Address most comments from PR review

This commit is contained in:
Hector Sanjuan 2021-10-05 14:04:28 +02:00
parent cf4fb74993
commit 6b31f44351
7 changed files with 69 additions and 63 deletions

View File

@ -39,6 +39,16 @@ import (
// ReplicationFactorMax is reached. Error if there are less than
// ReplicationFactorMin.
// A wrapper to carry peer metrics that have been classified.
type classifiedMetrics struct {
current api.MetricsSet
currentPeers []peer.ID
candidate api.MetricsSet
candidatePeers []peer.ID
priority api.MetricsSet
priorityPeers []peer.ID
}
// allocate finds peers to allocate a hash using the informer and the monitor
// it should only be used with valid replicationFactors (if rplMin and rplMax
// are > 0, then rplMin <= rplMax).
@ -65,13 +75,16 @@ func (c *Cluster) allocate(ctx context.Context, hash cid.Cid, currentPin *api.Pi
currentAllocs = currentPin.Allocations
}
// Get Metrics that the allocator is interested on
mSet := make(api.MetricsSet)
metrics := c.allocator.Metrics()
for _, metricName := range metrics {
mSet[metricName] = c.monitor.LatestMetrics(ctx, metricName)
}
curSet, curPeers, candSet, candPeers, prioSet, prioPeers := filterMetrics(
// Filter and divide metrics. The resulting sets only have peers that
// have all the metrics needed and are not blacklisted.
classified := filterMetrics(
mSet,
len(metrics),
currentAllocs,
@ -84,12 +97,7 @@ func (c *Cluster) allocate(ctx context.Context, hash cid.Cid, currentPin *api.Pi
hash,
rplMin,
rplMax,
curSet,
candSet,
prioSet,
curPeers,
candPeers,
prioPeers,
classified,
)
if err != nil {
return newAllocs, err
@ -108,14 +116,7 @@ func (c *Cluster) allocate(ctx context.Context, hash cid.Cid, currentPin *api.Pi
//
// For a metric/peer to be included in a group, it is necessary that it has
// metrics for all informers.
func filterMetrics(mSet api.MetricsSet, numMetrics int, currentAllocs, priorityList, blacklist []peer.ID) (
curSet api.MetricsSet,
curPeers []peer.ID,
candSet api.MetricsSet,
candPeers []peer.ID,
prioSet api.MetricsSet,
prioPeers []peer.ID,
) {
func filterMetrics(mSet api.MetricsSet, numMetrics int, currentAllocs, priorityList, blacklist []peer.ID) classifiedMetrics {
curPeersMap := make(map[peer.ID][]*api.Metric)
candPeersMap := make(map[peer.ID][]*api.Metric)
prioPeersMap := make(map[peer.ID][]*api.Metric)
@ -141,8 +142,11 @@ func filterMetrics(mSet api.MetricsSet, numMetrics int, currentAllocs, priorityL
mSet := make(api.MetricsSet)
peers := make([]peer.ID, 0, len(peersMap))
// Put the metrics in their sets if peers have metrics for all informers
// Record peers
// Put the metrics in their sets if peers have metrics for all
// informers Record peers. This relies on LatestMetrics
// returning exactly one metric per peer. Thus, a peer with
// all the needed metrics should have exactly numMetrics.
// Otherwise, they are ignored.
for p, metrics := range peersMap {
if len(metrics) == numMetrics {
for _, m := range metrics {
@ -154,11 +158,18 @@ func filterMetrics(mSet api.MetricsSet, numMetrics int, currentAllocs, priorityL
return mSet, peers
}
curSet, curPeers = fillMetricsSet(curPeersMap)
candSet, candPeers = fillMetricsSet(candPeersMap)
prioSet, prioPeers = fillMetricsSet(prioPeersMap)
curSet, curPeers := fillMetricsSet(curPeersMap)
candSet, candPeers := fillMetricsSet(candPeersMap)
prioSet, prioPeers := fillMetricsSet(prioPeersMap)
return
return classifiedMetrics{
current: curSet,
currentPeers: curPeers,
candidate: candSet,
candidatePeers: candPeers,
priority: prioSet,
priorityPeers: prioPeers,
}
}
// allocationError logs an allocation error
@ -182,21 +193,20 @@ func (c *Cluster) obtainAllocations(
ctx context.Context,
hash cid.Cid,
rplMin, rplMax int,
currentValidMetrics, candidatesMetrics, priorityMetrics api.MetricsSet,
currentPeers, candidatePeers, priorityPeers []peer.ID,
metrics classifiedMetrics,
) ([]peer.ID, error) {
ctx, span := trace.StartSpan(ctx, "cluster/obtainAllocations")
defer span.End()
nCurrentValid := len(currentPeers)
nAvailableValid := len(candidatePeers) + len(priorityPeers)
nCurrentValid := len(metrics.currentPeers)
nAvailableValid := len(metrics.candidatePeers) + len(metrics.priorityPeers)
needed := rplMin - nCurrentValid // The minimum we need
wanted := rplMax - nCurrentValid // The maximum we want
logger.Debugf("obtainAllocations: current: %d", nCurrentValid)
logger.Debugf("obtainAllocations: available: %d", nAvailableValid)
logger.Debugf("obtainAllocations: candidates: %d", len(candidatePeers))
logger.Debugf("obtainAllocations: priority: %d", len(priorityPeers))
logger.Debugf("obtainAllocations: candidates: %d", len(metrics.candidatePeers))
logger.Debugf("obtainAllocations: priority: %d", len(metrics.priorityPeers))
logger.Debugf("obtainAllocations: Needed: %d", needed)
logger.Debugf("obtainAllocations: Wanted: %d", wanted)
@ -206,7 +216,7 @@ func (c *Cluster) obtainAllocations(
// This could be done more intelligently by dropping them
// according to the allocator order (i.e. free-ing peers
// with most used space first).
return currentPeers[0 : len(currentPeers)+wanted], nil
return metrics.currentPeers[0 : len(metrics.currentPeers)+wanted], nil
}
if needed <= 0 { // allocations are above minimal threshold
@ -215,7 +225,7 @@ func (c *Cluster) obtainAllocations(
}
if nAvailableValid < needed { // not enough candidates
return nil, allocationError(hash, needed, wanted, append(priorityPeers, candidatePeers...))
return nil, allocationError(hash, needed, wanted, append(metrics.priorityPeers, metrics.candidatePeers...))
}
// We can allocate from this point. Use the allocator to decide
@ -225,9 +235,9 @@ func (c *Cluster) obtainAllocations(
finalAllocs, err := c.allocator.Allocate(
ctx,
hash,
currentValidMetrics,
candidatesMetrics,
priorityMetrics,
metrics.current,
metrics.candidate,
metrics.priority,
)
if err != nil {
return nil, logError(err.Error())
@ -245,5 +255,5 @@ func (c *Cluster) obtainAllocations(
// the final result is the currently valid allocations
// along with the ones provided by the allocator
return append(currentPeers, finalAllocs[0:allocationsToUse]...), nil
return append(metrics.currentPeers, finalAllocs[0:allocationsToUse]...), nil
}

View File

@ -13,7 +13,7 @@ const envConfigKey = "cluster_metricsalloc"
// These are the default values for a Config.
var (
DefaultAllocateBy = []string{"freespace"}
DefaultAllocateBy = []string{"tag:group", "freespace"}
)
// Config allows to initialize the Allocator.
@ -76,7 +76,7 @@ func (cfg *Config) LoadJSON(raw []byte) error {
}
func (cfg *Config) applyJSONConfig(jcfg *jsonConfig) error {
// When unset - use default
// When unset, leave default
if len(jcfg.AllocateBy) > 0 {
cfg.AllocateBy = jcfg.AllocateBy
}

View File

@ -2,8 +2,8 @@
// based on multiple metrics, where metrics may be an arbitrary way to
// partition a set of peers.
//
// For example, allocating by [tags, disk] will
// first order candidate peers by tag metric, and then by disk metric.
// For example, allocating by ["tag:region", "disk"] will
// first order candidate peers by tag metric, and then by "disk" metric.
// The final list will pick up allocations from each tag metric group.
// based on the given order of metrics.
package metrics
@ -16,12 +16,10 @@ import (
"github.com/ipfs/ipfs-cluster/api"
peer "github.com/libp2p/go-libp2p-core/peer"
rpc "github.com/libp2p/go-libp2p-gorpc"
"go.opencensus.io/trace"
)
// Allocator is an allocator that partitions metrics and orders
// the final least of allocation by selecting for each partition.
// the final list of allocation by selecting for each partition.
type Allocator struct {
config *Config
rpcClient *rpc.Client
@ -48,9 +46,6 @@ func (a *Allocator) SetClient(c *rpc.Client) {
// Shutdown is called on cluster shutdown. We just invalidate
// any metrics from this point.
func (a *Allocator) Shutdown(ctx context.Context) error {
_, span := trace.StartSpan(ctx, "allocator/metrics/Shutdown")
defer span.End()
a.rpcClient = nil
return nil
}
@ -62,13 +57,13 @@ type partitionedMetric struct {
partitions []*partition // they are in order of their values
}
// Returned a list of peers sorted by never choosing twice from the same
// Returns a list of peers sorted by never choosing twice from the same
// partition if there is some other partition to choose from.
func (pnedm *partitionedMetric) sortedPeers() []peer.ID {
peers := []peer.ID{}
for {
peer := pnedm.chooseNext()
if peer == "" {
if peer == "" { // This means we are done.
break
}
peers = append(peers, peer)
@ -125,7 +120,7 @@ func (pnedm *partitionedMetric) chooseNext() peer.ID {
type partition struct {
value string
peers map[peer.ID]bool
peers map[peer.ID]bool // the bool tracks whether the peer has been picked already out of the partition when doing the final sort.
sub *partitionedMetric // all peers in sub-partitions will have the same value for this metric
}

View File

@ -1100,7 +1100,7 @@ func (n *NodeWithMeta) Size() uint64 {
return uint64(len(n.Data))
}
// MetricsSet is map to carry slices of metric indexed by type.
// MetricsSet is a map to carry slices of metrics indexed by type.
type MetricsSet map[string][]*Metric
// Metric transports information about a peer.ID. It is used to decide
@ -1126,7 +1126,11 @@ func (m *Metric) SetTTL(d time.Duration) {
// GetTTL returns the time left before the Metric expires
func (m *Metric) GetTTL() time.Duration {
expDate := time.Unix(0, m.Expire)
return time.Until(expDate)
ttl := time.Until(expDate)
if ttl < 0 {
ttl = 0
}
return ttl
}
// Expired returns if the Metric has expired

View File

@ -283,6 +283,7 @@ func (c *Cluster) watchPinset() {
}
}
// returns the smallest ttl from the metrics pushed by the informer.
func (c *Cluster) sendInformerMetrics(ctx context.Context, informer Informer) (time.Duration, error) {
ctx, span := trace.StartSpan(ctx, "cluster/sendInformerMetric")
defer span.End()
@ -298,10 +299,7 @@ func (c *Cluster) sendInformerMetrics(ctx context.Context, informer Informer) (t
for _, metric := range metrics {
metric.Peer = c.id
ttl := metric.GetTTL()
if minTTL == 0 {
minTTL = metric.GetTTL()
}
if ttl < minTTL && ttl > 0 {
if ttl > 0 && (ttl < minTTL || minTTL == 0) {
minTTL = ttl
}
err := c.monitor.PublishMetric(ctx, metric)
@ -310,7 +308,7 @@ func (c *Cluster) sendInformerMetrics(ctx context.Context, informer Informer) (t
logger.Warnf("error sending metric %s: %s", metric.Name, err)
}
}
return minTTL, nil
return minTTL, errors
}
func (c *Cluster) sendInformersMetrics(ctx context.Context) error {
@ -354,8 +352,8 @@ func (c *Cluster) pushInformerMetrics(ctx context.Context, informer Informer) {
minTTL, err := c.sendInformerMetrics(ctx, informer)
if minTTL == 0 {
logger.Errorf("informer %s reported metric ttl 0. This must be a bug. Aborting this informer", informer.Name())
return
minTTL = 30 * time.Second
logger.Warningf("informer %s reported a min metric ttl of 0s.", informer.Name())
}
if err != nil {
if (retries % retryWarnMod) == 0 {

View File

@ -12,8 +12,6 @@ import (
logging "github.com/ipfs/go-log/v2"
rpc "github.com/libp2p/go-libp2p-gorpc"
"go.opencensus.io/trace"
)
var logger = logging.Logger("tags")
@ -46,7 +44,8 @@ func New(cfg *Config) (*Informer, error) {
}, nil
}
// Name returns the name of the metric issued by this informer.
// Name returns the name of this informer. Note the informer issues metrics
// with custom names.
func (tags *Informer) Name() string {
return MetricName
}
@ -62,9 +61,6 @@ func (tags *Informer) SetClient(c *rpc.Client) {
// Shutdown is called on cluster shutdown. We just invalidate
// any metrics from this point.
func (tags *Informer) Shutdown(ctx context.Context) error {
_, span := trace.StartSpan(ctx, "informer/tags/Shutdown")
defer span.End()
tags.mu.Lock()
defer tags.mu.Unlock()

View File

@ -137,6 +137,8 @@ type PinTracker interface {
type Informer interface {
Component
Name() string
// GetMetrics returns the metrics obtained by this Informer. It must
// always return at least one metric.
GetMetrics(context.Context) []*api.Metric
}
@ -172,8 +174,9 @@ type PeerMonitor interface {
// PublishMetric sends a metric to the rest of the peers.
// How to send it, and to who, is to be decided by the implementation.
PublishMetric(context.Context, *api.Metric) error
// LatestMetrics returns a map with the latest metrics of matching name
// for the current cluster peers.
// LatestMetrics returns a map with the latest metrics of matching
// name for the current cluster peers. The result should only contain
// one metric per peer at most.
LatestMetrics(ctx context.Context, name string) []*api.Metric
// MetricNames returns a list of metric names.
MetricNames(ctx context.Context) []string