9b9d76f92d
This commit introduces the new go-libp2p-gorpc streaming capabilities for Cluster. The main aim is to work towards heavily reducing memory usage when working with very large pinsets. As a side-effect, it takes the chance to revampt all types for all public methods so that pointers to static what should be static objects are not used anymore. This should heavily reduce heap allocations and GC activity. The main change is that state.List now returns a channel from which to read the pins, rather than pins being all loaded into a huge slice. Things reading pins have been all updated to iterate on the channel rather than on the slice. The full pinset is no longer fully loaded onto memory for things that run regularly like StateSync(). Additionally, the /allocations endpoint of the rest API no longer returns an array of pins, but rather streams json-encoded pin objects directly. This change has extended to the restapi client (which puts pins into a channel as they arrive) and to ipfs-cluster-ctl. There are still pending improvements like StatusAll() calls which should also stream responses, and specially BlockPut calls which should stream blocks directly into IPFS on a single call. These are coming up in future commits.
308 lines
8.3 KiB
Go
308 lines
8.3 KiB
Go
// Package balanced implements an allocator that can sort allocations
|
|
// based on multiple metrics, where metrics may be an arbitrary way to
|
|
// partition a set of peers.
|
|
//
|
|
// For example, allocating by ["tag:region", "disk"] the resulting peer
|
|
// candidate order will balanced between regions and ordered by the value of
|
|
// the weight of the disk metric.
|
|
package balanced
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sort"
|
|
|
|
cid "github.com/ipfs/go-cid"
|
|
logging "github.com/ipfs/go-log/v2"
|
|
api "github.com/ipfs/ipfs-cluster/api"
|
|
peer "github.com/libp2p/go-libp2p-core/peer"
|
|
rpc "github.com/libp2p/go-libp2p-gorpc"
|
|
)
|
|
|
|
var logger = logging.Logger("allocator")
|
|
|
|
// Allocator is an allocator that partitions metrics and orders
|
|
// the final list of allocation by selecting for each partition.
|
|
type Allocator struct {
|
|
config *Config
|
|
rpcClient *rpc.Client
|
|
}
|
|
|
|
// New returns an initialized Allocator.
|
|
func New(cfg *Config) (*Allocator, error) {
|
|
err := cfg.Validate()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &Allocator{
|
|
config: cfg,
|
|
}, nil
|
|
}
|
|
|
|
// SetClient provides us with an rpc.Client which allows
|
|
// contacting other components in the cluster.
|
|
func (a *Allocator) SetClient(c *rpc.Client) {
|
|
a.rpcClient = c
|
|
}
|
|
|
|
// Shutdown is called on cluster shutdown. We just invalidate
|
|
// any metrics from this point.
|
|
func (a *Allocator) Shutdown(ctx context.Context) error {
|
|
a.rpcClient = nil
|
|
return nil
|
|
}
|
|
|
|
type partitionedMetric struct {
|
|
metricName string
|
|
curChoosingIndex int
|
|
noMore bool
|
|
partitions []*partition // they are in order of their values
|
|
}
|
|
|
|
type partition struct {
|
|
value string
|
|
weight int64
|
|
peers map[peer.ID]bool // the bool tracks whether the peer has been picked already out of the partition when doing the final sort.
|
|
sub *partitionedMetric // all peers in sub-partitions will have the same value for this metric
|
|
}
|
|
|
|
// Returns a partitionedMetric which has partitions and subpartitions based
|
|
// on the metrics and values given by the "by" slice. The partitions
|
|
// are ordered based on the cumulative weight.
|
|
func partitionMetrics(set api.MetricsSet, by []string) *partitionedMetric {
|
|
rootMetric := by[0]
|
|
pnedMetric := &partitionedMetric{
|
|
metricName: rootMetric,
|
|
partitions: partitionValues(set[rootMetric]),
|
|
}
|
|
|
|
// For sorting based on weight (more to less)
|
|
lessF := func(i, j int) bool {
|
|
wi := pnedMetric.partitions[i].weight
|
|
wj := pnedMetric.partitions[j].weight
|
|
// Strict order
|
|
if wi == wj {
|
|
return pnedMetric.partitions[i].value < pnedMetric.partitions[j].value
|
|
}
|
|
// Descending!
|
|
return wj < wi
|
|
}
|
|
|
|
if len(by) == 1 { // we are done
|
|
sort.Slice(pnedMetric.partitions, lessF)
|
|
return pnedMetric
|
|
}
|
|
|
|
// process sub-partitions
|
|
for _, partition := range pnedMetric.partitions {
|
|
filteredSet := make(api.MetricsSet)
|
|
for k, v := range set {
|
|
if k == rootMetric { // not needed anymore
|
|
continue
|
|
}
|
|
for _, m := range v {
|
|
// only leave metrics for peers in current partition
|
|
if _, ok := partition.peers[m.Peer]; ok {
|
|
filteredSet[k] = append(filteredSet[k], m)
|
|
}
|
|
}
|
|
}
|
|
|
|
partition.sub = partitionMetrics(filteredSet, by[1:])
|
|
// Add the weight of our subpartitions
|
|
for _, subp := range partition.sub.partitions {
|
|
partition.weight += subp.weight
|
|
}
|
|
}
|
|
sort.Slice(pnedMetric.partitions, lessF)
|
|
return pnedMetric
|
|
}
|
|
|
|
func partitionValues(metrics []api.Metric) []*partition {
|
|
partitions := []*partition{}
|
|
|
|
if len(metrics) <= 0 {
|
|
return partitions
|
|
}
|
|
|
|
// We group peers with the same value in the same partition.
|
|
partitionsByValue := make(map[string]*partition)
|
|
|
|
for _, m := range metrics {
|
|
// Sometimes two metrics have the same value / weight, but we
|
|
// still want to put them in different partitions. Otherwise
|
|
// their weights get added and they form a bucket and
|
|
// therefore not they are not selected in order: 3 peers with
|
|
// freespace=100 and one peer with freespace=200 would result
|
|
// in one of the peers with freespace 100 being chosen first
|
|
// because the partition's weight is 300.
|
|
//
|
|
// We are going to call these metrics (like free-space),
|
|
// non-partitionable metrics. This is going to be the default
|
|
// (for backwards compat reasons).
|
|
//
|
|
// The informers must set the Partitionable field accordingly
|
|
// when two metrics with the same value must be grouped in the
|
|
// same partition.
|
|
if !m.Partitionable {
|
|
partitions = append(partitions, &partition{
|
|
value: m.Value,
|
|
weight: m.GetWeight(),
|
|
peers: map[peer.ID]bool{
|
|
m.Peer: false,
|
|
},
|
|
})
|
|
continue
|
|
}
|
|
|
|
// Any other case, we partition by value.
|
|
if p, ok := partitionsByValue[m.Value]; ok {
|
|
p.peers[m.Peer] = false
|
|
p.weight += m.GetWeight()
|
|
} else {
|
|
partitionsByValue[m.Value] = &partition{
|
|
value: m.Value,
|
|
weight: m.GetWeight(),
|
|
peers: map[peer.ID]bool{
|
|
m.Peer: false,
|
|
},
|
|
}
|
|
}
|
|
|
|
}
|
|
for _, p := range partitionsByValue {
|
|
partitions = append(partitions, p)
|
|
}
|
|
return partitions
|
|
}
|
|
|
|
// Returns a list of peers sorted by never choosing twice from the same
|
|
// partition if there is some other partition to choose from.
|
|
func (pnedm *partitionedMetric) sortedPeers() []peer.ID {
|
|
peers := []peer.ID{}
|
|
for {
|
|
peer := pnedm.chooseNext()
|
|
if peer == "" { // This means we are done.
|
|
break
|
|
}
|
|
peers = append(peers, peer)
|
|
}
|
|
return peers
|
|
}
|
|
|
|
func (pnedm *partitionedMetric) chooseNext() peer.ID {
|
|
lenp := len(pnedm.partitions)
|
|
if lenp == 0 {
|
|
return ""
|
|
}
|
|
|
|
if pnedm.noMore {
|
|
return ""
|
|
}
|
|
|
|
var peer peer.ID
|
|
|
|
curPartition := pnedm.partitions[pnedm.curChoosingIndex]
|
|
done := 0
|
|
for {
|
|
if curPartition.sub != nil {
|
|
// Choose something from the sub-partitionedMetric
|
|
peer = curPartition.sub.chooseNext()
|
|
} else {
|
|
// We are a bottom-partition. Choose one of our peers
|
|
for pid, used := range curPartition.peers {
|
|
if !used {
|
|
peer = pid
|
|
curPartition.peers[pid] = true // mark as used
|
|
break
|
|
}
|
|
}
|
|
}
|
|
// look in next partition next time
|
|
pnedm.curChoosingIndex = (pnedm.curChoosingIndex + 1) % lenp
|
|
curPartition = pnedm.partitions[pnedm.curChoosingIndex]
|
|
done++
|
|
|
|
if peer != "" {
|
|
break
|
|
}
|
|
|
|
// no peer and we have looked in as many partitions as we have
|
|
if done == lenp {
|
|
pnedm.noMore = true
|
|
break
|
|
}
|
|
}
|
|
|
|
return peer
|
|
}
|
|
|
|
// Allocate produces a sorted list of cluster peer IDs based on different
|
|
// metrics provided for those peer IDs.
|
|
// It works as follows:
|
|
//
|
|
// - First, it buckets each peer metrics based on the AllocateBy list. The
|
|
// metric name must match the bucket name, otherwise they are put at the end.
|
|
// - Second, based on the AllocateBy order, it orders the first bucket and
|
|
// groups peers by ordered value.
|
|
// - Third, it selects metrics on the second bucket for the most prioritary
|
|
// peers of the first bucket and orders their metrics. Then for the peers in
|
|
// second position etc.
|
|
// - It repeats the process until there is no more buckets to sort.
|
|
// - Finally, it returns the first peer of the first
|
|
// - Third, based on the AllocateBy order, it select the first metric
|
|
func (a *Allocator) Allocate(
|
|
ctx context.Context,
|
|
c cid.Cid,
|
|
current, candidates, priority api.MetricsSet,
|
|
) ([]peer.ID, error) {
|
|
|
|
// For the allocation to work well, there have to be metrics of all
|
|
// the types for all the peers. There cannot be a metric of one type
|
|
// for a peer that does not appear in the other types.
|
|
//
|
|
// Removing such occurences is done in allocate.go, before the
|
|
// allocator is called.
|
|
//
|
|
// Otherwise, the sorting might be funny.
|
|
|
|
candidatePartition := partitionMetrics(candidates, a.config.AllocateBy)
|
|
priorityPartition := partitionMetrics(priority, a.config.AllocateBy)
|
|
|
|
logger.Debugf("Balanced allocator partitions:\n%s\n", printPartition(candidatePartition, 0))
|
|
|
|
first := priorityPartition.sortedPeers()
|
|
last := candidatePartition.sortedPeers()
|
|
|
|
return append(first, last...), nil
|
|
}
|
|
|
|
// Metrics returns the names of the metrics that have been registered
|
|
// with this allocator.
|
|
func (a *Allocator) Metrics() []string {
|
|
return a.config.AllocateBy
|
|
}
|
|
|
|
func printPartition(m *partitionedMetric, ind int) string {
|
|
str := ""
|
|
indent := func() {
|
|
for i := 0; i < ind+2; i++ {
|
|
str += " "
|
|
}
|
|
}
|
|
|
|
for _, p := range m.partitions {
|
|
indent()
|
|
str += fmt.Sprintf(" | %s:%s - %d - [", m.metricName, p.value, p.weight)
|
|
for p, u := range p.peers {
|
|
str += fmt.Sprintf("%s|%t, ", p, u)
|
|
}
|
|
str += "]\n"
|
|
if p.sub != nil {
|
|
str += printPartition(p.sub, ind+2)
|
|
}
|
|
}
|
|
return str
|
|
}
|