use allocations list to choose peer to repin
License: MIT Signed-off-by: Adrian Lanzafame <adrianlanzafame92@gmail.com>
This commit is contained in:
parent
638cf73f5f
commit
31af640e33
38
cluster.go
38
cluster.go
|
@ -5,6 +5,7 @@ import (
|
|||
"errors"
|
||||
"fmt"
|
||||
"mime/multipart"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
|
@ -297,16 +298,14 @@ func (c *Cluster) alertsHandler() {
|
|||
case <-c.ctx.Done():
|
||||
return
|
||||
case alrt := <-c.monitor.Alerts():
|
||||
// TODO(lanzafame): create a psuedo-leader election for crdts
|
||||
// only the leader handles alerts
|
||||
leader, err := c.consensus.Leader(c.ctx)
|
||||
if err == nil && leader == c.id {
|
||||
logger.Warningf(
|
||||
"Peer %s received alert for %s in %s",
|
||||
c.id, alrt.MetricName, alrt.Peer,
|
||||
)
|
||||
switch alrt.MetricName {
|
||||
case pingMetricName:
|
||||
list := c.state.List(c.ctx)
|
||||
for _, pin := range list {
|
||||
if len(pin.Allocations) < 1 && containsPeer(pin.Allocations, alrt.Peer) {
|
||||
logger.Error("a pin with only one allocation cannot be repinned")
|
||||
logger.Error("to make repinning possible, pin with a replication factor of 2+")
|
||||
continue
|
||||
}
|
||||
if c.shouldPeerRepinCid(alrt.Peer, pin) {
|
||||
c.repinFromPeer(c.ctx, alrt.Peer)
|
||||
}
|
||||
}
|
||||
|
@ -314,6 +313,25 @@ func (c *Cluster) alertsHandler() {
|
|||
}
|
||||
}
|
||||
|
||||
// shouldPeerRepinCid returns true if the current peer is the top of the
|
||||
// allocs list. The failed peer is ignored, i.e. if current peer is
|
||||
// second and the failed peer is first, the function will still
|
||||
// return true.
|
||||
func (c *Cluster) shouldPeerRepinCid(failed peer.ID, pin *api.Pin) bool {
|
||||
if containsPeer(pin.Allocations, failed) && containsPeer(pin.Allocations, c.id) {
|
||||
allocs := sort.StringSlice(api.PeersToStrings(pin.Allocations))
|
||||
allocs.Sort()
|
||||
if allocs[0] == c.id.String() {
|
||||
return true
|
||||
}
|
||||
|
||||
if allocs[1] == c.id.String() && allocs[0] == failed.String() {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// detects any changes in the peerset and saves the configuration. When it
|
||||
// detects that we have been removed from the peerset, it shuts down this peer.
|
||||
func (c *Cluster) watchPeers() {
|
||||
|
|
|
@ -102,7 +102,8 @@ var testingTrackerCfg = []byte(`
|
|||
`)
|
||||
|
||||
var testingMonCfg = []byte(`{
|
||||
"check_interval": "300ms"
|
||||
"check_interval": "300ms",
|
||||
"failure_threshold": 1
|
||||
}`)
|
||||
|
||||
var testingDiskInfCfg = []byte(`{
|
||||
|
|
|
@ -38,6 +38,14 @@ func NewChecker(metrics *Store, threshold float64) *Checker {
|
|||
// when they have expired and no alert has been sent before.
|
||||
func (mc *Checker) CheckPeers(peers []peer.ID) error {
|
||||
for _, peer := range peers {
|
||||
// shortcut checking all metrics based on heartbeat
|
||||
// failure detection
|
||||
if mc.Failed(peer) {
|
||||
err := mc.alert(peer, "ping")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
for _, metric := range mc.metrics.PeerMetrics(peer) {
|
||||
err := mc.alertIfExpired(metric)
|
||||
if err != nil {
|
||||
|
|
|
@ -64,7 +64,7 @@ func (cfg *Config) Validate() error {
|
|||
}
|
||||
|
||||
if cfg.FailureThreshold <= 0 {
|
||||
return errors.New("basic.failure_threshold too low")
|
||||
return errors.New("pubsubmon.failure_threshold too low")
|
||||
}
|
||||
|
||||
return nil
|
||||
|
|
Loading…
Reference in New Issue
Block a user