use allocations list to choose peer to repin

License: MIT Signed-off-by: Adrian Lanzafame <adrianlanzafame92@gmail.com>
2019-04-01 13:33:58 +08:00 · 2019-04-01 13:33:58 +08:00 · 31af640e33
commit 31af640e33
parent 638cf73f5f
4 changed files with 39 additions and 12 deletions
--- a/cluster.go
+++ b/cluster.go
@ -5,6 +5,7 @@ import (
 	"errors"
 	"fmt"
 	"mime/multipart"
+	"sort"
 	"sync"
 	"time"

@ -297,16 +298,14 @@ func (c *Cluster) alertsHandler() {
 		case <-c.ctx.Done():
 			return
 		case alrt := <-c.monitor.Alerts():
-			// TODO(lanzafame): create a psuedo-leader election for crdts
-			// only the leader handles alerts
-			leader, err := c.consensus.Leader(c.ctx)
-			if err == nil && leader == c.id {
-				logger.Warningf(
-					"Peer %s received alert for %s in %s",
-					c.id, alrt.MetricName, alrt.Peer,
-				)
-				switch alrt.MetricName {
-				case pingMetricName:
+			list := c.state.List(c.ctx)
+			for _, pin := range list {
+				if len(pin.Allocations) < 1 && containsPeer(pin.Allocations, alrt.Peer) {
+					logger.Error("a pin with only one allocation cannot be repinned")
+					logger.Error("to make repinning possible, pin with a replication factor of 2+")
+					continue
+				}
+				if c.shouldPeerRepinCid(alrt.Peer, pin) {
 					c.repinFromPeer(c.ctx, alrt.Peer)
 				}
 			}
@ -314,6 +313,25 @@ func (c *Cluster) alertsHandler() {
 	}
 }

+// shouldPeerRepinCid returns true if the current peer is the top of the
+// allocs list. The failed peer is ignored, i.e. if current peer is
+// second and the failed peer is first, the function will still
+// return true.
+func (c *Cluster) shouldPeerRepinCid(failed peer.ID, pin *api.Pin) bool {
+	if containsPeer(pin.Allocations, failed) && containsPeer(pin.Allocations, c.id) {
+		allocs := sort.StringSlice(api.PeersToStrings(pin.Allocations))
+		allocs.Sort()
+		if allocs[0] == c.id.String() {
+			return true
+		}
+
+		if allocs[1] == c.id.String() && allocs[0] == failed.String() {
+			return true
+		}
+	}
+	return false
+}
+
 // detects any changes in the peerset and saves the configuration. When it
 // detects that we have been removed from the peerset, it shuts down this peer.
 func (c *Cluster) watchPeers() {
--- a/config_test.go
+++ b/config_test.go
@ -102,7 +102,8 @@ var testingTrackerCfg = []byte(`
 `)

 var testingMonCfg = []byte(`{
-    "check_interval": "300ms"
+    "check_interval": "300ms",
+	"failure_threshold": 1
 }`)

 var testingDiskInfCfg = []byte(`{
--- a/monitor/metrics/checker.go
+++ b/monitor/metrics/checker.go
@ -38,6 +38,14 @@ func NewChecker(metrics *Store, threshold float64) *Checker {
 // when they have expired and no alert has been sent before.
 func (mc *Checker) CheckPeers(peers []peer.ID) error {
 	for _, peer := range peers {
+		// shortcut checking all metrics based on heartbeat
+		// failure detection
+		if mc.Failed(peer) {
+			err := mc.alert(peer, "ping")
+			if err != nil {
+				return err
+			}
+		}
 		for _, metric := range mc.metrics.PeerMetrics(peer) {
 			err := mc.alertIfExpired(metric)
 			if err != nil {
--- a/monitor/pubsubmon/config.go
+++ b/monitor/pubsubmon/config.go
@ -64,7 +64,7 @@ func (cfg *Config) Validate() error {
 	}

 	if cfg.FailureThreshold <= 0 {
-		return errors.New("basic.failure_threshold too low")
+		return errors.New("pubsubmon.failure_threshold too low")
 	}

 	return nil