use allocations list to choose peer to repin

License: MIT
Signed-off-by: Adrian Lanzafame <adrianlanzafame92@gmail.com>
This commit is contained in:
Adrian Lanzafame 2019-04-01 13:33:58 +08:00
parent 638cf73f5f
commit 31af640e33
No known key found for this signature in database
GPG Key ID: 87E40C5D62EAE192
4 changed files with 39 additions and 12 deletions

View File

@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"mime/multipart"
"sort"
"sync"
"time"
@ -297,16 +298,14 @@ func (c *Cluster) alertsHandler() {
case <-c.ctx.Done():
return
case alrt := <-c.monitor.Alerts():
// TODO(lanzafame): create a psuedo-leader election for crdts
// only the leader handles alerts
leader, err := c.consensus.Leader(c.ctx)
if err == nil && leader == c.id {
logger.Warningf(
"Peer %s received alert for %s in %s",
c.id, alrt.MetricName, alrt.Peer,
)
switch alrt.MetricName {
case pingMetricName:
list := c.state.List(c.ctx)
for _, pin := range list {
if len(pin.Allocations) < 1 && containsPeer(pin.Allocations, alrt.Peer) {
logger.Error("a pin with only one allocation cannot be repinned")
logger.Error("to make repinning possible, pin with a replication factor of 2+")
continue
}
if c.shouldPeerRepinCid(alrt.Peer, pin) {
c.repinFromPeer(c.ctx, alrt.Peer)
}
}
@ -314,6 +313,25 @@ func (c *Cluster) alertsHandler() {
}
}
// shouldPeerRepinCid returns true if the current peer is the top of the
// allocs list. The failed peer is ignored, i.e. if current peer is
// second and the failed peer is first, the function will still
// return true.
func (c *Cluster) shouldPeerRepinCid(failed peer.ID, pin *api.Pin) bool {
if containsPeer(pin.Allocations, failed) && containsPeer(pin.Allocations, c.id) {
allocs := sort.StringSlice(api.PeersToStrings(pin.Allocations))
allocs.Sort()
if allocs[0] == c.id.String() {
return true
}
if allocs[1] == c.id.String() && allocs[0] == failed.String() {
return true
}
}
return false
}
// detects any changes in the peerset and saves the configuration. When it
// detects that we have been removed from the peerset, it shuts down this peer.
func (c *Cluster) watchPeers() {

View File

@ -102,7 +102,8 @@ var testingTrackerCfg = []byte(`
`)
var testingMonCfg = []byte(`{
"check_interval": "300ms"
"check_interval": "300ms",
"failure_threshold": 1
}`)
var testingDiskInfCfg = []byte(`{

View File

@ -38,6 +38,14 @@ func NewChecker(metrics *Store, threshold float64) *Checker {
// when they have expired and no alert has been sent before.
func (mc *Checker) CheckPeers(peers []peer.ID) error {
for _, peer := range peers {
// shortcut checking all metrics based on heartbeat
// failure detection
if mc.Failed(peer) {
err := mc.alert(peer, "ping")
if err != nil {
return err
}
}
for _, metric := range mc.metrics.PeerMetrics(peer) {
err := mc.alertIfExpired(metric)
if err != nil {

View File

@ -64,7 +64,7 @@ func (cfg *Config) Validate() error {
}
if cfg.FailureThreshold <= 0 {
return errors.New("basic.failure_threshold too low")
return errors.New("pubsubmon.failure_threshold too low")
}
return nil