Fix #167: Useful messages when consensus doesn't start

This will display a few hints when consensus fails to start. If consensus doesn't start (normally WaitForLeader times out), it's because of libp2p not being able to reach other peers. This sometimes also means that the wrong protector key (secret) is being used, even though libp2p does not give us clear indications. License: MIT Signed-off-by: Hector Sanjuan <code@hector.link>
2018-03-12 22:45:29 +01:00 · 2018-03-12 22:45:29 +01:00 · 740f314976
commit 740f314976
parent c276c31ad6
3 changed files with 49 additions and 20 deletions
--- a/cluster.go
+++ b/cluster.go
@ -139,7 +139,7 @@ func NewCluster(
 		return nil, errors.New("bootstrap unsuccessful")
 	}
 	go func() {
-		c.ready()
+		c.ready(consensusCfg.WaitForLeaderTimeout * 2)
 		c.run()
 	}()
 	return c, nil
@ -422,14 +422,22 @@ func (c *Cluster) run() {
 	go c.alertsHandler()
 }

-func (c *Cluster) ready() {
+func (c *Cluster) ready(timeout time.Duration) {
 	// We bootstrapped first because with dirty state consensus
 	// may have a peerset and not find a leader so we cannot wait
 	// for it.
-	timer := time.NewTimer(30 * time.Second)
+	timer := time.NewTimer(timeout)
 	select {
 	case <-timer.C:
-		logger.Error("consensus start timed out")
+		logger.Error("**************************************************")
+		logger.Error("***** ipfs-cluster consensus start timed out *****")
+		logger.Error("This peer was not able to become part of the cluster.")
+		logger.Error("This might be due to one or several causes:")
+		logger.Error(`  - Check that there is connectivity to the "bootstrap" and "peers" multiaddresses`)
+		logger.Error(`  - Check that all cluster peers are using the same "secret"`)
+		logger.Error(`  - Check that this peer is reachable on its "listen_multiaddress"`)
+		logger.Error(`  - Check that there is a majority of available peers`)
+		logger.Error("**************************************************")
 		c.Shutdown()
 		return
 	case <-c.consensus.Ready():
--- a/consensus/raft/logging.go
+++ b/consensus/raft/logging.go
@ -3,6 +3,7 @@ package raft
 import (
 	"log"
 	"strings"
+	"time"

 	logging "github.com/ipfs/go-log"
 )
@ -14,15 +15,14 @@ const (
 	err
 )

+const repeatPoolSize = 10
+const repeatReset = time.Minute
+
 // This provides a custom logger for Raft which intercepts Raft log messages
 // and rewrites us to our own logger (for "raft" facility).
 type logForwarder struct {
-	last map[int]*lastMsg
-}
-
-type lastMsg struct {
-	msg    string
-	tipped bool
+	lastMsgs map[int][]string
+	lastTip  map[int]time.Time
 }

 var raftStdLogger = log.New(&logForwarder{}, "", 0)
@ -58,19 +58,38 @@ func (fw *logForwarder) Write(p []byte) (n int, e error) {
 }

 func (fw *logForwarder) repeated(t int, msg string) bool {
-	if fw.last == nil {
-		fw.last = make(map[int]*lastMsg)
+	if fw.lastMsgs == nil {
+		fw.lastMsgs = make(map[int][]string)
+		fw.lastTip = make(map[int]time.Time)
 	}

-	last, ok := fw.last[t]
-	if !ok || last.msg != msg {
-		fw.last[t] = &lastMsg{msg, false}
-		return false
+	// We we haven't tipped about repeated log messages
+	// in while, do it and forget the list
+	if time.Now().After(fw.lastTip[t].Add(repeatReset)) {
+		fw.lastTip[t] = time.Now()
+		fw.lastMsgs[t] = nil
+		fw.log(t, "NOTICE: Some RAFT log messages repeat and will only be logged once")
 	}
-	if !last.tipped {
-		fw.log(t, "NOTICE: The last RAFT log message repeats and will only be logged once")
-		last.tipped = true
+
+	var found string
+
+	// Do we know about this message
+	for _, lmsg := range fw.lastMsgs[t] {
+		if lmsg == msg {
+			found = lmsg
+			break
+		}
 	}
+
+	if found == "" { // new message. Add to slice.
+		if len(fw.lastMsgs[t]) >= repeatPoolSize { // drop oldest
+			fw.lastMsgs[t] = fw.lastMsgs[t][1:]
+		}
+		fw.lastMsgs[t] = append(fw.lastMsgs[t], msg)
+		return false // not-repeated
+	}
+
+	// repeated, don't log
 	return true
 }

--- a/docs/ipfs-cluster-guide.md
+++ b/docs/ipfs-cluster-guide.md
@ -433,6 +433,8 @@ This is usually the result of a desync between the *shared state* and the *local
 Since cluster is built on top of libp2p, many errors that new users face come from libp2p and have confusing messages which are not obvious at first sight. This list compiles some of them:

 * `dial attempt failed: misdial to <peer.ID XXXXXX> through ....`: this means that the multiaddress you are contacting has a different peer in it than expected.
-* `dial attempt failed: context deadline exceeded`: this means that the address is not reachable.
+* `dial attempt failed: connection refused`: the peer is not running or not listening on the expected address/protocol/port.
+* `dial attempt failed: context deadline exceeded`: this means that the address is not reachable or that the wrong secret is being used.
+* `dial backoff`: same as above.
 * `dial attempt failed: incoming message was too large`: this probably means that your cluster peers are not sharing the same secret.
 * `version not supported`: this means that your nodes are running different versions of raft/cluster.