diff --git a/cluster.go b/cluster.go index 089333d7..754b37e8 100644 --- a/cluster.go +++ b/cluster.go @@ -139,7 +139,7 @@ func NewCluster( return nil, errors.New("bootstrap unsuccessful") } go func() { - c.ready() + c.ready(consensusCfg.WaitForLeaderTimeout * 2) c.run() }() return c, nil @@ -422,14 +422,22 @@ func (c *Cluster) run() { go c.alertsHandler() } -func (c *Cluster) ready() { +func (c *Cluster) ready(timeout time.Duration) { // We bootstrapped first because with dirty state consensus // may have a peerset and not find a leader so we cannot wait // for it. - timer := time.NewTimer(30 * time.Second) + timer := time.NewTimer(timeout) select { case <-timer.C: - logger.Error("consensus start timed out") + logger.Error("**************************************************") + logger.Error("***** ipfs-cluster consensus start timed out *****") + logger.Error("This peer was not able to become part of the cluster.") + logger.Error("This might be due to one or several causes:") + logger.Error(` - Check that there is connectivity to the "bootstrap" and "peers" multiaddresses`) + logger.Error(` - Check that all cluster peers are using the same "secret"`) + logger.Error(` - Check that this peer is reachable on its "listen_multiaddress"`) + logger.Error(` - Check that there is a majority of available peers`) + logger.Error("**************************************************") c.Shutdown() return case <-c.consensus.Ready(): diff --git a/consensus/raft/logging.go b/consensus/raft/logging.go index 5ec64da9..154bc5d8 100644 --- a/consensus/raft/logging.go +++ b/consensus/raft/logging.go @@ -3,6 +3,7 @@ package raft import ( "log" "strings" + "time" logging "github.com/ipfs/go-log" ) @@ -14,15 +15,14 @@ const ( err ) +const repeatPoolSize = 10 +const repeatReset = time.Minute + // This provides a custom logger for Raft which intercepts Raft log messages // and rewrites us to our own logger (for "raft" facility). type logForwarder struct { - last map[int]*lastMsg -} - -type lastMsg struct { - msg string - tipped bool + lastMsgs map[int][]string + lastTip map[int]time.Time } var raftStdLogger = log.New(&logForwarder{}, "", 0) @@ -58,19 +58,38 @@ func (fw *logForwarder) Write(p []byte) (n int, e error) { } func (fw *logForwarder) repeated(t int, msg string) bool { - if fw.last == nil { - fw.last = make(map[int]*lastMsg) + if fw.lastMsgs == nil { + fw.lastMsgs = make(map[int][]string) + fw.lastTip = make(map[int]time.Time) } - last, ok := fw.last[t] - if !ok || last.msg != msg { - fw.last[t] = &lastMsg{msg, false} - return false + // We we haven't tipped about repeated log messages + // in while, do it and forget the list + if time.Now().After(fw.lastTip[t].Add(repeatReset)) { + fw.lastTip[t] = time.Now() + fw.lastMsgs[t] = nil + fw.log(t, "NOTICE: Some RAFT log messages repeat and will only be logged once") } - if !last.tipped { - fw.log(t, "NOTICE: The last RAFT log message repeats and will only be logged once") - last.tipped = true + + var found string + + // Do we know about this message + for _, lmsg := range fw.lastMsgs[t] { + if lmsg == msg { + found = lmsg + break + } } + + if found == "" { // new message. Add to slice. + if len(fw.lastMsgs[t]) >= repeatPoolSize { // drop oldest + fw.lastMsgs[t] = fw.lastMsgs[t][1:] + } + fw.lastMsgs[t] = append(fw.lastMsgs[t], msg) + return false // not-repeated + } + + // repeated, don't log return true } diff --git a/docs/ipfs-cluster-guide.md b/docs/ipfs-cluster-guide.md index 229fadf0..3cd128c8 100644 --- a/docs/ipfs-cluster-guide.md +++ b/docs/ipfs-cluster-guide.md @@ -433,6 +433,8 @@ This is usually the result of a desync between the *shared state* and the *local Since cluster is built on top of libp2p, many errors that new users face come from libp2p and have confusing messages which are not obvious at first sight. This list compiles some of them: * `dial attempt failed: misdial to through ....`: this means that the multiaddress you are contacting has a different peer in it than expected. -* `dial attempt failed: context deadline exceeded`: this means that the address is not reachable. +* `dial attempt failed: connection refused`: the peer is not running or not listening on the expected address/protocol/port. +* `dial attempt failed: context deadline exceeded`: this means that the address is not reachable or that the wrong secret is being used. +* `dial backoff`: same as above. * `dial attempt failed: incoming message was too large`: this probably means that your cluster peers are not sharing the same secret. * `version not supported`: this means that your nodes are running different versions of raft/cluster.