Fix #167: Useful messages when consensus doesn't start

This will display a few hints when consensus fails to start.
If consensus doesn't start (normally WaitForLeader times out),
it's because of libp2p not being able to reach other peers.

This sometimes also means that the wrong protector key (secret)
is being used, even though libp2p does not give us clear
indications.

License: MIT
Signed-off-by: Hector Sanjuan <code@hector.link>
This commit is contained in:
Hector Sanjuan 2018-03-12 22:45:29 +01:00
parent c276c31ad6
commit 740f314976
3 changed files with 49 additions and 20 deletions

View File

@ -139,7 +139,7 @@ func NewCluster(
return nil, errors.New("bootstrap unsuccessful")
}
go func() {
c.ready()
c.ready(consensusCfg.WaitForLeaderTimeout * 2)
c.run()
}()
return c, nil
@ -422,14 +422,22 @@ func (c *Cluster) run() {
go c.alertsHandler()
}
func (c *Cluster) ready() {
func (c *Cluster) ready(timeout time.Duration) {
// We bootstrapped first because with dirty state consensus
// may have a peerset and not find a leader so we cannot wait
// for it.
timer := time.NewTimer(30 * time.Second)
timer := time.NewTimer(timeout)
select {
case <-timer.C:
logger.Error("consensus start timed out")
logger.Error("**************************************************")
logger.Error("***** ipfs-cluster consensus start timed out *****")
logger.Error("This peer was not able to become part of the cluster.")
logger.Error("This might be due to one or several causes:")
logger.Error(` - Check that there is connectivity to the "bootstrap" and "peers" multiaddresses`)
logger.Error(` - Check that all cluster peers are using the same "secret"`)
logger.Error(` - Check that this peer is reachable on its "listen_multiaddress"`)
logger.Error(` - Check that there is a majority of available peers`)
logger.Error("**************************************************")
c.Shutdown()
return
case <-c.consensus.Ready():

View File

@ -3,6 +3,7 @@ package raft
import (
"log"
"strings"
"time"
logging "github.com/ipfs/go-log"
)
@ -14,15 +15,14 @@ const (
err
)
const repeatPoolSize = 10
const repeatReset = time.Minute
// This provides a custom logger for Raft which intercepts Raft log messages
// and rewrites us to our own logger (for "raft" facility).
type logForwarder struct {
last map[int]*lastMsg
}
type lastMsg struct {
msg string
tipped bool
lastMsgs map[int][]string
lastTip map[int]time.Time
}
var raftStdLogger = log.New(&logForwarder{}, "", 0)
@ -58,19 +58,38 @@ func (fw *logForwarder) Write(p []byte) (n int, e error) {
}
func (fw *logForwarder) repeated(t int, msg string) bool {
if fw.last == nil {
fw.last = make(map[int]*lastMsg)
if fw.lastMsgs == nil {
fw.lastMsgs = make(map[int][]string)
fw.lastTip = make(map[int]time.Time)
}
last, ok := fw.last[t]
if !ok || last.msg != msg {
fw.last[t] = &lastMsg{msg, false}
return false
// We we haven't tipped about repeated log messages
// in while, do it and forget the list
if time.Now().After(fw.lastTip[t].Add(repeatReset)) {
fw.lastTip[t] = time.Now()
fw.lastMsgs[t] = nil
fw.log(t, "NOTICE: Some RAFT log messages repeat and will only be logged once")
}
if !last.tipped {
fw.log(t, "NOTICE: The last RAFT log message repeats and will only be logged once")
last.tipped = true
var found string
// Do we know about this message
for _, lmsg := range fw.lastMsgs[t] {
if lmsg == msg {
found = lmsg
break
}
}
if found == "" { // new message. Add to slice.
if len(fw.lastMsgs[t]) >= repeatPoolSize { // drop oldest
fw.lastMsgs[t] = fw.lastMsgs[t][1:]
}
fw.lastMsgs[t] = append(fw.lastMsgs[t], msg)
return false // not-repeated
}
// repeated, don't log
return true
}

View File

@ -433,6 +433,8 @@ This is usually the result of a desync between the *shared state* and the *local
Since cluster is built on top of libp2p, many errors that new users face come from libp2p and have confusing messages which are not obvious at first sight. This list compiles some of them:
* `dial attempt failed: misdial to <peer.ID XXXXXX> through ....`: this means that the multiaddress you are contacting has a different peer in it than expected.
* `dial attempt failed: context deadline exceeded`: this means that the address is not reachable.
* `dial attempt failed: connection refused`: the peer is not running or not listening on the expected address/protocol/port.
* `dial attempt failed: context deadline exceeded`: this means that the address is not reachable or that the wrong secret is being used.
* `dial backoff`: same as above.
* `dial attempt failed: incoming message was too large`: this probably means that your cluster peers are not sharing the same secret.
* `version not supported`: this means that your nodes are running different versions of raft/cluster.