Fix #1436: Do not block peer startup waiting for RecoverAll

On large pinsets this may take a very long time and prevents metrics and
re-boostrapping from starting, among other things. See bug description.

This lets watchPinset trigger an immediate RecoverAllLocal instead, but this
happens in its own goroutine and should allow everything else to start.
This commit is contained in:
Hector Sanjuan 2021-08-06 10:27:07 +02:00
parent 0c01079eca
commit 67497c4eb4

View File

@ -251,20 +251,32 @@ func (c *Cluster) watchPinset() {
ctx, span := trace.StartSpan(c.ctx, "cluster/watchPinset")
defer span.End()
stateSyncTicker := time.NewTicker(c.config.StateSyncInterval)
recoverTicker := time.NewTicker(c.config.PinRecoverInterval)
stateSyncTimer := time.NewTimer(c.config.StateSyncInterval)
// Upon start, every item in the state that is not pinned will appear
// as PinError when doing a Status, we should proceed to recover
// (try pinning) all of those right away.
recoverTimer := time.NewTimer(0) // 0 so that it does an initial recover right away
// This prevents doing an StateSync while doing a RecoverAllLocal,
// which is intended behaviour as for very large pinsets
for {
select {
case <-stateSyncTicker.C:
case <-stateSyncTimer.C:
logger.Debug("auto-triggering StateSync()")
c.StateSync(ctx)
case <-recoverTicker.C:
stateSyncTimer.Reset(c.config.StateSyncInterval)
case <-recoverTimer.C:
logger.Debug("auto-triggering RecoverAllLocal()")
c.RecoverAllLocal(ctx)
recoverTimer.Reset(c.config.PinRecoverInterval)
case <-c.ctx.Done():
stateSyncTicker.Stop()
recoverTicker.Stop()
if !stateSyncTimer.Stop() {
<-stateSyncTimer.C
}
if !recoverTimer.Stop() {
<-recoverTimer.C
}
return
}
}
@ -627,10 +639,7 @@ This might be due to one or several causes:
c.Shutdown(ctx)
return
case <-c.consensus.Ready(ctx):
// Consensus ready means the state is up to date. Every item
// in the state that is not pinned will appear as PinError so
// we can proceed to recover all of those in the tracker.
c.RecoverAllLocal(ctx)
// Consensus ready means the state is up to date.
case <-c.ctx.Done():
return
}