ipfs-cluster/consensus/raft/raft.go
Hector Sanjuan 1a06baeb23 Raft: do not ever remove state, rename it and leave it around
This commit changes the way that consensus.Clean() works. Before
it deleted the whole data folder. Now it renames it as <name>.old.0
and leaves it. When Clean() is called again, it renames <name>.old.0
as <name>.old.1, and the actual data becomes <name>.old.0. Higher number
means older. The number of backups is fixed to 5. When 5 backups exists
and a new one comes up again, the last one is discarded.

License: MIT
Signed-off-by: Hector Sanjuan <hector@protocol.ai>
2017-11-13 18:18:52 +01:00

450 lines
11 KiB
Go

package raft
import (
"context"
"errors"
"os"
"path/filepath"
"time"
hraft "github.com/hashicorp/raft"
raftboltdb "github.com/hashicorp/raft-boltdb"
host "github.com/libp2p/go-libp2p-host"
peer "github.com/libp2p/go-libp2p-peer"
p2praft "github.com/libp2p/go-libp2p-raft"
)
// errBadRaftState is returned when the consensus component cannot start
// because the cluster peers do not match the raft peers.
var errBadRaftState = errors.New("cluster peers do not match raft peers")
// ErrWaitingForSelf is returned when we are waiting for ourselves to depart
// the peer set, which won't happen
var errWaitingForSelf = errors.New("waiting for ourselves to depart")
// RaftMaxSnapshots indicates how many snapshots to keep in the consensus data
// folder.
// TODO: Maybe include this in Config. Not sure how useful it is to touch
// this anyways.
var RaftMaxSnapshots = 5
// RaftLogCacheSize is the maximum number of logs to cache in-memory.
// This is used to reduce disk I/O for the recently committed entries.
var RaftLogCacheSize = 512
// Are we compiled on a 64-bit architecture?
// https://groups.google.com/forum/#!topic/golang-nuts/vAckmhUMAdQ
// This is used below because raft Observers panic on 32-bit.
const sixtyfour = uint64(^uint(0)) == ^uint64(0)
// raftWrapper performs all Raft-specific operations which are needed by
// Cluster but are not fulfilled by the consensus interface. It should contain
// most of the Raft-related stuff so it can be easily replaced in the future,
// if need be.
type raftWrapper struct {
raft *hraft.Raft
dataFolder string
srvConfig hraft.Configuration
transport *hraft.NetworkTransport
snapshotStore hraft.SnapshotStore
logStore hraft.LogStore
stableStore hraft.StableStore
boltdb *raftboltdb.BoltStore
}
// newRaft launches a go-libp2p-raft consensus peer.
func newRaftWrapper(peers []peer.ID, host host.Host, cfg *Config, fsm hraft.FSM) (*raftWrapper, error) {
// Set correct LocalID
cfg.RaftConfig.LocalID = hraft.ServerID(peer.IDB58Encode(host.ID()))
// Prepare data folder
dataFolder, err := makeDataFolder(cfg.BaseDir, cfg.DataFolder)
if err != nil {
return nil, err
}
srvCfg := makeServerConf(peers)
logger.Debug("creating libp2p Raft transport")
transport, err := p2praft.NewLibp2pTransport(host, cfg.NetworkTimeout)
if err != nil {
return nil, err
}
var log hraft.LogStore
var stable hraft.StableStore
var snap hraft.SnapshotStore
logger.Debug("creating raft snapshot store")
snapstore, err := hraft.NewFileSnapshotStoreWithLogger(
dataFolder, RaftMaxSnapshots, raftStdLogger)
if err != nil {
return nil, err
}
logger.Debug("creating BoltDB store")
store, err := raftboltdb.NewBoltStore(
filepath.Join(dataFolder, "raft.db"))
if err != nil {
return nil, err
}
// wraps the store in a LogCache to improve performance.
// See consul/agent/consul/serger.go
cacheStore, err := hraft.NewLogCache(RaftLogCacheSize, store)
if err != nil {
return nil, err
}
stable = store
log = cacheStore
snap = snapstore
logger.Debug("checking for existing raft states")
hasState, err := hraft.HasExistingState(log, stable, snap)
if err != nil {
return nil, err
}
if !hasState {
logger.Info("bootstrapping raft cluster")
err := hraft.BootstrapCluster(cfg.RaftConfig,
log, stable, snap, transport, srvCfg)
if err != nil {
logger.Error("bootstrapping cluster: ", err)
return nil, err
}
} else {
logger.Info("raft cluster is already bootstrapped")
}
logger.Debug("creating Raft")
r, err := hraft.NewRaft(cfg.RaftConfig,
fsm, log, stable, snap, transport)
if err != nil {
logger.Error("initializing raft: ", err)
return nil, err
}
raftW := &raftWrapper{
raft: r,
dataFolder: dataFolder,
srvConfig: srvCfg,
transport: transport,
snapshotStore: snap,
logStore: log,
stableStore: stable,
boltdb: store,
}
// Handle existing, different configuration
if hasState {
cf := r.GetConfiguration()
if err := cf.Error(); err != nil {
return nil, err
}
currentCfg := cf.Configuration()
added, removed := diffConfigurations(srvCfg, currentCfg)
if len(added)+len(removed) > 0 {
raftW.Shutdown()
logger.Error("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
logger.Error("Raft peers do not match cluster peers from the configuration.")
logger.Error("This likely indicates that this peer has left the cluster and/or")
logger.Error("has a dirty state. Clean the raft state for this peer")
logger.Error("(%s)", dataFolder)
logger.Error("bootstrap it to a working cluster.")
logger.Error("Raft peers:")
for _, s := range currentCfg.Servers {
logger.Errorf(" - %s", s.ID)
}
logger.Error("Cluster configuration peers:")
for _, s := range srvCfg.Servers {
logger.Errorf(" - %s", s.ID)
}
logger.Errorf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
return nil, errBadRaftState
//return nil, errors.New("Bad cluster peers")
}
}
return raftW, nil
}
// returns the folder path after creating it.
// if folder is empty, it uses baseDir+Default.
func makeDataFolder(baseDir, folder string) (string, error) {
if folder == "" {
folder = filepath.Join(baseDir, DefaultDataSubFolder)
}
err := os.MkdirAll(folder, 0700)
if err != nil {
return "", err
}
return folder, nil
}
// create Raft servers configuration
func makeServerConf(peers []peer.ID) hraft.Configuration {
sm := make(map[string]struct{})
servers := make([]hraft.Server, 0)
for _, pid := range peers {
p := peer.IDB58Encode(pid)
_, ok := sm[p]
if !ok { // avoid dups
sm[p] = struct{}{}
servers = append(servers, hraft.Server{
Suffrage: hraft.Voter,
ID: hraft.ServerID(p),
Address: hraft.ServerAddress(p),
})
}
}
return hraft.Configuration{
Servers: servers,
}
}
// diffConfigurations returns the serverIDs added and removed from
// c2 in relation to c1.
func diffConfigurations(
c1, c2 hraft.Configuration) (added, removed []hraft.ServerID) {
m1 := make(map[hraft.ServerID]struct{})
m2 := make(map[hraft.ServerID]struct{})
added = make([]hraft.ServerID, 0)
removed = make([]hraft.ServerID, 0)
for _, s := range c1.Servers {
m1[s.ID] = struct{}{}
}
for _, s := range c2.Servers {
m2[s.ID] = struct{}{}
}
for k, _ := range m1 {
_, ok := m2[k]
if !ok {
removed = append(removed, k)
}
}
for k, _ := range m2 {
_, ok := m1[k]
if !ok {
added = append(added, k)
}
}
return
}
// WaitForLeader holds until Raft says we have a leader.
// Returns uf ctx is cancelled.
func (rw *raftWrapper) WaitForLeader(ctx context.Context) (string, error) {
obsCh := make(chan hraft.Observation, 1)
if sixtyfour { // 32-bit systems don't support observers
observer := hraft.NewObserver(obsCh, false, nil)
rw.raft.RegisterObserver(observer)
defer rw.raft.DeregisterObserver(observer)
}
ticker := time.NewTicker(time.Second / 2)
for {
select {
case obs := <-obsCh:
_ = obs
// See https://github.com/hashicorp/raft/issues/254
// switch obs.Data.(type) {
// case hraft.LeaderObservation:
// lObs := obs.Data.(hraft.LeaderObservation)
// logger.Infof("Raft Leader elected: %s",
// lObs.Leader)
// return string(lObs.Leader), nil
// }
case <-ticker.C:
if l := rw.raft.Leader(); l != "" {
logger.Debug("waitForleaderTimer")
logger.Infof("Raft Leader elected: %s", l)
ticker.Stop()
return string(l), nil
}
case <-ctx.Done():
return "", ctx.Err()
}
}
}
// WaitForUpdates holds until Raft has synced to the last index in the log
func (rw *raftWrapper) WaitForUpdates(ctx context.Context) error {
logger.Info("Raft state is catching up")
for {
select {
case <-ctx.Done():
return ctx.Err()
default:
lai := rw.raft.AppliedIndex()
li := rw.raft.LastIndex()
logger.Debugf("current Raft index: %d/%d",
lai, li)
if lai == li {
return nil
}
time.Sleep(500 * time.Millisecond)
}
}
}
func (rw *raftWrapper) WaitForPeer(ctx context.Context, pid string, depart bool) error {
for {
select {
case <-ctx.Done():
return ctx.Err()
default:
peers, err := rw.Peers()
if err != nil {
return err
}
if len(peers) == 1 && pid == peers[0] && depart {
return errWaitingForSelf
}
found := find(peers, pid)
// departing
if depart && !found {
return nil
}
// joining
if !depart && found {
return nil
}
time.Sleep(50 * time.Millisecond)
}
}
}
// Snapshot tells Raft to take a snapshot.
func (rw *raftWrapper) Snapshot() error {
future := rw.raft.Snapshot()
err := future.Error()
if err != nil && err.Error() != hraft.ErrNothingNewToSnapshot.Error() {
return err
}
return nil
}
// Shutdown shutdown Raft and closes the BoltDB.
func (rw *raftWrapper) Shutdown() error {
future := rw.raft.Shutdown()
err := future.Error()
errMsgs := ""
if err != nil {
errMsgs += "could not shutdown raft: " + err.Error() + ".\n"
}
err = rw.boltdb.Close() // important!
if err != nil {
errMsgs += "could not close boltdb: " + err.Error()
}
if errMsgs != "" {
return errors.New(errMsgs)
}
return nil
}
// AddPeer adds a peer to Raft
func (rw *raftWrapper) AddPeer(peer string) error {
// Check that we don't have it to not waste
// log entries if so.
peers, err := rw.Peers()
if err != nil {
return err
}
if find(peers, peer) {
logger.Infof("%s is already a raft peer", peer)
return nil
}
future := rw.raft.AddVoter(
hraft.ServerID(peer),
hraft.ServerAddress(peer),
0,
0) // TODO: Extra cfg value?
err = future.Error()
if err != nil {
logger.Error("raft cannot add peer: ", err)
}
return err
}
// RemovePeer removes a peer from Raft
func (rw *raftWrapper) RemovePeer(peer string) error {
// Check that we have it to not waste
// log entries if we don't.
peers, err := rw.Peers()
if err != nil {
return err
}
if !find(peers, peer) {
logger.Infof("%s is not among raft peers", peer)
return nil
}
if len(peers) == 1 && peers[0] == peer {
return errors.New("cannot remove ourselves from a 1-peer cluster")
}
rmFuture := rw.raft.RemoveServer(
hraft.ServerID(peer),
0,
0) // TODO: Extra cfg value?
err = rmFuture.Error()
if err != nil {
logger.Error("raft cannot remove peer: ", err)
return err
}
return nil
}
// Leader returns Raft's leader. It may be an empty string if
// there is no leader or it is unknown.
func (rw *raftWrapper) Leader() string {
return string(rw.raft.Leader())
}
func (rw *raftWrapper) Peers() ([]string, error) {
ids := make([]string, 0)
configFuture := rw.raft.GetConfiguration()
if err := configFuture.Error(); err != nil {
return nil, err
}
for _, server := range configFuture.Configuration().Servers {
ids = append(ids, string(server.ID))
}
return ids, nil
}
// only call when Raft is shutdown
func (rw *raftWrapper) Clean() error {
dbh := newDataBackupHelper(rw.dataFolder)
err := dbh.makeBackup()
if err != nil {
logger.Warning(err)
logger.Warning("the state could not be cleaned properly")
logger.Warning("manual intervention may be needed before starting cluster again")
}
return nil
}
func find(s []string, elem string) bool {
for _, selem := range s {
if selem == elem {
return true
}
}
return false
}