ipfs-cluster/consensus/raft/consensus.go
Hector Sanjuan acbd7fda60 Consensus: add new "crdt" consensus component
This adds a new "crdt" consensus component using go-ds-crdt.

This implies several refactors to fully make cluster consensus-component
independent:

* Delete mapstate and fully adopt dsstate (after people have migrated).
* Return errors from state methods rather than ignoring them.
* Add a new "datastore" modules so that we can configure datastores in the
   main configuration like other components.
* Let the consensus components fully define the "state.State". Thus, they do
not receive the state, they receive the storage where we put the state (a
go-datastore).
* Allow to customize how the monitor component obtains Peers() (the current
  peerset), including avoiding using the current peerset. At the moment the
  crdt consensus uses the monitoring component to define the current peerset.
  Therefore the monitor component cannot rely on the consensus component to
  produce a peerset.
* Re-factor/re-implementation of "ipfs-cluster-service state"
  operations. Includes the dissapearance of the "migrate" one.

The CRDT consensus component defines creates a crdt-datastore (with ipfs-lite)
and uses it to intitialize a dssate. Thus the crdt-store is elegantly
wrapped. Any modifications to the state get automatically replicated to other
peers. We store all the CRDT DAG blocks in the local datastore.

The consensus components only expose a ReadOnly state, as any modifications to
the shared state should happen through them.

DHT and PubSub facilities must now be created outside of Cluster and passed in
so they can be re-used by different components.
2019-04-17 19:14:26 +02:00

565 lines
15 KiB
Go

// Package raft implements a Consensus component for IPFS Cluster which uses
// Raft (go-libp2p-raft).
package raft
import (
"context"
"errors"
"fmt"
"sort"
"sync"
"time"
"go.opencensus.io/tag"
"go.opencensus.io/trace"
"github.com/ipfs/ipfs-cluster/api"
"github.com/ipfs/ipfs-cluster/state"
"github.com/ipfs/ipfs-cluster/state/dsstate"
ds "github.com/ipfs/go-datastore"
logging "github.com/ipfs/go-log"
consensus "github.com/libp2p/go-libp2p-consensus"
rpc "github.com/libp2p/go-libp2p-gorpc"
host "github.com/libp2p/go-libp2p-host"
peer "github.com/libp2p/go-libp2p-peer"
libp2praft "github.com/libp2p/go-libp2p-raft"
ma "github.com/multiformats/go-multiaddr"
)
var logger = logging.Logger("raft")
// Consensus handles the work of keeping a shared-state between
// the peers of an IPFS Cluster, as well as modifying that state and
// applying any updates in a thread-safe manner.
type Consensus struct {
ctx context.Context
cancel func()
config *Config
host host.Host
consensus consensus.OpLogConsensus
actor consensus.Actor
baseOp *LogOp
raft *raftWrapper
rpcClient *rpc.Client
rpcReady chan struct{}
readyCh chan struct{}
shutdownLock sync.RWMutex
shutdown bool
}
// NewConsensus builds a new ClusterConsensus component using Raft.
//
// Raft saves state snapshots regularly and persists log data in a bolt
// datastore. Therefore, unless memory usage is a concern, it is recommended
// to use an in-memory go-datastore as store parameter.
//
// The staging parameter controls if the Raft peer should start in
// staging mode (used when joining a new Raft peerset with other peers).
//
// The store parameter should be a thread-safe datastore.
func NewConsensus(
host host.Host,
cfg *Config,
store ds.Datastore,
staging bool, // this peer must not be bootstrapped if no state exists
) (*Consensus, error) {
err := cfg.Validate()
if err != nil {
return nil, err
}
logger.Debug("starting Consensus and waiting for a leader...")
baseOp := &LogOp{tracing: cfg.Tracing}
state, err := dsstate.New(
store,
cfg.DatastoreNamespace,
dsstate.DefaultHandle(),
)
if err != nil {
return nil, err
}
consensus := libp2praft.NewOpLog(state, baseOp)
raft, err := newRaftWrapper(host, cfg, consensus.FSM(), staging)
if err != nil {
logger.Error("error creating raft: ", err)
return nil, err
}
actor := libp2praft.NewActor(raft.raft)
consensus.SetActor(actor)
ctx, cancel := context.WithCancel(context.Background())
cc := &Consensus{
ctx: ctx,
cancel: cancel,
config: cfg,
host: host,
consensus: consensus,
actor: actor,
baseOp: baseOp,
raft: raft,
rpcReady: make(chan struct{}, 1),
readyCh: make(chan struct{}, 1),
}
baseOp.consensus = cc
go cc.finishBootstrap()
return cc, nil
}
// WaitForSync waits for a leader and for the state to be up to date, then returns.
func (cc *Consensus) WaitForSync(ctx context.Context) error {
ctx, span := trace.StartSpan(ctx, "consensus/WaitForSync")
defer span.End()
leaderCtx, cancel := context.WithTimeout(
ctx,
cc.config.WaitForLeaderTimeout)
defer cancel()
// 1 - wait for leader
// 2 - wait until we are a Voter
// 3 - wait until last index is applied
// From raft docs:
// once a staging server receives enough log entries to be sufficiently
// caught up to the leader's log, the leader will invoke a membership
// change to change the Staging server to a Voter
// Thus, waiting to be a Voter is a guarantee that we have a reasonable
// up to date state. Otherwise, we might return too early (see
// https://github.com/ipfs/ipfs-cluster/issues/378)
_, err := cc.raft.WaitForLeader(leaderCtx)
if err != nil {
return errors.New("error waiting for leader: " + err.Error())
}
err = cc.raft.WaitForVoter(ctx)
if err != nil {
return errors.New("error waiting to become a Voter: " + err.Error())
}
err = cc.raft.WaitForUpdates(ctx)
if err != nil {
return errors.New("error waiting for consensus updates: " + err.Error())
}
return nil
}
// waits until there is a consensus leader and syncs the state
// to the tracker. If errors happen, this will return and never
// signal the component as Ready.
func (cc *Consensus) finishBootstrap() {
// wait until we have RPC to perform any actions.
select {
case <-cc.ctx.Done():
return
case <-cc.rpcReady:
}
// Sometimes bootstrap is a no-op. It only applies when
// no state exists and staging=false.
_, err := cc.raft.Bootstrap()
if err != nil {
return
}
err = cc.WaitForSync(cc.ctx)
if err != nil {
return
}
logger.Debug("Raft state is now up to date")
logger.Debug("consensus ready")
cc.readyCh <- struct{}{}
}
// Shutdown stops the component so it will not process any
// more updates. The underlying consensus is permanently
// shutdown, along with the libp2p transport.
func (cc *Consensus) Shutdown(ctx context.Context) error {
ctx, span := trace.StartSpan(ctx, "consensus/Shutdown")
defer span.End()
cc.shutdownLock.Lock()
defer cc.shutdownLock.Unlock()
if cc.shutdown {
logger.Debug("already shutdown")
return nil
}
logger.Info("stopping Consensus component")
// Raft Shutdown
err := cc.raft.Shutdown(ctx)
if err != nil {
logger.Error(err)
}
if cc.config.hostShutdown {
cc.host.Close()
}
cc.shutdown = true
cc.cancel()
close(cc.rpcReady)
return nil
}
// SetClient makes the component ready to perform RPC requets
func (cc *Consensus) SetClient(c *rpc.Client) {
cc.rpcClient = c
cc.rpcReady <- struct{}{}
}
// Ready returns a channel which is signaled when the Consensus
// algorithm has finished bootstrapping and is ready to use
func (cc *Consensus) Ready(ctx context.Context) <-chan struct{} {
ctx, span := trace.StartSpan(ctx, "consensus/Ready")
defer span.End()
return cc.readyCh
}
func (cc *Consensus) op(ctx context.Context, pin *api.Pin, t LogOpType) *LogOp {
return &LogOp{
Cid: pin,
Type: t,
}
}
// returns true if the operation was redirected to the leader
// note that if the leader just dissappeared, the rpc call will
// fail because we haven't heard that it's gone.
func (cc *Consensus) redirectToLeader(method string, arg interface{}) (bool, error) {
ctx, span := trace.StartSpan(cc.ctx, "consensus/redirectToLeader")
defer span.End()
var finalErr error
// Retry redirects
for i := 0; i <= cc.config.CommitRetries; i++ {
logger.Debugf("redirect try %d", i)
leader, err := cc.Leader(ctx)
// No leader, wait for one
if err != nil {
logger.Warning("there seems to be no leader. Waiting for one")
rctx, cancel := context.WithTimeout(
ctx,
cc.config.WaitForLeaderTimeout,
)
defer cancel()
pidstr, err := cc.raft.WaitForLeader(rctx)
// means we timed out waiting for a leader
// we don't retry in this case
if err != nil {
return false, fmt.Errorf("timed out waiting for leader: %s", err)
}
leader, err = peer.IDB58Decode(pidstr)
if err != nil {
return false, err
}
}
// We are the leader. Do not redirect
if leader == cc.host.ID() {
return false, nil
}
logger.Debugf("redirecting %s to leader: %s", method, leader.Pretty())
finalErr = cc.rpcClient.CallContext(
ctx,
leader,
"Cluster",
method,
arg,
&struct{}{},
)
if finalErr != nil {
logger.Errorf("retrying to redirect request to leader: %s", finalErr)
time.Sleep(2 * cc.config.RaftConfig.HeartbeatTimeout)
continue
}
break
}
// We tried to redirect, but something happened
return true, finalErr
}
// commit submits a cc.consensus commit. It retries upon failures.
func (cc *Consensus) commit(ctx context.Context, op *LogOp, rpcOp string, redirectArg interface{}) error {
ctx, span := trace.StartSpan(ctx, "consensus/commit")
defer span.End()
if cc.config.Tracing {
// required to cross the serialized boundary
op.SpanCtx = span.SpanContext()
tagmap := tag.FromContext(ctx)
if tagmap != nil {
op.TagCtx = tag.Encode(tagmap)
}
}
var finalErr error
for i := 0; i <= cc.config.CommitRetries; i++ {
logger.Debugf("attempt #%d: committing %+v", i, op)
// this means we are retrying
if finalErr != nil {
logger.Errorf("retrying upon failed commit (retry %d): %s ",
i, finalErr)
}
// try to send it to the leader
// redirectToLeader has it's own retry loop. If this fails
// we're done here.
ok, err := cc.redirectToLeader(rpcOp, redirectArg)
if err != nil || ok {
return err
}
// Being here means we are the LEADER. We can commit.
// now commit the changes to our state
cc.shutdownLock.RLock() // do not shut down while committing
_, finalErr = cc.consensus.CommitOp(op)
cc.shutdownLock.RUnlock()
if finalErr != nil {
goto RETRY
}
switch op.Type {
case LogOpPin:
logger.Infof("pin committed to global state: %s", op.Cid.Cid)
case LogOpUnpin:
logger.Infof("unpin committed to global state: %s", op.Cid.Cid)
}
break
RETRY:
time.Sleep(cc.config.CommitRetryDelay)
}
return finalErr
}
// LogPin submits a Cid to the shared state of the cluster. It will forward
// the operation to the leader if this is not it.
func (cc *Consensus) LogPin(ctx context.Context, pin *api.Pin) error {
ctx, span := trace.StartSpan(ctx, "consensus/LogPin")
defer span.End()
op := cc.op(ctx, pin, LogOpPin)
err := cc.commit(ctx, op, "ConsensusLogPin", pin)
if err != nil {
return err
}
return nil
}
// LogUnpin removes a Cid from the shared state of the cluster.
func (cc *Consensus) LogUnpin(ctx context.Context, pin *api.Pin) error {
ctx, span := trace.StartSpan(ctx, "consensus/LogUnpin")
defer span.End()
op := cc.op(ctx, pin, LogOpUnpin)
err := cc.commit(ctx, op, "ConsensusLogUnpin", pin)
if err != nil {
return err
}
return nil
}
// AddPeer adds a new peer to participate in this consensus. It will
// forward the operation to the leader if this is not it.
func (cc *Consensus) AddPeer(ctx context.Context, pid peer.ID) error {
ctx, span := trace.StartSpan(ctx, "consensus/AddPeer")
defer span.End()
var finalErr error
for i := 0; i <= cc.config.CommitRetries; i++ {
logger.Debugf("attempt #%d: AddPeer %s", i, pid.Pretty())
if finalErr != nil {
logger.Errorf("retrying to add peer. Attempt #%d failed: %s", i, finalErr)
}
ok, err := cc.redirectToLeader("ConsensusAddPeer", pid)
if err != nil || ok {
return err
}
// Being here means we are the leader and can commit
cc.shutdownLock.RLock() // do not shutdown while committing
finalErr = cc.raft.AddPeer(ctx, peer.IDB58Encode(pid))
cc.shutdownLock.RUnlock()
if finalErr != nil {
time.Sleep(cc.config.CommitRetryDelay)
continue
}
logger.Infof("peer added to Raft: %s", pid.Pretty())
break
}
return finalErr
}
// RmPeer removes a peer from this consensus. It will
// forward the operation to the leader if this is not it.
func (cc *Consensus) RmPeer(ctx context.Context, pid peer.ID) error {
ctx, span := trace.StartSpan(ctx, "consensus/RmPeer")
defer span.End()
var finalErr error
for i := 0; i <= cc.config.CommitRetries; i++ {
logger.Debugf("attempt #%d: RmPeer %s", i, pid.Pretty())
if finalErr != nil {
logger.Errorf("retrying to remove peer. Attempt #%d failed: %s", i, finalErr)
}
ok, err := cc.redirectToLeader("ConsensusRmPeer", pid)
if err != nil || ok {
return err
}
// Being here means we are the leader and can commit
cc.shutdownLock.RLock() // do not shutdown while committing
finalErr = cc.raft.RemovePeer(ctx, peer.IDB58Encode(pid))
cc.shutdownLock.RUnlock()
if finalErr != nil {
time.Sleep(cc.config.CommitRetryDelay)
continue
}
logger.Infof("peer removed from Raft: %s", pid.Pretty())
break
}
return finalErr
}
// State retrieves the current consensus State. It may error if no State has
// been agreed upon or the state is not consistent. The returned State is the
// last agreed-upon State known by this node. No writes are allowed, as all
// writes to the shared state should happen through the Consensus component
// methods.
func (cc *Consensus) State(ctx context.Context) (state.ReadOnly, error) {
ctx, span := trace.StartSpan(ctx, "consensus/State")
defer span.End()
st, err := cc.consensus.GetLogHead()
if err == libp2praft.ErrNoState {
return state.Empty(), nil
}
if err != nil {
return nil, err
}
state, ok := st.(state.State)
if !ok {
return nil, errors.New("wrong state type")
}
return state, nil
}
// Leader returns the peerID of the Leader of the
// cluster. It returns an error when there is no leader.
func (cc *Consensus) Leader(ctx context.Context) (peer.ID, error) {
ctx, span := trace.StartSpan(ctx, "consensus/Leader")
defer span.End()
// Note the hard-dependency on raft here...
raftactor := cc.actor.(*libp2praft.Actor)
return raftactor.Leader()
}
// Clean removes the Raft persisted state.
func (cc *Consensus) Clean(ctx context.Context) error {
ctx, span := trace.StartSpan(ctx, "consensus/Clean")
defer span.End()
cc.shutdownLock.RLock()
defer cc.shutdownLock.RUnlock()
if !cc.shutdown {
return errors.New("consensus component is not shutdown")
}
return CleanupRaft(cc.config)
}
// Rollback replaces the current agreed-upon
// state with the state provided. Only the consensus leader
// can perform this operation.
func (cc *Consensus) Rollback(state state.State) error {
// This is unused. It *might* be used for upgrades.
// There is rather untested magic in libp2p-raft's FSM()
// to make this possible.
return cc.consensus.Rollback(state)
}
// Peers return the current list of peers in the consensus.
// The list will be sorted alphabetically.
func (cc *Consensus) Peers(ctx context.Context) ([]peer.ID, error) {
ctx, span := trace.StartSpan(ctx, "consensus/Peers")
defer span.End()
cc.shutdownLock.RLock() // prevent shutdown while here
defer cc.shutdownLock.RUnlock()
if cc.shutdown { // things hang a lot in this case
return nil, errors.New("consensus is shutdown")
}
peers := []peer.ID{}
raftPeers, err := cc.raft.Peers(ctx)
if err != nil {
return nil, fmt.Errorf("cannot retrieve list of peers: %s", err)
}
sort.Strings(raftPeers)
for _, p := range raftPeers {
id, err := peer.IDB58Decode(p)
if err != nil {
panic("could not decode peer")
}
peers = append(peers, id)
}
return peers, nil
}
func parsePIDFromMultiaddr(addr ma.Multiaddr) string {
pidstr, err := addr.ValueForProtocol(ma.P_IPFS)
if err != nil {
panic("peer badly encoded")
}
return pidstr
}
// OfflineState state returns a cluster state by reading the Raft data and
// writing it to the given datastore which is then wrapped as a state.State.
// Usually an in-memory datastore suffices. The given datastore should be
// thread-safe.
func OfflineState(cfg *Config, store ds.Datastore) (state.State, error) {
r, snapExists, err := LastStateRaw(cfg)
if err != nil {
return nil, err
}
st, err := dsstate.New(store, cfg.DatastoreNamespace, dsstate.DefaultHandle())
if err != nil {
return nil, err
}
if !snapExists {
return st, nil
}
err = st.Unmarshal(r)
if err != nil {
return nil, err
}
return st, nil
}