Fix #219: WIP: Remove duplicate peer accounting

This change removes the duplicities of the PeerManager component:

* No more commiting PeerAdd and PeerRm log entries
* The Raft peer set is the source of truth
* Basic broadcasting is used to communicate peer multiaddresses
  in the cluster
* A peer can only be added in a healthy cluster
* A peer can be removed from any cluster which can still commit
* This also adds support for multiple multiaddresses per peer

License: MIT
Signed-off-by: Hector Sanjuan <hector@protocol.ai>
This commit is contained in:
Hector Sanjuan 2017-11-08 20:04:04 +01:00
parent aced97cfa1
commit b852dfa892
13 changed files with 360 additions and 345 deletions

View File

@ -3,7 +3,6 @@ package ipfscluster
import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"strings"
@ -57,7 +56,7 @@ type Cluster struct {
readyB bool
wg sync.WaitGroup
paMux sync.Mutex
// paMux sync.Mutex
}
// NewCluster builds a new IPFS Cluster peer. It initializes a LibP2P host,
@ -94,27 +93,31 @@ func NewCluster(
logger.Infof(" %s/ipfs/%s", addr, host.ID().Pretty())
}
peerManager := newPeerManager(host)
peerManager.importAddresses(cfg.Peers)
peerManager.importAddresses(cfg.Bootstrap)
c := &Cluster{
ctx: ctx,
cancel: cancel,
id: host.ID(),
config: cfg,
host: host,
api: api,
ipfs: ipfs,
state: st,
tracker: tracker,
monitor: monitor,
allocator: allocator,
informer: informer,
shutdownB: false,
removed: false,
doneCh: make(chan struct{}),
readyCh: make(chan struct{}),
readyB: false,
ctx: ctx,
cancel: cancel,
id: host.ID(),
config: cfg,
host: host,
api: api,
ipfs: ipfs,
state: st,
tracker: tracker,
monitor: monitor,
allocator: allocator,
informer: informer,
peerManager: peerManager,
shutdownB: false,
removed: false,
doneCh: make(chan struct{}),
readyCh: make(chan struct{}),
readyB: false,
}
c.setupPeerManager()
err = c.setupRPC()
if err != nil {
c.Shutdown()
@ -140,18 +143,6 @@ func NewCluster(
return c, nil
}
func (c *Cluster) setupPeerManager() {
pm := newPeerManager(c)
c.peerManager = pm
if len(c.config.Peers) > 0 {
c.peerManager.setFromMultiaddrs(c.config.Peers, false)
} else {
c.peerManager.setFromMultiaddrs(c.config.Bootstrap, false)
}
}
func (c *Cluster) setupRPC() error {
rpcServer := rpc.NewServer(c.host, RPCProtocol)
err := rpcServer.RegisterName("Cluster", &RPCAPI{c})
@ -219,12 +210,21 @@ func (c *Cluster) syncWatcher() {
}
func (c *Cluster) broadcastMetric(m api.Metric) error {
peers := c.peerManager.peers()
peers, err := c.consensus.Peers()
if err != nil {
logger.Error(err)
return err
}
leader, err := c.consensus.Leader()
if err != nil {
return err
}
if m.Discard() {
logger.Warningf("discarding invalid metric: %+v", m)
return nil
}
// If a peer is down, the rpc call will get locked. Therefore,
// we need to do it async. This way we keep broadcasting
// even if someone is down. Eventually those requests will
@ -234,7 +234,6 @@ func (c *Cluster) broadcastMetric(m api.Metric) error {
// Leader needs to broadcast its metric to everyone
// in case it goes down (new leader will have to detect this node went down)
logger.Debugf("Leader %s about to broadcast metric %s to %s. Expires: %s", c.id, m.Name, peers, m.Expire)
errs := c.multiRPC(peers,
"Cluster",
"PeerMonitorLogMetric",
@ -340,6 +339,66 @@ func (c *Cluster) alertsHandler() {
}
}
// detects any changes in the peerset and saves the configuration. When it
// detects that we have been removed from the peerset, it shuts down this peer.
func (c *Cluster) watchPeers() {
// TODO: Config option?
ticker := time.NewTicker(5 * time.Second)
var lastPeers []peer.ID
lastPeers, err := c.consensus.Peers()
if err != nil {
logger.Error("starting to watch peers", err)
}
for {
select {
case <-c.ctx.Done():
return
case <-ticker.C:
logger.Debugf("%s watching peers", c.id)
save := false
hasMe := false
peers, err := c.consensus.Peers()
if err != nil {
logger.Error(err)
continue
}
for _, p := range peers {
if p == c.id {
hasMe = true
break
}
}
if len(peers) != len(lastPeers) {
save = true
} else {
for i := range peers {
if peers[i] != lastPeers[i] {
save = true
}
}
}
lastPeers = peers
if !hasMe {
logger.Info("this peer has been removed and will shutdown")
c.removed = true
c.config.Bootstrap = c.peerManager.addresses(peers)
c.config.savePeers([]ma.Multiaddr{})
go c.Shutdown()
return
}
if save {
logger.Info("peerset change detected")
c.config.savePeers(c.peerManager.addresses(peers))
}
}
}
}
// find all Cids pinned to a given peer and triggers re-pins on them.
func (c *Cluster) repinFromPeer(p peer.ID) {
cState, err := c.consensus.State()
@ -356,12 +415,12 @@ func (c *Cluster) repinFromPeer(p peer.ID) {
}
}
// run provides a cancellable context and launches some goroutines
// before signaling readyCh
// run launches some go-routines which live throughout the cluster's life
func (c *Cluster) run() {
go c.syncWatcher()
go c.pushPingMetrics()
go c.pushInformerMetrics()
go c.watchPeers()
go c.alertsHandler()
}
@ -381,13 +440,21 @@ func (c *Cluster) ready() {
}
// Cluster is ready.
logger.Info("Cluster Peers (not including ourselves):")
peers := c.peerManager.peersAddrs()
if len(peers) == 0 {
peers, err := c.consensus.Peers()
if err != nil {
logger.Error(err)
c.Shutdown()
return
}
logger.Info("Cluster Peers (without including ourselves):")
if len(peers) == 1 {
logger.Info(" - No other peers")
}
for _, a := range c.peerManager.peersAddrs() {
logger.Infof(" - %s", a)
for _, p := range peers {
if p != c.id {
logger.Infof(" - %s", p.Pretty())
}
}
close(c.readyCh)
c.readyB = true
@ -432,20 +499,21 @@ func (c *Cluster) Shutdown() error {
// Only attempt to leave if:
// - consensus is initialized
// - cluster was ready (no bootstrapping error)
// - We are not removed already (means PeerRemove() was called on us)
// - We are not removed already (means watchPeers() called uss)
if c.consensus != nil && c.config.LeaveOnShutdown && c.readyB && !c.removed {
c.removed = true
// best effort
logger.Warning("attempting to leave the cluster. This may take some seconds")
err := c.consensus.LogRmPeer(c.id)
if err != nil {
logger.Error("leaving cluster: " + err.Error())
peers, err := c.consensus.Peers()
if err == nil {
// best effort
logger.Warning("attempting to leave the cluster. This may take some seconds")
err := c.consensus.RmPeer(c.id)
if err != nil {
logger.Error("leaving cluster: " + err.Error())
}
// save peers as bootstrappers
c.config.Bootstrap = c.peerManager.addresses(peers)
c.config.savePeers([]ma.Multiaddr{})
}
// save peers as bootstrappers
c.config.Bootstrap = c.peerManager.peersAddrs()
c.peerManager.resetPeers()
c.peerManager.savePeers()
}
// Cancel contexts
@ -460,8 +528,6 @@ func (c *Cluster) Shutdown() error {
// Do not save anything if we were not ready
if c.readyB {
// peers are saved usually on addPeer/rmPeer
// c.peerManager.savePeers()
c.backupState()
}
@ -519,11 +585,16 @@ func (c *Cluster) ID() api.ID {
addrs = append(addrs, multiaddrJoin(addr, c.id))
}
peers, _ := c.consensus.Peers()
return api.ID{
ID: c.id,
//PublicKey: c.host.Peerstore().PubKey(c.id),
Addresses: addrs,
ClusterPeers: c.peerManager.peersAddrs(),
Addresses: addrs,
// TODO: These are not peers but addresses. There could be
// several addresses for a single peer. Do we want to provide
// only PIDs? Another key in this object for addresses?
ClusterPeers: c.peerManager.addresses(peers),
Version: Version,
Commit: Commit,
RPCProtocolVersion: RPCProtocol,
@ -541,8 +612,8 @@ func (c *Cluster) PeerAdd(addr ma.Multiaddr) (api.ID, error) {
// starting 10 nodes on the same box for testing
// causes deadlock and a global lock here
// seems to help.
c.paMux.Lock()
defer c.paMux.Unlock()
// c.paMux.Lock()
// defer c.paMux.Unlock()
logger.Debugf("peerAdd called with %s", addr)
pid, decapAddr, err := multiaddrSplit(addr)
if err != nil {
@ -555,11 +626,30 @@ func (c *Cluster) PeerAdd(addr ma.Multiaddr) (api.ID, error) {
// Figure out its real address if we have one
remoteAddr := getRemoteMultiaddr(c.host, pid, decapAddr)
err = c.peerManager.addPeer(remoteAddr, false)
// whisper address to everyone, including ourselves
peers, err := c.consensus.Peers()
if err != nil {
logger.Error(err)
id := api.ID{ID: pid, Error: err.Error()}
return id, err
return api.ID{Error: err.Error()}, err
}
errs := c.multiRPC(peers, "Cluster",
"PeerManagerAddPeer",
api.MultiaddrToSerial(remoteAddr),
copyEmptyStructToIfaces(make([]struct{}, len(peers), len(peers))))
brk := false
for i, e := range errs {
if e != nil {
brk = true
logger.Errorf("%s: %s", peers[i].Pretty(), e)
}
}
if brk {
msg := "error broadcasting new peer's address: all cluster members need to be healthy for this operation to succeed. Try removing any unhealthy peers. Check the logs for more information about the error."
logger.Error(msg)
id := api.ID{ID: pid, Error: "error broadcasting new peer's address"}
return id, errors.New(msg)
}
// Figure out our address to that peer. This also
@ -570,25 +660,23 @@ func (c *Cluster) PeerAdd(addr ma.Multiaddr) (api.ID, error) {
if err != nil {
logger.Error(err)
id := api.ID{ID: pid, Error: err.Error()}
c.peerManager.rmPeer(pid, false)
return id, err
}
// Log the new peer in the log so everyone gets it.
err = c.consensus.LogAddPeer(remoteAddr) // this will save
err = c.consensus.AddPeer(pid)
if err != nil {
logger.Error(err)
id := api.ID{ID: pid, Error: err.Error()}
c.peerManager.rmPeer(pid, false)
return id, err
}
// Send cluster peers to the new peer.
clusterPeers := append(c.peerManager.peersAddrs(),
clusterPeers := append(c.peerManager.addresses(peers),
addrSerial.ToMultiaddr())
err = c.rpcClient.Call(pid,
"Cluster",
"PeerManagerSetFromMultiaddrs",
"PeerManagerImportAddresses",
api.MultiaddrsToSerial(clusterPeers),
&struct{}{})
if err != nil {
@ -611,19 +699,15 @@ func (c *Cluster) PeerAdd(addr ma.Multiaddr) (api.ID, error) {
// PeerRemove removes a peer from this Cluster.
//
// The peer will be removed from the consensus peer set,
// it will be shut down after this happens.
// The peer will be removed from the consensus peerset, all it's content
// will be re-pinned and the peer it will shut itself down.
func (c *Cluster) PeerRemove(pid peer.ID) error {
if !c.peerManager.isPeer(pid) {
return fmt.Errorf("%s is not a peer", pid.Pretty())
}
// We need to repin before removing the peer, otherwise, it won't
// be able to submit the pins.
logger.Infof("re-allocating all CIDs directly associated to %s", pid)
c.repinFromPeer(pid)
err := c.consensus.LogRmPeer(pid)
err := c.consensus.RmPeer(pid)
if err != nil {
logger.Error(err)
return err
@ -655,7 +739,7 @@ func (c *Cluster) Join(addr ma.Multiaddr) error {
}
// Add peer to peerstore so we can talk to it
c.peerManager.addPeer(addr, false)
c.peerManager.addPeer(addr)
// Note that PeerAdd() on the remote peer will
// figure out what our real address is (obviously not
@ -681,7 +765,7 @@ func (c *Cluster) Join(addr ma.Multiaddr) error {
}
c.StateSync()
logger.Infof("joined %s's cluster", addr)
logger.Infof("%s: joined %s's cluster", c.id.Pretty(), pid.Pretty())
return nil
}
@ -882,14 +966,20 @@ func (c *Cluster) Unpin(h *cid.Cid) error {
return nil
}
// Version returns the current IPFS Cluster version
// Version returns the current IPFS Cluster version.
func (c *Cluster) Version() string {
return Version
}
// Peers returns the IDs of the members of this Cluster
// Peers returns the IDs of the members of this Cluster.
func (c *Cluster) Peers() []api.ID {
members := c.peerManager.peers()
members, err := c.consensus.Peers()
if err != nil {
logger.Error(err)
logger.Error("an empty list of peers will be returned")
return []api.ID{}
}
peersSerial := make([]api.IDSerial, len(members), len(members))
peers := make([]api.ID, len(members), len(members))
@ -909,7 +999,7 @@ func (c *Cluster) Peers() []api.ID {
return peers
}
// makeHost makes a libp2p-host
// makeHost makes a libp2p-host.
func makeHost(ctx context.Context, cfg *Config) (host.Host, error) {
ps := peerstore.NewPeerstore()
privateKey := cfg.PrivateKey
@ -1004,7 +1094,12 @@ func (c *Cluster) globalPinInfoCid(method string, h *cid.Cid) (api.GlobalPinInfo
PeerMap: make(map[peer.ID]api.PinInfo),
}
members := c.peerManager.peers()
members, err := c.consensus.Peers()
if err != nil {
logger.Error(err)
return api.GlobalPinInfo{}, err
}
replies := make([]api.PinInfoSerial, len(members), len(members))
arg := api.Pin{
Cid: h,
@ -1055,7 +1150,12 @@ func (c *Cluster) globalPinInfoSlice(method string) ([]api.GlobalPinInfo, error)
var infos []api.GlobalPinInfo
fullMap := make(map[string]api.GlobalPinInfo)
members := c.peerManager.peers()
members, err := c.consensus.Peers()
if err != nil {
logger.Error(err)
return []api.GlobalPinInfo{}, err
}
replies := make([][]api.PinInfoSerial, len(members), len(members))
errs := c.multiRPC(members,
"Cluster",

View File

@ -8,6 +8,7 @@ import (
"encoding/json"
"errors"
"fmt"
"sync"
"time"
"github.com/ipfs/ipfs-cluster/config"
@ -36,6 +37,7 @@ const (
// config.ComponentConfig interface.
type Config struct {
config.Saver
lock sync.Mutex
// Libp2p ID and private key for Cluster communication (including)
// the Consensus component.
@ -343,6 +345,13 @@ func (cfg *Config) ToJSON() (raw []byte, err error) {
return
}
func (cfg *Config) savePeers(addrs []ma.Multiaddr) {
cfg.lock.Lock()
cfg.Peers = addrs
cfg.lock.Unlock()
cfg.NotifySave()
}
// DecodeClusterSecret parses a hex-encoded string, checks that it is exactly
// 32 bytes long and returns its value as a byte-slice.x
func DecodeClusterSecret(hexSecret string) ([]byte, error) {

View File

@ -30,6 +30,9 @@ var (
type Config struct {
config.Saver
// will shutdown libp2p host on shutdown. Useful for testing
hostShutdown bool
// A Hashicorp Raft's configuration object.
RaftConfig *hraft.Config
// A folder to store Raft's data.
@ -227,7 +230,7 @@ func (cfg *Config) Default() error {
cfg.RaftConfig = hraft.DefaultConfig()
// These options are imposed over any Default Raft Config.
// cfg.RaftConfig.ShutdownOnRemove = false
cfg.RaftConfig.ShutdownOnRemove = false
cfg.RaftConfig.LocalID = "will_be_set_automatically"
// Set up logging

View File

@ -6,6 +6,7 @@ import (
"context"
"errors"
"fmt"
"sort"
"sync"
"time"
@ -162,6 +163,11 @@ func (cc *Consensus) Shutdown() error {
if err != nil {
logger.Error(err)
}
if cc.config.hostShutdown {
cc.host.Close()
}
cc.shutdown = true
cc.cancel()
close(cc.rpcReady)
@ -180,20 +186,10 @@ func (cc *Consensus) Ready() <-chan struct{} {
return cc.readyCh
}
func (cc *Consensus) op(argi interface{}, t LogOpType) *LogOp {
switch argi.(type) {
case api.Pin:
return &LogOp{
Cid: argi.(api.Pin).ToSerial(),
Type: t,
}
case ma.Multiaddr:
return &LogOp{
Peer: api.MultiaddrToSerial(argi.(ma.Multiaddr)),
Type: t,
}
default:
panic("bad type")
func (cc *Consensus) op(pin api.Pin, t LogOpType) *LogOp {
return &LogOp{
Cid: pin.ToSerial(),
Type: t,
}
}
@ -281,26 +277,11 @@ func (cc *Consensus) commit(op *LogOp, rpcOp string, redirectArg interface{}) er
goto RETRY
}
// addPeer and rmPeer need to apply the change to Raft directly.
switch op.Type {
case LogOpPin:
logger.Infof("pin committed to global state: %s", op.Cid.Cid)
case LogOpUnpin:
logger.Infof("unpin committed to global state: %s", op.Cid.Cid)
case LogOpAddPeer:
pidstr := parsePIDFromMultiaddr(op.Peer.ToMultiaddr())
finalErr = cc.raft.AddPeer(pidstr)
if finalErr != nil {
goto RETRY
}
logger.Infof("peer committed to global state: %s", pidstr)
case LogOpRmPeer:
pidstr := parsePIDFromMultiaddr(op.Peer.ToMultiaddr())
finalErr = cc.raft.RemovePeer(pidstr)
if finalErr != nil {
goto RETRY
}
logger.Infof("peer removed from global state: %s", pidstr)
}
break
@ -331,24 +312,54 @@ func (cc *Consensus) LogUnpin(pin api.Pin) error {
return nil
}
// LogAddPeer submits a new peer to the shared state of the cluster. It will
// AddPeer adds a new peer to participate in this consensus. It will
// forward the operation to the leader if this is not it.
func (cc *Consensus) LogAddPeer(addr ma.Multiaddr) error {
addrS := api.MultiaddrToSerial(addr)
op := cc.op(addr, LogOpAddPeer)
return cc.commit(op, "ConsensusLogAddPeer", addrS)
func (cc *Consensus) AddPeer(pid peer.ID) error {
var finalErr error
for i := 0; i <= cc.config.CommitRetries; i++ {
logger.Debugf("attempt #%d: AddPeer %s", i, pid.Pretty())
if finalErr != nil {
logger.Errorf("retrying to add peer. Attempt #%d failed: %s", i, finalErr)
}
ok, err := cc.redirectToLeader("ConsensusAddPeer", pid)
if err != nil || ok {
return err
}
// Being here means we are the leader and can commit
finalErr = cc.raft.AddPeer(peer.IDB58Encode(pid))
if finalErr != nil {
time.Sleep(cc.config.CommitRetryDelay)
continue
}
logger.Infof("peer added to Raft: %s", pid.Pretty())
break
}
return finalErr
}
// LogRmPeer removes a peer from the shared state of the cluster. It will
// RmPeer removes a peer from this consensus. It will
// forward the operation to the leader if this is not it.
func (cc *Consensus) LogRmPeer(pid peer.ID) error {
// Create rmPeer operation for the log
addr, err := ma.NewMultiaddr("/ipfs/" + peer.IDB58Encode(pid))
if err != nil {
return err
func (cc *Consensus) RmPeer(pid peer.ID) error {
var finalErr error
for i := 0; i <= cc.config.CommitRetries; i++ {
logger.Debugf("attempt #%d: RmPeer %s", i, pid.Pretty())
if finalErr != nil {
logger.Errorf("retrying to add peer. Attempt #%d failed: %s", i, finalErr)
}
ok, err := cc.redirectToLeader("ConsensusRmPeer", pid)
if err != nil || ok {
return err
}
// Being here means we are the leader and can commit
finalErr = cc.raft.RemovePeer(peer.IDB58Encode(pid))
if finalErr != nil {
time.Sleep(cc.config.CommitRetryDelay)
continue
}
logger.Infof("peer removed from Raft: %s", pid.Pretty())
break
}
op := cc.op(addr, LogOpRmPeer)
return cc.commit(op, "ConsensusLogRmPeer", pid)
return finalErr
}
// State retrieves the current consensus State. It may error
@ -400,6 +411,30 @@ func (cc *Consensus) Rollback(state state.State) error {
return cc.consensus.Rollback(state)
}
// Peers return the current list of peers in the consensus.
// The list will be sorted alphabetically.
func (cc *Consensus) Peers() ([]peer.ID, error) {
if cc.shutdown { // things hang a lot in this case
return nil, errors.New("consensus is shutdown")
}
peers := []peer.ID{}
raftPeers, err := cc.raft.Peers()
if err != nil {
return nil, fmt.Errorf("cannot retrieve list of peers: %s", err)
}
sort.Strings(raftPeers)
for _, p := range raftPeers {
id, err := peer.IDB58Decode(p)
if err != nil {
panic("could not decode peer")
}
peers = append(peers, id)
}
return peers, nil
}
func parsePIDFromMultiaddr(addr ma.Multiaddr) string {
pidstr, err := addr.ValueForProtocol(ma.P_IPFS)
if err != nil {

View File

@ -55,6 +55,7 @@ func testingConsensus(t *testing.T, port int) *Consensus {
cfg := &Config{}
cfg.Default()
cfg.DataFolder = fmt.Sprintf("raftFolderFromTests%d", port)
cfg.hostShutdown = true
cc, err := NewConsensus([]peer.ID{h.ID()}, h, cfg, st)
if err != nil {
@ -122,7 +123,7 @@ func TestConsensusUnpin(t *testing.T) {
}
}
func TestConsensusLogAddPeer(t *testing.T) {
func TestConsensusAddPeer(t *testing.T) {
cc := testingConsensus(t, p2pPort)
cc2 := testingConsensus(t, p2pPortAlt)
t.Log(cc.host.ID().Pretty())
@ -133,10 +134,9 @@ func TestConsensusLogAddPeer(t *testing.T) {
defer cc2.Shutdown()
addr, _ := ma.NewMultiaddr(fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", p2pPortAlt))
haddr, _ := ma.NewMultiaddr(fmt.Sprintf("/ipfs/%s", cc2.host.ID().Pretty()))
cc.host.Peerstore().AddAddr(cc2.host.ID(), addr, peerstore.TempAddrTTL)
err := cc.LogAddPeer(addr.Encapsulate(haddr))
err := cc.AddPeer(cc2.host.ID())
if err != nil {
t.Error("the operation did not make it to the log:", err)
}
@ -157,7 +157,7 @@ func TestConsensusLogAddPeer(t *testing.T) {
}
}
func TestConsensusLogRmPeer(t *testing.T) {
func TestConsensusRmPeer(t *testing.T) {
cc := testingConsensus(t, p2pPort)
cc2 := testingConsensus(t, p2pPortAlt)
defer cleanRaft(p2pPort)
@ -166,10 +166,9 @@ func TestConsensusLogRmPeer(t *testing.T) {
defer cc2.Shutdown()
addr, _ := ma.NewMultiaddr(fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", p2pPortAlt))
haddr, _ := ma.NewMultiaddr(fmt.Sprintf("/ipfs/%s", cc2.host.ID().Pretty()))
cc.host.Peerstore().AddAddr(cc2.host.ID(), addr, peerstore.TempAddrTTL)
err := cc.LogAddPeer(addr.Encapsulate(haddr))
err := cc.AddPeer(cc2.host.ID())
if err != nil {
t.Error("could not add peer:", err)
}
@ -190,14 +189,14 @@ func TestConsensusLogRmPeer(t *testing.T) {
time.Sleep(2 * time.Second)
// Remove unexisting peer
err = cc.LogRmPeer(test.TestPeerID1)
err = cc.RmPeer(test.TestPeerID1)
if err != nil {
t.Error("the operation did not make it to the log:", err)
}
// Remove real peer. At least the leader can succeed
err = cc2.LogRmPeer(cc.host.ID())
err2 := cc.LogRmPeer(cc2.host.ID())
err = cc2.RmPeer(cc.host.ID())
err2 := cc.RmPeer(cc2.host.ID())
if err != nil && err2 != nil {
t.Error("could not remove peer:", err, err2)
}

View File

@ -1,23 +1,18 @@
package raft
import (
"context"
"errors"
"time"
"github.com/ipfs/ipfs-cluster/api"
"github.com/ipfs/ipfs-cluster/state"
consensus "github.com/libp2p/go-libp2p-consensus"
peer "github.com/libp2p/go-libp2p-peer"
)
// Type of consensus operation
const (
LogOpPin = iota + 1
LogOpUnpin
LogOpAddPeer
LogOpRmPeer
)
// LogOpType expresses the type of a consensus Operation
@ -28,7 +23,6 @@ type LogOpType int
// Consensus component.
type LogOp struct {
Cid api.PinSerial
Peer api.MultiaddrSerial
Type LogOpType
consensus *Consensus
}
@ -67,46 +61,6 @@ func (op *LogOp) ApplyTo(cstate consensus.State) (consensus.State, error) {
op.Cid,
&struct{}{},
nil)
case LogOpAddPeer:
// pidstr := parsePIDFromMultiaddr(op.Peer.ToMultiaddr())
op.consensus.rpcClient.Call("",
"Cluster",
"PeerManagerAddPeer",
op.Peer,
&struct{}{})
case LogOpRmPeer:
pidstr := parsePIDFromMultiaddr(op.Peer.ToMultiaddr())
pid, err := peer.IDB58Decode(pidstr)
if err != nil {
panic("could not decode a PID we ourselves encoded")
}
// Asynchronously wait for peer to be removed from raft
// and remove it from the peerset. Otherwise do nothing
go func() {
ctx, cancel := context.WithTimeout(op.consensus.ctx,
10*time.Second)
defer cancel()
// Do not wait if we are being removed
// as it may just hang waiting for a future.
if pid != op.consensus.host.ID() {
err = op.consensus.raft.WaitForPeer(ctx, pidstr, true)
if err != nil {
if err.Error() != errWaitingForSelf.Error() {
logger.Warningf("Peer has not been removed from raft: %s: %s", pidstr, err)
}
return
}
}
op.consensus.rpcClient.Call("",
"Cluster",
"PeerManagerRmPeer",
pid,
&struct{}{})
}()
default:
logger.Error("unknown LogOp type. Ignoring")

View File

@ -12,7 +12,7 @@ func init() {
//SetFacilityLogLevel("consensus", l)
//SetFacilityLogLevel("monitor", "INFO")
//SetFacilityLogLevel("raft", l)
//SetFacilityLogLevel("p2p-gorpc", l)
SetFacilityLogLevel("p2p-gorpc", l)
//SetFacilityLogLevel("swarm2", l)
//SetFacilityLogLevel("libp2p-raft", l)
}

View File

@ -16,7 +16,6 @@ import (
cid "github.com/ipfs/go-cid"
peer "github.com/libp2p/go-libp2p-peer"
protocol "github.com/libp2p/go-libp2p-protocol"
ma "github.com/multiformats/go-multiaddr"
)
// RPCProtocol is used to send libp2p messages between cluster peers
@ -45,8 +44,8 @@ type Consensus interface {
LogPin(c api.Pin) error
// Logs an unpin operation
LogUnpin(c api.Pin) error
LogAddPeer(addr ma.Multiaddr) error
LogRmPeer(p peer.ID) error
AddPeer(p peer.ID) error
RmPeer(p peer.ID) error
State() (state.State, error)
// Provide a node which is responsible to perform
// specific tasks which must only run in 1 cluster peer
@ -56,6 +55,8 @@ type Consensus interface {
WaitForSync() error
// Clean removes all consensus data
Clean() error
// Peers returns the peerset participating in the Consensus
Peers() ([]peer.ID, error)
}
// API is a component which offers an API for Cluster. This is

View File

@ -31,7 +31,7 @@ import (
//TestClusters*
var (
// number of clusters to create
nClusters = 6
nClusters = 10
// number of pins to pin/unpin/check
nPins = 500

View File

@ -1,173 +1,81 @@
package ipfscluster
import (
"sync"
"time"
"fmt"
host "github.com/libp2p/go-libp2p-host"
peer "github.com/libp2p/go-libp2p-peer"
peerstore "github.com/libp2p/go-libp2p-peerstore"
ma "github.com/multiformats/go-multiaddr"
)
// peerManager is our own local peerstore
// peerManager provides wrappers peerset control
type peerManager struct {
cluster *Cluster
ps peerstore.Peerstore
self peer.ID
peermap map[peer.ID]ma.Multiaddr
m sync.RWMutex
host host.Host
}
func newPeerManager(c *Cluster) *peerManager {
pm := &peerManager{
cluster: c,
ps: c.host.Peerstore(),
self: c.host.ID(),
}
pm.resetPeers()
return pm
func newPeerManager(h host.Host) *peerManager {
return &peerManager{h}
}
func (pm *peerManager) addPeer(addr ma.Multiaddr, save bool) error {
logger.Debugf("adding peer %s", addr)
func (pm *peerManager) addPeer(addr ma.Multiaddr) error {
logger.Debugf("adding peer address %s", addr)
pid, decapAddr, err := multiaddrSplit(addr)
if err != nil {
return err
}
pm.ps.AddAddr(pid, decapAddr, peerstore.PermanentAddrTTL)
// Only log these when we are not starting cluster (rpcClient == nil)
// They pollute the start up logs redundantly.
if !pm.isPeer(pid) && pm.cluster.rpcClient != nil {
logger.Infof("new peer: %s", addr.String())
}
pm.m.Lock()
pm.peermap[pid] = addr
pm.m.Unlock()
if save {
pm.savePeers()
}
logger.Debugf("peers after adding %s", pm.peersAddrs())
pm.host.Peerstore().AddAddr(pid, decapAddr, peerstore.PermanentAddrTTL)
return nil
}
func (pm *peerManager) rmPeer(pid peer.ID, save bool) error {
logger.Debugf("removing peer %s", pid.Pretty())
// Seeing our own departure during bootstrap. Ignore that.
if pid == pm.self && !pm.cluster.readyB {
return nil
}
// remove ourselves, unless:
// - we are not ready yet (means we are boostrapping)
// - we have been removed (means Shutdown() with LeaveOnShutdown flag)
if pid == pm.self && pm.cluster.readyB && !pm.cluster.removed {
logger.Info("this peer has been removed and will shutdown")
pm.cluster.removed = true
// we are removing ourselves. Therefore we need to:
// - convert cluster peers to bootstrapping peers
// - shut ourselves down if we are not in the process
//
// Note that, if we are here, we have already been
// removed from the raft.
// save peers as boostrappers
pm.cluster.config.Bootstrap = pm.peersAddrs()
pm.resetPeers()
pm.savePeers()
time.Sleep(1 * time.Second)
// should block and do nothing if already doing it
pm.cluster.Shutdown()
return nil
}
// Removing a different peer
if pm.isPeer(pid) {
logger.Infof("removing Cluster peer %s", pid.Pretty())
}
pm.m.Lock()
delete(pm.peermap, pid)
pm.m.Unlock()
if save {
pm.savePeers()
}
func (pm *peerManager) rmPeer(pid peer.ID) error {
logger.Debugf("forgetting peer %s", pid.Pretty())
pm.host.Peerstore().ClearAddrs(pid)
return nil
}
func (pm *peerManager) savePeers() {
peers := pm.peersAddrs()
logger.Debugf("saving peers: %s", peers)
pm.cluster.config.Peers = peers
pm.cluster.config.NotifySave()
}
// func (pm *peerManager) savePeers() {
// peers := pm.peersAddrs()
// logger.Debugf("saving peers: %s", peers)
// pm.cluster.config.Peers = peers
// pm.cluster.config.NotifySave()
// }
func (pm *peerManager) resetPeers() {
pm.m.Lock()
pm.peermap = make(map[peer.ID]ma.Multiaddr)
pm.peermap[pm.self] = pm.cluster.config.ListenAddr
pm.m.Unlock()
}
// func (pm *peerManager) isPeer(p peer.ID) bool {
// if p == pm.cluster.id {
// return true
// }
func (pm *peerManager) isPeer(p peer.ID) bool {
if p == pm.self {
return true
}
pm.m.RLock()
_, ok := pm.peermap[p]
pm.m.RUnlock()
return ok
}
// peers including ourselves
func (pm *peerManager) peers() []peer.ID {
pm.m.RLock()
defer pm.m.RUnlock()
var peers []peer.ID
for k := range pm.peermap {
peers = append(peers, k)
}
return peers
}
// peers := pm.cluster.consensus.Peers()
// for _, pid := range peers {
// if p == pid {
// return true
// }
// }
// return false
// }
// cluster peer addresses (NOT including ourselves)
func (pm *peerManager) peersAddrs() []ma.Multiaddr {
pm.m.RLock()
defer pm.m.RUnlock()
func (pm *peerManager) addresses(peers []peer.ID) []ma.Multiaddr {
addrs := []ma.Multiaddr{}
for k, addr := range pm.peermap {
if k != pm.self {
addrs = append(addrs, addr)
if peers == nil {
return addrs
}
for _, p := range peers {
if p == pm.host.ID() {
continue
}
peerAddr, _ := ma.NewMultiaddr(fmt.Sprintf("/ipfs/%s", peer.IDB58Encode(p)))
for _, a := range pm.host.Peerstore().Addrs(p) {
addrs = append(addrs, a.Encapsulate(peerAddr))
}
}
return addrs
}
// func (pm *peerManager) addFromConfig(cfg *Config) error {
// return pm.setFromMultiaddrs(cfg.ClusterPeers)
// }
// this resets peers!
func (pm *peerManager) setFromMultiaddrs(addrs []ma.Multiaddr, save bool) error {
pm.resetPeers()
for _, m := range addrs {
err := pm.addPeer(m, false)
if err != nil {
logger.Error(err)
return err
}
}
if save {
pm.savePeers()
func (pm *peerManager) importAddresses(addrs []ma.Multiaddr) error {
for _, a := range addrs {
pm.addPeer(a)
}
return nil
}

View File

@ -53,7 +53,7 @@ func TestClustersPeerAdd(t *testing.T) {
if len(id.ClusterPeers) != i {
// ClusterPeers is originally empty and contains nodes as we add them
t.Log(id.ClusterPeers)
t.Log(i, id.ClusterPeers)
t.Fatal("cluster peers should be up to date with the cluster")
}
}
@ -455,10 +455,20 @@ func TestClustersPeerRejoin(t *testing.T) {
}
}
clusters[0].Shutdown()
clusters[0].config.LeaveOnShutdown = true
err = clusters[0].Shutdown()
if err != nil {
t.Fatal(err)
}
mocks[0].Close()
//delay()
delay()
// Forget peer so we can re-add one in same address/port
f := func(t *testing.T, c *Cluster) {
c.peerManager.rmPeer(clusters[0].id)
}
runF(t, clusters[1:], f)
// Pin something on the rest
pin2, _ := cid.Decode(test.TestCid2)

View File

@ -272,15 +272,14 @@ func (rpcapi *RPCAPI) ConsensusLogUnpin(in api.PinSerial, out *struct{}) error {
return rpcapi.c.consensus.LogUnpin(c)
}
// ConsensusLogAddPeer runs Consensus.LogAddPeer().
func (rpcapi *RPCAPI) ConsensusLogAddPeer(in api.MultiaddrSerial, out *struct{}) error {
addr := in.ToMultiaddr()
return rpcapi.c.consensus.LogAddPeer(addr)
// ConsensusAddPeer runs Consensus.AddPeer().
func (rpcapi *RPCAPI) ConsensusAddPeer(in peer.ID, out *struct{}) error {
return rpcapi.c.consensus.AddPeer(in)
}
// ConsensusLogRmPeer runs Consensus.LogRmPeer().
func (rpcapi *RPCAPI) ConsensusLogRmPeer(in peer.ID, out *struct{}) error {
return rpcapi.c.consensus.LogRmPeer(in)
// ConsensusRmPeer runs Consensus.RmPeer().
func (rpcapi *RPCAPI) ConsensusRmPeer(in peer.ID, out *struct{}) error {
return rpcapi.c.consensus.RmPeer(in)
}
/*
@ -290,26 +289,27 @@ func (rpcapi *RPCAPI) ConsensusLogRmPeer(in peer.ID, out *struct{}) error {
// PeerManagerAddPeer runs peerManager.addPeer().
func (rpcapi *RPCAPI) PeerManagerAddPeer(in api.MultiaddrSerial, out *struct{}) error {
addr := in.ToMultiaddr()
err := rpcapi.c.peerManager.addPeer(addr, true)
err := rpcapi.c.peerManager.addPeer(addr)
return err
}
// PeerManagerSetFromMultiaddrs runs peerManager.setFromMultiaddrs().
func (rpcapi *RPCAPI) PeerManagerSetFromMultiaddrs(in api.MultiaddrsSerial, out *struct{}) error {
// PeerManagerImportAddresses runs peerManager.importAddresses().
func (rpcapi *RPCAPI) PeerManagerImportAddresses(in api.MultiaddrsSerial, out *struct{}) error {
addrs := in.ToMultiaddrs()
err := rpcapi.c.peerManager.setFromMultiaddrs(addrs, true)
err := rpcapi.c.peerManager.importAddresses(addrs)
return err
}
// PeerManagerRmPeer runs peerManager.rmPeer().
func (rpcapi *RPCAPI) PeerManagerRmPeer(in peer.ID, out *struct{}) error {
return rpcapi.c.peerManager.rmPeer(in, true)
return rpcapi.c.peerManager.rmPeer(in)
}
// PeerManagerPeers runs peerManager.peers().
// PeerManagerPeers runs cluster.consensus.Peers().
func (rpcapi *RPCAPI) PeerManagerPeers(in struct{}, out *[]peer.ID) error {
*out = rpcapi.c.peerManager.peers()
return nil
peers, err := rpcapi.c.consensus.Peers()
*out = peers
return err
}
/*

View File

@ -224,10 +224,6 @@ func (mock *mockService) PeerManagerAddPeer(in api.MultiaddrSerial, out *struct{
return nil
}
func (mock *mockService) PeerManagerRmPeer(in peer.ID, out *struct{}) error {
return nil
}
/* IPFSConnector methods */
func (mock *mockService) IPFSPin(in api.PinSerial, out *struct{}) error {
@ -282,10 +278,10 @@ func (mock *mockService) IPFSFreeSpace(in struct{}, out *uint64) error {
return nil
}
func (mock *mockService) ConsensusLogAddPeer(in api.MultiaddrSerial, out *struct{}) error {
func (mock *mockService) ConsensusAddPeer(in peer.ID, out *struct{}) error {
return errors.New("mock rpc cannot redirect")
}
func (mock *mockService) ConsensusLogRmPeer(in peer.ID, out *struct{}) error {
func (mock *mockService) ConsensusRmPeer(in peer.ID, out *struct{}) error {
return errors.New("mock rpc cannot redirect")
}