ipfs-cluster/monitor/metrics/checker.go

204 lines
5.5 KiB
Go
Raw Normal View History

package metrics
import (
"context"
"errors"
"sync"
"time"
"github.com/ipfs/ipfs-cluster/api"
"github.com/ipfs/ipfs-cluster/observations"
peer "github.com/libp2p/go-libp2p-core/peer"
"go.opencensus.io/stats"
"go.opencensus.io/tag"
)
// AlertChannelCap specifies how much buffer the alerts channel has.
var AlertChannelCap = 256
// MaxAlertThreshold specifies how many alerts will occur per a peer is
// removed from the list of monitored peers.
2019-06-10 12:09:06 +00:00
var MaxAlertThreshold = 1
// ErrAlertChannelFull is returned if the alert channel is full.
var ErrAlertChannelFull = errors.New("alert channel is full")
2019-06-27 15:55:51 +00:00
// accrualMetricsNum represents the number metrics required for
// accrual to function appropriately, and under which we use
// TTL to determine whether a peer may have failed.
var accrualMetricsNum = 6
// Checker provides utilities to find expired metrics
// for a given peerset and send alerts if it proceeds to do so.
type Checker struct {
ctx context.Context
alertCh chan *api.Alert
metrics *Store
threshold float64
failedPeersMu sync.Mutex
failedPeers map[peer.ID]map[string]int
}
// NewChecker creates a Checker using the given
// MetricsStore. The threshold value indicates when a
// monitored component should be considered to have failed.
// The greater the threshold value the more leniency is granted.
//
// A value between 2.0 and 4.0 is suggested for the threshold.
func NewChecker(ctx context.Context, metrics *Store, threshold float64) *Checker {
return &Checker{
ctx: ctx,
alertCh: make(chan *api.Alert, AlertChannelCap),
metrics: metrics,
threshold: threshold,
failedPeers: make(map[peer.ID]map[string]int),
}
}
// CheckPeers will trigger alerts based on the latest metrics from the given peerset
Consensus: add new "crdt" consensus component This adds a new "crdt" consensus component using go-ds-crdt. This implies several refactors to fully make cluster consensus-component independent: * Delete mapstate and fully adopt dsstate (after people have migrated). * Return errors from state methods rather than ignoring them. * Add a new "datastore" modules so that we can configure datastores in the main configuration like other components. * Let the consensus components fully define the "state.State". Thus, they do not receive the state, they receive the storage where we put the state (a go-datastore). * Allow to customize how the monitor component obtains Peers() (the current peerset), including avoiding using the current peerset. At the moment the crdt consensus uses the monitoring component to define the current peerset. Therefore the monitor component cannot rely on the consensus component to produce a peerset. * Re-factor/re-implementation of "ipfs-cluster-service state" operations. Includes the dissapearance of the "migrate" one. The CRDT consensus component defines creates a crdt-datastore (with ipfs-lite) and uses it to intitialize a dssate. Thus the crdt-store is elegantly wrapped. Any modifications to the state get automatically replicated to other peers. We store all the CRDT DAG blocks in the local datastore. The consensus components only expose a ReadOnly state, as any modifications to the shared state should happen through them. DHT and PubSub facilities must now be created outside of Cluster and passed in so they can be re-used by different components.
2019-02-20 14:24:25 +00:00
// when they have expired and no alert has been sent before.
func (mc *Checker) CheckPeers(peers []peer.ID) error {
2019-06-23 08:05:32 +00:00
for _, name := range mc.metrics.MetricNames() {
for _, peer := range peers {
for _, metric := range mc.metrics.PeerMetricAll(name, peer) {
if mc.FailedMetric(metric.Name, peer) {
err := mc.alert(peer, metric.Name)
if err != nil {
return err
}
}
}
}
}
return nil
}
Consensus: add new "crdt" consensus component This adds a new "crdt" consensus component using go-ds-crdt. This implies several refactors to fully make cluster consensus-component independent: * Delete mapstate and fully adopt dsstate (after people have migrated). * Return errors from state methods rather than ignoring them. * Add a new "datastore" modules so that we can configure datastores in the main configuration like other components. * Let the consensus components fully define the "state.State". Thus, they do not receive the state, they receive the storage where we put the state (a go-datastore). * Allow to customize how the monitor component obtains Peers() (the current peerset), including avoiding using the current peerset. At the moment the crdt consensus uses the monitoring component to define the current peerset. Therefore the monitor component cannot rely on the consensus component to produce a peerset. * Re-factor/re-implementation of "ipfs-cluster-service state" operations. Includes the dissapearance of the "migrate" one. The CRDT consensus component defines creates a crdt-datastore (with ipfs-lite) and uses it to intitialize a dssate. Thus the crdt-store is elegantly wrapped. Any modifications to the state get automatically replicated to other peers. We store all the CRDT DAG blocks in the local datastore. The consensus components only expose a ReadOnly state, as any modifications to the shared state should happen through them. DHT and PubSub facilities must now be created outside of Cluster and passed in so they can be re-used by different components.
2019-02-20 14:24:25 +00:00
// CheckAll will trigger alerts for all latest metrics when they have expired
// and no alert has been sent before.
func (mc *Checker) CheckAll() error {
Consensus: add new "crdt" consensus component This adds a new "crdt" consensus component using go-ds-crdt. This implies several refactors to fully make cluster consensus-component independent: * Delete mapstate and fully adopt dsstate (after people have migrated). * Return errors from state methods rather than ignoring them. * Add a new "datastore" modules so that we can configure datastores in the main configuration like other components. * Let the consensus components fully define the "state.State". Thus, they do not receive the state, they receive the storage where we put the state (a go-datastore). * Allow to customize how the monitor component obtains Peers() (the current peerset), including avoiding using the current peerset. At the moment the crdt consensus uses the monitoring component to define the current peerset. Therefore the monitor component cannot rely on the consensus component to produce a peerset. * Re-factor/re-implementation of "ipfs-cluster-service state" operations. Includes the dissapearance of the "migrate" one. The CRDT consensus component defines creates a crdt-datastore (with ipfs-lite) and uses it to intitialize a dssate. Thus the crdt-store is elegantly wrapped. Any modifications to the state get automatically replicated to other peers. We store all the CRDT DAG blocks in the local datastore. The consensus components only expose a ReadOnly state, as any modifications to the shared state should happen through them. DHT and PubSub facilities must now be created outside of Cluster and passed in so they can be re-used by different components.
2019-02-20 14:24:25 +00:00
for _, metric := range mc.metrics.AllMetrics() {
2019-06-23 08:05:32 +00:00
if mc.FailedMetric(metric.Name, metric.Peer) {
err := mc.alert(metric.Peer, metric.Name)
if err != nil {
return err
}
Consensus: add new "crdt" consensus component This adds a new "crdt" consensus component using go-ds-crdt. This implies several refactors to fully make cluster consensus-component independent: * Delete mapstate and fully adopt dsstate (after people have migrated). * Return errors from state methods rather than ignoring them. * Add a new "datastore" modules so that we can configure datastores in the main configuration like other components. * Let the consensus components fully define the "state.State". Thus, they do not receive the state, they receive the storage where we put the state (a go-datastore). * Allow to customize how the monitor component obtains Peers() (the current peerset), including avoiding using the current peerset. At the moment the crdt consensus uses the monitoring component to define the current peerset. Therefore the monitor component cannot rely on the consensus component to produce a peerset. * Re-factor/re-implementation of "ipfs-cluster-service state" operations. Includes the dissapearance of the "migrate" one. The CRDT consensus component defines creates a crdt-datastore (with ipfs-lite) and uses it to intitialize a dssate. Thus the crdt-store is elegantly wrapped. Any modifications to the state get automatically replicated to other peers. We store all the CRDT DAG blocks in the local datastore. The consensus components only expose a ReadOnly state, as any modifications to the shared state should happen through them. DHT and PubSub facilities must now be created outside of Cluster and passed in so they can be re-used by different components.
2019-02-20 14:24:25 +00:00
}
}
return nil
}
func (mc *Checker) alert(pid peer.ID, metricName string) error {
mc.failedPeersMu.Lock()
defer mc.failedPeersMu.Unlock()
if _, ok := mc.failedPeers[pid]; !ok {
mc.failedPeers[pid] = make(map[string]int)
}
failedMetrics := mc.failedPeers[pid]
lastMetric := mc.metrics.PeerLatest(metricName, pid)
if lastMetric == nil {
lastMetric = &api.Metric{
Name: metricName,
Peer: pid,
}
}
// If above threshold, remove all metrics for that peer
// and clean up failedPeers when no failed metrics are left.
if failedMetrics[metricName] >= MaxAlertThreshold {
mc.metrics.RemovePeerMetrics(pid, metricName)
delete(failedMetrics, metricName)
if len(mc.failedPeers[pid]) == 0 {
delete(mc.failedPeers, pid)
}
return nil
}
failedMetrics[metricName]++
alrt := &api.Alert{
Metric: *lastMetric,
TriggeredAt: time.Now(),
}
select {
case mc.alertCh <- alrt:
stats.RecordWithTags(
mc.ctx,
[]tag.Mutator{tag.Upsert(observations.RemotePeerKey, pid.Pretty())},
observations.Alerts.M(1),
)
default:
return ErrAlertChannelFull
}
return nil
}
// Alerts returns a channel which gets notified by CheckPeers.
func (mc *Checker) Alerts() <-chan *api.Alert {
return mc.alertCh
}
// Watch will trigger regular CheckPeers on the given interval. It will call
// peersF to obtain a peerset. It can be stopped by cancelling the context.
// Usually you want to launch this in a goroutine.
func (mc *Checker) Watch(ctx context.Context, peersF func(context.Context) ([]peer.ID, error), interval time.Duration) {
ticker := time.NewTicker(interval)
for {
select {
case <-ticker.C:
Consensus: add new "crdt" consensus component This adds a new "crdt" consensus component using go-ds-crdt. This implies several refactors to fully make cluster consensus-component independent: * Delete mapstate and fully adopt dsstate (after people have migrated). * Return errors from state methods rather than ignoring them. * Add a new "datastore" modules so that we can configure datastores in the main configuration like other components. * Let the consensus components fully define the "state.State". Thus, they do not receive the state, they receive the storage where we put the state (a go-datastore). * Allow to customize how the monitor component obtains Peers() (the current peerset), including avoiding using the current peerset. At the moment the crdt consensus uses the monitoring component to define the current peerset. Therefore the monitor component cannot rely on the consensus component to produce a peerset. * Re-factor/re-implementation of "ipfs-cluster-service state" operations. Includes the dissapearance of the "migrate" one. The CRDT consensus component defines creates a crdt-datastore (with ipfs-lite) and uses it to intitialize a dssate. Thus the crdt-store is elegantly wrapped. Any modifications to the state get automatically replicated to other peers. We store all the CRDT DAG blocks in the local datastore. The consensus components only expose a ReadOnly state, as any modifications to the shared state should happen through them. DHT and PubSub facilities must now be created outside of Cluster and passed in so they can be re-used by different components.
2019-02-20 14:24:25 +00:00
if peersF != nil {
peers, err := peersF(ctx)
if err != nil {
continue
}
mc.CheckPeers(peers)
} else {
mc.CheckAll()
}
case <-ctx.Done():
ticker.Stop()
return
}
}
}
2019-06-23 08:05:32 +00:00
// FailedMetric returns if a peer is marked as failed for a particular metric.
func (mc *Checker) FailedMetric(metric string, pid peer.ID) bool {
_, _, _, result := mc.failed(metric, pid)
return result
}
// failed returns all the values involved in making the decision
// as to whether a peer has failed or not. The debugging parameter
// enables a more computation heavy path of the function but
// allows for insight into the return phi value.
func (mc *Checker) failed(metric string, pid peer.ID) (float64, []float64, float64, bool) {
latest := mc.metrics.PeerLatest(metric, pid)
if latest == nil {
return 0.0, nil, 0.0, true
}
// A peer is never failed if the latest metric from it has
2019-06-27 15:55:51 +00:00
// not expired or we do not have enough number of metrics
// for accrual detection
if !latest.Expired() {
return 0.0, nil, 0.0, false
}
2019-06-27 15:55:51 +00:00
// The latest metric has expired
2019-06-27 15:55:51 +00:00
pmtrs := mc.metrics.PeerMetricAll(metric, pid)
// Not enough values for accrual and metric expired. Peer failed.
if len(pmtrs) < accrualMetricsNum {
return 0.0, nil, 0.0, true
}
v := time.Now().UnixNano() - latest.ReceivedAt
dv := mc.metrics.Distribution(metric, pid)
2019-06-27 15:55:51 +00:00
phiv := phi(float64(v), dv)
return float64(v), dv, phiv, phiv >= mc.threshold
}