diff --git a/consensus/raft/logging.go b/consensus/raft/logging.go index 3273d21d..4178817b 100644 --- a/consensus/raft/logging.go +++ b/consensus/raft/logging.go @@ -65,6 +65,7 @@ func (log *hcLogToLogger) Info(msg string, args ...interface{}) { } func (log *hcLogToLogger) Warn(msg string, args ...interface{}) { + fmt.Println(msg) raftLogger.Warning(log.format(msg, args)) } diff --git a/monitor/metrics/checker.go b/monitor/metrics/checker.go index 3248aa96..e1f17fb6 100644 --- a/monitor/metrics/checker.go +++ b/monitor/metrics/checker.go @@ -35,7 +35,7 @@ type Checker struct { alertThreshold int failedPeersMu sync.Mutex - failedPeers map[peer.ID]int + failedPeers map[peer.ID]map[string]int } // NewChecker creates a Checker using the given @@ -50,7 +50,7 @@ func NewChecker(ctx context.Context, metrics *Store, threshold float64) *Checker alertCh: make(chan *api.Alert, AlertChannelCap), metrics: metrics, threshold: threshold, - failedPeers: make(map[peer.ID]int), + failedPeers: make(map[peer.ID]map[string]int), } } @@ -102,12 +102,25 @@ func (mc *Checker) alertIfExpired(metric *api.Metric) error { func (mc *Checker) alert(pid peer.ID, metricName string) error { mc.failedPeersMu.Lock() defer mc.failedPeersMu.Unlock() - if mc.failedPeers[pid] >= MaxAlertThreshold { - mc.metrics.RemovePeer(pid) - delete(mc.failedPeers, pid) + + _, ok := mc.failedPeers[pid] + if !ok { + mc.failedPeers[pid] = make(map[string]int) + } + failedMetrics := mc.failedPeers[pid] + + // If above threshold, remove all metrics for that peer + // and clean up failedPeers when no failed metrics are left. + if failedMetrics[metricName] >= MaxAlertThreshold { + mc.metrics.RemovePeerMetrics(pid, metricName) + delete(failedMetrics, metricName) + if len(mc.failedPeers[pid]) == 0 { + delete(mc.failedPeers, pid) + } return nil } - mc.failedPeers[pid]++ + + failedMetrics[metricName]++ alrt := &api.Alert{ Peer: pid, diff --git a/monitor/metrics/store.go b/monitor/metrics/store.go index d8a7dde3..4b13c713 100644 --- a/monitor/metrics/store.go +++ b/monitor/metrics/store.go @@ -50,12 +50,20 @@ func (mtrs *Store) Add(m *api.Metric) { // RemovePeer removes all metrics related to a peer from the Store. func (mtrs *Store) RemovePeer(pid peer.ID) { mtrs.mux.Lock() - for _, mtrs := range mtrs.byName { - delete(mtrs, pid) + for _, metrics := range mtrs.byName { + delete(metrics, pid) } mtrs.mux.Unlock() } +// RemovePeerMetrics removes all metrics of a given name for a given peer ID. +func (mtrs *Store) RemovePeerMetrics(pid peer.ID, name string) { + mtrs.mux.Lock() + metrics := mtrs.byName[name] + delete(metrics, pid) + mtrs.mux.Unlock() +} + // LatestValid returns all the last known valid metrics of a given type. A metric // is valid if it has not expired. func (mtrs *Store) LatestValid(name string) []*api.Metric {