Fix: alert at most once PER METRIC

Before it would alert at most once per peer, which prevented some metrics
from alerting at all.
This commit is contained in:
Hector Sanjuan 2019-06-11 10:53:12 +02:00
parent a0d93fc62c
commit 27368ab077
3 changed files with 30 additions and 8 deletions

View File

@ -65,6 +65,7 @@ func (log *hcLogToLogger) Info(msg string, args ...interface{}) {
}
func (log *hcLogToLogger) Warn(msg string, args ...interface{}) {
fmt.Println(msg)
raftLogger.Warning(log.format(msg, args))
}

View File

@ -35,7 +35,7 @@ type Checker struct {
alertThreshold int
failedPeersMu sync.Mutex
failedPeers map[peer.ID]int
failedPeers map[peer.ID]map[string]int
}
// NewChecker creates a Checker using the given
@ -50,7 +50,7 @@ func NewChecker(ctx context.Context, metrics *Store, threshold float64) *Checker
alertCh: make(chan *api.Alert, AlertChannelCap),
metrics: metrics,
threshold: threshold,
failedPeers: make(map[peer.ID]int),
failedPeers: make(map[peer.ID]map[string]int),
}
}
@ -102,12 +102,25 @@ func (mc *Checker) alertIfExpired(metric *api.Metric) error {
func (mc *Checker) alert(pid peer.ID, metricName string) error {
mc.failedPeersMu.Lock()
defer mc.failedPeersMu.Unlock()
if mc.failedPeers[pid] >= MaxAlertThreshold {
mc.metrics.RemovePeer(pid)
delete(mc.failedPeers, pid)
_, ok := mc.failedPeers[pid]
if !ok {
mc.failedPeers[pid] = make(map[string]int)
}
failedMetrics := mc.failedPeers[pid]
// If above threshold, remove all metrics for that peer
// and clean up failedPeers when no failed metrics are left.
if failedMetrics[metricName] >= MaxAlertThreshold {
mc.metrics.RemovePeerMetrics(pid, metricName)
delete(failedMetrics, metricName)
if len(mc.failedPeers[pid]) == 0 {
delete(mc.failedPeers, pid)
}
return nil
}
mc.failedPeers[pid]++
failedMetrics[metricName]++
alrt := &api.Alert{
Peer: pid,

View File

@ -50,12 +50,20 @@ func (mtrs *Store) Add(m *api.Metric) {
// RemovePeer removes all metrics related to a peer from the Store.
func (mtrs *Store) RemovePeer(pid peer.ID) {
mtrs.mux.Lock()
for _, mtrs := range mtrs.byName {
delete(mtrs, pid)
for _, metrics := range mtrs.byName {
delete(metrics, pid)
}
mtrs.mux.Unlock()
}
// RemovePeerMetrics removes all metrics of a given name for a given peer ID.
func (mtrs *Store) RemovePeerMetrics(pid peer.ID, name string) {
mtrs.mux.Lock()
metrics := mtrs.byName[name]
delete(metrics, pid)
mtrs.mux.Unlock()
}
// LatestValid returns all the last known valid metrics of a given type. A metric
// is valid if it has not expired.
func (mtrs *Store) LatestValid(name string) []*api.Metric {