Fix: alert at most once PER METRIC
Before it would alert at most once per peer, which prevented some metrics from alerting at all.
This commit is contained in:
parent
a0d93fc62c
commit
27368ab077
|
@ -65,6 +65,7 @@ func (log *hcLogToLogger) Info(msg string, args ...interface{}) {
|
|||
}
|
||||
|
||||
func (log *hcLogToLogger) Warn(msg string, args ...interface{}) {
|
||||
fmt.Println(msg)
|
||||
raftLogger.Warning(log.format(msg, args))
|
||||
}
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ type Checker struct {
|
|||
alertThreshold int
|
||||
|
||||
failedPeersMu sync.Mutex
|
||||
failedPeers map[peer.ID]int
|
||||
failedPeers map[peer.ID]map[string]int
|
||||
}
|
||||
|
||||
// NewChecker creates a Checker using the given
|
||||
|
@ -50,7 +50,7 @@ func NewChecker(ctx context.Context, metrics *Store, threshold float64) *Checker
|
|||
alertCh: make(chan *api.Alert, AlertChannelCap),
|
||||
metrics: metrics,
|
||||
threshold: threshold,
|
||||
failedPeers: make(map[peer.ID]int),
|
||||
failedPeers: make(map[peer.ID]map[string]int),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -102,12 +102,25 @@ func (mc *Checker) alertIfExpired(metric *api.Metric) error {
|
|||
func (mc *Checker) alert(pid peer.ID, metricName string) error {
|
||||
mc.failedPeersMu.Lock()
|
||||
defer mc.failedPeersMu.Unlock()
|
||||
if mc.failedPeers[pid] >= MaxAlertThreshold {
|
||||
mc.metrics.RemovePeer(pid)
|
||||
delete(mc.failedPeers, pid)
|
||||
|
||||
_, ok := mc.failedPeers[pid]
|
||||
if !ok {
|
||||
mc.failedPeers[pid] = make(map[string]int)
|
||||
}
|
||||
failedMetrics := mc.failedPeers[pid]
|
||||
|
||||
// If above threshold, remove all metrics for that peer
|
||||
// and clean up failedPeers when no failed metrics are left.
|
||||
if failedMetrics[metricName] >= MaxAlertThreshold {
|
||||
mc.metrics.RemovePeerMetrics(pid, metricName)
|
||||
delete(failedMetrics, metricName)
|
||||
if len(mc.failedPeers[pid]) == 0 {
|
||||
delete(mc.failedPeers, pid)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
mc.failedPeers[pid]++
|
||||
|
||||
failedMetrics[metricName]++
|
||||
|
||||
alrt := &api.Alert{
|
||||
Peer: pid,
|
||||
|
|
|
@ -50,12 +50,20 @@ func (mtrs *Store) Add(m *api.Metric) {
|
|||
// RemovePeer removes all metrics related to a peer from the Store.
|
||||
func (mtrs *Store) RemovePeer(pid peer.ID) {
|
||||
mtrs.mux.Lock()
|
||||
for _, mtrs := range mtrs.byName {
|
||||
delete(mtrs, pid)
|
||||
for _, metrics := range mtrs.byName {
|
||||
delete(metrics, pid)
|
||||
}
|
||||
mtrs.mux.Unlock()
|
||||
}
|
||||
|
||||
// RemovePeerMetrics removes all metrics of a given name for a given peer ID.
|
||||
func (mtrs *Store) RemovePeerMetrics(pid peer.ID, name string) {
|
||||
mtrs.mux.Lock()
|
||||
metrics := mtrs.byName[name]
|
||||
delete(metrics, pid)
|
||||
mtrs.mux.Unlock()
|
||||
}
|
||||
|
||||
// LatestValid returns all the last known valid metrics of a given type. A metric
|
||||
// is valid if it has not expired.
|
||||
func (mtrs *Store) LatestValid(name string) []*api.Metric {
|
||||
|
|
Loading…
Reference in New Issue
Block a user