Monitor: do not clean up metrics immediately after an alert

This commit is contained in:
Hector Sanjuan 2022-01-28 17:03:08 +01:00
parent d4591b8442
commit 000dccc1cc
3 changed files with 28 additions and 32 deletions

View File

@ -84,6 +84,22 @@ func (mc *Checker) CheckAll() error {
return nil
}
// ResetAlerts clears up how many time a peer alerted for a given metric.
// Thus, if it was over the threshold, it will start alerting again.
func (mc *Checker) ResetAlerts(pid peer.ID, metricName string) {
mc.failedPeersMu.Lock()
defer mc.failedPeersMu.Unlock()
failedMetrics, ok := mc.failedPeers[pid]
if !ok {
return
}
delete(failedMetrics, metricName)
if len(mc.failedPeers[pid]) == 0 {
delete(mc.failedPeers, pid)
}
}
func (mc *Checker) alert(pid peer.ID, metricName string) error {
mc.failedPeersMu.Lock()
defer mc.failedPeersMu.Unlock()
@ -100,19 +116,19 @@ func (mc *Checker) alert(pid peer.ID, metricName string) error {
}
}
// If above threshold, remove all metrics for that peer
// and clean up failedPeers when no failed metrics are left.
if failedMetrics[metricName] >= MaxAlertThreshold {
mc.metrics.RemovePeerMetrics(pid, metricName)
delete(failedMetrics, metricName)
if len(mc.failedPeers[pid]) == 0 {
delete(mc.failedPeers, pid)
failedMetrics[metricName]++
// If above threshold, do not send alert
if failedMetrics[metricName] > MaxAlertThreshold {
// Cleanup old metrics eventually
if failedMetrics[metricName] >= 300 {
delete(failedMetrics, metricName)
if len(mc.failedPeers[pid]) == 0 {
delete(mc.failedPeers, pid)
}
}
return nil
}
failedMetrics[metricName]++
alrt := &api.Alert{
Metric: *lastMetric,
TriggeredAt: time.Now(),

View File

@ -206,6 +206,9 @@ func (mon *Monitor) LogMetric(ctx context.Context, m *api.Metric) error {
mon.metrics.Add(m)
debug("logged", m)
if !m.Discard() { // We received a valid metric so avoid alerting.
mon.checker.ResetAlerts(m.Peer, m.Name)
}
return nil
}

View File

@ -311,26 +311,3 @@ func TestPeerMonitorAlerts(t *testing.T) {
}
}
}
func TestMetricsGetsDeleted(t *testing.T) {
ctx := context.Background()
pm, _, shutdown := testPeerMonitor(t)
defer shutdown()
mf := newMetricFactory()
pm.LogMetric(ctx, mf.newMetric("test", test.PeerID1))
metrics := pm.metrics.PeerMetrics(test.PeerID1)
if len(metrics) == 0 {
t.Error("expected metrics")
}
// TODO: expiry time + checkInterval is 7 sec
// Why does it need 9 or more?
time.Sleep(9 * time.Second)
metrics = pm.metrics.PeerMetrics(test.PeerID1)
if len(metrics) > 0 {
t.Error("expected no metrics")
}
}