From 000dccc1ccf9ac657b584cd59e54a3a084c80e90 Mon Sep 17 00:00:00 2001 From: Hector Sanjuan Date: Fri, 28 Jan 2022 17:03:08 +0100 Subject: [PATCH] Monitor: do not clean up metrics immediately after an alert --- monitor/metrics/checker.go | 34 +++++++++++++++++++++-------- monitor/pubsubmon/pubsubmon.go | 3 +++ monitor/pubsubmon/pubsubmon_test.go | 23 ------------------- 3 files changed, 28 insertions(+), 32 deletions(-) diff --git a/monitor/metrics/checker.go b/monitor/metrics/checker.go index fe914ee3..807b234f 100644 --- a/monitor/metrics/checker.go +++ b/monitor/metrics/checker.go @@ -84,6 +84,22 @@ func (mc *Checker) CheckAll() error { return nil } +// ResetAlerts clears up how many time a peer alerted for a given metric. +// Thus, if it was over the threshold, it will start alerting again. +func (mc *Checker) ResetAlerts(pid peer.ID, metricName string) { + mc.failedPeersMu.Lock() + defer mc.failedPeersMu.Unlock() + + failedMetrics, ok := mc.failedPeers[pid] + if !ok { + return + } + delete(failedMetrics, metricName) + if len(mc.failedPeers[pid]) == 0 { + delete(mc.failedPeers, pid) + } +} + func (mc *Checker) alert(pid peer.ID, metricName string) error { mc.failedPeersMu.Lock() defer mc.failedPeersMu.Unlock() @@ -100,19 +116,19 @@ func (mc *Checker) alert(pid peer.ID, metricName string) error { } } - // If above threshold, remove all metrics for that peer - // and clean up failedPeers when no failed metrics are left. - if failedMetrics[metricName] >= MaxAlertThreshold { - mc.metrics.RemovePeerMetrics(pid, metricName) - delete(failedMetrics, metricName) - if len(mc.failedPeers[pid]) == 0 { - delete(mc.failedPeers, pid) + failedMetrics[metricName]++ + // If above threshold, do not send alert + if failedMetrics[metricName] > MaxAlertThreshold { + // Cleanup old metrics eventually + if failedMetrics[metricName] >= 300 { + delete(failedMetrics, metricName) + if len(mc.failedPeers[pid]) == 0 { + delete(mc.failedPeers, pid) + } } return nil } - failedMetrics[metricName]++ - alrt := &api.Alert{ Metric: *lastMetric, TriggeredAt: time.Now(), diff --git a/monitor/pubsubmon/pubsubmon.go b/monitor/pubsubmon/pubsubmon.go index 61e2bb54..3ddad0a4 100644 --- a/monitor/pubsubmon/pubsubmon.go +++ b/monitor/pubsubmon/pubsubmon.go @@ -206,6 +206,9 @@ func (mon *Monitor) LogMetric(ctx context.Context, m *api.Metric) error { mon.metrics.Add(m) debug("logged", m) + if !m.Discard() { // We received a valid metric so avoid alerting. + mon.checker.ResetAlerts(m.Peer, m.Name) + } return nil } diff --git a/monitor/pubsubmon/pubsubmon_test.go b/monitor/pubsubmon/pubsubmon_test.go index 0dbfcbdb..66a28d57 100644 --- a/monitor/pubsubmon/pubsubmon_test.go +++ b/monitor/pubsubmon/pubsubmon_test.go @@ -311,26 +311,3 @@ func TestPeerMonitorAlerts(t *testing.T) { } } } - -func TestMetricsGetsDeleted(t *testing.T) { - ctx := context.Background() - - pm, _, shutdown := testPeerMonitor(t) - defer shutdown() - mf := newMetricFactory() - - pm.LogMetric(ctx, mf.newMetric("test", test.PeerID1)) - metrics := pm.metrics.PeerMetrics(test.PeerID1) - if len(metrics) == 0 { - t.Error("expected metrics") - } - - // TODO: expiry time + checkInterval is 7 sec - // Why does it need 9 or more? - time.Sleep(9 * time.Second) - - metrics = pm.metrics.PeerMetrics(test.PeerID1) - if len(metrics) > 0 { - t.Error("expected no metrics") - } -}