Monitor: do not clean up metrics immediately after an alert
This commit is contained in:
parent
d4591b8442
commit
000dccc1cc
|
@ -84,6 +84,22 @@ func (mc *Checker) CheckAll() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// ResetAlerts clears up how many time a peer alerted for a given metric.
|
||||
// Thus, if it was over the threshold, it will start alerting again.
|
||||
func (mc *Checker) ResetAlerts(pid peer.ID, metricName string) {
|
||||
mc.failedPeersMu.Lock()
|
||||
defer mc.failedPeersMu.Unlock()
|
||||
|
||||
failedMetrics, ok := mc.failedPeers[pid]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
delete(failedMetrics, metricName)
|
||||
if len(mc.failedPeers[pid]) == 0 {
|
||||
delete(mc.failedPeers, pid)
|
||||
}
|
||||
}
|
||||
|
||||
func (mc *Checker) alert(pid peer.ID, metricName string) error {
|
||||
mc.failedPeersMu.Lock()
|
||||
defer mc.failedPeersMu.Unlock()
|
||||
|
@ -100,19 +116,19 @@ func (mc *Checker) alert(pid peer.ID, metricName string) error {
|
|||
}
|
||||
}
|
||||
|
||||
// If above threshold, remove all metrics for that peer
|
||||
// and clean up failedPeers when no failed metrics are left.
|
||||
if failedMetrics[metricName] >= MaxAlertThreshold {
|
||||
mc.metrics.RemovePeerMetrics(pid, metricName)
|
||||
delete(failedMetrics, metricName)
|
||||
if len(mc.failedPeers[pid]) == 0 {
|
||||
delete(mc.failedPeers, pid)
|
||||
failedMetrics[metricName]++
|
||||
// If above threshold, do not send alert
|
||||
if failedMetrics[metricName] > MaxAlertThreshold {
|
||||
// Cleanup old metrics eventually
|
||||
if failedMetrics[metricName] >= 300 {
|
||||
delete(failedMetrics, metricName)
|
||||
if len(mc.failedPeers[pid]) == 0 {
|
||||
delete(mc.failedPeers, pid)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
failedMetrics[metricName]++
|
||||
|
||||
alrt := &api.Alert{
|
||||
Metric: *lastMetric,
|
||||
TriggeredAt: time.Now(),
|
||||
|
|
|
@ -206,6 +206,9 @@ func (mon *Monitor) LogMetric(ctx context.Context, m *api.Metric) error {
|
|||
|
||||
mon.metrics.Add(m)
|
||||
debug("logged", m)
|
||||
if !m.Discard() { // We received a valid metric so avoid alerting.
|
||||
mon.checker.ResetAlerts(m.Peer, m.Name)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
|
@ -311,26 +311,3 @@ func TestPeerMonitorAlerts(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsGetsDeleted(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
|
||||
pm, _, shutdown := testPeerMonitor(t)
|
||||
defer shutdown()
|
||||
mf := newMetricFactory()
|
||||
|
||||
pm.LogMetric(ctx, mf.newMetric("test", test.PeerID1))
|
||||
metrics := pm.metrics.PeerMetrics(test.PeerID1)
|
||||
if len(metrics) == 0 {
|
||||
t.Error("expected metrics")
|
||||
}
|
||||
|
||||
// TODO: expiry time + checkInterval is 7 sec
|
||||
// Why does it need 9 or more?
|
||||
time.Sleep(9 * time.Second)
|
||||
|
||||
metrics = pm.metrics.PeerMetrics(test.PeerID1)
|
||||
if len(metrics) > 0 {
|
||||
t.Error("expected no metrics")
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user