Do alert for all metric types

2019-06-23 09:05:32 +01:00 · 2019-06-23 09:05:32 +01:00 · 563a0da9ae
commit 563a0da9ae
parent 27295c10ac
3 changed files with 23 additions and 18 deletions
--- a/monitor/metrics/checker.go
+++ b/monitor/metrics/checker.go
@ -58,12 +58,14 @@ func NewChecker(ctx context.Context, metrics *Store, threshold float64) *Checker
 // CheckPeers will trigger alerts based on the latest metrics from the given peerset
 // when they have expired and no alert has been sent before.
 func (mc *Checker) CheckPeers(peers []peer.ID) error {
-	for _, peer := range peers {
+	for _, name := range mc.metrics.MetricNames() {
-		for _, metric := range mc.metrics.PeerMetricAll("ping", peer) {
+		for _, peer := range peers {
-			if mc.FailedMetric(metric.Name, peer) {
+			for _, metric := range mc.metrics.PeerMetricAll(name, peer) {
-				err := mc.alert(peer, metric.Name)
+				if mc.FailedMetric(metric.Name, peer) {
-				if err != nil {
+					err := mc.alert(peer, metric.Name)
-					return err
+					if err != nil {
 						return err
 					}
 				}
 			}
 		}
@ -75,7 +77,7 @@ func (mc *Checker) CheckPeers(peers []peer.ID) error {
 // and no alert has been sent before.
 func (mc *Checker) CheckAll() error {
 	for _, metric := range mc.metrics.AllMetrics() {
-		if metric.Name == "ping" && mc.Failed(metric.Peer) {
+		if mc.FailedMetric(metric.Name, metric.Peer) {
 			err := mc.alert(metric.Peer, metric.Name)
 			if err != nil {
 				return err
@ -168,16 +170,7 @@ func (mc *Checker) Watch(ctx context.Context, peersF func(context.Context) ([]pe
 	}
 }
-// Failed returns true if a peer has potentially failed.
+// FailedMetric returns if a peer is marked as failed for a particular metric.
 // Peers that are not present in the metrics store will return
 // as failed.
 func (mc *Checker) Failed(pid peer.ID) bool {
 	_, _, _, result := mc.failed("ping", pid)
 	return result
 }
 // FailedMetric is the same as Failed but can use any metric type,
 // not just ping.
 func (mc *Checker) FailedMetric(metric string, pid peer.ID) bool {
 	_, _, _, result := mc.failed(metric, pid)
 	return result
--- a/monitor/metrics/checker_test.go
+++ b/monitor/metrics/checker_test.go
@ -145,7 +145,7 @@ func TestChecker_Failed(t *testing.T) {
 		}
 		for i := 0; i < 10; i++ {
 			metrics.Add(makePeerMetric(test.PeerID1, "1", 3*time.Millisecond))
-			got := checker.Failed(test.PeerID1)
+			got := checker.FailedMetric("ping", test.PeerID1)
 			// the magic number 17 represents the point at which
 			// the time between metrics addition has gotten
 			// so large that the probability that the service
--- a/monitor/metrics/store.go
+++ b/monitor/metrics/store.go
@ -189,3 +189,15 @@ func (mtrs *Store) Distribution(name string, pid peer.ID) []float64 {
 	return window.Distribution()
 }
 // MetricNames returns all the known metric names
 func (mtrs *Store) MetricNames() []string {
 	mtrs.mux.RLock()
 	defer mtrs.mux.RUnlock()
 	list := make([]string, 0, len(mtrs.byName))
 	for k := range mtrs.byName {
 		list = append(list, k)
 	}
 	return list
 }