Add accrual failure detection method
License: MIT Signed-off-by: Adrian Lanzafame <adrianlanzafame92@gmail.com>
This commit is contained in:
parent
13ed78786c
commit
3d6eb64db6
|
@ -21,14 +21,16 @@ var ErrAlertChannelFull = errors.New("alert channel is full")
|
||||||
type Checker struct {
|
type Checker struct {
|
||||||
alertCh chan *api.Alert
|
alertCh chan *api.Alert
|
||||||
metrics *Store
|
metrics *Store
|
||||||
|
threshold float64
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewChecker creates a Checker using the given
|
// NewChecker creates a Checker using the given
|
||||||
// MetricsStore.
|
// MetricsStore.
|
||||||
func NewChecker(metrics *Store) *Checker {
|
func NewChecker(metrics *Store, threshold float64) *Checker {
|
||||||
return &Checker{
|
return &Checker{
|
||||||
alertCh: make(chan *api.Alert, AlertChannelCap),
|
alertCh: make(chan *api.Alert, AlertChannelCap),
|
||||||
metrics: metrics,
|
metrics: metrics,
|
||||||
|
threshold: threshold,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -114,3 +116,17 @@ func (mc *Checker) Watch(ctx context.Context, peersF func(context.Context) ([]pe
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Failed returns if a peer has potentially failed. Peers
|
||||||
|
// that are not present in the metrics store will return
|
||||||
|
// as failed.
|
||||||
|
func (mc *Checker) Failed(pid peer.ID) bool {
|
||||||
|
latest := mc.metrics.PeerLatest("ping", pid)
|
||||||
|
if latest == nil {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
v := time.Now().UnixNano() - latest.TS
|
||||||
|
dv := mc.metrics.Distribution("ping", pid)
|
||||||
|
phiv := phi(float64(v), dv)
|
||||||
|
return phiv >= mc.threshold
|
||||||
|
}
|
||||||
|
|
|
@ -13,7 +13,7 @@ import (
|
||||||
|
|
||||||
func TestChecker(t *testing.T) {
|
func TestChecker(t *testing.T) {
|
||||||
metrics := NewStore()
|
metrics := NewStore()
|
||||||
checker := NewChecker(metrics)
|
checker := NewChecker(metrics, 2.0)
|
||||||
|
|
||||||
metr := &api.Metric{
|
metr := &api.Metric{
|
||||||
Name: "test",
|
Name: "test",
|
||||||
|
@ -52,12 +52,12 @@ func TestChecker(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCheckerWatch(t *testing.T) {
|
func TestChecker_Watch(t *testing.T) {
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
metrics := NewStore()
|
metrics := NewStore()
|
||||||
checker := NewChecker(metrics)
|
checker := NewChecker(metrics, 2.0)
|
||||||
|
|
||||||
metr := &api.Metric{
|
metr := &api.Metric{
|
||||||
Name: "test",
|
Name: "test",
|
||||||
|
@ -81,3 +81,31 @@ func TestCheckerWatch(t *testing.T) {
|
||||||
t.Fatal("should have received an alert")
|
t.Fatal("should have received an alert")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestChecker_Failed(t *testing.T) {
|
||||||
|
metrics := NewStore()
|
||||||
|
checker := NewChecker(metrics, 2.0)
|
||||||
|
|
||||||
|
for i := 0; i < 10; i++ {
|
||||||
|
metrics.Add(makePeerMetric(test.PeerID1, "1"))
|
||||||
|
time.Sleep(time.Duration(2) * time.Second)
|
||||||
|
}
|
||||||
|
for i := 0; i < 10; i++ {
|
||||||
|
metrics.Add(makePeerMetric(test.PeerID1, "1"))
|
||||||
|
time.Sleep(time.Duration(500*i) * time.Millisecond)
|
||||||
|
got := checker.Failed(test.PeerID1)
|
||||||
|
if i >= 17 && !got {
|
||||||
|
t.Fatal("threshold should have been passed by now")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func makePeerMetric(pid peer.ID, value string) *api.Metric {
|
||||||
|
metr := &api.Metric{
|
||||||
|
Name: "ping",
|
||||||
|
Peer: pid,
|
||||||
|
Value: value,
|
||||||
|
Valid: true,
|
||||||
|
}
|
||||||
|
return metr
|
||||||
|
}
|
||||||
|
|
86
monitor/metrics/prob.go
Normal file
86
monitor/metrics/prob.go
Normal file
|
@ -0,0 +1,86 @@
|
||||||
|
/*
|
||||||
|
Copyright (©) 2015 Timothée Peignier <timothee.peignier@tryphon.org>
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
"math/big"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phi returns the φ-failure for the given value and distribution.
|
||||||
|
func phi(v float64, d []int64) float64 {
|
||||||
|
u := mean(d)
|
||||||
|
o := standardDeviation(d)
|
||||||
|
cdf := cdf(u, o, big.NewFloat(v))
|
||||||
|
phi := -math.Log10(1 - cdf)
|
||||||
|
if math.IsInf(phi, 1) {
|
||||||
|
phi = 0
|
||||||
|
}
|
||||||
|
return phi
|
||||||
|
}
|
||||||
|
|
||||||
|
// CDF returns the cumulative distribution function if the given
|
||||||
|
// normal function, for the given value.
|
||||||
|
func cdf(u, o, v *big.Float) float64 {
|
||||||
|
var a, b, c big.Float
|
||||||
|
c.Quo(b.Sub(v, u), a.Mul(o, big.NewFloat(math.Sqrt2)))
|
||||||
|
cf, _ := c.Float64()
|
||||||
|
cdf := ((1.0 / 2.0) * (1 + math.Erf(cf)))
|
||||||
|
return cdf
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mean returns the mean of the given sample.
|
||||||
|
func mean(values []int64) *big.Float {
|
||||||
|
if len(values) == 0 {
|
||||||
|
return big.NewFloat(0.0)
|
||||||
|
}
|
||||||
|
var sum int64
|
||||||
|
for _, v := range values {
|
||||||
|
sum += v
|
||||||
|
}
|
||||||
|
var q big.Float
|
||||||
|
return q.Quo(big.NewFloat(float64(sum)), big.NewFloat(float64(len(values))))
|
||||||
|
}
|
||||||
|
|
||||||
|
// StandardDeviation returns standard deviation of the given sample.
|
||||||
|
func standardDeviation(v []int64) *big.Float {
|
||||||
|
var z big.Float
|
||||||
|
z.Sqrt(variance(v)).Float64()
|
||||||
|
return &z
|
||||||
|
}
|
||||||
|
|
||||||
|
// Variance returns variance if the given sample.
|
||||||
|
func variance(values []int64) *big.Float {
|
||||||
|
if len(values) == 0 {
|
||||||
|
return big.NewFloat(0.0)
|
||||||
|
}
|
||||||
|
m := mean(values)
|
||||||
|
var sum, pwr, res big.Float
|
||||||
|
for _, v := range values {
|
||||||
|
d := big.NewFloat(float64(v))
|
||||||
|
d.Sub(d, m)
|
||||||
|
pwr.Mul(d, d)
|
||||||
|
sum.Add(&sum, &pwr)
|
||||||
|
}
|
||||||
|
return res.Quo(&sum, big.NewFloat(float64(len(values))))
|
||||||
|
}
|
|
@ -61,6 +61,7 @@ func (mtrs *Store) LatestValid(name string) []*api.Metric {
|
||||||
metrics := make([]*api.Metric, 0, len(byPeer))
|
metrics := make([]*api.Metric, 0, len(byPeer))
|
||||||
for _, window := range byPeer {
|
for _, window := range byPeer {
|
||||||
m, err := window.Latest()
|
m, err := window.Latest()
|
||||||
|
// TODO(ajl): for accrual, does it matter if a ping has expired?
|
||||||
if err != nil || m.Discard() {
|
if err != nil || m.Discard() {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -110,3 +111,45 @@ func (mtrs *Store) PeerMetrics(pid peer.ID) []*api.Metric {
|
||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// PeerLatest returns the latest of a particular metric for a
|
||||||
|
// particular peer. It may return an expired metric.
|
||||||
|
func (mtrs *Store) PeerLatest(name string, pid peer.ID) *api.Metric {
|
||||||
|
mtrs.mux.RLock()
|
||||||
|
defer mtrs.mux.RUnlock()
|
||||||
|
|
||||||
|
byPeer, ok := mtrs.byName[name]
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
window, ok := byPeer[pid]
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
m, err := window.Latest()
|
||||||
|
if err != nil {
|
||||||
|
// ignoring error, as nil metric is indicative enough
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return m
|
||||||
|
}
|
||||||
|
|
||||||
|
// Distribution returns the distribution of a particular metrics
|
||||||
|
// for a particular peer.
|
||||||
|
func (mtrs *Store) Distribution(name string, pid peer.ID) []int64 {
|
||||||
|
mtrs.mux.RLock()
|
||||||
|
defer mtrs.mux.RUnlock()
|
||||||
|
|
||||||
|
byPeer, ok := mtrs.byName[name]
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
window, ok := byPeer[pid]
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return window.Distribution()
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user