2018-05-08 09:38:12 +00:00
|
|
|
package metrics
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2019-04-17 06:39:00 +00:00
|
|
|
"fmt"
|
|
|
|
"math"
|
2018-05-08 09:38:12 +00:00
|
|
|
"testing"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
peer "github.com/libp2p/go-libp2p-peer"
|
2019-04-17 06:39:00 +00:00
|
|
|
"gonum.org/v1/plot"
|
|
|
|
"gonum.org/v1/plot/plotter"
|
|
|
|
"gonum.org/v1/plot/plotutil"
|
|
|
|
"gonum.org/v1/plot/vg"
|
2018-05-08 09:38:12 +00:00
|
|
|
|
|
|
|
"github.com/ipfs/ipfs-cluster/api"
|
|
|
|
"github.com/ipfs/ipfs-cluster/test"
|
|
|
|
)
|
|
|
|
|
2019-04-23 10:30:26 +00:00
|
|
|
func TestCheckPeers(t *testing.T) {
|
2018-05-08 09:38:12 +00:00
|
|
|
metrics := NewStore()
|
2019-04-26 06:33:01 +00:00
|
|
|
checker := NewChecker(context.Background(), metrics, 2.0)
|
2018-05-08 09:38:12 +00:00
|
|
|
|
2019-02-27 17:04:35 +00:00
|
|
|
metr := &api.Metric{
|
2019-04-23 10:30:26 +00:00
|
|
|
Name: "ping",
|
2019-02-27 20:19:10 +00:00
|
|
|
Peer: test.PeerID1,
|
2018-05-08 09:38:12 +00:00
|
|
|
Value: "1",
|
|
|
|
Valid: true,
|
|
|
|
}
|
|
|
|
metr.SetTTL(2 * time.Second)
|
|
|
|
|
|
|
|
metrics.Add(metr)
|
|
|
|
|
2019-02-27 20:19:10 +00:00
|
|
|
checker.CheckPeers([]peer.ID{test.PeerID1})
|
2018-05-08 09:38:12 +00:00
|
|
|
select {
|
|
|
|
case <-checker.Alerts():
|
|
|
|
t.Error("there should not be an alert yet")
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
|
|
|
|
time.Sleep(3 * time.Second)
|
2019-02-27 20:19:10 +00:00
|
|
|
err := checker.CheckPeers([]peer.ID{test.PeerID1})
|
2018-05-08 09:38:12 +00:00
|
|
|
if err != nil {
|
|
|
|
t.Fatal(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
select {
|
|
|
|
case <-checker.Alerts():
|
|
|
|
default:
|
|
|
|
t.Error("an alert should have been triggered")
|
|
|
|
}
|
|
|
|
|
2019-02-27 20:19:10 +00:00
|
|
|
checker.CheckPeers([]peer.ID{test.PeerID2})
|
2018-05-08 09:38:12 +00:00
|
|
|
select {
|
|
|
|
case <-checker.Alerts():
|
|
|
|
t.Error("there should not be alerts for different peer")
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-03-11 09:00:21 +00:00
|
|
|
func TestChecker_Watch(t *testing.T) {
|
2018-05-08 09:38:12 +00:00
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
|
|
|
defer cancel()
|
|
|
|
|
|
|
|
metrics := NewStore()
|
2019-04-26 06:33:01 +00:00
|
|
|
checker := NewChecker(context.Background(), metrics, 2.0)
|
2018-05-08 09:38:12 +00:00
|
|
|
|
2019-02-27 17:04:35 +00:00
|
|
|
metr := &api.Metric{
|
2018-05-08 09:38:12 +00:00
|
|
|
Name: "test",
|
2019-02-27 20:19:10 +00:00
|
|
|
Peer: test.PeerID1,
|
2018-05-08 09:38:12 +00:00
|
|
|
Value: "1",
|
|
|
|
Valid: true,
|
|
|
|
}
|
|
|
|
metr.SetTTL(100 * time.Millisecond)
|
|
|
|
metrics.Add(metr)
|
|
|
|
|
2018-06-27 04:03:15 +00:00
|
|
|
peersF := func(context.Context) ([]peer.ID, error) {
|
2019-02-27 20:19:10 +00:00
|
|
|
return []peer.ID{test.PeerID1}, nil
|
2018-05-08 09:38:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
go checker.Watch(ctx, peersF, 200*time.Millisecond)
|
|
|
|
|
|
|
|
select {
|
|
|
|
case a := <-checker.Alerts():
|
|
|
|
t.Log("received alert:", a)
|
|
|
|
case <-ctx.Done():
|
|
|
|
t.Fatal("should have received an alert")
|
|
|
|
}
|
|
|
|
}
|
2019-03-11 09:00:21 +00:00
|
|
|
|
|
|
|
func TestChecker_Failed(t *testing.T) {
|
2019-03-21 10:42:56 +00:00
|
|
|
t.Run("standard failure check", func(t *testing.T) {
|
|
|
|
metrics := NewStore()
|
2019-04-26 06:33:01 +00:00
|
|
|
checker := NewChecker(context.Background(), metrics, 2.0)
|
2019-03-11 09:00:21 +00:00
|
|
|
|
2019-03-21 10:42:56 +00:00
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
metrics.Add(makePeerMetric(test.PeerID1, "1"))
|
2019-03-21 11:59:46 +00:00
|
|
|
time.Sleep(time.Duration(2) * time.Millisecond)
|
2019-03-11 09:00:21 +00:00
|
|
|
}
|
2019-03-21 10:42:56 +00:00
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
metrics.Add(makePeerMetric(test.PeerID1, "1"))
|
|
|
|
got := checker.Failed(test.PeerID1)
|
2019-03-21 11:59:46 +00:00
|
|
|
// the magic number 17 represents the point at which
|
|
|
|
// the time between metrics addition has gotten
|
|
|
|
// so large that the probability that the service
|
|
|
|
// has failed goes over the threshold.
|
2019-03-21 10:42:56 +00:00
|
|
|
if i >= 17 && !got {
|
|
|
|
t.Fatal("threshold should have been passed by now")
|
|
|
|
}
|
2019-04-17 06:39:00 +00:00
|
|
|
time.Sleep(time.Duration(i) * time.Millisecond)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-04-23 10:30:26 +00:00
|
|
|
//////////////////
|
|
|
|
// HELPER TESTS //
|
|
|
|
//////////////////
|
|
|
|
|
2019-04-17 06:39:00 +00:00
|
|
|
func TestThresholdValues(t *testing.T) {
|
|
|
|
t.Log("TestThresholdValues is useful for testing out different threshold values")
|
|
|
|
t.Log("It doesn't actually perform any 'tests', so it is skipped by default")
|
|
|
|
t.SkipNow()
|
|
|
|
|
|
|
|
thresholds := []float64{0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.8, 0.9, 1.3, 1.4, 1.5, 1.8, 2.0, 3.0, 4.0, 5.0, 7.0, 10.0, 20.0}
|
|
|
|
|
|
|
|
t.Run("linear threshold test", func(t *testing.T) {
|
|
|
|
dists := make([]timeseries, 0)
|
|
|
|
phivs := make([]timeseries, 0)
|
|
|
|
for _, v := range thresholds {
|
|
|
|
metrics := NewStore()
|
2019-04-26 06:33:01 +00:00
|
|
|
checker := NewChecker(context.Background(), metrics, v)
|
2019-04-17 06:39:00 +00:00
|
|
|
tsName := fmt.Sprintf("%f", v)
|
|
|
|
distTS := newTS(tsName)
|
|
|
|
phivTS := newTS(tsName)
|
|
|
|
|
|
|
|
var failed bool
|
|
|
|
output := false
|
|
|
|
|
|
|
|
check := func(i int) bool {
|
2019-04-23 10:30:26 +00:00
|
|
|
inputv, dist, phiv, got := checker.failed("ping", test.PeerID1)
|
2019-04-17 06:39:00 +00:00
|
|
|
if output {
|
|
|
|
fmt.Println(i)
|
|
|
|
fmt.Printf("phiv: %f\n", phiv)
|
|
|
|
fmt.Printf("threshold: %f\n", v)
|
|
|
|
fmt.Printf("latest: %f\n", inputv)
|
|
|
|
fmt.Printf("distribution: %v\n", dist)
|
|
|
|
}
|
|
|
|
phivTS.record(phiv)
|
|
|
|
failed = got
|
|
|
|
if failed {
|
|
|
|
distTS.record(float64(i))
|
|
|
|
}
|
|
|
|
return got
|
|
|
|
}
|
|
|
|
|
|
|
|
// set baseline static distribution of ping intervals
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
check(i)
|
|
|
|
distTS.record(float64(10))
|
|
|
|
metrics.Add(makePeerMetric(test.PeerID1, "1"))
|
|
|
|
time.Sleep(time.Duration(10) * time.Millisecond)
|
|
|
|
}
|
|
|
|
// start linearly increasing the interval values
|
|
|
|
for i := 10; i < 100 && !check(i); i++ {
|
|
|
|
distTS.record(float64(i))
|
|
|
|
metrics.Add(makePeerMetric(test.PeerID1, "1"))
|
|
|
|
time.Sleep(time.Duration(i) * time.Millisecond)
|
|
|
|
}
|
|
|
|
|
|
|
|
if failed {
|
|
|
|
dists = append(dists, *distTS)
|
|
|
|
phivs = append(phivs, *phivTS)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
dists = append(dists, *distTS)
|
|
|
|
phivs = append(phivs, *phivTS)
|
|
|
|
|
|
|
|
// if a threshold has made it to this point, all greater
|
|
|
|
// thresholds will make it here so print the threshold and skip
|
|
|
|
t.Log("threshold that was not meet by linear increase: ", v)
|
|
|
|
break
|
|
|
|
}
|
|
|
|
plotDistsPhivs("linear_", dists, phivs)
|
|
|
|
})
|
|
|
|
|
|
|
|
t.Run("cubic threshold test", func(t *testing.T) {
|
|
|
|
dists := make([]timeseries, 0)
|
|
|
|
phivs := make([]timeseries, 0)
|
|
|
|
for _, v := range thresholds {
|
|
|
|
metrics := NewStore()
|
2019-04-26 06:33:01 +00:00
|
|
|
checker := NewChecker(context.Background(), metrics, v)
|
2019-04-17 06:39:00 +00:00
|
|
|
tsName := fmt.Sprintf("%f", v)
|
|
|
|
distTS := newTS(tsName)
|
|
|
|
phivTS := newTS(tsName)
|
|
|
|
|
|
|
|
var failed bool
|
|
|
|
output := false
|
|
|
|
|
|
|
|
check := func(i int) bool {
|
2019-04-23 10:30:26 +00:00
|
|
|
inputv, dist, phiv, got := checker.failed("ping", test.PeerID1)
|
2019-04-17 06:39:00 +00:00
|
|
|
if output {
|
|
|
|
fmt.Println(i)
|
|
|
|
fmt.Printf("phiv: %f\n", phiv)
|
|
|
|
fmt.Printf("threshold: %f\n", v)
|
|
|
|
fmt.Printf("latest: %f\n", inputv)
|
|
|
|
fmt.Printf("distribution: %v\n", dist)
|
|
|
|
}
|
|
|
|
phivTS.record(phiv)
|
|
|
|
failed = got
|
|
|
|
if failed {
|
|
|
|
diff := math.Pow(float64(i), float64(i))
|
|
|
|
distTS.record(diff)
|
|
|
|
}
|
|
|
|
return got
|
|
|
|
}
|
|
|
|
|
|
|
|
// set baseline static distribution of ping intervals
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
check(i)
|
|
|
|
distTS.record(float64(8))
|
|
|
|
metrics.Add(makePeerMetric(test.PeerID1, "1"))
|
|
|
|
time.Sleep(time.Duration(8) * time.Millisecond)
|
|
|
|
}
|
|
|
|
for i := 2; !check(i) && i < 20; i++ {
|
|
|
|
diff := math.Pow(float64(i), 3)
|
|
|
|
distTS.record(diff)
|
|
|
|
metrics.Add(makePeerMetric(test.PeerID1, "1"))
|
|
|
|
time.Sleep(time.Duration(diff) * time.Millisecond)
|
|
|
|
}
|
|
|
|
|
|
|
|
if failed {
|
|
|
|
dists = append(dists, *distTS)
|
|
|
|
phivs = append(phivs, *phivTS)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
dists = append(dists, *distTS)
|
|
|
|
phivs = append(phivs, *phivTS)
|
|
|
|
|
|
|
|
// if a threshold has made it to this point, all greater
|
|
|
|
// thresholds will make it here so print the threshold and skip
|
|
|
|
t.Log("threshold that was not meet by cubic increase: ", v)
|
|
|
|
break
|
|
|
|
}
|
|
|
|
plotDistsPhivs("cubic_", dists, phivs)
|
|
|
|
})
|
|
|
|
|
|
|
|
t.Run("14x threshold test", func(t *testing.T) {
|
|
|
|
dists := make([]timeseries, 0)
|
|
|
|
phivs := make([]timeseries, 0)
|
|
|
|
for _, v := range thresholds {
|
|
|
|
metrics := NewStore()
|
2019-04-26 06:33:01 +00:00
|
|
|
checker := NewChecker(context.Background(), metrics, v)
|
2019-04-17 06:39:00 +00:00
|
|
|
tsName := fmt.Sprintf("%f", v)
|
|
|
|
distTS := newTS(tsName)
|
|
|
|
phivTS := newTS(tsName)
|
|
|
|
|
|
|
|
var failed bool
|
|
|
|
output := false
|
|
|
|
|
|
|
|
check := func(i int) bool {
|
2019-04-23 10:30:26 +00:00
|
|
|
inputv, dist, phiv, got := checker.failed("ping", test.PeerID1)
|
2019-04-17 06:39:00 +00:00
|
|
|
if output {
|
|
|
|
fmt.Println(i)
|
|
|
|
fmt.Printf("phiv: %f\n", phiv)
|
|
|
|
fmt.Printf("threshold: %f\n", v)
|
|
|
|
fmt.Printf("latest: %f\n", inputv)
|
|
|
|
fmt.Printf("distribution: %v\n", dist)
|
|
|
|
}
|
|
|
|
phivTS.record(phiv)
|
|
|
|
failed = got
|
|
|
|
if failed {
|
|
|
|
diff := i * 50
|
|
|
|
distTS.record(float64(diff))
|
|
|
|
}
|
|
|
|
return got
|
|
|
|
}
|
|
|
|
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
check(i)
|
|
|
|
distTS.record(float64(i))
|
|
|
|
metrics.Add(makePeerMetric(test.PeerID1, "1"))
|
|
|
|
time.Sleep(time.Duration(i) * time.Millisecond)
|
|
|
|
}
|
|
|
|
for i := 10; !check(i) && i < 30; i++ {
|
|
|
|
diff := i * 50
|
|
|
|
distTS.record(float64(diff))
|
|
|
|
metrics.Add(makePeerMetric(test.PeerID1, "1"))
|
|
|
|
time.Sleep(time.Duration(diff) * time.Millisecond)
|
|
|
|
}
|
|
|
|
|
|
|
|
if failed {
|
|
|
|
dists = append(dists, *distTS)
|
|
|
|
phivs = append(phivs, *phivTS)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
dists = append(dists, *distTS)
|
|
|
|
phivs = append(phivs, *phivTS)
|
|
|
|
|
|
|
|
// if a threshold has made it to this point, all greater
|
|
|
|
// thresholds will make it here so print the threshold and skip
|
|
|
|
t.Log("threshold that was not meet by 14x increase: ", v)
|
|
|
|
break
|
2019-03-21 10:42:56 +00:00
|
|
|
}
|
2019-04-17 06:39:00 +00:00
|
|
|
plotDistsPhivs("14x_", dists, phivs)
|
2019-03-21 10:42:56 +00:00
|
|
|
})
|
2019-03-11 09:00:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func makePeerMetric(pid peer.ID, value string) *api.Metric {
|
|
|
|
metr := &api.Metric{
|
|
|
|
Name: "ping",
|
|
|
|
Peer: pid,
|
|
|
|
Value: value,
|
|
|
|
Valid: true,
|
|
|
|
}
|
|
|
|
return metr
|
|
|
|
}
|
2019-04-17 06:39:00 +00:00
|
|
|
|
|
|
|
type timeseries struct {
|
|
|
|
name string
|
|
|
|
values []float64
|
|
|
|
}
|
|
|
|
|
|
|
|
func newTS(name string) *timeseries {
|
|
|
|
v := make([]float64, 0)
|
|
|
|
return ×eries{name: name, values: v}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ts *timeseries) record(v float64) {
|
|
|
|
ts.values = append(ts.values, v)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ts *timeseries) ToXYs() plotter.XYs {
|
|
|
|
pts := make(plotter.XYs, len(ts.values))
|
|
|
|
for i, v := range ts.values {
|
|
|
|
pts[i].X = float64(i)
|
|
|
|
if math.IsInf(v, 0) {
|
|
|
|
pts[i].Y = -5
|
|
|
|
} else {
|
|
|
|
pts[i].Y = v
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return pts
|
|
|
|
}
|
|
|
|
|
|
|
|
func toPlotLines(ts []timeseries) ([]string, [][]plot.Plotter) {
|
|
|
|
labels := make([]string, 0)
|
|
|
|
plotters := make([][]plot.Plotter, 0)
|
|
|
|
for i, t := range ts {
|
|
|
|
l, s, err := plotter.NewLinePoints(t.ToXYs())
|
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
l.Width = vg.Points(2)
|
|
|
|
l.Color = plotutil.Color(i)
|
|
|
|
l.Dashes = plotutil.Dashes(i)
|
|
|
|
s.Color = plotutil.Color(i)
|
|
|
|
s.Shape = plotutil.Shape(i)
|
|
|
|
|
|
|
|
labels = append(labels, t.name)
|
|
|
|
plotters = append(plotters, []plot.Plotter{l, s})
|
|
|
|
}
|
|
|
|
return labels, plotters
|
|
|
|
}
|
|
|
|
|
|
|
|
func plotDistsPhivs(prefix string, dists, phivs []timeseries) {
|
|
|
|
plotTS(prefix+"dists", dists)
|
|
|
|
plotTS(prefix+"phivs", phivs)
|
|
|
|
}
|
|
|
|
|
|
|
|
func plotTS(name string, ts []timeseries) {
|
|
|
|
p, err := plot.New()
|
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
p.Title.Text = name
|
|
|
|
p.Add(plotter.NewGrid())
|
|
|
|
|
|
|
|
labels, plotters := toPlotLines(ts)
|
|
|
|
for i := range labels {
|
|
|
|
l, pts := plotters[i][0], plotters[i][1]
|
|
|
|
p.Add(l, pts)
|
|
|
|
p.Legend.Add(labels[i], l.(*plotter.Line), pts.(*plotter.Scatter))
|
|
|
|
}
|
|
|
|
|
|
|
|
if err := p.Save(20*vg.Inch, 15*vg.Inch, name+".png"); err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
}
|