Adrian Lanzafame 3b3f786d68
add opencensus tracing and metrics
This commit adds support for OpenCensus tracing
and metrics collection. This required support for
context.Context propogation throughout the cluster
codebase, and in particular, the ipfscluster component

The tracing propogates across RPC and HTTP boundaries.
The current default tracing backend is Jaeger.

The metrics currently exports the metrics exposed by
the opencensus http plugin as well as the pprof metrics
to a prometheus endpoint for scraping.
The current default metrics backend is Prometheus.

Metrics are currently exposed by default due to low
overhead, can be turned off if desired, whereas tracing
is off by default as it has a much higher performance
overhead, though the extent of the performance hit can be
adjusted with smaller sampling rates.

License: MIT
Signed-off-by: Adrian Lanzafame <adrianlanzafame92@gmail.com>
2019-02-04 18:53:21 +10:00

300 lines
6.3 KiB

package pubsubmon
import (
libp2p "github.com/libp2p/go-libp2p"
peer "github.com/libp2p/go-libp2p-peer"
peerstore "github.com/libp2p/go-libp2p-peerstore"
pubsub "github.com/libp2p/go-libp2p-pubsub"
func init() {
// GossipSub needs to heartbeat to discover newly connected hosts
// This speeds things up a little.
pubsub.GossipSubHeartbeatInterval = 50 * time.Millisecond
type metricFactory struct {
l sync.Mutex
counter int
func newMetricFactory() *metricFactory {
return &metricFactory{
counter: 0,
func (mf *metricFactory) newMetric(n string, p peer.ID) api.Metric {
defer mf.l.Unlock()
m := api.Metric{
Name: n,
Peer: p,
Value: fmt.Sprintf("%d", mf.counter),
Valid: true,
m.SetTTL(5 * time.Second)
return m
func (mf *metricFactory) count() int {
defer mf.l.Unlock()
return mf.counter
func testPeerMonitor(t *testing.T) (*Monitor, func()) {
ctx := context.Background()
h, err := libp2p.New(
if err != nil {
mock := test.NewMockRPCClientWithHost(t, h)
cfg := &Config{}
cfg.CheckInterval = 2 * time.Second
mon, err := New(h, cfg)
if err != nil {
shutdownF := func() {
return mon, shutdownF
func TestPeerMonitorShutdown(t *testing.T) {
ctx := context.Background()
pm, shutdown := testPeerMonitor(t)
defer shutdown()
err := pm.Shutdown(ctx)
if err != nil {
err = pm.Shutdown(ctx)
if err != nil {
func TestLogMetricConcurrent(t *testing.T) {
ctx := context.Background()
pm, shutdown := testPeerMonitor(t)
defer shutdown()
var wg sync.WaitGroup
// Insert 25 metrics
f := func() {
defer wg.Done()
for i := 0; i < 25; i++ {
mt := api.Metric{
Name: "test",
Peer: test.TestPeerID1,
Value: fmt.Sprintf("%d", time.Now().UnixNano()),
Valid: true,
mt.SetTTL(150 * time.Millisecond)
pm.LogMetric(ctx, mt)
time.Sleep(75 * time.Millisecond)
go f()
go f()
go f()
// Wait for at least two metrics to be inserted
time.Sleep(200 * time.Millisecond)
last := time.Now().Add(-500 * time.Millisecond)
for i := 0; i <= 20; i++ {
lastMtrcs := pm.LatestMetrics(ctx, "test")
// There should always 1 valid LatestMetric "test"
if len(lastMtrcs) != 1 {
t.Error("no valid metrics", len(lastMtrcs), i)
time.Sleep(75 * time.Millisecond)
n, err := strconv.Atoi(lastMtrcs[0].Value)
if err != nil {
// The timestamp of the metric cannot be older than
// the timestamp from the last
current := time.Unix(0, int64(n))
if current.Before(last) {
t.Errorf("expected newer metric: Current: %s, Last: %s", current, last)
last = current
time.Sleep(75 * time.Millisecond)
func TestPeerMonitorLogMetric(t *testing.T) {
ctx := context.Background()
pm, shutdown := testPeerMonitor(t)
defer shutdown()
mf := newMetricFactory()
// dont fill window
pm.LogMetric(ctx, mf.newMetric("test", test.TestPeerID1))
pm.LogMetric(ctx, mf.newMetric("test", test.TestPeerID2))
pm.LogMetric(ctx, mf.newMetric("test", test.TestPeerID3))
// fill window
pm.LogMetric(ctx, mf.newMetric("test2", test.TestPeerID3))
pm.LogMetric(ctx, mf.newMetric("test2", test.TestPeerID3))
pm.LogMetric(ctx, mf.newMetric("test2", test.TestPeerID3))
pm.LogMetric(ctx, mf.newMetric("test2", test.TestPeerID3))
latestMetrics := pm.LatestMetrics(ctx, "testbad")
if len(latestMetrics) != 0 {
t.Logf("%+v", latestMetrics)
t.Error("metrics should be empty")
latestMetrics = pm.LatestMetrics(ctx, "test")
if len(latestMetrics) != 3 {
t.Error("metrics should correspond to 3 hosts")
for _, v := range latestMetrics {
switch v.Peer {
case test.TestPeerID1:
if v.Value != "0" {
t.Error("bad metric value")
case test.TestPeerID2:
if v.Value != "1" {
t.Error("bad metric value")
case test.TestPeerID3:
if v.Value != "2" {
t.Error("bad metric value")
t.Error("bad peer")
latestMetrics = pm.LatestMetrics(ctx, "test2")
if len(latestMetrics) != 1 {
t.Fatal("should only be one metric")
if latestMetrics[0].Value != fmt.Sprintf("%d", mf.count()-1) {
t.Error("metric is not last")
func TestPeerMonitorPublishMetric(t *testing.T) {
ctx := context.Background()
pm, shutdown := testPeerMonitor(t)
defer shutdown()
pm2, shutdown2 := testPeerMonitor(t)
defer shutdown2()
time.Sleep(200 * time.Millisecond)
err := pm.host.Connect(
ID: pm2.host.ID(),
Addrs: pm2.host.Addrs(),
if err != nil {
time.Sleep(200 * time.Millisecond)
mf := newMetricFactory()
metric := mf.newMetric("test", test.TestPeerID1)
err = pm.PublishMetric(ctx, metric)
if err != nil {
time.Sleep(500 * time.Millisecond)
checkMetric := func(t *testing.T, pm *Monitor) {
latestMetrics := pm.LatestMetrics(ctx, "test")
if len(latestMetrics) != 1 {
t.Fatal(pm.host.ID(), "expected 1 published metric")
t.Log(pm.host.ID(), "received metric")
receivedMetric := latestMetrics[0]
if receivedMetric.Peer != metric.Peer ||
receivedMetric.Expire != metric.Expire ||
receivedMetric.Value != metric.Value ||
receivedMetric.Valid != metric.Valid ||
receivedMetric.Name != metric.Name {
t.Fatal("it should be exactly the same metric we published")
checkMetric(t, pm)
checkMetric(t, pm2)
func TestPeerMonitorAlerts(t *testing.T) {
ctx := context.Background()
pm, shutdown := testPeerMonitor(t)
defer shutdown()
mf := newMetricFactory()
mtr := mf.newMetric("test", test.TestPeerID1)
pm.LogMetric(ctx, mtr)
timeout := time.NewTimer(time.Second * 5)
// it should alert twice at least. Alert re-occurrs.
for i := 0; i < 2; i++ {
select {
case <-timeout.C:
t.Fatal("should have thrown an alert by now")
case alrt := <-pm.Alerts():
if alrt.MetricName != "test" {
t.Error("Alert should be for test")
if alrt.Peer != test.TestPeerID1 {
t.Error("Peer should be TestPeerID1")