diff --git a/api/rest/client/client.go b/api/rest/client/client.go index 4b121fe7..3e6bed83 100644 --- a/api/rest/client/client.go +++ b/api/rest/client/client.go @@ -106,6 +106,9 @@ type Client interface { // Otherwise, it happens everywhere. RecoverAll(ctx context.Context, local bool) ([]*api.GlobalPinInfo, error) + // Alerts returns things that are wrong with cluster. + Alerts(ctx context.Context) (map[string]api.Alert, error) + // Version returns the ipfs-cluster peer's version. Version(context.Context) (*api.Version, error) diff --git a/api/rest/client/lbclient.go b/api/rest/client/lbclient.go index 629cb472..7bf38550 100644 --- a/api/rest/client/lbclient.go +++ b/api/rest/client/lbclient.go @@ -331,6 +331,19 @@ func (lc *loadBalancingClient) RecoverAll(ctx context.Context, local bool) ([]*a return pinInfos, err } +// Alerts returns things that are wrong with cluster. +func (lc *loadBalancingClient) Alerts(ctx context.Context) (map[string]api.Alert, error) { + var alerts map[string]api.Alert + call := func(c Client) error { + var err error + alerts, err = c.Alerts(ctx) + return err + } + + err := lc.retry(0, call) + return alerts, err +} + // Version returns the ipfs-cluster peer's version. func (lc *loadBalancingClient) Version(ctx context.Context) (*api.Version, error) { var v *api.Version diff --git a/api/rest/client/methods.go b/api/rest/client/methods.go index 532aba8b..a94f6f1a 100644 --- a/api/rest/client/methods.go +++ b/api/rest/client/methods.go @@ -306,6 +306,16 @@ func (c *defaultClient) RecoverAll(ctx context.Context, local bool) ([]*api.Glob return gpis, err } +// Alerts returns things that are wrong with cluster. +func (c *defaultClient) Alerts(ctx context.Context) (map[string]api.Alert, error) { + ctx, span := trace.StartSpan(ctx, "client/RecoverAll") + defer span.End() + + var alerts map[string]api.Alert + err := c.do(ctx, "GET", "/alerts", nil, nil, &alerts) + return alerts, err +} + // Version returns the ipfs-cluster peer's version. func (c *defaultClient) Version(ctx context.Context) (*api.Version, error) { ctx, span := trace.StartSpan(ctx, "client/Version") diff --git a/api/rest/restapi.go b/api/rest/restapi.go index 62beafc9..7e9f5595 100644 --- a/api/rest/restapi.go +++ b/api/rest/restapi.go @@ -487,6 +487,12 @@ func (api *API) routes() []route { "/monitor/metrics", api.metricNamesHandler, }, + { + "Alerts", + "GET", + "/alerts", + api.alertsHandler, + }, } } @@ -659,6 +665,19 @@ func (api *API) metricNamesHandler(w http.ResponseWriter, r *http.Request) { api.sendResponse(w, autoStatus, err, metricNames) } +func (api *API) alertsHandler(w http.ResponseWriter, r *http.Request) { + var alerts map[string]types.Alert + err := api.rpcClient.CallContext( + r.Context(), + "", + "Cluster", + "Alerts", + struct{}{}, + &alerts, + ) + api.sendResponse(w, autoStatus, err, alerts) +} + func (api *API) addHandler(w http.ResponseWriter, r *http.Request) { reader, err := r.MultipartReader() if err != nil { diff --git a/api/types.go b/api/types.go index a7aea655..b48eae5e 100644 --- a/api/types.go +++ b/api/types.go @@ -943,8 +943,10 @@ func (es MetricSlice) Less(i, j int) bool { // Alert carries alerting information about a peer. WIP. type Alert struct { - Peer peer.ID - MetricName string + Peer peer.ID `json:"peer" codec:"p"` + MetricName string `json:"metric_name" codec:"m"` + Expiry int64 `json:"expiry" codec:"e"` + Value string `json:"value" codec:"v"` } // Error can be used by APIs to return errors. diff --git a/cluster.go b/cluster.go index e76ada25..102f27da 100644 --- a/cluster.go +++ b/cluster.go @@ -75,6 +75,9 @@ type Cluster struct { informers []Informer tracer Tracer + alerts map[string]api.Alert + alertsMux sync.Mutex + doneCh chan struct{} readyCh chan struct{} readyB bool @@ -161,6 +164,7 @@ func NewCluster( allocator: allocator, informers: informers, tracer: tracer, + alerts: make(map[string]api.Alert), peerManager: peerManager, shutdownB: false, removed: false, @@ -391,6 +395,19 @@ func (c *Cluster) pushPingMetrics(ctx context.Context) { } } +// Alerts returns things that are wrong with the cluster. +func (c *Cluster) Alerts() map[string]api.Alert { + alerts := make(map[string]api.Alert) + + c.alertsMux.Lock() + defer c.alertsMux.Unlock() + for i, alert := range c.alerts { + alerts[i] = alert + } + + return alerts +} + // read the alerts channel from the monitor and triggers repins func (c *Cluster) alertsHandler() { for { @@ -399,6 +416,10 @@ func (c *Cluster) alertsHandler() { return case alrt := <-c.monitor.Alerts(): logger.Warningf("metric alert for %s: Peer: %s.", alrt.MetricName, alrt.Peer) + c.alertsMux.Lock() + c.alerts[peer.IDB58Encode(alrt.Peer)] = *alrt + c.alertsMux.Unlock() + if alrt.MetricName != pingMetricName { continue // only handle ping alerts } diff --git a/cmd/ipfs-cluster-ctl/formatters.go b/cmd/ipfs-cluster-ctl/formatters.go index 317da9fe..0780df2c 100644 --- a/cmd/ipfs-cluster-ctl/formatters.go +++ b/cmd/ipfs-cluster-ctl/formatters.go @@ -97,6 +97,10 @@ func textFormatObject(resp interface{}) { for _, item := range resp.([]string) { textFormatObject(item) } + case map[string]api.Alert: + for i := range resp.(map[string]api.Alert) { + fmt.Printf("peer is down: %s\n", i) + } default: checkErr("", errors.New("unsupported type returned")) } diff --git a/cmd/ipfs-cluster-ctl/main.go b/cmd/ipfs-cluster-ctl/main.go index f9e86a5f..a5c29878 100644 --- a/cmd/ipfs-cluster-ctl/main.go +++ b/cmd/ipfs-cluster-ctl/main.go @@ -977,6 +977,16 @@ but usually are: }, }, }, + { + Name: "alerts", + Usage: "Show things which are wrong with this cluster", + Description: "Show things which are wrong with this cluster", + Action: func(c *cli.Context) error { + resp, cerr := globalClient.Alerts(ctx) + formatResponse(c, resp, cerr) + return nil + }, + }, { Name: "ipfs", Usage: "Manage IPFS daemon", diff --git a/rpc_api.go b/rpc_api.go index f6254478..0bbcb717 100644 --- a/rpc_api.go +++ b/rpc_api.go @@ -459,6 +459,13 @@ func (rpcapi *ClusterRPCAPI) SendInformersMetrics(ctx context.Context, in struct return nil } +// Alerts runs Cluster.Alerts(). +func (rpcapi *ClusterRPCAPI) Alerts(ctx context.Context, in struct{}, out *map[string]api.Alert) error { + alerts := rpcapi.c.Alerts() + *out = alerts + return nil +} + /* Tracker component methods */ diff --git a/rpc_policy.go b/rpc_policy.go index e784d3c8..ebf2378f 100644 --- a/rpc_policy.go +++ b/rpc_policy.go @@ -26,6 +26,7 @@ var DefaultRPCPolicy = map[string]RPCEndpointType{ "Cluster.RepoGCLocal": RPCTrusted, "Cluster.SendInformerMetric": RPCClosed, "Cluster.SendInformersMetrics": RPCClosed, + "Cluster.Alerts": RPCClosed, "Cluster.Status": RPCClosed, "Cluster.StatusAll": RPCClosed, "Cluster.StatusAllLocal": RPCClosed,