Fix: repo/stat gets hammered on busy cluster peers

Given that every pin and block/put writes something to IPFS and thus increases
the repo size, a while ago we added a check to let the IPFS connector directly
trigger the sending of metrics every 10 of such requests. This was meant to
update the metrics more often so that balancing happened more granularly
(particularly the freespace one).

In practice, on a cluster that receives several hundreds of pin/adds
operations in a few seconds, this is just bad.

So:

* We disable by default the whole thing.
* We add a new InformerTriggerInterval configuration option to enable the thing.
* Fix a bug that made this always call the first informer, which may not
  have been the freespace one).
This commit is contained in:
Hector Sanjuan 2022-01-31 17:44:35 +01:00
parent ed348f29c1
commit acde3f16d0
6 changed files with 46 additions and 33 deletions

View File

@ -112,7 +112,8 @@ var testingIpfsCfg = []byte(`{
"node_multiaddress": "/ip4/127.0.0.1/tcp/5001",
"connect_swarms_delay": "7s",
"pin_timeout": "30s",
"unpin_timeout": "15s"
"unpin_timeout": "15s",
"informer_trigger_interval": 10
}`)
var testingTrackerCfg = []byte(`

View File

@ -24,6 +24,7 @@ const (
DefaultPinTimeout = 2 * time.Minute
DefaultUnpinTimeout = 3 * time.Hour
DefaultRepoGCTimeout = 24 * time.Hour
DefaultInformerTriggerInterval = 0 // disabled
DefaultUnpinDisable = false
)
@ -51,6 +52,11 @@ type Config struct {
// RepoGC Operation timeout
RepoGCTimeout time.Duration
// How many pin and block/put operations need to happen before we do a
// special broadcast informer metrics to the network. 0 to disable.
InformerTriggerInterval int
// Disables the unpin operation and returns an error.
UnpinDisable bool
@ -65,6 +71,7 @@ type jsonConfig struct {
PinTimeout string `json:"pin_timeout"`
UnpinTimeout string `json:"unpin_timeout"`
RepoGCTimeout string `json:"repogc_timeout"`
InformerTriggerInterval int `json:"informer_trigger_interval"`
UnpinDisable bool `json:"unpin_disable,omitempty"`
}
@ -82,6 +89,7 @@ func (cfg *Config) Default() error {
cfg.PinTimeout = DefaultPinTimeout
cfg.UnpinTimeout = DefaultUnpinTimeout
cfg.RepoGCTimeout = DefaultRepoGCTimeout
cfg.InformerTriggerInterval = DefaultInformerTriggerInterval
cfg.UnpinDisable = DefaultUnpinDisable
return nil
@ -130,6 +138,9 @@ func (cfg *Config) Validate() error {
if cfg.RepoGCTimeout < 0 {
err = errors.New("ipfshttp.repogc_timeout invalid")
}
if cfg.InformerTriggerInterval < 0 {
err = errors.New("ipfshttp.update_metrics_after")
}
return err
@ -157,6 +168,7 @@ func (cfg *Config) applyJSONConfig(jcfg *jsonConfig) error {
cfg.NodeAddr = nodeAddr
cfg.UnpinDisable = jcfg.UnpinDisable
cfg.InformerTriggerInterval = jcfg.InformerTriggerInterval
err = config.ParseDurations(
"ipfshttp",
@ -201,6 +213,7 @@ func (cfg *Config) toJSONConfig() (jcfg *jsonConfig, err error) {
jcfg.PinTimeout = cfg.PinTimeout.String()
jcfg.UnpinTimeout = cfg.UnpinTimeout.String()
jcfg.RepoGCTimeout = cfg.RepoGCTimeout.String()
jcfg.InformerTriggerInterval = cfg.InformerTriggerInterval
jcfg.UnpinDisable = cfg.UnpinDisable
return

View File

@ -14,7 +14,8 @@ var cfgJSON = []byte(`
"ipfs_request_timeout": "5m0s",
"pin_timeout": "2m",
"unpin_timeout": "3h",
"repogc_timeout": "24h"
"repogc_timeout": "24h",
"informer_trigger_interval": 10
}
`)
@ -27,6 +28,11 @@ func TestLoadJSON(t *testing.T) {
j := &jsonConfig{}
json.Unmarshal(cfgJSON, j)
if cfg.InformerTriggerInterval != 10 {
t.Error("missing value")
}
j.NodeMultiaddress = "abc"
tst, _ := json.Marshal(j)
err = cfg.LoadJSON(tst)

View File

@ -14,6 +14,7 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/ipfs/ipfs-cluster/api"
@ -41,11 +42,6 @@ var DNSTimeout = 5 * time.Second
var logger = logging.Logger("ipfshttp")
// updateMetricsMod only makes updates to informer metrics
// on the nth occasion. So, for example, for every BlockPut,
// only the 10th will trigger a SendInformerMetrics call.
var updateMetricMod = 10
// Connector implements the IPFSConnector interface
// and provides a component which is used to perform
// on-demand requests against the configured IPFS daemom
@ -62,8 +58,7 @@ type Connector struct {
client *http.Client // client to ipfs daemon
updateMetricMutex sync.Mutex
updateMetricCount int
updateMetricCount uint64
shutdownLock sync.Mutex
shutdown bool
@ -970,11 +965,12 @@ func (ipfs *Connector) BlockGet(ctx context.Context, c cid.Cid) ([]byte, error)
// Returns true every updateMetricsMod-th time that we
// call this function.
func (ipfs *Connector) shouldUpdateMetric() bool {
ipfs.updateMetricMutex.Lock()
defer ipfs.updateMetricMutex.Unlock()
ipfs.updateMetricCount++
if ipfs.updateMetricCount%updateMetricMod == 0 {
ipfs.updateMetricCount = 0
if ipfs.config.InformerTriggerInterval <= 0 {
return false
}
curCount := atomic.AddUint64(&ipfs.updateMetricCount, 1)
if curCount%uint64(ipfs.config.InformerTriggerInterval) == 0 {
atomic.StoreUint64(&ipfs.updateMetricCount, 0)
return true
}
return false

View File

@ -27,6 +27,7 @@ func testIPFSConnector(t *testing.T) (*Connector, *test.IpfsMock) {
cfg.Default()
cfg.NodeAddr = nodeMAddr
cfg.ConnectSwarmsDelay = 0
cfg.InformerTriggerInterval = 10
ipfs, err := NewConnector(cfg)
if err != nil {

View File

@ -407,11 +407,7 @@ func (rpcapi *ClusterRPCAPI) RepoGCLocal(ctx context.Context, in struct{}, out *
// SendInformerMetric runs Cluster.sendInformerMetric().
func (rpcapi *ClusterRPCAPI) SendInformerMetrics(ctx context.Context, in struct{}, out *struct{}) error {
_, err := rpcapi.c.sendInformerMetrics(ctx, rpcapi.c.informers[0])
if err != nil {
return err
}
return nil
return rpcapi.c.sendInformersMetrics(ctx)
}
// SendInformersMetrics runs Cluster.sendInformerMetric() on all informers.