diff --git a/libnetwork/networkdb/cluster.go b/libnetwork/networkdb/cluster.go index b388cae83c..6a633dfeda 100644 --- a/libnetwork/networkdb/cluster.go +++ b/libnetwork/networkdb/cluster.go @@ -18,12 +18,10 @@ import ( ) const ( - reapPeriod = 5 * time.Second - rejoinClusterDuration = 10 * time.Second - rejoinInterval = 60 * time.Second - retryInterval = 1 * time.Second - nodeReapInterval = 24 * time.Hour - nodeReapPeriod = 2 * time.Hour + reapPeriod = 5 * time.Second + retryInterval = 1 * time.Second + nodeReapInterval = 24 * time.Hour + nodeReapPeriod = 2 * time.Hour // considering a cluster with > 20 nodes and a drain speed of 100 msg/s // the following is roughly 1 minute maxQueueLenBroadcastOnSync = 500 @@ -172,7 +170,7 @@ func (nDB *NetworkDB) clusterInit() error { {config.PushPullInterval, nDB.bulkSyncTables}, {retryInterval, nDB.reconnectNode}, {nodeReapPeriod, nDB.reapDeadNode}, - {rejoinInterval, nDB.rejoinClusterBootStrap}, + {nDB.config.rejoinClusterInterval, nDB.rejoinClusterBootStrap}, } { t := time.NewTicker(trigger.interval) go nDB.triggerFunc(trigger.interval, t.C, trigger.fn) @@ -210,7 +208,8 @@ func (nDB *NetworkDB) clusterJoin(members []string) error { if _, err := mlist.Join(members); err != nil { // In case of failure, we no longer need to explicitly call retryJoin. - // rejoinClusterBootStrap, which runs every minute, will retryJoin for 10sec + // rejoinClusterBootStrap, which runs every nDB.config.rejoinClusterInterval, + // will retryJoin for nDB.config.rejoinClusterDuration. return fmt.Errorf("could not join node to memberlist: %v", err) } @@ -324,7 +323,7 @@ func (nDB *NetworkDB) rejoinClusterBootStrap() { } // None of the bootStrap nodes are in the cluster, call memberlist join logrus.Debugf("rejoinClusterBootStrap, calling cluster join with bootStrap %v", bootStrapIPs) - ctx, cancel := context.WithTimeout(nDB.ctx, rejoinClusterDuration) + ctx, cancel := context.WithTimeout(nDB.ctx, nDB.config.rejoinClusterDuration) defer cancel() nDB.retryJoin(ctx, bootStrapIPs) } diff --git a/libnetwork/networkdb/networkdb.go b/libnetwork/networkdb/networkdb.go index 7655f83317..bc78e480ae 100644 --- a/libnetwork/networkdb/networkdb.go +++ b/libnetwork/networkdb/networkdb.go @@ -192,6 +192,14 @@ type Config struct { // NOTE this MUST always be higher than reapEntryInterval reapNetworkInterval time.Duration + // rejoinClusterDuration represents retryJoin timeout used by rejoinClusterBootStrap. + // Default is 10sec. + rejoinClusterDuration time.Duration + + // rejoinClusterInterval represents interval on which rejoinClusterBootStrap runs. + // Default is 60sec. + rejoinClusterInterval time.Duration + // StatsPrintPeriod the period to use to print queue stats // Default is 5min StatsPrintPeriod time.Duration @@ -225,13 +233,15 @@ type entry struct { func DefaultConfig() *Config { hostname, _ := os.Hostname() return &Config{ - NodeID: stringid.TruncateID(stringid.GenerateRandomID()), - Hostname: hostname, - BindAddr: "0.0.0.0", - PacketBufferSize: 1400, - StatsPrintPeriod: 5 * time.Minute, - HealthPrintPeriod: 1 * time.Minute, - reapEntryInterval: 30 * time.Minute, + NodeID: stringid.TruncateID(stringid.GenerateRandomID()), + Hostname: hostname, + BindAddr: "0.0.0.0", + PacketBufferSize: 1400, + StatsPrintPeriod: 5 * time.Minute, + HealthPrintPeriod: 1 * time.Minute, + reapEntryInterval: 30 * time.Minute, + rejoinClusterDuration: 10 * time.Second, + rejoinClusterInterval: 60 * time.Second, } } diff --git a/libnetwork/networkdb/networkdb_test.go b/libnetwork/networkdb/networkdb_test.go index 2dc4f885b3..6691f01726 100644 --- a/libnetwork/networkdb/networkdb_test.go +++ b/libnetwork/networkdb/networkdb_test.go @@ -819,8 +819,24 @@ func TestParallelDelete(t *testing.T) { } func TestNetworkDBIslands(t *testing.T) { + pollTimeout := func() time.Duration { + const defaultTimeout = 120 * time.Second + dl, ok := t.Deadline() + if !ok { + return defaultTimeout + } + if d := time.Until(dl); d <= defaultTimeout { + return d + } + return defaultTimeout + } + logrus.SetLevel(logrus.DebugLevel) - dbs := createNetworkDBInstances(t, 5, "node", DefaultConfig()) + conf := DefaultConfig() + // Shorten durations to speed up test execution. + conf.rejoinClusterDuration = conf.rejoinClusterDuration / 10 + conf.rejoinClusterInterval = conf.rejoinClusterInterval / 10 + dbs := createNetworkDBInstances(t, 5, "node", conf) // Get the node IP used currently node := dbs[0].nodes[dbs[0].config.NodeID] @@ -868,7 +884,7 @@ func TestNetworkDBIslands(t *testing.T) { } return poll.Success() } - poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(120*time.Second)) + poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(pollTimeout())) // Spawn again the first 3 nodes with different names but same IP:port for i := 0; i < 3; i++ { @@ -877,7 +893,7 @@ func TestNetworkDBIslands(t *testing.T) { dbs[i] = launchNode(t, *dbs[i].config) } - // Give some time for the reconnect routine to run, it runs every 60s + // Give some time for the reconnect routine to run, it runs every 6s. check = func(t poll.LogT) poll.Result { // Verify that the cluster is again all connected. Note that the 3 previous node did not do any join for i := 0; i < 5; i++ { @@ -908,6 +924,6 @@ func TestNetworkDBIslands(t *testing.T) { } return poll.Success() } - poll.WaitOn(t, check, poll.WithDelay(10*time.Second), poll.WithTimeout(120*time.Second)) + poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(pollTimeout())) closeNetworkDBInstances(t, dbs) }