libnetwork: make rejoin intervals configurable

This allows the rejoin intervals to be chosen according to the context within which the component is used, and, in particular, this allows lower intervals to be used within TestNetworkDBIslands test. Signed-off-by: Roman Volosatovs <roman.volosatovs@docker.com>
2022-11-09 12:21:53 -05:00 · 2021-07-12 11:12:56 +02:00 · 2021-07-12 11:12:56 +02:00 · d7a2635537
commit d7a2635537
parent c81abefdb1
3 changed files with 45 additions and 20 deletions
--- a/libnetwork/networkdb/cluster.go
+++ b/libnetwork/networkdb/cluster.go
@ -18,12 +18,10 @@ import (
 )

 const (
-	reapPeriod            = 5 * time.Second
-	rejoinClusterDuration = 10 * time.Second
-	rejoinInterval        = 60 * time.Second
-	retryInterval         = 1 * time.Second
-	nodeReapInterval      = 24 * time.Hour
-	nodeReapPeriod        = 2 * time.Hour
+	reapPeriod       = 5 * time.Second
+	retryInterval    = 1 * time.Second
+	nodeReapInterval = 24 * time.Hour
+	nodeReapPeriod   = 2 * time.Hour
 	// considering a cluster with > 20 nodes and a drain speed of 100 msg/s
 	// the following is roughly 1 minute
 	maxQueueLenBroadcastOnSync = 500
@ -172,7 +170,7 @@ func (nDB *NetworkDB) clusterInit() error {
 		{config.PushPullInterval, nDB.bulkSyncTables},
 		{retryInterval, nDB.reconnectNode},
 		{nodeReapPeriod, nDB.reapDeadNode},
-		{rejoinInterval, nDB.rejoinClusterBootStrap},
+		{nDB.config.rejoinClusterInterval, nDB.rejoinClusterBootStrap},
 	} {
 		t := time.NewTicker(trigger.interval)
 		go nDB.triggerFunc(trigger.interval, t.C, trigger.fn)
@ -210,7 +208,8 @@ func (nDB *NetworkDB) clusterJoin(members []string) error {

 	if _, err := mlist.Join(members); err != nil {
 		// In case of failure, we no longer need to explicitly call retryJoin.
-		// rejoinClusterBootStrap, which runs every minute, will retryJoin for 10sec
+		// rejoinClusterBootStrap, which runs every nDB.config.rejoinClusterInterval,
+		// will retryJoin for nDB.config.rejoinClusterDuration.
 		return fmt.Errorf("could not join node to memberlist: %v", err)
 	}

@ -324,7 +323,7 @@ func (nDB *NetworkDB) rejoinClusterBootStrap() {
 	}
 	// None of the bootStrap nodes are in the cluster, call memberlist join
 	logrus.Debugf("rejoinClusterBootStrap, calling cluster join with bootStrap %v", bootStrapIPs)
-	ctx, cancel := context.WithTimeout(nDB.ctx, rejoinClusterDuration)
+	ctx, cancel := context.WithTimeout(nDB.ctx, nDB.config.rejoinClusterDuration)
 	defer cancel()
 	nDB.retryJoin(ctx, bootStrapIPs)
 }
--- a/libnetwork/networkdb/networkdb.go
+++ b/libnetwork/networkdb/networkdb.go
@ -192,6 +192,14 @@ type Config struct {
 	// NOTE this MUST always be higher than reapEntryInterval
 	reapNetworkInterval time.Duration

+	// rejoinClusterDuration represents retryJoin timeout used by rejoinClusterBootStrap.
+	// Default is 10sec.
+	rejoinClusterDuration time.Duration
+
+	// rejoinClusterInterval represents interval on which rejoinClusterBootStrap runs.
+	// Default is 60sec.
+	rejoinClusterInterval time.Duration
+
 	// StatsPrintPeriod the period to use to print queue stats
 	// Default is 5min
 	StatsPrintPeriod time.Duration
@ -225,13 +233,15 @@ type entry struct {
 func DefaultConfig() *Config {
 	hostname, _ := os.Hostname()
 	return &Config{
-		NodeID:            stringid.TruncateID(stringid.GenerateRandomID()),
-		Hostname:          hostname,
-		BindAddr:          "0.0.0.0",
-		PacketBufferSize:  1400,
-		StatsPrintPeriod:  5 * time.Minute,
-		HealthPrintPeriod: 1 * time.Minute,
-		reapEntryInterval: 30 * time.Minute,
+		NodeID:                stringid.TruncateID(stringid.GenerateRandomID()),
+		Hostname:              hostname,
+		BindAddr:              "0.0.0.0",
+		PacketBufferSize:      1400,
+		StatsPrintPeriod:      5 * time.Minute,
+		HealthPrintPeriod:     1 * time.Minute,
+		reapEntryInterval:     30 * time.Minute,
+		rejoinClusterDuration: 10 * time.Second,
+		rejoinClusterInterval: 60 * time.Second,
 	}
 }

--- a/libnetwork/networkdb/networkdb_test.go
+++ b/libnetwork/networkdb/networkdb_test.go
@ -819,8 +819,24 @@ func TestParallelDelete(t *testing.T) {
 }

 func TestNetworkDBIslands(t *testing.T) {
+	pollTimeout := func() time.Duration {
+		const defaultTimeout = 120 * time.Second
+		dl, ok := t.Deadline()
+		if !ok {
+			return defaultTimeout
+		}
+		if d := time.Until(dl); d <= defaultTimeout {
+			return d
+		}
+		return defaultTimeout
+	}
+
 	logrus.SetLevel(logrus.DebugLevel)
-	dbs := createNetworkDBInstances(t, 5, "node", DefaultConfig())
+	conf := DefaultConfig()
+	// Shorten durations to speed up test execution.
+	conf.rejoinClusterDuration = conf.rejoinClusterDuration / 10
+	conf.rejoinClusterInterval = conf.rejoinClusterInterval / 10
+	dbs := createNetworkDBInstances(t, 5, "node", conf)

 	// Get the node IP used currently
 	node := dbs[0].nodes[dbs[0].config.NodeID]
@ -868,7 +884,7 @@ func TestNetworkDBIslands(t *testing.T) {
 		}
 		return poll.Success()
 	}
-	poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(120*time.Second))
+	poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(pollTimeout()))

 	// Spawn again the first 3 nodes with different names but same IP:port
 	for i := 0; i < 3; i++ {
@ -877,7 +893,7 @@ func TestNetworkDBIslands(t *testing.T) {
 		dbs[i] = launchNode(t, *dbs[i].config)
 	}

-	// Give some time for the reconnect routine to run, it runs every 60s
+	// Give some time for the reconnect routine to run, it runs every 6s.
 	check = func(t poll.LogT) poll.Result {
 		// Verify that the cluster is again all connected. Note that the 3 previous node did not do any join
 		for i := 0; i < 5; i++ {
@ -908,6 +924,6 @@ func TestNetworkDBIslands(t *testing.T) {
 		}
 		return poll.Success()
 	}
-	poll.WaitOn(t, check, poll.WithDelay(10*time.Second), poll.WithTimeout(120*time.Second))
+	poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(pollTimeout()))
 	closeNetworkDBInstances(t, dbs)
 }