131 lines
4.2 KiB
Go
131 lines
4.2 KiB
Go
package memberlist
|
|
|
|
import (
|
|
"math"
|
|
"sync/atomic"
|
|
"time"
|
|
)
|
|
|
|
// suspicion manages the suspect timer for a node and provides an interface
|
|
// to accelerate the timeout as we get more independent confirmations that
|
|
// a node is suspect.
|
|
type suspicion struct {
|
|
// n is the number of independent confirmations we've seen. This must
|
|
// be updated using atomic instructions to prevent contention with the
|
|
// timer callback.
|
|
n int32
|
|
|
|
// k is the number of independent confirmations we'd like to see in
|
|
// order to drive the timer to its minimum value.
|
|
k int32
|
|
|
|
// min is the minimum timer value.
|
|
min time.Duration
|
|
|
|
// max is the maximum timer value.
|
|
max time.Duration
|
|
|
|
// start captures the timestamp when we began the timer. This is used
|
|
// so we can calculate durations to feed the timer during updates in
|
|
// a way the achieves the overall time we'd like.
|
|
start time.Time
|
|
|
|
// timer is the underlying timer that implements the timeout.
|
|
timer *time.Timer
|
|
|
|
// f is the function to call when the timer expires. We hold on to this
|
|
// because there are cases where we call it directly.
|
|
timeoutFn func()
|
|
|
|
// confirmations is a map of "from" nodes that have confirmed a given
|
|
// node is suspect. This prevents double counting.
|
|
confirmations map[string]struct{}
|
|
}
|
|
|
|
// newSuspicion returns a timer started with the max time, and that will drive
|
|
// to the min time after seeing k or more confirmations. The from node will be
|
|
// excluded from confirmations since we might get our own suspicion message
|
|
// gossiped back to us. The minimum time will be used if no confirmations are
|
|
// called for (k <= 0).
|
|
func newSuspicion(from string, k int, min time.Duration, max time.Duration, fn func(int)) *suspicion {
|
|
s := &suspicion{
|
|
k: int32(k),
|
|
min: min,
|
|
max: max,
|
|
confirmations: make(map[string]struct{}),
|
|
}
|
|
|
|
// Exclude the from node from any confirmations.
|
|
s.confirmations[from] = struct{}{}
|
|
|
|
// Pass the number of confirmations into the timeout function for
|
|
// easy telemetry.
|
|
s.timeoutFn = func() {
|
|
fn(int(atomic.LoadInt32(&s.n)))
|
|
}
|
|
|
|
// If there aren't any confirmations to be made then take the min
|
|
// time from the start.
|
|
timeout := max
|
|
if k < 1 {
|
|
timeout = min
|
|
}
|
|
s.timer = time.AfterFunc(timeout, s.timeoutFn)
|
|
|
|
// Capture the start time right after starting the timer above so
|
|
// we should always err on the side of a little longer timeout if
|
|
// there's any preemption that separates this and the step above.
|
|
s.start = time.Now()
|
|
return s
|
|
}
|
|
|
|
// remainingSuspicionTime takes the state variables of the suspicion timer and
|
|
// calculates the remaining time to wait before considering a node dead. The
|
|
// return value can be negative, so be prepared to fire the timer immediately in
|
|
// that case.
|
|
func remainingSuspicionTime(n, k int32, elapsed time.Duration, min, max time.Duration) time.Duration {
|
|
frac := math.Log(float64(n)+1.0) / math.Log(float64(k)+1.0)
|
|
raw := max.Seconds() - frac*(max.Seconds()-min.Seconds())
|
|
timeout := time.Duration(math.Floor(1000.0*raw)) * time.Millisecond
|
|
if timeout < min {
|
|
timeout = min
|
|
}
|
|
|
|
// We have to take into account the amount of time that has passed so
|
|
// far, so we get the right overall timeout.
|
|
return timeout - elapsed
|
|
}
|
|
|
|
// Confirm registers that a possibly new peer has also determined the given
|
|
// node is suspect. This returns true if this was new information, and false
|
|
// if it was a duplicate confirmation, or if we've got enough confirmations to
|
|
// hit the minimum.
|
|
func (s *suspicion) Confirm(from string) bool {
|
|
// If we've got enough confirmations then stop accepting them.
|
|
if atomic.LoadInt32(&s.n) >= s.k {
|
|
return false
|
|
}
|
|
|
|
// Only allow one confirmation from each possible peer.
|
|
if _, ok := s.confirmations[from]; ok {
|
|
return false
|
|
}
|
|
s.confirmations[from] = struct{}{}
|
|
|
|
// Compute the new timeout given the current number of confirmations and
|
|
// adjust the timer. If the timeout becomes negative *and* we can cleanly
|
|
// stop the timer then we will call the timeout function directly from
|
|
// here.
|
|
n := atomic.AddInt32(&s.n, 1)
|
|
elapsed := time.Since(s.start)
|
|
remaining := remainingSuspicionTime(n, s.k, elapsed, s.min, s.max)
|
|
if s.timer.Stop() {
|
|
if remaining > 0 {
|
|
s.timer.Reset(remaining)
|
|
} else {
|
|
go s.timeoutFn()
|
|
}
|
|
}
|
|
return true
|
|
}
|