1
0
Fork 0
mirror of https://github.com/moby/moby.git synced 2022-11-09 12:21:53 -05:00

Update libnetwork dependencies for b66c038

Signed-off-by: Alessandro Boch <aboch@docker.com>
This commit is contained in:
Alessandro Boch 2016-05-08 00:31:30 -07:00
parent 4c6838137c
commit ebcf785f2f
59 changed files with 4049 additions and 379 deletions

View file

@ -9,7 +9,7 @@ source 'hack/.vendor-helpers.sh'
clone git github.com/Azure/go-ansiterm 388960b655244e76e24c75f48631564eaefade62
clone git github.com/Microsoft/hcsshim v0.2.2
clone git github.com/Microsoft/go-winio v0.3.4
clone git github.com/Sirupsen/logrus v0.9.0 # logrus is a common dependency among multiple deps
clone git github.com/Sirupsen/logrus v0.10.0 # logrus is a common dependency among multiple deps
clone git github.com/docker/libtrust 9cbd2a1374f46905c68a4eb3694a130610adc62a
clone git github.com/go-check/check 03a4d9dcf2f92eae8e90ed42aa2656f63fdd0b14 https://github.com/cpuguy83/check.git
clone git github.com/gorilla/context 14f550f51a
@ -30,11 +30,14 @@ clone git github.com/imdario/mergo 0.2.1
#get libnetwork packages
clone git github.com/docker/libnetwork v0.8.0-dev.1
clone git github.com/docker/go-events 2e7d352816128aa84f4d29b2a21d400133701a0d
clone git github.com/armon/go-radix e39d623f12e8e41c7b5529e9a9dd67a1e2261f80
clone git github.com/armon/go-metrics eb0af217e5e9747e41dd5303755356b62d28e3ec
clone git github.com/hashicorp/go-msgpack 71c2886f5a673a35f909803f38ece5810165097b
clone git github.com/hashicorp/memberlist 9a1e242e454d2443df330bdd51a436d5a9058fc4
clone git github.com/hashicorp/serf 7151adcef72687bf95f451a2e0ba15cb19412bf2
clone git github.com/docker/libkv c2aac5dbbaa5c872211edea7c0f32b3bd67e7410
clone git github.com/hashicorp/memberlist 88ac4de0d1a0ca6def284b571342db3b777a4c37
clone git github.com/hashicorp/go-multierror fcdddc395df1ddf4247c69bd436e84cfa0733f7e
clone git github.com/hashicorp/serf 598c54895cc5a7b1a24a398d635e8c0ea0959870
clone git github.com/docker/libkv 7283ef27ed32fe267388510a91709b307bb9942c
clone git github.com/vishvananda/netns 604eaf189ee867d8c147fafc28def2394e878d25
clone git github.com/vishvananda/netlink 631962935bff4f3d20ff32a72e8944f6d2836a26
clone git github.com/BurntSushi/toml f706d00e3de6abe700c994cdd545a1a4915af060

View file

@ -1,3 +1,10 @@
# 0.10.0
* feature: Add a test hook (#180)
* feature: `ParseLevel` is now case-insensitive (#326)
* feature: `FieldLogger` interface that generalizes `Logger` and `Entry` (#308)
* performance: avoid re-allocations on `WithFields` (#335)
# 0.9.0
* logrus/text_formatter: don't emit empty msg

View file

@ -1,4 +1,4 @@
# Logrus <img src="http://i.imgur.com/hTeVwmJ.png" width="40" height="40" alt=":walrus:" class="emoji" title=":walrus:"/>&nbsp;[![Build Status](https://travis-ci.org/Sirupsen/logrus.svg?branch=master)](https://travis-ci.org/Sirupsen/logrus)&nbsp;[![godoc reference](https://godoc.org/github.com/Sirupsen/logrus?status.png)][godoc]
# Logrus <img src="http://i.imgur.com/hTeVwmJ.png" width="40" height="40" alt=":walrus:" class="emoji" title=":walrus:"/>&nbsp;[![Build Status](https://travis-ci.org/Sirupsen/logrus.svg?branch=master)](https://travis-ci.org/Sirupsen/logrus)&nbsp;[![GoDoc](https://godoc.org/github.com/Sirupsen/logrus?status.svg)](https://godoc.org/github.com/Sirupsen/logrus)
Logrus is a structured logger for Go (golang), completely API compatible with
the standard library logger. [Godoc][godoc]. **Please note the Logrus API is not
@ -12,7 +12,7 @@ plain text):
![Colored](http://i.imgur.com/PY7qMwd.png)
With `log.Formatter = new(logrus.JSONFormatter)`, for easy parsing by logstash
With `log.SetFormatter(&log.JSONFormatter{})`, for easy parsing by logstash
or Splunk:
```json
@ -32,7 +32,7 @@ ocean","size":10,"time":"2014-03-10 19:57:38.562264131 -0400 EDT"}
"time":"2014-03-10 19:57:38.562543128 -0400 EDT"}
```
With the default `log.Formatter = new(&log.TextFormatter{})` when a TTY is not
With the default `log.SetFormatter(&log.TextFormatter{})` when a TTY is not
attached, the output is compatible with the
[logfmt](http://godoc.org/github.com/kr/logfmt) format:
@ -222,6 +222,11 @@ Note: Syslog hook also support connecting to local syslog (Ex. "/dev/log" or "/v
| [Octokit](https://github.com/dorajistyle/logrus-octokit-hook) | Hook for logging to github via octokit |
| [DeferPanic](https://github.com/deferpanic/dp-logrus) | Hook for logging to DeferPanic |
| [Redis-Hook](https://github.com/rogierlommers/logrus-redis-hook) | Hook for logging to a ELK stack (through Redis) |
| [Amqp-Hook](https://github.com/vladoatanasov/logrus_amqp) | Hook for logging to Amqp broker (Like RabbitMQ) |
| [KafkaLogrus](https://github.com/goibibo/KafkaLogrus) | Hook for logging to kafka |
| [Typetalk](https://github.com/dragon3/logrus-typetalk-hook) | Hook for logging to [Typetalk](https://www.typetalk.in/) |
| [ElasticSearch](https://github.com/sohlich/elogrus) | Hook for logging to ElasticSearch|
#### Level logging
@ -363,4 +368,21 @@ entries. It should not be a feature of the application-level logger.
| ---- | ----------- |
|[Logrus Mate](https://github.com/gogap/logrus_mate)|Logrus mate is a tool for Logrus to manage loggers, you can initial logger's level, hook and formatter by config file, the logger will generated with different config at different environment.|
[godoc]: https://godoc.org/github.com/Sirupsen/logrus
#### Testing
Logrus has a built in facility for asserting the presence of log messages. This is implemented through the `test` hook and provides:
* decorators for existing logger (`test.NewLocal` and `test.NewGlobal`) which basically just add the `test` hook
* a test logger (`test.NewNullLogger`) that just records log messages (and does not output any):
```go
logger, hook := NewNullLogger()
logger.Error("Hello error")
assert.Equal(1, len(hook.Entries))
assert.Equal(logrus.ErrorLevel, hook.LastEntry().Level)
assert.Equal("Hello error", hook.LastEntry().Message)
hook.Reset()
assert.Nil(hook.LastEntry())
```

View file

@ -68,7 +68,7 @@ func (entry *Entry) WithField(key string, value interface{}) *Entry {
// Add a map of fields to the Entry.
func (entry *Entry) WithFields(fields Fields) *Entry {
data := Fields{}
data := make(Fields, len(entry.Data)+len(fields))
for k, v := range entry.Data {
data[k] = v
}

View file

@ -3,6 +3,7 @@ package logrus
import (
"fmt"
"log"
"strings"
)
// Fields type, used to pass to `WithFields`.
@ -33,7 +34,7 @@ func (level Level) String() string {
// ParseLevel takes a string level and returns the Logrus log level constant.
func ParseLevel(lvl string) (Level, error) {
switch lvl {
switch strings.ToLower(lvl) {
case "panic":
return PanicLevel, nil
case "fatal":
@ -52,6 +53,16 @@ func ParseLevel(lvl string) (Level, error) {
return l, fmt.Errorf("not a valid logrus Level: %q", lvl)
}
// A constant exposing all logging levels
var AllLevels = []Level{
PanicLevel,
FatalLevel,
ErrorLevel,
WarnLevel,
InfoLevel,
DebugLevel,
}
// These are the different logging levels. You can set the logging level to log
// on your instance of logger, obtained with `logrus.New()`.
const (
@ -96,3 +107,37 @@ type StdLogger interface {
Panicf(string, ...interface{})
Panicln(...interface{})
}
// The FieldLogger interface generalizes the Entry and Logger types
type FieldLogger interface {
WithField(key string, value interface{}) *Entry
WithFields(fields Fields) *Entry
WithError(err error) *Entry
Debugf(format string, args ...interface{})
Infof(format string, args ...interface{})
Printf(format string, args ...interface{})
Warnf(format string, args ...interface{})
Warningf(format string, args ...interface{})
Errorf(format string, args ...interface{})
Fatalf(format string, args ...interface{})
Panicf(format string, args ...interface{})
Debug(args ...interface{})
Info(args ...interface{})
Print(args ...interface{})
Warn(args ...interface{})
Warning(args ...interface{})
Error(args ...interface{})
Fatal(args ...interface{})
Panic(args ...interface{})
Debugln(args ...interface{})
Infoln(args ...interface{})
Println(args ...interface{})
Warnln(args ...interface{})
Warningln(args ...interface{})
Errorln(args ...interface{})
Fatalln(args ...interface{})
Panicln(args ...interface{})
}

View file

@ -0,0 +1,22 @@
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe

View file

@ -0,0 +1,3 @@
language: go
go:
- tip

View file

@ -0,0 +1,20 @@
The MIT License (MIT)
Copyright (c) 2014 Armon Dadgar
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View file

@ -0,0 +1,36 @@
go-radix [![Build Status](https://travis-ci.org/armon/go-radix.png)](https://travis-ci.org/armon/go-radix)
=========
Provides the `radix` package that implements a [radix tree](http://en.wikipedia.org/wiki/Radix_tree).
The package only provides a single `Tree` implementation, optimized for sparse nodes.
As a radix tree, it provides the following:
* O(k) operations. In many cases, this can be faster than a hash table since
the hash function is an O(k) operation, and hash tables have very poor cache locality.
* Minimum / Maximum value lookups
* Ordered iteration
Documentation
=============
The full documentation is available on [Godoc](http://godoc.org/github.com/armon/go-radix).
Example
=======
Below is a simple example of usage
```go
// Create a tree
r := radix.New()
r.Insert("foo", 1)
r.Insert("bar", 2)
r.Insert("foobar", 2)
// Find the longest prefix match
m, _, _ := r.LongestPrefix("foozip")
if m != "foo" {
panic("should be foo")
}
```

View file

@ -0,0 +1,467 @@
package radix
import (
"sort"
"strings"
)
// WalkFn is used when walking the tree. Takes a
// key and value, returning if iteration should
// be terminated.
type WalkFn func(s string, v interface{}) bool
// leafNode is used to represent a value
type leafNode struct {
key string
val interface{}
}
// edge is used to represent an edge node
type edge struct {
label byte
node *node
}
type node struct {
// leaf is used to store possible leaf
leaf *leafNode
// prefix is the common prefix we ignore
prefix string
// Edges should be stored in-order for iteration.
// We avoid a fully materialized slice to save memory,
// since in most cases we expect to be sparse
edges edges
}
func (n *node) isLeaf() bool {
return n.leaf != nil
}
func (n *node) addEdge(e edge) {
n.edges = append(n.edges, e)
n.edges.Sort()
}
func (n *node) replaceEdge(e edge) {
num := len(n.edges)
idx := sort.Search(num, func(i int) bool {
return n.edges[i].label >= e.label
})
if idx < num && n.edges[idx].label == e.label {
n.edges[idx].node = e.node
return
}
panic("replacing missing edge")
}
func (n *node) getEdge(label byte) *node {
num := len(n.edges)
idx := sort.Search(num, func(i int) bool {
return n.edges[i].label >= label
})
if idx < num && n.edges[idx].label == label {
return n.edges[idx].node
}
return nil
}
type edges []edge
func (e edges) Len() int {
return len(e)
}
func (e edges) Less(i, j int) bool {
return e[i].label < e[j].label
}
func (e edges) Swap(i, j int) {
e[i], e[j] = e[j], e[i]
}
func (e edges) Sort() {
sort.Sort(e)
}
// Tree implements a radix tree. This can be treated as a
// Dictionary abstract data type. The main advantage over
// a standard hash map is prefix-based lookups and
// ordered iteration,
type Tree struct {
root *node
size int
}
// New returns an empty Tree
func New() *Tree {
return NewFromMap(nil)
}
// NewFromMap returns a new tree containing the keys
// from an existing map
func NewFromMap(m map[string]interface{}) *Tree {
t := &Tree{root: &node{}}
for k, v := range m {
t.Insert(k, v)
}
return t
}
// Len is used to return the number of elements in the tree
func (t *Tree) Len() int {
return t.size
}
// longestPrefix finds the length of the shared prefix
// of two strings
func longestPrefix(k1, k2 string) int {
max := len(k1)
if l := len(k2); l < max {
max = l
}
var i int
for i = 0; i < max; i++ {
if k1[i] != k2[i] {
break
}
}
return i
}
// Insert is used to add a newentry or update
// an existing entry. Returns if updated.
func (t *Tree) Insert(s string, v interface{}) (interface{}, bool) {
var parent *node
n := t.root
search := s
for {
// Handle key exhaution
if len(search) == 0 {
if n.isLeaf() {
old := n.leaf.val
n.leaf.val = v
return old, true
} else {
n.leaf = &leafNode{
key: s,
val: v,
}
t.size++
return nil, false
}
}
// Look for the edge
parent = n
n = n.getEdge(search[0])
// No edge, create one
if n == nil {
e := edge{
label: search[0],
node: &node{
leaf: &leafNode{
key: s,
val: v,
},
prefix: search,
},
}
parent.addEdge(e)
t.size++
return nil, false
}
// Determine longest prefix of the search key on match
commonPrefix := longestPrefix(search, n.prefix)
if commonPrefix == len(n.prefix) {
search = search[commonPrefix:]
continue
}
// Split the node
t.size++
child := &node{
prefix: search[:commonPrefix],
}
parent.replaceEdge(edge{
label: search[0],
node: child,
})
// Restore the existing node
child.addEdge(edge{
label: n.prefix[commonPrefix],
node: n,
})
n.prefix = n.prefix[commonPrefix:]
// Create a new leaf node
leaf := &leafNode{
key: s,
val: v,
}
// If the new key is a subset, add to to this node
search = search[commonPrefix:]
if len(search) == 0 {
child.leaf = leaf
return nil, false
}
// Create a new edge for the node
child.addEdge(edge{
label: search[0],
node: &node{
leaf: leaf,
prefix: search,
},
})
return nil, false
}
return nil, false
}
// Delete is used to delete a key, returning the previous
// value and if it was deleted
func (t *Tree) Delete(s string) (interface{}, bool) {
n := t.root
search := s
for {
// Check for key exhaution
if len(search) == 0 {
if !n.isLeaf() {
break
}
goto DELETE
}
// Look for an edge
n = n.getEdge(search[0])
if n == nil {
break
}
// Consume the search prefix
if strings.HasPrefix(search, n.prefix) {
search = search[len(n.prefix):]
} else {
break
}
}
return nil, false
DELETE:
// Delete the leaf
leaf := n.leaf
n.leaf = nil
t.size--
// Check if we should merge this node
if len(n.edges) == 1 {
e := n.edges[0]
child := e.node
n.prefix = n.prefix + child.prefix
n.leaf = child.leaf
n.edges = child.edges
}
return leaf.val, true
}
// Get is used to lookup a specific key, returning
// the value and if it was found
func (t *Tree) Get(s string) (interface{}, bool) {
n := t.root
search := s
for {
// Check for key exhaution
if len(search) == 0 {
if n.isLeaf() {
return n.leaf.val, true
}
break
}
// Look for an edge
n = n.getEdge(search[0])
if n == nil {
break
}
// Consume the search prefix
if strings.HasPrefix(search, n.prefix) {
search = search[len(n.prefix):]
} else {
break
}
}
return nil, false
}
// LongestPrefix is like Get, but instead of an
// exact match, it will return the longest prefix match.
func (t *Tree) LongestPrefix(s string) (string, interface{}, bool) {
var last *leafNode
n := t.root
search := s
for {
// Look for a leaf node
if n.isLeaf() {
last = n.leaf
}
// Check for key exhaution
if len(search) == 0 {
break
}
// Look for an edge
n = n.getEdge(search[0])
if n == nil {
break
}
// Consume the search prefix
if strings.HasPrefix(search, n.prefix) {
search = search[len(n.prefix):]
} else {
break
}
}
if last != nil {
return last.key, last.val, true
}
return "", nil, false
}
// Minimum is used to return the minimum value in the tree
func (t *Tree) Minimum() (string, interface{}, bool) {
n := t.root
for {
if n.isLeaf() {
return n.leaf.key, n.leaf.val, true
}
if len(n.edges) > 0 {
n = n.edges[0].node
} else {
break
}
}
return "", nil, false
}
// Maximum is used to return the maximum value in the tree
func (t *Tree) Maximum() (string, interface{}, bool) {
n := t.root
for {
if num := len(n.edges); num > 0 {
n = n.edges[num-1].node
continue
}
if n.isLeaf() {
return n.leaf.key, n.leaf.val, true
} else {
break
}
}
return "", nil, false
}
// Walk is used to walk the tree
func (t *Tree) Walk(fn WalkFn) {
recursiveWalk(t.root, fn)
}
// WalkPrefix is used to walk the tree under a prefix
func (t *Tree) WalkPrefix(prefix string, fn WalkFn) {
n := t.root
search := prefix
for {
// Check for key exhaution
if len(search) == 0 {
recursiveWalk(n, fn)
return
}
// Look for an edge
n = n.getEdge(search[0])
if n == nil {
break
}
// Consume the search prefix
if strings.HasPrefix(search, n.prefix) {
search = search[len(n.prefix):]
} else if strings.HasPrefix(n.prefix, search) {
// Child may be under our search prefix
recursiveWalk(n, fn)
return
} else {
break
}
}
}
// WalkPath is used to walk the tree, but only visiting nodes
// from the root down to a given leaf. Where WalkPrefix walks
// all the entries *under* the given prefix, this walks the
// entries *above* the given prefix.
func (t *Tree) WalkPath(path string, fn WalkFn) {
n := t.root
search := path
for {
// Visit the leaf values if any
if n.leaf != nil && fn(n.leaf.key, n.leaf.val) {
return
}
// Check for key exhaution
if len(search) == 0 {
return
}
// Look for an edge
n = n.getEdge(search[0])
if n == nil {
return
}
// Consume the search prefix
if strings.HasPrefix(search, n.prefix) {
search = search[len(n.prefix):]
} else {
break
}
}
}
// recursiveWalk is used to do a pre-order walk of a node
// recursively. Returns true if the walk should be aborted
func recursiveWalk(n *node, fn WalkFn) bool {
// Visit the leaf values if any
if n.leaf != nil && fn(n.leaf.key, n.leaf.val) {
return true
}
// Recurse on the children
for _, e := range n.edges {
if recursiveWalk(e.node, fn) {
return true
}
}
return false
}
// ToMap is used to walk the tree and convert it into a map
func (t *Tree) ToMap() map[string]interface{} {
out := make(map[string]interface{}, t.size)
t.Walk(func(k string, v interface{}) bool {
out[k] = v
return false
})
return out
}

View file

@ -0,0 +1,24 @@
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.test
*.prof

View file

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright {yyyy} {name of copyright owner}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View file

@ -0,0 +1,112 @@
# Docker Events Package
[![GoDoc](https://godoc.org/github.com/docker/go-events?status.svg)](https://godoc.org/github.com/docker/go-events)
[![Circle CI](https://circleci.com/gh/docker/go-events.svg?style=shield)](https://circleci.com/gh/docker/go-events)
The Docker `events` package implements a composable event distribution package
for Go.
Originally created to implement the [notifications in Docker Registry
2](https://github.com/docker/distribution/blob/master/docs/notifications.md),
we've found the pattern to be useful in other applications. This package is
most of the same code with slightly updated interfaces. Much of the internals
have been made available.
## Usage
The `events` package centers around a `Sink` type. Events are written with
calls to `Sink.Write(event Event)`. Sinks can be wired up in various
configurations to achieve interesting behavior.
The canonical example is that employed by the
[docker/distribution/notifications](https://godoc.org/github.com/docker/distribution/notifications)
package. Let's say we have a type `httpSink` where we'd like to queue
notifications. As a rule, it should send a single http request and return an
error if it fails:
```go
func (h *httpSink) Write(event Event) error {
p, err := json.Marshal(event)
if err != nil {
return err
}
body := bytes.NewReader(p)
resp, err := h.client.Post(h.url, "application/json", body)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.Status != 200 {
return errors.New("unexpected status")
}
return nil
}
// implement (*httpSink).Close()
```
With just that, we can start using components from this package. One can call
`(*httpSink).Write` to send events as the body of a post request to a
configured URL.
### Retries
HTTP can be unreliable. The first feature we'd like is to have some retry:
```go
hs := newHTTPSink(/*...*/)
retry := NewRetryingSink(hs, NewBreaker(5, time.Second))
```
We now have a sink that will retry events against the `httpSink` until they
succeed. The retry will backoff for one second after 5 consecutive failures
using the breaker strategy.
### Queues
This isn't quite enough. We we want a sink that doesn't block while we are
waiting for events to be sent. Let's add a `Queue`:
```go
queue := NewQueue(retry)
```
Now, we have an unbounded queue that will work through all events sent with
`(*Queue).Write`. Events can be added asynchronously to the queue without
blocking the current execution path. This is ideal for use in an http request.
### Broadcast
It usually turns out that you want to send to more than one listener. We can
use `Broadcaster` to support this:
```go
var broadcast = NewBroadcaster() // make it available somewhere in your application.
broadcast.Add(queue) // add your queue!
broadcast.Add(queue2) // and another!
```
With the above, we can now call `broadcast.Write` in our http handlers and have
all the events distributed to each queue. Because the events are queued, not
listener blocks another.
### Extending
For the most part, the above is sufficient for a lot of applications. However,
extending the above functionality can be done implementing your own `Sink`. The
behavior and semantics of the sink can be completely dependent on the
application requirements. The interface is provided below for reference:
```go
type Sink {
Write(Event) error
Close() error
}
```
Application behavior can be controlled by how `Write` behaves. The examples
above are designed to queue the message and return as quickly as possible.
Other implementations may block until the event is committed to durable
storage.

View file

@ -0,0 +1,158 @@
package events
import "github.com/Sirupsen/logrus"
// Broadcaster sends events to multiple, reliable Sinks. The goal of this
// component is to dispatch events to configured endpoints. Reliability can be
// provided by wrapping incoming sinks.
type Broadcaster struct {
sinks []Sink
events chan Event
adds chan configureRequest
removes chan configureRequest
closed chan chan struct{}
}
// NewBroadcaster appends one or more sinks to the list of sinks. The
// broadcaster behavior will be affected by the properties of the sink.
// Generally, the sink should accept all messages and deal with reliability on
// its own. Use of EventQueue and RetryingSink should be used here.
func NewBroadcaster(sinks ...Sink) *Broadcaster {
b := Broadcaster{
sinks: sinks,
events: make(chan Event),
adds: make(chan configureRequest),
removes: make(chan configureRequest),
closed: make(chan chan struct{}),
}
// Start the broadcaster
go b.run()
return &b
}
// Write accepts an event to be dispatched to all sinks. This method will never
// fail and should never block (hopefully!). The caller cedes the memory to the
// broadcaster and should not modify it after calling write.
func (b *Broadcaster) Write(event Event) error {
select {
case b.events <- event:
case <-b.closed:
return ErrSinkClosed
}
return nil
}
// Add the sink to the broadcaster.
//
// The provided sink must be comparable with equality. Typically, this just
// works with a regular pointer type.
func (b *Broadcaster) Add(sink Sink) error {
return b.configure(b.adds, sink)
}
// Remove the provided sink.
func (b *Broadcaster) Remove(sink Sink) error {
return b.configure(b.removes, sink)
}
type configureRequest struct {
sink Sink
response chan error
}
func (b *Broadcaster) configure(ch chan configureRequest, sink Sink) error {
response := make(chan error, 1)
for {
select {
case ch <- configureRequest{
sink: sink,
response: response}:
ch = nil
case err := <-response:
return err
case <-b.closed:
return ErrSinkClosed
}
}
}
// Close the broadcaster, ensuring that all messages are flushed to the
// underlying sink before returning.
func (b *Broadcaster) Close() error {
select {
case <-b.closed:
// already closed
return ErrSinkClosed
default:
// do a little chan handoff dance to synchronize closing
closed := make(chan struct{})
b.closed <- closed
close(b.closed)
<-closed
return nil
}
}
// run is the main broadcast loop, started when the broadcaster is created.
// Under normal conditions, it waits for events on the event channel. After
// Close is called, this goroutine will exit.
func (b *Broadcaster) run() {
remove := func(target Sink) {
for i, sink := range b.sinks {
if sink == target {
b.sinks = append(b.sinks[:i], b.sinks[i+1:]...)
break
}
}
}
for {
select {
case event := <-b.events:
for _, sink := range b.sinks {
if err := sink.Write(event); err != nil {
if err == ErrSinkClosed {
// remove closed sinks
remove(sink)
continue
}
logrus.WithField("event", event).WithField("events.sink", sink).WithError(err).
Errorf("broadcaster: dropping event")
}
}
case request := <-b.adds:
// while we have to iterate for add/remove, common iteration for
// send is faster against slice.
var found bool
for _, sink := range b.sinks {
if request.sink == sink {
found = true
break
}
}
if !found {
b.sinks = append(b.sinks, request.sink)
}
// b.sinks[request.sink] = struct{}{}
request.response <- nil
case request := <-b.removes:
remove(request.sink)
request.response <- nil
case closing := <-b.closed:
// close all the underlying sinks
for _, sink := range b.sinks {
if err := sink.Close(); err != nil && err != ErrSinkClosed {
logrus.WithField("events.sink", sink).WithError(err).
Errorf("broadcaster: closing sink failed")
}
}
closing <- struct{}{}
return
}
}
}

View file

@ -0,0 +1,47 @@
package events
// Channel provides a sink that can be listened on. The writer and channel
// listener must operate in separate goroutines.
//
// Consumers should listen on Channel.C until Closed is closed.
type Channel struct {
C chan Event
closed chan struct{}
}
// NewChannel returns a channel. If buffer is non-zero, the channel is
// unbuffered.
func NewChannel(buffer int) *Channel {
return &Channel{
C: make(chan Event, buffer),
closed: make(chan struct{}),
}
}
// Done returns a channel that will always proceed once the sink is closed.
func (ch *Channel) Done() chan struct{} {
return ch.closed
}
// Write the event to the channel. Must be called in a separate goroutine from
// the listener.
func (ch *Channel) Write(event Event) error {
select {
case ch.C <- event:
return nil
case <-ch.closed:
return ErrSinkClosed
}
}
// Close the channel sink.
func (ch *Channel) Close() error {
select {
case <-ch.closed:
return ErrSinkClosed
default:
close(ch.closed)
return nil
}
}

View file

@ -0,0 +1,10 @@
package events
import "fmt"
var (
// ErrSinkClosed is returned if a write is issued to a sink that has been
// closed. If encountered, the error should be considered terminal and
// retries will not be successful.
ErrSinkClosed = fmt.Errorf("events: sink closed")
)

View file

@ -0,0 +1,15 @@
package events
// Event marks items that can be sent as events.
type Event interface{}
// Sink accepts and sends events.
type Sink interface {
// Write an event to the Sink. If no error is returned, the caller will
// assume that all events have been committed to the sink. If an error is
// received, the caller may retry sending the event.
Write(event Event) error
// Close the sink, possibly waiting for pending events to flush.
Close() error
}

View file

@ -0,0 +1,52 @@
package events
// Matcher matches events.
type Matcher interface {
Match(event Event) bool
}
// MatcherFunc implements matcher with just a function.
type MatcherFunc func(event Event) bool
// Match calls the wrapped function.
func (fn MatcherFunc) Match(event Event) bool {
return fn(event)
}
// Filter provides an event sink that sends only events that are accepted by a
// Matcher. No methods on filter are goroutine safe.
type Filter struct {
dst Sink
matcher Matcher
closed bool
}
// NewFilter returns a new filter that will send to events to dst that return
// true for Matcher.
func NewFilter(dst Sink, matcher Matcher) Sink {
return &Filter{dst: dst, matcher: matcher}
}
// Write an event to the filter.
func (f *Filter) Write(event Event) error {
if f.closed {
return ErrSinkClosed
}
if f.matcher.Match(event) {
return f.dst.Write(event)
}
return nil
}
// Close the filter and allow no more events to pass through.
func (f *Filter) Close() error {
// TODO(stevvooe): Not all sinks should have Close.
if f.closed {
return ErrSinkClosed
}
f.closed = true
return f.dst.Close()
}

View file

@ -0,0 +1,104 @@
package events
import (
"container/list"
"sync"
"github.com/Sirupsen/logrus"
)
// Queue accepts all messages into a queue for asynchronous consumption
// by a sink. It is unbounded and thread safe but the sink must be reliable or
// events will be dropped.
type Queue struct {
dst Sink
events *list.List
cond *sync.Cond
mu sync.Mutex
closed bool
}
// NewQueue returns a queue to the provided Sink dst.
func NewQueue(dst Sink) *Queue {
eq := Queue{
dst: dst,
events: list.New(),
}
eq.cond = sync.NewCond(&eq.mu)
go eq.run()
return &eq
}
// Write accepts the events into the queue, only failing if the queue has
// beend closed.
func (eq *Queue) Write(event Event) error {
eq.mu.Lock()
defer eq.mu.Unlock()
if eq.closed {
return ErrSinkClosed
}
eq.events.PushBack(event)
eq.cond.Signal() // signal waiters
return nil
}
// Close shutsdown the event queue, flushing
func (eq *Queue) Close() error {
eq.mu.Lock()
defer eq.mu.Unlock()
if eq.closed {
return ErrSinkClosed
}
// set closed flag
eq.closed = true
eq.cond.Signal() // signal flushes queue
eq.cond.Wait() // wait for signal from last flush
return eq.dst.Close()
}
// run is the main goroutine to flush events to the target sink.
func (eq *Queue) run() {
for {
event := eq.next()
if event == nil {
return // nil block means event queue is closed.
}
if err := eq.dst.Write(event); err != nil {
logrus.WithFields(logrus.Fields{
"event": event,
"sink": eq.dst,
}).WithError(err).Warnf("eventqueue: dropped event")
}
}
}
// next encompasses the critical section of the run loop. When the queue is
// empty, it will block on the condition. If new data arrives, it will wake
// and return a block. When closed, a nil slice will be returned.
func (eq *Queue) next() Event {
eq.mu.Lock()
defer eq.mu.Unlock()
for eq.events.Len() < 1 {
if eq.closed {
eq.cond.Broadcast()
return nil
}
eq.cond.Wait()
}
front := eq.events.Front()
block := front.Value.(Event)
eq.events.Remove(front)
return block
}

View file

@ -0,0 +1,168 @@
package events
import (
"sync"
"time"
"github.com/Sirupsen/logrus"
)
// RetryingSink retries the write until success or an ErrSinkClosed is
// returned. Underlying sink must have p > 0 of succeeding or the sink will
// block. Retry is configured with a RetryStrategy. Concurrent calls to a
// retrying sink are serialized through the sink, meaning that if one is
// in-flight, another will not proceed.
type RetryingSink struct {
sink Sink
strategy RetryStrategy
closed chan struct{}
}
// NewRetryingSink returns a sink that will retry writes to a sink, backing
// off on failure. Parameters threshold and backoff adjust the behavior of the
// circuit breaker.
func NewRetryingSink(sink Sink, strategy RetryStrategy) *RetryingSink {
rs := &RetryingSink{
sink: sink,
strategy: strategy,
closed: make(chan struct{}),
}
return rs
}
// Write attempts to flush the events to the downstream sink until it succeeds
// or the sink is closed.
func (rs *RetryingSink) Write(event Event) error {
logger := logrus.WithField("event", event)
var timer *time.Timer
retry:
select {
case <-rs.closed:
return ErrSinkClosed
default:
}
if backoff := rs.strategy.Proceed(event); backoff > 0 {
if timer == nil {
timer = time.NewTimer(backoff)
defer timer.Stop()
} else {
timer.Reset(backoff)
}
select {
case <-timer.C:
goto retry
case <-rs.closed:
return ErrSinkClosed
}
}
if err := rs.sink.Write(event); err != nil {
if err == ErrSinkClosed {
// terminal!
return err
}
logger := logger.WithError(err) // shadow!!
if rs.strategy.Failure(event, err) {
logger.Errorf("retryingsink: dropped event")
return nil
}
logger.Errorf("retryingsink: error writing event, retrying")
goto retry
}
rs.strategy.Success(event)
return nil
}
// Close closes the sink and the underlying sink.
func (rs *RetryingSink) Close() error {
select {
case <-rs.closed:
return ErrSinkClosed
default:
close(rs.closed)
return rs.sink.Close()
}
}
// RetryStrategy defines a strategy for retrying event sink writes.
//
// All methods should be goroutine safe.
type RetryStrategy interface {
// Proceed is called before every event send. If proceed returns a
// positive, non-zero integer, the retryer will back off by the provided
// duration.
//
// An event is provided, by may be ignored.
Proceed(event Event) time.Duration
// Failure reports a failure to the strategy. If this method returns true,
// the event should be dropped.
Failure(event Event, err error) bool
// Success should be called when an event is sent successfully.
Success(event Event)
}
// TODO(stevvooe): We are using circuit breaker here. May want to provide
// bounded exponential backoff, as well.
// Breaker implements a circuit breaker retry strategy.
//
// The current implementation never drops events.
type Breaker struct {
threshold int
recent int
last time.Time
backoff time.Duration // time after which we retry after failure.
mu sync.Mutex
}
var _ RetryStrategy = &Breaker{}
// NewBreaker returns a breaker that will backoff after the threshold has been
// tripped. A Breaker is thread safe and may be shared by many goroutines.
func NewBreaker(threshold int, backoff time.Duration) *Breaker {
return &Breaker{
threshold: threshold,
backoff: backoff,
}
}
// Proceed checks the failures against the threshold.
func (b *Breaker) Proceed(event Event) time.Duration {
b.mu.Lock()
defer b.mu.Unlock()
if b.recent < b.threshold {
return 0
}
return b.last.Add(b.backoff).Sub(time.Now())
}
// Success resets the breaker.
func (b *Breaker) Success(event Event) {
b.mu.Lock()
defer b.mu.Unlock()
b.recent = 0
b.last = time.Time{}
}
// Failure records the failure and latest failure time.
func (b *Breaker) Failure(event Event, err error) bool {
b.mu.Lock()
defer b.mu.Unlock()
b.recent++
b.last = time.Now().UTC()
return false // never drop events.
}

View file

@ -1,9 +1,7 @@
language: go
go:
- 1.3
# - 1.4
# see https://github.com/moovweb/gvm/pull/116 for why Go 1.4 is currently disabled
- 1.5.3
# let us have speedy Docker-based Travis workers
sudo: false
@ -11,19 +9,18 @@ sudo: false
before_install:
# Symlink below is needed for Travis CI to work correctly on personal forks of libkv
- ln -s $HOME/gopath/src/github.com/${TRAVIS_REPO_SLUG///libkv/} $HOME/gopath/src/github.com/docker
- go get golang.org/x/tools/cmd/vet
- go get golang.org/x/tools/cmd/cover
- go get github.com/mattn/goveralls
- go get github.com/golang/lint/golint
- go get github.com/GeertJohan/fgt
before_script:
- script/travis_consul.sh 0.5.2
- script/travis_etcd.sh 2.2.0
- script/travis_zk.sh 3.4.6
- script/travis_consul.sh 0.6.3
- script/travis_etcd.sh 2.2.5
- script/travis_zk.sh 3.5.1-alpha
script:
- ./consul agent -server -bootstrap-expect 1 -data-dir /tmp/consul -config-file=./config.json 1>/dev/null &
- ./consul agent -server -bootstrap -advertise=127.0.0.1 -data-dir /tmp/consul -config-file=./config.json 1>/dev/null &
- ./etcd/etcd --listen-client-urls 'http://0.0.0.0:4001' --advertise-client-urls 'http://127.0.0.1:4001' >/dev/null 2>&1 &
- ./zk/bin/zkServer.sh start ./zk/conf/zoo.cfg 1> /dev/null
- script/validate-gofmt

View file

@ -176,7 +176,7 @@
END OF TERMS AND CONDITIONS
Copyright 2014-2015 Docker, Inc.
Copyright 2014-2016 Docker, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.

View file

@ -0,0 +1,46 @@
# Libkv maintainers file
#
# This file describes who runs the docker/libkv project and how.
# This is a living document - if you see something out of date or missing, speak up!
#
# It is structured to be consumable by both humans and programs.
# To extract its contents programmatically, use any TOML-compliant parser.
#
# This file is compiled into the MAINTAINERS file in docker/opensource.
#
[Org]
[Org."Core maintainers"]
people = [
"abronan",
"aluzzardi",
"sanimej",
"vieux",
]
[people]
# A reference list of all people associated with the project.
# All other sections should refer to people by their canonical key
# in the people section.
# ADD YOURSELF HERE IN ALPHABETICAL ORDER
[people.abronan]
Name = "Alexandre Beslic"
Email = "abronan@docker.com"
GitHub = "abronan"
[people.aluzzardi]
Name = "Andrea Luzzardi"
Email = "al@docker.com"
GitHub = "aluzzardi"
[people.sanimej]
Name = "Santhosh Manohar"
Email = "santhosh@docker.com"
GitHub = "sanimej"
[people.vieux]
Name = "Victor Vieux"
Email = "vieux@docker.com"
GitHub = "vieux"

View file

@ -3,6 +3,7 @@
[![GoDoc](https://godoc.org/github.com/docker/libkv?status.png)](https://godoc.org/github.com/docker/libkv)
[![Build Status](https://travis-ci.org/docker/libkv.svg?branch=master)](https://travis-ci.org/docker/libkv)
[![Coverage Status](https://coveralls.io/repos/docker/libkv/badge.svg)](https://coveralls.io/r/docker/libkv)
[![Go Report Card](https://goreportcard.com/badge/github.com/docker/libkv)](https://goreportcard.com/report/github.com/docker/libkv)
`libkv` provides a `Go` native library to store metadata.
@ -10,7 +11,7 @@ The goal of `libkv` is to abstract common store operations for multiple distribu
For example, you can use it to store your metadata or for service discovery to register machines and endpoints inside your cluster.
You can also easily implement a generic *Leader Election* on top of it (see the [swarm/leadership](https://github.com/docker/swarm/tree/master/leadership) package).
You can also easily implement a generic *Leader Election* on top of it (see the [docker/leadership](https://github.com/docker/leadership) repository).
As of now, `libkv` offers support for `Consul`, `Etcd`, `Zookeeper` (**Distributed** store) and `BoltDB` (**Local** store).
@ -30,7 +31,7 @@ You can find examples of usage for `libkv` under in `docs/examples.go`. Optional
`libkv` supports:
- Consul versions >= `0.5.1` because it uses Sessions with `Delete` behavior for the use of `TTLs` (mimics zookeeper's Ephemeral node support), If you don't plan to use `TTLs`: you can use Consul version `0.4.0+`.
- Etcd versions >= `2.0` because it uses the new `coreos/etcd/client`, this might change in the future as the support for `APIv3` comes along and adds mor capabilities.
- Etcd versions >= `2.0` because it uses the new `coreos/etcd/client`, this might change in the future as the support for `APIv3` comes along and adds more capabilities.
- Zookeeper versions >= `3.4.5`. Although this might work with previous version but this remains untested as of now.
- Boltdb, which shouldn't be subject to any version dependencies.
@ -83,7 +84,7 @@ Please refer to the `docs/compatibility.md` to see what are the special cases fo
Other than those special cases, you should expect the same experience for basic operations like `Get`/`Put`, etc.
Calls like `WatchTree` may return different events (or number of events) depending on the backend (for now, `Etcd` and `Consul` will likely return more events than `Zookeeper` that you should triage properly). Although you should be able to use it successfully to watch on events in an interchangeable way (see the **swarm/leadership** or **swarm/discovery** packages in **docker/swarm**).
Calls like `WatchTree` may return different events (or number of events) depending on the backend (for now, `Etcd` and `Consul` will likely return more events than `Zookeeper` that you should triage properly). Although you should be able to use it successfully to watch on events in an interchangeable way (see the **docker/leadership** repository or the **pkg/discovery/kv** package in **docker/docker**).
## TLS
@ -103,4 +104,4 @@ Want to hack on libkv? [Docker's contributions guidelines](https://github.com/do
##Copyright and license
Copyright © 2014-2015 Docker, Inc. All rights reserved, except as follows. Code is released under the Apache 2.0 license. The README.md file, and files in the "docs" folder are licensed under the Creative Commons Attribution 4.0 International License under the terms and conditions set forth in the file "LICENSE.docs". You may obtain a duplicate copy of the same license, titled CC-BY-SA-4.0, at http://creativecommons.org/licenses/by/4.0/.
Copyright © 2014-2016 Docker, Inc. All rights reserved, except as follows. Code is released under the Apache 2.0 license. The README.md file, and files in the "docs" folder are licensed under the Creative Commons Attribution 4.0 International License under the terms and conditions set forth in the file "LICENSE.docs". You may obtain a duplicate copy of the same license, titled CC-BY-SA-4.0, at http://creativecommons.org/licenses/by/4.0/.

View file

@ -25,7 +25,7 @@ var (
}()
)
// NewStore creates a an instance of store
// NewStore creates an instance of store
func NewStore(backend store.Backend, addrs []string, options *store.Config) (store.Store, error) {
if init, exists := initializers[backend]; exists {
return init(addrs, options)

View file

@ -19,8 +19,6 @@ var (
// ErrMultipleEndpointsUnsupported is thrown when multiple endpoints specified for
// BoltDB. Endpoint has to be a local file path
ErrMultipleEndpointsUnsupported = errors.New("boltdb supports one endpoint and should be a file path")
// ErrBoltBucketNotFound is thrown when specified BoltBD bucket doesn't exist in the DB
ErrBoltBucketNotFound = errors.New("boltdb bucket doesn't exist")
// ErrBoltBucketOptionMissing is thrown when boltBcuket config option is missing
ErrBoltBucketOptionMissing = errors.New("boltBucket config option missing")
)
@ -141,7 +139,7 @@ func (b *BoltDB) Get(key string) (*store.KVPair, error) {
err = db.View(func(tx *bolt.Tx) error {
bucket := tx.Bucket(b.boltBucket)
if bucket == nil {
return ErrBoltBucketNotFound
return store.ErrKeyNotFound
}
v := bucket.Get([]byte(key))
@ -217,7 +215,7 @@ func (b *BoltDB) Delete(key string) error {
err = db.Update(func(tx *bolt.Tx) error {
bucket := tx.Bucket(b.boltBucket)
if bucket == nil {
return ErrBoltBucketNotFound
return store.ErrKeyNotFound
}
err := bucket.Delete([]byte(key))
return err
@ -243,7 +241,7 @@ func (b *BoltDB) Exists(key string) (bool, error) {
err = db.View(func(tx *bolt.Tx) error {
bucket := tx.Bucket(b.boltBucket)
if bucket == nil {
return ErrBoltBucketNotFound
return store.ErrKeyNotFound
}
val = bucket.Get([]byte(key))
@ -276,7 +274,7 @@ func (b *BoltDB) List(keyPrefix string) ([]*store.KVPair, error) {
err = db.View(func(tx *bolt.Tx) error {
bucket := tx.Bucket(b.boltBucket)
if bucket == nil {
return ErrBoltBucketNotFound
return store.ErrKeyNotFound
}
cursor := bucket.Cursor()
@ -326,7 +324,7 @@ func (b *BoltDB) AtomicDelete(key string, previous *store.KVPair) (bool, error)
err = db.Update(func(tx *bolt.Tx) error {
bucket := tx.Bucket(b.boltBucket)
if bucket == nil {
return ErrBoltBucketNotFound
return store.ErrKeyNotFound
}
val = bucket.Get([]byte(key))
@ -370,7 +368,7 @@ func (b *BoltDB) AtomicPut(key string, value []byte, previous *store.KVPair, opt
bucket := tx.Bucket(b.boltBucket)
if bucket == nil {
if previous != nil {
return ErrBoltBucketNotFound
return store.ErrKeyNotFound
}
bucket, err = tx.CreateBucket(b.boltBucket)
if err != nil {
@ -381,7 +379,7 @@ func (b *BoltDB) AtomicPut(key string, value []byte, previous *store.KVPair, opt
// doesn't exist in the DB.
val = bucket.Get([]byte(key))
if previous == nil && len(val) != 0 {
return store.ErrKeyModified
return store.ErrKeyExists
}
if previous != nil {
if len(val) == 0 {
@ -440,7 +438,7 @@ func (b *BoltDB) DeleteTree(keyPrefix string) error {
err = db.Update(func(tx *bolt.Tx) error {
bucket := tx.Bucket(b.boltBucket)
if bucket == nil {
return ErrBoltBucketNotFound
return store.ErrKeyNotFound
}
cursor := bucket.Cursor()

View file

@ -22,6 +22,14 @@ const (
// RenewSessionRetryMax is the number of time we should try
// to renew the session before giving up and throwing an error
RenewSessionRetryMax = 5
// MaxSessionDestroyAttempts is the maximum times we will try
// to explicitely destroy the session attached to a lock after
// the connectivity to the store has been lost
MaxSessionDestroyAttempts = 5
// defaultLockTTL is the default ttl for the consul lock
defaultLockTTL = 20 * time.Second
)
var (
@ -186,6 +194,7 @@ func (s *Consul) Put(key string, value []byte, opts *store.WriteOptions) error {
p := &api.KVPair{
Key: key,
Value: value,
Flags: api.LockFlagValue,
}
if opts != nil && opts.TTL > 0 {
@ -378,44 +387,99 @@ func (s *Consul) NewLock(key string, options *store.LockOptions) (store.Locker,
lock := &consulLock{}
ttl := defaultLockTTL
if options != nil {
// Set optional TTL on Lock
if options.TTL != 0 {
entry := &api.SessionEntry{
Behavior: api.SessionBehaviorRelease, // Release the lock when the session expires
TTL: (options.TTL / 2).String(), // Consul multiplies the TTL by 2x
LockDelay: 1 * time.Millisecond, // Virtually disable lock delay
}
// Create the key session
session, _, err := s.client.Session().Create(entry, nil)
if err != nil {
return nil, err
}
// Place the session on lock
lockOpts.Session = session
// Renew the session ttl lock periodically
go s.client.Session().RenewPeriodic(entry.TTL, session, nil, options.RenewLock)
lock.renewCh = options.RenewLock
ttl = options.TTL
}
// Set optional value on Lock
if options.Value != nil {
lockOpts.Value = options.Value
}
}
entry := &api.SessionEntry{
Behavior: api.SessionBehaviorRelease, // Release the lock when the session expires
TTL: (ttl / 2).String(), // Consul multiplies the TTL by 2x
LockDelay: 1 * time.Millisecond, // Virtually disable lock delay
}
// Create the key session
session, _, err := s.client.Session().Create(entry, nil)
if err != nil {
return nil, err
}
// Place the session and renew chan on lock
lockOpts.Session = session
lock.renewCh = options.RenewLock
l, err := s.client.LockOpts(lockOpts)
if err != nil {
return nil, err
}
// Renew the session ttl lock periodically
s.renewLockSession(entry.TTL, session, options.RenewLock)
lock.lock = l
return lock, nil
}
// renewLockSession is used to renew a session Lock, it takes
// a stopRenew chan which is used to explicitely stop the session
// renew process. The renew routine never stops until a signal is
// sent to this channel. If deleting the session fails because the
// connection to the store is lost, it keeps trying to delete the
// session periodically until it can contact the store, this ensures
// that the lock is not maintained indefinitely which ensures liveness
// over safety for the lock when the store becomes unavailable.
func (s *Consul) renewLockSession(initialTTL string, id string, stopRenew chan struct{}) {
sessionDestroyAttempts := 0
ttl, err := time.ParseDuration(initialTTL)
if err != nil {
return
}
go func() {
for {
select {
case <-time.After(ttl / 2):
entry, _, err := s.client.Session().Renew(id, nil)
if err != nil {
// If an error occurs, continue until the
// session gets destroyed explicitely or
// the session ttl times out
continue
}
if entry == nil {
return
}
// Handle the server updating the TTL
ttl, _ = time.ParseDuration(entry.TTL)
case <-stopRenew:
// Attempt a session destroy
_, err := s.client.Session().Destroy(id, nil)
if err == nil {
return
}
if sessionDestroyAttempts >= MaxSessionDestroyAttempts {
return
}
// We can't destroy the session because the store
// is unavailable, wait for the session renew period
sessionDestroyAttempts++
time.Sleep(ttl / 2)
}
}
}()
}
// Lock attempts to acquire the lock and blocks while
// doing so. It returns a channel that is closed if our
// lock is lost or if an error occurs
@ -436,7 +500,7 @@ func (l *consulLock) Unlock() error {
// modified in the meantime, throws an error if this is the case
func (s *Consul) AtomicPut(key string, value []byte, previous *store.KVPair, options *store.WriteOptions) (bool, *store.KVPair, error) {
p := &api.KVPair{Key: s.normalize(key), Value: value}
p := &api.KVPair{Key: s.normalize(key), Value: value, Flags: api.LockFlagValue}
if previous == nil {
// Consul interprets ModifyIndex = 0 as new key.
@ -445,9 +509,14 @@ func (s *Consul) AtomicPut(key string, value []byte, previous *store.KVPair, opt
p.ModifyIndex = previous.LastIndex
}
if work, _, err := s.client.KV().CAS(p, nil); err != nil {
ok, _, err := s.client.KV().CAS(p, nil)
if err != nil {
return false, nil, err
} else if !work {
}
if !ok {
if previous == nil {
return false, nil, store.ErrKeyExists
}
return false, nil, store.ErrKeyModified
}
@ -466,7 +535,7 @@ func (s *Consul) AtomicDelete(key string, previous *store.KVPair) (bool, error)
return false, store.ErrPreviousNotSpecified
}
p := &api.KVPair{Key: s.normalize(key), ModifyIndex: previous.LastIndex}
p := &api.KVPair{Key: s.normalize(key), ModifyIndex: previous.LastIndex, Flags: api.LockFlagValue}
// Extra Get operation to check on the key
_, err := s.Get(key)

View file

@ -75,6 +75,9 @@ func New(addrs []string, options *store.Config) (store.Store, error) {
if options.ConnectionTimeout != 0 {
setTimeout(cfg, options.ConnectionTimeout)
}
if options.Username != "" {
setCredentials(cfg, options.Username, options.Password)
}
}
c, err := etcd.New(*cfg)
@ -119,6 +122,12 @@ func setTimeout(cfg *etcd.Config, time time.Duration) {
cfg.HeaderTimeoutPerRequest = time
}
// setCredentials sets the username/password credentials for connecting to Etcd
func setCredentials(cfg *etcd.Config, username, password string) {
cfg.Username = username
cfg.Password = password
}
// Normalize the key for usage in Etcd
func (s *Etcd) normalize(key string) string {
key = store.Normalize(key)
@ -335,6 +344,10 @@ func (s *Etcd) AtomicPut(key string, value []byte, previous *store.KVPair, opts
if etcdError.Code == etcd.ErrorCodeTestFailed {
return false, nil, store.ErrKeyModified
}
// Node exists error (when PrevNoExist)
if etcdError.Code == etcd.ErrorCodeNodeExist {
return false, nil, store.ErrKeyExists
}
}
return false, nil, err
}
@ -508,15 +521,15 @@ func (l *etcdLock) Lock(stopChan chan struct{}) (<-chan struct{}, error) {
// Wait for the key to be available or for
// a signal to stop trying to lock the key
select {
case _ = <-free:
case <-free:
break
case err := <-errorCh:
return nil, err
case _ = <-stopChan:
case <-stopChan:
return nil, ErrAbortTryLock
}
// Delete or Expire event occured
// Delete or Expire event occurred
// Retry
}
}

View file

@ -35,6 +35,8 @@ var (
ErrKeyNotFound = errors.New("Key not found in store")
// ErrPreviousNotSpecified is thrown when the previous value is not specified for an atomic operation
ErrPreviousNotSpecified = errors.New("Previous K/V pair should be provided for the Atomic operation")
// ErrKeyExists is thrown when the previous value exists in the case of an AtomicPut
ErrKeyExists = errors.New("Previous K/V pair exists, cannot complete Atomic operation")
)
// Config contains the options for a storage client
@ -44,6 +46,8 @@ type Config struct {
ConnectionTimeout time.Duration
Bucket string
PersistConnection bool
Username string
Password string
}
// ClientTLSConfig contains data for a Client TLS configuration in the form

View file

@ -291,8 +291,8 @@ func (s *Zookeeper) DeleteTree(directory string) error {
// AtomicPut put a value at "key" if the key has not been
// modified in the meantime, throws an error if this is the case
func (s *Zookeeper) AtomicPut(key string, value []byte, previous *store.KVPair, _ *store.WriteOptions) (bool, *store.KVPair, error) {
var lastIndex uint64
if previous != nil {
meta, err := s.client.Set(s.normalize(key), value, int32(previous.LastIndex))
if err != nil {
@ -307,8 +307,9 @@ func (s *Zookeeper) AtomicPut(key string, value []byte, previous *store.KVPair,
// Interpret previous == nil as create operation.
_, err := s.client.Create(s.normalize(key), value, 0, zk.WorldACL(zk.PermAll))
if err != nil {
// Zookeeper will complain if the directory doesn't exist.
// Directory does not exist
if err == zk.ErrNoNode {
// Create the directory
parts := store.SplitKey(strings.TrimSuffix(key, "/"))
parts = parts[:len(parts)-1]
@ -316,11 +317,22 @@ func (s *Zookeeper) AtomicPut(key string, value []byte, previous *store.KVPair,
// Failed to create the directory.
return false, nil, err
}
// Create the node
if _, err := s.client.Create(s.normalize(key), value, 0, zk.WorldACL(zk.PermAll)); err != nil {
// Node exist error (when previous nil)
if err == zk.ErrNodeExists {
return false, nil, store.ErrKeyExists
}
return false, nil, err
}
} else {
// Node Exists error (when previous nil)
if err == zk.ErrNodeExists {
return false, nil, store.ErrKeyExists
}
// Unhandled error
return false, nil, err
}

View file

@ -0,0 +1,353 @@
Mozilla Public License, version 2.0
1. Definitions
1.1. “Contributor”
means each individual or legal entity that creates, contributes to the
creation of, or owns Covered Software.
1.2. “Contributor Version”
means the combination of the Contributions of others (if any) used by a
Contributor and that particular Contributors Contribution.
1.3. “Contribution”
means Covered Software of a particular Contributor.
1.4. “Covered Software”
means Source Code Form to which the initial Contributor has attached the
notice in Exhibit A, the Executable Form of such Source Code Form, and
Modifications of such Source Code Form, in each case including portions
thereof.
1.5. “Incompatible With Secondary Licenses”
means
a. that the initial Contributor has attached the notice described in
Exhibit B to the Covered Software; or
b. that the Covered Software was made available under the terms of version
1.1 or earlier of the License, but not also under the terms of a
Secondary License.
1.6. “Executable Form”
means any form of the work other than Source Code Form.
1.7. “Larger Work”
means a work that combines Covered Software with other material, in a separate
file or files, that is not Covered Software.
1.8. “License”
means this document.
1.9. “Licensable”
means having the right to grant, to the maximum extent possible, whether at the
time of the initial grant or subsequently, any and all of the rights conveyed by
this License.
1.10. “Modifications”
means any of the following:
a. any file in Source Code Form that results from an addition to, deletion
from, or modification of the contents of Covered Software; or
b. any new file in Source Code Form that contains any Covered Software.
1.11. “Patent Claims” of a Contributor
means any patent claim(s), including without limitation, method, process,
and apparatus claims, in any patent Licensable by such Contributor that
would be infringed, but for the grant of the License, by the making,
using, selling, offering for sale, having made, import, or transfer of
either its Contributions or its Contributor Version.
1.12. “Secondary License”
means either the GNU General Public License, Version 2.0, the GNU Lesser
General Public License, Version 2.1, the GNU Affero General Public
License, Version 3.0, or any later versions of those licenses.
1.13. “Source Code Form”
means the form of the work preferred for making modifications.
1.14. “You” (or “Your”)
means an individual or a legal entity exercising rights under this
License. For legal entities, “You” includes any entity that controls, is
controlled by, or is under common control with You. For purposes of this
definition, “control” means (a) the power, direct or indirect, to cause
the direction or management of such entity, whether by contract or
otherwise, or (b) ownership of more than fifty percent (50%) of the
outstanding shares or beneficial ownership of such entity.
2. License Grants and Conditions
2.1. Grants
Each Contributor hereby grants You a world-wide, royalty-free,
non-exclusive license:
a. under intellectual property rights (other than patent or trademark)
Licensable by such Contributor to use, reproduce, make available,
modify, display, perform, distribute, and otherwise exploit its
Contributions, either on an unmodified basis, with Modifications, or as
part of a Larger Work; and
b. under Patent Claims of such Contributor to make, use, sell, offer for
sale, have made, import, and otherwise transfer either its Contributions
or its Contributor Version.
2.2. Effective Date
The licenses granted in Section 2.1 with respect to any Contribution become
effective for each Contribution on the date the Contributor first distributes
such Contribution.
2.3. Limitations on Grant Scope
The licenses granted in this Section 2 are the only rights granted under this
License. No additional rights or licenses will be implied from the distribution
or licensing of Covered Software under this License. Notwithstanding Section
2.1(b) above, no patent license is granted by a Contributor:
a. for any code that a Contributor has removed from Covered Software; or
b. for infringements caused by: (i) Your and any other third partys
modifications of Covered Software, or (ii) the combination of its
Contributions with other software (except as part of its Contributor
Version); or
c. under Patent Claims infringed by Covered Software in the absence of its
Contributions.
This License does not grant any rights in the trademarks, service marks, or
logos of any Contributor (except as may be necessary to comply with the
notice requirements in Section 3.4).
2.4. Subsequent Licenses
No Contributor makes additional grants as a result of Your choice to
distribute the Covered Software under a subsequent version of this License
(see Section 10.2) or under the terms of a Secondary License (if permitted
under the terms of Section 3.3).
2.5. Representation
Each Contributor represents that the Contributor believes its Contributions
are its original creation(s) or it has sufficient rights to grant the
rights to its Contributions conveyed by this License.
2.6. Fair Use
This License is not intended to limit any rights You have under applicable
copyright doctrines of fair use, fair dealing, or other equivalents.
2.7. Conditions
Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in
Section 2.1.
3. Responsibilities
3.1. Distribution of Source Form
All distribution of Covered Software in Source Code Form, including any
Modifications that You create or to which You contribute, must be under the
terms of this License. You must inform recipients that the Source Code Form
of the Covered Software is governed by the terms of this License, and how
they can obtain a copy of this License. You may not attempt to alter or
restrict the recipients rights in the Source Code Form.
3.2. Distribution of Executable Form
If You distribute Covered Software in Executable Form then:
a. such Covered Software must also be made available in Source Code Form,
as described in Section 3.1, and You must inform recipients of the
Executable Form how they can obtain a copy of such Source Code Form by
reasonable means in a timely manner, at a charge no more than the cost
of distribution to the recipient; and
b. You may distribute such Executable Form under the terms of this License,
or sublicense it under different terms, provided that the license for
the Executable Form does not attempt to limit or alter the recipients
rights in the Source Code Form under this License.
3.3. Distribution of a Larger Work
You may create and distribute a Larger Work under terms of Your choice,
provided that You also comply with the requirements of this License for the
Covered Software. If the Larger Work is a combination of Covered Software
with a work governed by one or more Secondary Licenses, and the Covered
Software is not Incompatible With Secondary Licenses, this License permits
You to additionally distribute such Covered Software under the terms of
such Secondary License(s), so that the recipient of the Larger Work may, at
their option, further distribute the Covered Software under the terms of
either this License or such Secondary License(s).
3.4. Notices
You may not remove or alter the substance of any license notices (including
copyright notices, patent notices, disclaimers of warranty, or limitations
of liability) contained within the Source Code Form of the Covered
Software, except that You may alter any license notices to the extent
required to remedy known factual inaccuracies.
3.5. Application of Additional Terms
You may choose to offer, and to charge a fee for, warranty, support,
indemnity or liability obligations to one or more recipients of Covered
Software. However, You may do so only on Your own behalf, and not on behalf
of any Contributor. You must make it absolutely clear that any such
warranty, support, indemnity, or liability obligation is offered by You
alone, and You hereby agree to indemnify every Contributor for any
liability incurred by such Contributor as a result of warranty, support,
indemnity or liability terms You offer. You may include additional
disclaimers of warranty and limitations of liability specific to any
jurisdiction.
4. Inability to Comply Due to Statute or Regulation
If it is impossible for You to comply with any of the terms of this License
with respect to some or all of the Covered Software due to statute, judicial
order, or regulation then You must: (a) comply with the terms of this License
to the maximum extent possible; and (b) describe the limitations and the code
they affect. Such description must be placed in a text file included with all
distributions of the Covered Software under this License. Except to the
extent prohibited by statute or regulation, such description must be
sufficiently detailed for a recipient of ordinary skill to be able to
understand it.
5. Termination
5.1. The rights granted under this License will terminate automatically if You
fail to comply with any of its terms. However, if You become compliant,
then the rights granted under this License from a particular Contributor
are reinstated (a) provisionally, unless and until such Contributor
explicitly and finally terminates Your grants, and (b) on an ongoing basis,
if such Contributor fails to notify You of the non-compliance by some
reasonable means prior to 60 days after You have come back into compliance.
Moreover, Your grants from a particular Contributor are reinstated on an
ongoing basis if such Contributor notifies You of the non-compliance by
some reasonable means, this is the first time You have received notice of
non-compliance with this License from such Contributor, and You become
compliant prior to 30 days after Your receipt of the notice.
5.2. If You initiate litigation against any entity by asserting a patent
infringement claim (excluding declaratory judgment actions, counter-claims,
and cross-claims) alleging that a Contributor Version directly or
indirectly infringes any patent, then the rights granted to You by any and
all Contributors for the Covered Software under Section 2.1 of this License
shall terminate.
5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user
license agreements (excluding distributors and resellers) which have been
validly granted by You or Your distributors under this License prior to
termination shall survive termination.
6. Disclaimer of Warranty
Covered Software is provided under this License on an “as is” basis, without
warranty of any kind, either expressed, implied, or statutory, including,
without limitation, warranties that the Covered Software is free of defects,
merchantable, fit for a particular purpose or non-infringing. The entire
risk as to the quality and performance of the Covered Software is with You.
Should any Covered Software prove defective in any respect, You (not any
Contributor) assume the cost of any necessary servicing, repair, or
correction. This disclaimer of warranty constitutes an essential part of this
License. No use of any Covered Software is authorized under this License
except under this disclaimer.
7. Limitation of Liability
Under no circumstances and under no legal theory, whether tort (including
negligence), contract, or otherwise, shall any Contributor, or anyone who
distributes Covered Software as permitted above, be liable to You for any
direct, indirect, special, incidental, or consequential damages of any
character including, without limitation, damages for lost profits, loss of
goodwill, work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses, even if such party shall have been
informed of the possibility of such damages. This limitation of liability
shall not apply to liability for death or personal injury resulting from such
partys negligence to the extent applicable law prohibits such limitation.
Some jurisdictions do not allow the exclusion or limitation of incidental or
consequential damages, so this exclusion and limitation may not apply to You.
8. Litigation
Any litigation relating to this License may be brought only in the courts of
a jurisdiction where the defendant maintains its principal place of business
and such litigation shall be governed by laws of that jurisdiction, without
reference to its conflict-of-law provisions. Nothing in this Section shall
prevent a partys ability to bring cross-claims or counter-claims.
9. Miscellaneous
This License represents the complete agreement concerning the subject matter
hereof. If any provision of this License is held to be unenforceable, such
provision shall be reformed only to the extent necessary to make it
enforceable. Any law or regulation which provides that the language of a
contract shall be construed against the drafter shall not be used to construe
this License against a Contributor.
10. Versions of the License
10.1. New Versions
Mozilla Foundation is the license steward. Except as provided in Section
10.3, no one other than the license steward has the right to modify or
publish new versions of this License. Each version will be given a
distinguishing version number.
10.2. Effect of New Versions
You may distribute the Covered Software under the terms of the version of
the License under which You originally received the Covered Software, or
under the terms of any subsequent version published by the license
steward.
10.3. Modified Versions
If you create software not governed by this License, and you want to
create a new license for such software, you may create and use a modified
version of this License if you rename the license and remove any
references to the name of the license steward (except to note that such
modified license differs from this License).
10.4. Distributing Source Code Form that is Incompatible With Secondary Licenses
If You choose to distribute Source Code Form that is Incompatible With
Secondary Licenses under the terms of this version of the License, the
notice described in Exhibit B of this License must be attached.
Exhibit A - Source Code Form License Notice
This Source Code Form is subject to the
terms of the Mozilla Public License, v.
2.0. If a copy of the MPL was not
distributed with this file, You can
obtain one at
http://mozilla.org/MPL/2.0/.
If it is not possible or desirable to put the notice in a particular file, then
You may include the notice in a location (such as a LICENSE file in a relevant
directory) where a recipient would be likely to look for such a notice.
You may add additional accurate notices of copyright ownership.
Exhibit B - “Incompatible With Secondary Licenses” Notice
This Source Code Form is “Incompatible
With Secondary Licenses”, as defined by
the Mozilla Public License, v. 2.0.

View file

@ -0,0 +1,91 @@
# go-multierror
`go-multierror` is a package for Go that provides a mechanism for
representing a list of `error` values as a single `error`.
This allows a function in Go to return an `error` that might actually
be a list of errors. If the caller knows this, they can unwrap the
list and access the errors. If the caller doesn't know, the error
formats to a nice human-readable format.
`go-multierror` implements the
[errwrap](https://github.com/hashicorp/errwrap) interface so that it can
be used with that library, as well.
## Installation and Docs
Install using `go get github.com/hashicorp/go-multierror`.
Full documentation is available at
http://godoc.org/github.com/hashicorp/go-multierror
## Usage
go-multierror is easy to use and purposely built to be unobtrusive in
existing Go applications/libraries that may not be aware of it.
**Building a list of errors**
The `Append` function is used to create a list of errors. This function
behaves a lot like the Go built-in `append` function: it doesn't matter
if the first argument is nil, a `multierror.Error`, or any other `error`,
the function behaves as you would expect.
```go
var result error
if err := step1(); err != nil {
result = multierror.Append(result, err)
}
if err := step2(); err != nil {
result = multierror.Append(result, err)
}
return result
```
**Customizing the formatting of the errors**
By specifying a custom `ErrorFormat`, you can customize the format
of the `Error() string` function:
```go
var result *multierror.Error
// ... accumulate errors here, maybe using Append
if result != nil {
result.ErrorFormat = func([]error) string {
return "errors!"
}
}
```
**Accessing the list of errors**
`multierror.Error` implements `error` so if the caller doesn't know about
multierror, it will work just fine. But if you're aware a multierror might
be returned, you can use type switches to access the list of errors:
```go
if err := something(); err != nil {
if merr, ok := err.(*multierror.Error); ok {
// Use merr.Errors
}
}
```
**Returning a multierror only if there are errors**
If you build a `multierror.Error`, you can use the `ErrorOrNil` function
to return an `error` implementation only if there are errors to return:
```go
var result *multierror.Error
// ... accumulate errors here
// Return the `error` only if errors were added to the multierror, otherwise
// return nil since there are no errors.
return result.ErrorOrNil()
```

View file

@ -0,0 +1,30 @@
package multierror
// Append is a helper function that will append more errors
// onto an Error in order to create a larger multi-error.
//
// If err is not a multierror.Error, then it will be turned into
// one. If any of the errs are multierr.Error, they will be flattened
// one level into err.
func Append(err error, errs ...error) *Error {
switch err := err.(type) {
case *Error:
// Typed nils can reach here, so initialize if we are nil
if err == nil {
err = new(Error)
}
err.Errors = append(err.Errors, errs...)
return err
default:
newErrs := make([]error, 0, len(errs)+1)
if err != nil {
newErrs = append(newErrs, err)
}
newErrs = append(newErrs, errs...)
return &Error{
Errors: newErrs,
}
}
}

View file

@ -0,0 +1,23 @@
package multierror
import (
"fmt"
"strings"
)
// ErrorFormatFunc is a function callback that is called by Error to
// turn the list of errors into a string.
type ErrorFormatFunc func([]error) string
// ListFormatFunc is a basic formatter that outputs the number of errors
// that occurred along with a bullet point list of the errors.
func ListFormatFunc(es []error) string {
points := make([]string, len(es))
for i, err := range es {
points[i] = fmt.Sprintf("* %s", err)
}
return fmt.Sprintf(
"%d error(s) occurred:\n\n%s",
len(es), strings.Join(points, "\n"))
}

View file

@ -0,0 +1,51 @@
package multierror
import (
"fmt"
)
// Error is an error type to track multiple errors. This is used to
// accumulate errors in cases and return them as a single "error".
type Error struct {
Errors []error
ErrorFormat ErrorFormatFunc
}
func (e *Error) Error() string {
fn := e.ErrorFormat
if fn == nil {
fn = ListFormatFunc
}
return fn(e.Errors)
}
// ErrorOrNil returns an error interface if this Error represents
// a list of errors, or returns nil if the list of errors is empty. This
// function is useful at the end of accumulation to make sure that the value
// returned represents the existence of errors.
func (e *Error) ErrorOrNil() error {
if e == nil {
return nil
}
if len(e.Errors) == 0 {
return nil
}
return e
}
func (e *Error) GoString() string {
return fmt.Sprintf("*%#v", *e)
}
// WrappedErrors returns the list of errors that this Error is wrapping.
// It is an implementatin of the errwrap.Wrapper interface so that
// multierror.Error can be used with that library.
//
// This method is not safe to be called concurrently and is no different
// than accessing the Errors field directly. It is implementd only to
// satisfy the errwrap.Wrapper interface.
func (e *Error) WrappedErrors() []error {
return e.Errors
}

View file

@ -1,4 +1,4 @@
# memberlist
# memberlist [![GoDoc](https://godoc.org/github.com/hashicorp/memberlist?status.png)](https://godoc.org/github.com/hashicorp/memberlist)
memberlist is a [Go](http://www.golang.org) library that manages cluster
membership and member failure detection using a gossip based protocol.
@ -64,7 +64,7 @@ For complete documentation, see the associated [Godoc](http://godoc.org/github.c
## Protocol
memberlist is based on ["SWIM: Scalable Weakly-consistent Infection-style Process Group Membership Protocol"](http://www.cs.cornell.edu/~asdas/research/dsn02-swim.pdf),
with a few minor adaptations, mostly to increase propogation speed and
with a few minor adaptations, mostly to increase propagation speed and
convergence rate.
A high level overview of the memberlist protocol (based on SWIM) is
@ -93,15 +93,22 @@ be disabled entirely.
Failure detection is done by periodic random probing using a configurable interval.
If the node fails to ack within a reasonable time (typically some multiple
of RTT), then an indirect probe is attempted. An indirect probe asks a
configurable number of random nodes to probe the same node, in case there
are network issues causing our own node to fail the probe. If both our
probe and the indirect probes fail within a reasonable time, then the
node is marked "suspicious" and this knowledge is gossiped to the cluster.
A suspicious node is still considered a member of cluster. If the suspect member
of the cluster does not disputes the suspicion within a configurable period of
time, the node is finally considered dead, and this state is then gossiped
to the cluster.
of RTT), then an indirect probe as well as a direct TCP probe are attempted. An
indirect probe asks a configurable number of random nodes to probe the same node,
in case there are network issues causing our own node to fail the probe. The direct
TCP probe is used to help identify the common situation where networking is
misconfigured to allow TCP but not UDP. Without the TCP probe, a UDP-isolated node
would think all other nodes were suspect and could cause churn in the cluster when
it attempts a TCP-based state exchange with another node. It is not desirable to
operate with only TCP connectivity because convergence will be much slower, but it
is enabled so that memberlist can detect this situation and alert operators.
If both our probe, the indirect probes, and the direct TCP probe fail within a
configurable time, then the node is marked "suspicious" and this knowledge is
gossiped to the cluster. A suspicious node is still considered a member of
cluster. If the suspect member of the cluster does not dispute the suspicion
within a configurable period of time, the node is finally considered dead,
and this state is then gossiped to the cluster.
This is a brief and incomplete description of the protocol. For a better idea,
please read the
@ -111,7 +118,7 @@ in its entirety, along with the memberlist source code.
### Changes from SWIM
As mentioned earlier, the memberlist protocol is based on SWIM but includes
minor changes, mostly to increase propogation speed and convergence rates.
minor changes, mostly to increase propagation speed and convergence rates.
The changes from SWIM are noted here:
@ -127,7 +134,7 @@ The changes from SWIM are noted here:
also will periodically send out dedicated gossip messages on their own. This
feature lets you have a higher gossip rate (for example once per 200ms)
and a slower failure detection rate (such as once per second), resulting
in overall faster convergence rates and data propogation speeds. This feature
in overall faster convergence rates and data propagation speeds. This feature
can be totally disabed as well, if you wish.
* memberlist stores around the state of dead nodes for a set amount of time,

View file

@ -0,0 +1,14 @@
package memberlist
// AliveDelegate is used to involve a client in processing
// a node "alive" message. When a node joins, either through
// a UDP gossip or TCP push/pull, we update the state of
// that node via an alive message. This can be used to filter
// a node out and prevent it from being considered a peer
// using application specific logic.
type AliveDelegate interface {
// NotifyMerge is invoked when a merge could take place.
// Provides a list of the nodes known by the peer. If
// the return value is non-nil, the merge is canceled.
NotifyAlive(peer *Node) error
}

View file

@ -2,6 +2,7 @@ package memberlist
import (
"io"
"log"
"os"
"time"
)
@ -85,6 +86,11 @@ type Config struct {
ProbeInterval time.Duration
ProbeTimeout time.Duration
// DisableTcpPings will turn off the fallback TCP pings that are attempted
// if the direct UDP ping fails. These get pipelined along with the
// indirect UDP pings.
DisableTcpPings bool
// GossipInterval and GossipNodes are used to configure the gossip
// behavior of memberlist.
//
@ -111,6 +117,8 @@ type Config struct {
// the first key used while attempting to decrypt messages. Providing a
// value for this primary key will enable message-level encryption and
// verification, and automatically install the key onto the keyring.
// The value should be either 16, 24, or 32 bytes to select AES-128,
// AES-192, or AES-256.
SecretKey []byte
// The keyring holds all of the encryption keys used internally. It is
@ -132,16 +140,29 @@ type Config struct {
Events EventDelegate
Conflict ConflictDelegate
Merge MergeDelegate
Ping PingDelegate
Alive AliveDelegate
// DNSConfigPath points to the system's DNS config file, usually located
// at /etc/resolv.conf. It can be overridden via config for easier testing.
DNSConfigPath string
// LogOutput is the writer where logs should be sent. If this is not
// set, logging will go to stderr by default.
// set, logging will go to stderr by default. You cannot specify both LogOutput
// and Logger at the same time.
LogOutput io.Writer
// Logger is a custom logger which you provide. If Logger is set, it will use
// this for the internal logger. If Logger is not set, it will fall back to the
// behavior for using LogOutput. You cannot specify both LogOutput and Logger
// at the same time.
Logger *log.Logger
}
// DefaultLANConfig returns a sane set of configurations for Memberlist.
// It uses the hostname as the node name, and otherwise sets very conservative
// values that are sane for most LAN environments. The default configuration
// errs on the side on the side of caution, choosing values that are optimized
// errs on the side of caution, choosing values that are optimized
// for higher convergence at the cost of higher bandwidth usage. Regardless,
// these values are a good starting point when getting started with memberlist.
func DefaultLANConfig() *Config {
@ -152,7 +173,7 @@ func DefaultLANConfig() *Config {
BindPort: 7946,
AdvertiseAddr: "",
AdvertisePort: 7946,
ProtocolVersion: ProtocolVersionMax,
ProtocolVersion: ProtocolVersion2Compatible,
TCPTimeout: 10 * time.Second, // Timeout after 10 seconds
IndirectChecks: 3, // Use 3 nodes for the indirect ping
RetransmitMult: 4, // Retransmit a message 4 * log(N+1) nodes
@ -160,6 +181,7 @@ func DefaultLANConfig() *Config {
PushPullInterval: 30 * time.Second, // Low frequency
ProbeTimeout: 500 * time.Millisecond, // Reasonable RTT time for LAN
ProbeInterval: 1 * time.Second, // Failure check every second
DisableTcpPings: false, // TCP pings are safe, even with mixed versions
GossipNodes: 3, // Gossip to 3 nodes
GossipInterval: 200 * time.Millisecond, // Gossip more rapidly
@ -167,8 +189,9 @@ func DefaultLANConfig() *Config {
EnableCompression: true, // Enable compression by default
SecretKey: nil,
Keyring: nil,
Keyring: nil,
DNSConfigPath: "/etc/resolv.conf",
}
}

View file

@ -19,7 +19,8 @@ type Delegate interface {
// It can return a list of buffers to send. Each buffer should assume an
// overhead as provided with a limit on the total byte size allowed.
// The total byte size of the resulting data to send must not exceed
// the limit.
// the limit. Care should be taken that this method does not block,
// since doing so would block the entire UDP packet receive loop.
GetBroadcasts(overhead, limit int) [][]byte
// LocalState is used for a TCP Push/Pull. This is sent to

View file

@ -34,6 +34,9 @@ func (k *Keyring) init() {
// keyring. If creating a keyring with multiple keys, one key must be designated
// primary by passing it as the primaryKey. If the primaryKey does not exist in
// the list of secondary keys, it will be automatically added at position 0.
//
// A key should be either 16, 24, or 32 bytes to select AES-128,
// AES-192, or AES-256.
func NewKeyring(keys [][]byte, primaryKey []byte) (*Keyring, error) {
keyring := &Keyring{}
keyring.init()
@ -58,10 +61,12 @@ func NewKeyring(keys [][]byte, primaryKey []byte) (*Keyring, error) {
// AddKey will install a new key on the ring. Adding a key to the ring will make
// it available for use in decryption. If the key already exists on the ring,
// this function will just return noop.
//
// key should be either 16, 24, or 32 bytes to select AES-128,
// AES-192, or AES-256.
func (k *Keyring) AddKey(key []byte) error {
// Encorce 16-byte key size
if len(key) != 16 {
return fmt.Errorf("key size must be 16 bytes")
if l := len(key); l != 16 && l != 24 && l != 32 {
return fmt.Errorf("key size must be 16, 24 or 32 bytes")
}
// No-op if key is already installed

View file

@ -0,0 +1,22 @@
package memberlist
import (
"fmt"
"net"
)
func LogAddress(addr net.Addr) string {
if addr == nil {
return "from=<unknown address>"
}
return fmt.Sprintf("from=%s", addr.String())
}
func LogConn(conn net.Conn) string {
if conn == nil {
return LogAddress(nil)
}
return LogAddress(conn.RemoteAddr())
}

View file

@ -20,11 +20,19 @@ import (
"net"
"os"
"strconv"
"strings"
"sync"
"time"
"github.com/hashicorp/go-multierror"
"github.com/miekg/dns"
)
type Memberlist struct {
sequenceNum uint32 // Local sequence number
incarnation uint32 // Local incarnation number
numNodes uint32 // Number of known nodes (estimate)
config *Config
shutdown bool
shutdownCh chan struct{}
@ -35,9 +43,6 @@ type Memberlist struct {
tcpListener *net.TCPListener
handoff chan msgHandoff
sequenceNum uint32 // Local sequence number
incarnation uint32 // Local incarnation number
nodeLock sync.RWMutex
nodes []*nodeState // Known nodes
nodeMap map[string]*nodeState // Maps Addr.String() -> NodeState
@ -52,8 +57,6 @@ type Memberlist struct {
broadcasts *TransmitLimitedQueue
startStopLock sync.Mutex
logger *log.Logger
}
@ -90,6 +93,9 @@ func newMemberlist(conf *Config) (*Memberlist, error) {
if err != nil {
return nil, fmt.Errorf("Failed to start TCP listener. Err: %s", err)
}
if conf.BindPort == 0 {
conf.BindPort = tcpLn.Addr().(*net.TCPAddr).Port
}
udpAddr := &net.UDPAddr{IP: net.ParseIP(conf.BindAddr), Port: conf.BindPort}
udpLn, err := net.ListenUDP("udp", udpAddr)
@ -101,10 +107,19 @@ func newMemberlist(conf *Config) (*Memberlist, error) {
// Set the UDP receive window size
setUDPRecvBuf(udpLn)
if conf.LogOutput == nil {
conf.LogOutput = os.Stderr
if conf.LogOutput != nil && conf.Logger != nil {
return nil, fmt.Errorf("Cannot specify both LogOutput and Logger. Please choose a single log configuration setting.")
}
logDest := conf.LogOutput
if logDest == nil {
logDest = os.Stderr
}
logger := conf.Logger
if logger == nil {
logger = log.New(logDest, "", log.LstdFlags)
}
logger := log.New(conf.LogOutput, "", log.LstdFlags)
m := &Memberlist{
config: conf,
@ -118,7 +133,9 @@ func newMemberlist(conf *Config) (*Memberlist, error) {
broadcasts: &TransmitLimitedQueue{RetransmitMult: conf.RetransmitMult},
logger: logger,
}
m.broadcasts.NumNodes = func() int { return len(m.nodes) }
m.broadcasts.NumNodes = func() int {
return m.estNumNodes()
}
go m.tcpListen()
go m.udpListen()
go m.udpHandler()
@ -153,79 +170,158 @@ func Create(conf *Config) (*Memberlist, error) {
// none could be reached. If an error is returned, the node did not successfully
// join the cluster.
func (m *Memberlist) Join(existing []string) (int, error) {
// Attempt to join any of them
numSuccess := 0
var retErr error
var errs error
for _, exist := range existing {
addrs, port, err := m.resolveAddr(exist)
addrs, err := m.resolveAddr(exist)
if err != nil {
m.logger.Printf("[WARN] memberlist: Failed to resolve %s: %v", exist, err)
retErr = err
err = fmt.Errorf("Failed to resolve %s: %v", exist, err)
errs = multierror.Append(errs, err)
m.logger.Printf("[WARN] memberlist: %v", err)
continue
}
for _, addr := range addrs {
if err := m.pushPullNode(addr, port, true); err != nil {
retErr = err
if err := m.pushPullNode(addr.ip, addr.port, true); err != nil {
err = fmt.Errorf("Failed to join %s: %v", addr.ip, err)
errs = multierror.Append(errs, err)
m.logger.Printf("[DEBUG] memberlist: %v", err)
continue
}
numSuccess++
}
}
if numSuccess > 0 {
retErr = nil
errs = nil
}
return numSuccess, errs
}
// ipPort holds information about a node we want to try to join.
type ipPort struct {
ip net.IP
port uint16
}
// tcpLookupIP is a helper to initiate a TCP-based DNS lookup for the given host.
// The built-in Go resolver will do a UDP lookup first, and will only use TCP if
// the response has the truncate bit set, which isn't common on DNS servers like
// Consul's. By doing the TCP lookup directly, we get the best chance for the
// largest list of hosts to join. Since joins are relatively rare events, it's ok
// to do this rather expensive operation.
func (m *Memberlist) tcpLookupIP(host string, defaultPort uint16) ([]ipPort, error) {
// Don't attempt any TCP lookups against non-fully qualified domain
// names, since those will likely come from the resolv.conf file.
if !strings.Contains(host, ".") {
return nil, nil
}
return numSuccess, retErr
// Make sure the domain name is terminated with a dot (we know there's
// at least one character at this point).
dn := host
if dn[len(dn)-1] != '.' {
dn = dn + "."
}
// See if we can find a server to try.
cc, err := dns.ClientConfigFromFile(m.config.DNSConfigPath)
if err != nil {
return nil, err
}
if len(cc.Servers) > 0 {
// We support host:port in the DNS config, but need to add the
// default port if one is not supplied.
server := cc.Servers[0]
if !hasPort(server) {
server = net.JoinHostPort(server, cc.Port)
}
// Do the lookup.
c := new(dns.Client)
c.Net = "tcp"
msg := new(dns.Msg)
msg.SetQuestion(dn, dns.TypeANY)
in, _, err := c.Exchange(msg, server)
if err != nil {
return nil, err
}
// Handle any IPs we get back that we can attempt to join.
var ips []ipPort
for _, r := range in.Answer {
switch rr := r.(type) {
case (*dns.A):
ips = append(ips, ipPort{rr.A, defaultPort})
case (*dns.AAAA):
ips = append(ips, ipPort{rr.AAAA, defaultPort})
case (*dns.CNAME):
m.logger.Printf("[DEBUG] memberlist: Ignoring CNAME RR in TCP-first answer for '%s'", host)
}
}
return ips, nil
}
return nil, nil
}
// resolveAddr is used to resolve the address into an address,
// port, and error. If no port is given, use the default
func (m *Memberlist) resolveAddr(hostStr string) ([][]byte, uint16, error) {
ips := make([][]byte, 0)
func (m *Memberlist) resolveAddr(hostStr string) ([]ipPort, error) {
// Normalize the incoming string to host:port so we can apply Go's
// parser to it.
port := uint16(0)
if !hasPort(hostStr) {
hostStr += ":" + strconv.Itoa(m.config.BindPort)
}
host, sport, err := net.SplitHostPort(hostStr)
if ae, ok := err.(*net.AddrError); ok && ae.Err == "missing port in address" {
// error, port missing - we can solve this
port = uint16(m.config.BindPort)
host = hostStr
} else if err != nil {
// error, but not missing port
return ips, port, err
} else if lport, err := strconv.ParseUint(sport, 10, 16); err != nil {
// error, when parsing port
return ips, port, err
} else {
// no error
port = uint16(lport)
if err != nil {
return nil, err
}
// Get the addresses that hostPort might resolve to
// ResolveTcpAddr requres ipv6 brackets to separate
// port numbers whereas ParseIP doesn't, but luckily
// SplitHostPort takes care of the brackets
if ip := net.ParseIP(host); ip == nil {
if pre, err := net.LookupIP(host); err == nil {
for _, ip := range pre {
ips = append(ips, ip)
}
} else {
return ips, port, err
}
} else {
ips = append(ips, ip)
// This will capture the supplied port, or the default one added above.
lport, err := strconv.ParseUint(sport, 10, 16)
if err != nil {
return nil, err
}
port = uint16(lport)
// If it looks like an IP address we are done. The SplitHostPort() above
// will make sure the host part is in good shape for parsing, even for
// IPv6 addresses.
if ip := net.ParseIP(host); ip != nil {
return []ipPort{ipPort{ip, port}}, nil
}
return ips, port, nil
// First try TCP so we have the best chance for the largest list of
// hosts to join. If this fails it's not fatal since this isn't a standard
// way to query DNS, and we have a fallback below.
ips, err := m.tcpLookupIP(host, port)
if err != nil {
m.logger.Printf("[DEBUG] memberlist: TCP-first lookup failed for '%s', falling back to UDP: %s", hostStr, err)
}
if len(ips) > 0 {
return ips, nil
}
// If TCP didn't yield anything then use the normal Go resolver which
// will try UDP, then might possibly try TCP again if the UDP response
// indicates it was truncated.
ans, err := net.LookupIP(host)
if err != nil {
return nil, err
}
ips = make([]ipPort, 0, len(ans))
for _, ip := range ans {
ips = append(ips, ipPort{ip, port})
}
return ips, nil
}
// setAlive is used to mark this node as being alive. This is the same
// as if we received an alive notification our own network channel for
// ourself.
func (m *Memberlist) setAlive() error {
var advertiseAddr []byte
var advertisePort int
if m.config.AdvertiseAddr != "" {
@ -268,7 +364,7 @@ func (m *Memberlist) setAlive() error {
if ip.To4() == nil {
continue
}
if !isPrivateIP(ip.String()) {
if !IsPrivateIP(ip.String()) {
continue
}
@ -286,12 +382,14 @@ func (m *Memberlist) setAlive() error {
addr := m.tcpListener.Addr().(*net.TCPAddr)
advertiseAddr = addr.IP
}
advertisePort = m.config.BindPort
// Use the port we are bound to.
advertisePort = m.tcpListener.Addr().(*net.TCPAddr).Port
}
// Check if this is a public address without encryption
addrStr := net.IP(advertiseAddr).String()
if !isPrivateIP(addrStr) && !isLoopbackIP(addrStr) && !m.config.EncryptionEnabled() {
if !IsPrivateIP(addrStr) && !isLoopbackIP(addrStr) && !m.config.EncryptionEnabled() {
m.logger.Printf("[WARN] memberlist: Binding to public address without encryption!")
}
@ -385,7 +483,8 @@ func (m *Memberlist) UpdateNode(timeout time.Duration) error {
// user-data message, which a delegate will receive through NotifyMsg
// The actual data is transmitted over UDP, which means this is a
// best-effort transmission mechanism, and the maximum size of the
// message is the size of a single UDP datagram, after compression
// message is the size of a single UDP datagram, after compression.
// This method is DEPRECATED in favor or SendToUDP
func (m *Memberlist) SendTo(to net.Addr, msg []byte) error {
// Encode as a user message
buf := make([]byte, 1, len(msg)+1)
@ -393,7 +492,36 @@ func (m *Memberlist) SendTo(to net.Addr, msg []byte) error {
buf = append(buf, msg...)
// Send the message
return m.rawSendMsg(to, buf)
return m.rawSendMsgUDP(to, buf)
}
// SendToUDP is used to directly send a message to another node, without
// the use of the gossip mechanism. This will encode the message as a
// user-data message, which a delegate will receive through NotifyMsg
// The actual data is transmitted over UDP, which means this is a
// best-effort transmission mechanism, and the maximum size of the
// message is the size of a single UDP datagram, after compression
func (m *Memberlist) SendToUDP(to *Node, msg []byte) error {
// Encode as a user message
buf := make([]byte, 1, len(msg)+1)
buf[0] = byte(userMsg)
buf = append(buf, msg...)
// Send the message
destAddr := &net.UDPAddr{IP: to.Addr, Port: int(to.Port)}
return m.rawSendMsgUDP(destAddr, buf)
}
// SendToTCP is used to directly send a message to another node, without
// the use of the gossip mechanism. This will encode the message as a
// user-data message, which a delegate will receive through NotifyMsg
// The actual data is transmitted over TCP, which means delivery
// is guaranteed if no error is returned. There is no limit
// to the size of the message
func (m *Memberlist) SendToTCP(to *Node, msg []byte) error {
// Send the message
destAddr := &net.TCPAddr{IP: to.Addr, Port: int(to.Port)}
return m.sendTCPUserMsg(destAddr, msg)
}
// Members returns a list of all known live nodes. The node structures
@ -441,10 +569,12 @@ func (m *Memberlist) NumMembers() (alive int) {
// This method is safe to call multiple times, but must not be called
// after the cluster is already shut down.
func (m *Memberlist) Leave(timeout time.Duration) error {
m.startStopLock.Lock()
defer m.startStopLock.Unlock()
m.nodeLock.Lock()
// We can't defer m.nodeLock.Unlock() because m.deadNode will also try to
// acquire a lock so we need to Unlock before that.
if m.shutdown {
m.nodeLock.Unlock()
panic("leave after shutdown")
}
@ -452,6 +582,7 @@ func (m *Memberlist) Leave(timeout time.Duration) error {
m.leave = true
state, ok := m.nodeMap[m.config.Name]
m.nodeLock.Unlock()
if !ok {
m.logger.Printf("[WARN] memberlist: Leave but we're not in the node map.")
return nil
@ -475,6 +606,8 @@ func (m *Memberlist) Leave(timeout time.Duration) error {
return fmt.Errorf("timeout waiting for leave broadcast")
}
}
} else {
m.nodeLock.Unlock()
}
return nil
@ -509,8 +642,8 @@ func (m *Memberlist) ProtocolVersion() uint8 {
//
// This method is safe to call multiple times.
func (m *Memberlist) Shutdown() error {
m.startStopLock.Lock()
defer m.startStopLock.Unlock()
m.nodeLock.Lock()
defer m.nodeLock.Unlock()
if m.shutdown {
return nil

View file

@ -8,6 +8,7 @@ package memberlist
// as part of the push-pull anti-entropy.
type MergeDelegate interface {
// NotifyMerge is invoked when a merge could take place.
// Provides a list of the nodes known by the peer.
NotifyMerge(peers []*Node) (cancel bool)
// Provides a list of the nodes known by the peer. If
// the return value is non-nil, the merge is canceled.
NotifyMerge(peers []*Node) error
}

View file

@ -18,7 +18,15 @@ import (
// range. This range is inclusive.
const (
ProtocolVersionMin uint8 = 1
ProtocolVersionMax = 2
// Version 3 added support for TCP pings but we kept the default
// protocol version at 2 to ease transition to this new feature.
// A memberlist speaking version 2 of the protocol will attempt
// to TCP ping another memberlist who understands version 3 or
// greater.
ProtocolVersion2Compatible = 2
ProtocolVersionMax = 3
)
// messageType is an integer ID of a type of message that can be received
@ -79,7 +87,8 @@ type indirectPingReq struct {
// ack response is sent for a ping
type ackResp struct {
SeqNo uint32
SeqNo uint32
Payload []byte
}
// suspect is broadcast when we suspect a node is dead
@ -119,6 +128,11 @@ type pushPullHeader struct {
Join bool // Is this a join request or a anti-entropy run
}
// userMsgHeader is used to encapsulate a userMsg
type userMsgHeader struct {
UserMsgLen int // Encodes the byte lengh of user state
}
// pushNodeState is used for pushPullReq when we are
// transfering out node states
type pushNodeState struct {
@ -185,54 +199,65 @@ func (m *Memberlist) tcpListen() {
// handleConn handles a single incoming TCP connection
func (m *Memberlist) handleConn(conn *net.TCPConn) {
m.logger.Printf("[DEBUG] memberlist: Responding to push/pull sync with: %s", conn.RemoteAddr())
m.logger.Printf("[DEBUG] memberlist: TCP connection %s", LogConn(conn))
defer conn.Close()
metrics.IncrCounter([]string{"memberlist", "tcp", "accept"}, 1)
join, remoteNodes, userState, err := m.readRemoteState(conn)
conn.SetDeadline(time.Now().Add(m.config.TCPTimeout))
msgType, bufConn, dec, err := m.readTCP(conn)
if err != nil {
m.logger.Printf("[ERR] memberlist: Failed to receive remote state: %s", err)
m.logger.Printf("[ERR] memberlist: failed to receive: %s %s", err, LogConn(conn))
return
}
if err := m.sendLocalState(conn, join); err != nil {
m.logger.Printf("[ERR] memberlist: Failed to push local state: %s", err)
}
if err := m.verifyProtocol(remoteNodes); err != nil {
m.logger.Printf("[ERR] memberlist: Push/pull verification failed: %s", err)
return
}
// Invoke the merge delegate if any
if join && m.config.Merge != nil {
nodes := make([]*Node, len(remoteNodes))
for idx, n := range remoteNodes {
nodes[idx] = &Node{
Name: n.Name,
Addr: n.Addr,
Port: n.Port,
Meta: n.Meta,
PMin: n.Vsn[0],
PMax: n.Vsn[1],
PCur: n.Vsn[2],
DMin: n.Vsn[3],
DMax: n.Vsn[4],
DCur: n.Vsn[5],
}
switch msgType {
case userMsg:
if err := m.readUserMsg(bufConn, dec); err != nil {
m.logger.Printf("[ERR] memberlist: Failed to receive user message: %s %s", err, LogConn(conn))
}
if m.config.Merge.NotifyMerge(nodes) {
m.logger.Printf("[WARN] memberlist: Cluster merge canceled")
case pushPullMsg:
join, remoteNodes, userState, err := m.readRemoteState(bufConn, dec)
if err != nil {
m.logger.Printf("[ERR] memberlist: Failed to read remote state: %s %s", err, LogConn(conn))
return
}
}
// Merge the membership state
m.mergeState(remoteNodes)
if err := m.sendLocalState(conn, join); err != nil {
m.logger.Printf("[ERR] memberlist: Failed to push local state: %s %s", err, LogConn(conn))
return
}
// Invoke the delegate for user state
if m.config.Delegate != nil {
m.config.Delegate.MergeRemoteState(userState, join)
if err := m.mergeRemoteState(join, remoteNodes, userState); err != nil {
m.logger.Printf("[ERR] memberlist: Failed push/pull merge: %s %s", err, LogConn(conn))
return
}
case pingMsg:
var p ping
if err := dec.Decode(&p); err != nil {
m.logger.Printf("[ERR] memberlist: Failed to decode TCP ping: %s %s", err, LogConn(conn))
return
}
if p.Node != "" && p.Node != m.config.Name {
m.logger.Printf("[WARN] memberlist: Got ping for unexpected node %s %s", p.Node, LogConn(conn))
return
}
ack := ackResp{p.SeqNo, nil}
out, err := encode(ackRespMsg, &ack)
if err != nil {
m.logger.Printf("[ERR] memberlist: Failed to encode TCP ack: %s", err)
return
}
err = m.rawSendMsgTCP(conn, out.Bytes())
if err != nil {
m.logger.Printf("[ERR] memberlist: Failed to send TCP ack: %s %s", err, LogConn(conn))
return
}
default:
m.logger.Printf("[ERR] memberlist: Received invalid msgType (%d) %s", msgType, LogConn(conn))
}
}
@ -265,29 +290,30 @@ func (m *Memberlist) udpListen() {
continue
}
// Capture the reception time of the packet as close to the
// system calls as possible.
lastPacket = time.Now()
// Check the length
if n < 1 {
m.logger.Printf("[ERR] memberlist: UDP packet too short (%d bytes). From: %s",
len(buf), addr)
m.logger.Printf("[ERR] memberlist: UDP packet too short (%d bytes) %s",
len(buf), LogAddress(addr))
continue
}
// Capture the current time
lastPacket = time.Now()
// Ingest this packet
metrics.IncrCounter([]string{"memberlist", "udp", "received"}, float32(n))
m.ingestPacket(buf[:n], addr)
m.ingestPacket(buf[:n], addr, lastPacket)
}
}
func (m *Memberlist) ingestPacket(buf []byte, from net.Addr) {
func (m *Memberlist) ingestPacket(buf []byte, from net.Addr, timestamp time.Time) {
// Check if encryption is enabled
if m.config.EncryptionEnabled() {
// Decrypt the payload
plain, err := decryptPayload(m.config.Keyring.GetKeys(), buf, nil)
if err != nil {
m.logger.Printf("[ERR] memberlist: Decrypt packet failed: %v", err)
m.logger.Printf("[ERR] memberlist: Decrypt packet failed: %v %s", err, LogAddress(from))
return
}
@ -296,10 +322,10 @@ func (m *Memberlist) ingestPacket(buf []byte, from net.Addr) {
}
// Handle the command
m.handleCommand(buf, from)
m.handleCommand(buf, from, timestamp)
}
func (m *Memberlist) handleCommand(buf []byte, from net.Addr) {
func (m *Memberlist) handleCommand(buf []byte, from net.Addr, timestamp time.Time) {
// Decode the message type
msgType := messageType(buf[0])
buf = buf[1:]
@ -307,16 +333,16 @@ func (m *Memberlist) handleCommand(buf []byte, from net.Addr) {
// Switch on the msgType
switch msgType {
case compoundMsg:
m.handleCompound(buf, from)
m.handleCompound(buf, from, timestamp)
case compressMsg:
m.handleCompressed(buf, from)
m.handleCompressed(buf, from, timestamp)
case pingMsg:
m.handlePing(buf, from)
case indirectPingMsg:
m.handleIndirectPing(buf, from)
case ackRespMsg:
m.handleAck(buf, from)
m.handleAck(buf, from, timestamp)
case suspectMsg:
fallthrough
@ -328,11 +354,11 @@ func (m *Memberlist) handleCommand(buf []byte, from net.Addr) {
select {
case m.handoff <- msgHandoff{msgType, buf, from}:
default:
m.logger.Printf("[WARN] memberlist: UDP handler queue full, dropping message (%d)", msgType)
m.logger.Printf("[WARN] memberlist: UDP handler queue full, dropping message (%d) %s", msgType, LogAddress(from))
}
default:
m.logger.Printf("[ERR] memberlist: UDP msg type (%d) not supported. From: %s", msgType, from)
m.logger.Printf("[ERR] memberlist: UDP msg type (%d) not supported %s", msgType, LogAddress(from))
}
}
@ -357,7 +383,7 @@ func (m *Memberlist) udpHandler() {
case userMsg:
m.handleUser(buf, from)
default:
m.logger.Printf("[ERR] memberlist: UDP msg type (%d) not supported. From: %s (handler)", msgType, from)
m.logger.Printf("[ERR] memberlist: UDP msg type (%d) not supported %s (handler)", msgType, LogAddress(from))
}
case <-m.shutdownCh:
@ -366,46 +392,50 @@ func (m *Memberlist) udpHandler() {
}
}
func (m *Memberlist) handleCompound(buf []byte, from net.Addr) {
func (m *Memberlist) handleCompound(buf []byte, from net.Addr, timestamp time.Time) {
// Decode the parts
trunc, parts, err := decodeCompoundMessage(buf)
if err != nil {
m.logger.Printf("[ERR] memberlist: Failed to decode compound request: %s", err)
m.logger.Printf("[ERR] memberlist: Failed to decode compound request: %s %s", err, LogAddress(from))
return
}
// Log any truncation
if trunc > 0 {
m.logger.Printf("[WARN] memberlist: Compound request had %d truncated messages", trunc)
m.logger.Printf("[WARN] memberlist: Compound request had %d truncated messages %s", trunc, LogAddress(from))
}
// Handle each message
for _, part := range parts {
m.handleCommand(part, from)
m.handleCommand(part, from, timestamp)
}
}
func (m *Memberlist) handlePing(buf []byte, from net.Addr) {
var p ping
if err := decode(buf, &p); err != nil {
m.logger.Printf("[ERR] memberlist: Failed to decode ping request: %s", err)
m.logger.Printf("[ERR] memberlist: Failed to decode ping request: %s %s", err, LogAddress(from))
return
}
// If node is provided, verify that it is for us
if p.Node != "" && p.Node != m.config.Name {
m.logger.Printf("[WARN] memberlist: Got ping for unexpected node '%s'", p.Node)
m.logger.Printf("[WARN] memberlist: Got ping for unexpected node '%s' %s", p.Node, LogAddress(from))
return
}
ack := ackResp{p.SeqNo}
var ack ackResp
ack.SeqNo = p.SeqNo
if m.config.Ping != nil {
ack.Payload = m.config.Ping.AckPayload()
}
if err := m.encodeAndSendMsg(from, ackRespMsg, &ack); err != nil {
m.logger.Printf("[ERR] memberlist: Failed to send ack: %s", err)
m.logger.Printf("[ERR] memberlist: Failed to send ack: %s %s", err, LogAddress(from))
}
}
func (m *Memberlist) handleIndirectPing(buf []byte, from net.Addr) {
var ind indirectPingReq
if err := decode(buf, &ind); err != nil {
m.logger.Printf("[ERR] memberlist: Failed to decode indirect ping request: %s", err)
m.logger.Printf("[ERR] memberlist: Failed to decode indirect ping request: %s %s", err, LogAddress(from))
return
}
@ -421,33 +451,33 @@ func (m *Memberlist) handleIndirectPing(buf []byte, from net.Addr) {
destAddr := &net.UDPAddr{IP: ind.Target, Port: int(ind.Port)}
// Setup a response handler to relay the ack
respHandler := func() {
ack := ackResp{ind.SeqNo}
respHandler := func(payload []byte, timestamp time.Time) {
ack := ackResp{ind.SeqNo, nil}
if err := m.encodeAndSendMsg(from, ackRespMsg, &ack); err != nil {
m.logger.Printf("[ERR] memberlist: Failed to forward ack: %s", err)
m.logger.Printf("[ERR] memberlist: Failed to forward ack: %s %s", err, LogAddress(from))
}
}
m.setAckHandler(localSeqNo, respHandler, m.config.ProbeTimeout)
// Send the ping
if err := m.encodeAndSendMsg(destAddr, pingMsg, &ping); err != nil {
m.logger.Printf("[ERR] memberlist: Failed to send ping: %s", err)
m.logger.Printf("[ERR] memberlist: Failed to send ping: %s %s", err, LogAddress(from))
}
}
func (m *Memberlist) handleAck(buf []byte, from net.Addr) {
func (m *Memberlist) handleAck(buf []byte, from net.Addr, timestamp time.Time) {
var ack ackResp
if err := decode(buf, &ack); err != nil {
m.logger.Printf("[ERR] memberlist: Failed to decode ack response: %s", err)
m.logger.Printf("[ERR] memberlist: Failed to decode ack response: %s %s", err, LogAddress(from))
return
}
m.invokeAckHandler(ack.SeqNo)
m.invokeAckHandler(ack, timestamp)
}
func (m *Memberlist) handleSuspect(buf []byte, from net.Addr) {
var sus suspect
if err := decode(buf, &sus); err != nil {
m.logger.Printf("[ERR] memberlist: Failed to decode suspect message: %s", err)
m.logger.Printf("[ERR] memberlist: Failed to decode suspect message: %s %s", err, LogAddress(from))
return
}
m.suspectNode(&sus)
@ -456,7 +486,7 @@ func (m *Memberlist) handleSuspect(buf []byte, from net.Addr) {
func (m *Memberlist) handleAlive(buf []byte, from net.Addr) {
var live alive
if err := decode(buf, &live); err != nil {
m.logger.Printf("[ERR] memberlist: Failed to decode alive message: %s", err)
m.logger.Printf("[ERR] memberlist: Failed to decode alive message: %s %s", err, LogAddress(from))
return
}
@ -472,7 +502,7 @@ func (m *Memberlist) handleAlive(buf []byte, from net.Addr) {
func (m *Memberlist) handleDead(buf []byte, from net.Addr) {
var d dead
if err := decode(buf, &d); err != nil {
m.logger.Printf("[ERR] memberlist: Failed to decode dead message: %s", err)
m.logger.Printf("[ERR] memberlist: Failed to decode dead message: %s %s", err, LogAddress(from))
return
}
m.deadNode(&d)
@ -487,16 +517,16 @@ func (m *Memberlist) handleUser(buf []byte, from net.Addr) {
}
// handleCompressed is used to unpack a compressed message
func (m *Memberlist) handleCompressed(buf []byte, from net.Addr) {
func (m *Memberlist) handleCompressed(buf []byte, from net.Addr, timestamp time.Time) {
// Try to decode the payload
payload, err := decompressPayload(buf)
if err != nil {
m.logger.Printf("[ERR] memberlist: Failed to decompress payload: %v", err)
m.logger.Printf("[ERR] memberlist: Failed to decompress payload: %v %s", err, LogAddress(from))
return
}
// Recursively handle the payload
m.handleCommand(payload, from)
m.handleCommand(payload, from, timestamp)
}
// encodeAndSendMsg is used to combine the encoding and sending steps
@ -523,7 +553,7 @@ func (m *Memberlist) sendMsg(to net.Addr, msg []byte) error {
// Fast path if nothing to piggypack
if len(extra) == 0 {
return m.rawSendMsg(to, msg)
return m.rawSendMsgUDP(to, msg)
}
// Join all the messages
@ -535,11 +565,11 @@ func (m *Memberlist) sendMsg(to net.Addr, msg []byte) error {
compound := makeCompoundMessage(msgs)
// Send the message
return m.rawSendMsg(to, compound.Bytes())
return m.rawSendMsgUDP(to, compound.Bytes())
}
// rawSendMsg is used to send a UDP message to another host without modification
func (m *Memberlist) rawSendMsg(to net.Addr, msg []byte) error {
// rawSendMsgUDP is used to send a UDP message to another host without modification
func (m *Memberlist) rawSendMsgUDP(to net.Addr, msg []byte) error {
// Check if we have compression enabled
if m.config.EnableCompression {
buf, err := compressPayload(msg)
@ -571,7 +601,72 @@ func (m *Memberlist) rawSendMsg(to net.Addr, msg []byte) error {
return err
}
// sendState is used to initiate a push/pull over TCP with a remote node
// rawSendMsgTCP is used to send a TCP message to another host without modification
func (m *Memberlist) rawSendMsgTCP(conn net.Conn, sendBuf []byte) error {
// Check if compresion is enabled
if m.config.EnableCompression {
compBuf, err := compressPayload(sendBuf)
if err != nil {
m.logger.Printf("[ERROR] memberlist: Failed to compress payload: %v", err)
} else {
sendBuf = compBuf.Bytes()
}
}
// Check if encryption is enabled
if m.config.EncryptionEnabled() {
crypt, err := m.encryptLocalState(sendBuf)
if err != nil {
m.logger.Printf("[ERROR] memberlist: Failed to encrypt local state: %v", err)
return err
}
sendBuf = crypt
}
// Write out the entire send buffer
metrics.IncrCounter([]string{"memberlist", "tcp", "sent"}, float32(len(sendBuf)))
if n, err := conn.Write(sendBuf); err != nil {
return err
} else if n != len(sendBuf) {
return fmt.Errorf("only %d of %d bytes written", n, len(sendBuf))
}
return nil
}
// sendTCPUserMsg is used to send a TCP userMsg to another host
func (m *Memberlist) sendTCPUserMsg(to net.Addr, sendBuf []byte) error {
dialer := net.Dialer{Timeout: m.config.TCPTimeout}
conn, err := dialer.Dial("tcp", to.String())
if err != nil {
return err
}
defer conn.Close()
bufConn := bytes.NewBuffer(nil)
if err := bufConn.WriteByte(byte(userMsg)); err != nil {
return err
}
// Send our node state
header := userMsgHeader{UserMsgLen: len(sendBuf)}
hd := codec.MsgpackHandle{}
enc := codec.NewEncoder(bufConn, &hd)
if err := enc.Encode(&header); err != nil {
return err
}
if _, err := bufConn.Write(sendBuf); err != nil {
return err
}
return m.rawSendMsgTCP(conn, bufConn.Bytes())
}
// sendAndReceiveState is used to initiate a push/pull over TCP with a remote node
func (m *Memberlist) sendAndReceiveState(addr []byte, port uint16, join bool) ([]pushNodeState, []byte, error) {
// Attempt to connect
dialer := net.Dialer{Timeout: m.config.TCPTimeout}
@ -589,15 +684,21 @@ func (m *Memberlist) sendAndReceiveState(addr []byte, port uint16, join bool) ([
return nil, nil, err
}
// Read remote state
_, remote, userState, err := m.readRemoteState(conn)
conn.SetDeadline(time.Now().Add(m.config.TCPTimeout))
msgType, bufConn, dec, err := m.readTCP(conn)
if err != nil {
err := fmt.Errorf("Reading remote state failed: %v", err)
return nil, nil, err
}
// Return the remote state
return remote, userState, nil
// Quit if not push/pull
if msgType != pushPullMsg {
err := fmt.Errorf("received invalid msgType (%d), expected pushPullMsg (%d) %s", msgType, pushPullMsg, LogConn(conn))
return nil, nil, err
}
// Read remote state
_, remoteNodes, userState, err := m.readRemoteState(bufConn, dec)
return remoteNodes, userState, err
}
// sendLocalState is invoked to send our local state over a tcp connection
@ -658,34 +759,7 @@ func (m *Memberlist) sendLocalState(conn net.Conn, join bool) error {
}
// Get the send buffer
sendBuf := bufConn.Bytes()
// Check if compresion is enabled
if m.config.EnableCompression {
compBuf, err := compressPayload(bufConn.Bytes())
if err != nil {
m.logger.Printf("[ERROR] memberlist: Failed to compress local state: %v", err)
} else {
sendBuf = compBuf.Bytes()
}
}
// Check if encryption is enabled
if m.config.EncryptionEnabled() {
crypt, err := m.encryptLocalState(sendBuf)
if err != nil {
m.logger.Printf("[ERROR] memberlist: Failed to encrypt local state: %v", err)
return err
}
sendBuf = crypt
}
// Write out the entire send buffer
metrics.IncrCounter([]string{"memberlist", "tcp", "sent"}, float32(len(sendBuf)))
if _, err := conn.Write(sendBuf); err != nil {
return err
}
return nil
return m.rawSendMsgTCP(conn, bufConn.Bytes())
}
// encryptLocalState is used to help encrypt local state before sending
@ -743,38 +817,36 @@ func (m *Memberlist) decryptRemoteState(bufConn io.Reader) ([]byte, error) {
return decryptPayload(keys, cipherBytes, dataBytes)
}
// recvRemoteState is used to read the remote state from a connection
func (m *Memberlist) readRemoteState(conn net.Conn) (bool, []pushNodeState, []byte, error) {
// Setup a deadline
conn.SetDeadline(time.Now().Add(m.config.TCPTimeout))
// readTCP is used to read the start of a TCP stream.
// it decrypts and decompresses the stream if necessary
func (m *Memberlist) readTCP(conn net.Conn) (messageType, io.Reader, *codec.Decoder, error) {
// Created a buffered reader
var bufConn io.Reader = bufio.NewReader(conn)
// Read the message type
buf := [1]byte{0}
if _, err := bufConn.Read(buf[:]); err != nil {
return false, nil, nil, err
return 0, nil, nil, err
}
msgType := messageType(buf[0])
// Check if the message is encrypted
if msgType == encryptMsg {
if !m.config.EncryptionEnabled() {
return false, nil, nil,
return 0, nil, nil,
fmt.Errorf("Remote state is encrypted and encryption is not configured")
}
plain, err := m.decryptRemoteState(bufConn)
if err != nil {
return false, nil, nil, err
return 0, nil, nil, err
}
// Reset message type and bufConn
msgType = messageType(plain[0])
bufConn = bytes.NewReader(plain[1:])
} else if m.config.EncryptionEnabled() {
return false, nil, nil,
return 0, nil, nil,
fmt.Errorf("Encryption is configured but remote state is not encrypted")
}
@ -786,11 +858,11 @@ func (m *Memberlist) readRemoteState(conn net.Conn) (bool, []pushNodeState, []by
if msgType == compressMsg {
var c compress
if err := dec.Decode(&c); err != nil {
return false, nil, nil, err
return 0, nil, nil, err
}
decomp, err := decompressBuffer(&c)
if err != nil {
return false, nil, nil, err
return 0, nil, nil, err
}
// Reset the message type
@ -803,12 +875,11 @@ func (m *Memberlist) readRemoteState(conn net.Conn) (bool, []pushNodeState, []by
dec = codec.NewDecoder(bufConn, &hd)
}
// Quit if not push/pull
if msgType != pushPullMsg {
err := fmt.Errorf("received invalid msgType (%d)", msgType)
return false, nil, nil, err
}
return msgType, bufConn, dec, nil
}
// readRemoteState is used to read the remote state from a connection
func (m *Memberlist) readRemoteState(bufConn io.Reader, dec *codec.Decoder) (bool, []pushNodeState, []byte, error) {
// Read the push/pull header
var header pushPullHeader
if err := dec.Decode(&header); err != nil {
@ -821,7 +892,7 @@ func (m *Memberlist) readRemoteState(conn net.Conn) (bool, []pushNodeState, []by
// Try to decode all the states
for i := 0; i < header.Nodes; i++ {
if err := dec.Decode(&remoteNodes[i]); err != nil {
return false, remoteNodes, nil, err
return false, nil, nil, err
}
}
@ -836,7 +907,7 @@ func (m *Memberlist) readRemoteState(conn net.Conn) (bool, []pushNodeState, []by
bytes, header.UserStateLen)
}
if err != nil {
return false, remoteNodes, nil, err
return false, nil, nil, err
}
}
@ -850,3 +921,119 @@ func (m *Memberlist) readRemoteState(conn net.Conn) (bool, []pushNodeState, []by
return header.Join, remoteNodes, userBuf, nil
}
// mergeRemoteState is used to merge the remote state with our local state
func (m *Memberlist) mergeRemoteState(join bool, remoteNodes []pushNodeState, userBuf []byte) error {
if err := m.verifyProtocol(remoteNodes); err != nil {
return err
}
// Invoke the merge delegate if any
if join && m.config.Merge != nil {
nodes := make([]*Node, len(remoteNodes))
for idx, n := range remoteNodes {
nodes[idx] = &Node{
Name: n.Name,
Addr: n.Addr,
Port: n.Port,
Meta: n.Meta,
PMin: n.Vsn[0],
PMax: n.Vsn[1],
PCur: n.Vsn[2],
DMin: n.Vsn[3],
DMax: n.Vsn[4],
DCur: n.Vsn[5],
}
}
if err := m.config.Merge.NotifyMerge(nodes); err != nil {
return err
}
}
// Merge the membership state
m.mergeState(remoteNodes)
// Invoke the delegate for user state
if userBuf != nil && m.config.Delegate != nil {
m.config.Delegate.MergeRemoteState(userBuf, join)
}
return nil
}
// readUserMsg is used to decode a userMsg from a TCP stream
func (m *Memberlist) readUserMsg(bufConn io.Reader, dec *codec.Decoder) error {
// Read the user message header
var header userMsgHeader
if err := dec.Decode(&header); err != nil {
return err
}
// Read the user message into a buffer
var userBuf []byte
if header.UserMsgLen > 0 {
userBuf = make([]byte, header.UserMsgLen)
bytes, err := io.ReadAtLeast(bufConn, userBuf, header.UserMsgLen)
if err == nil && bytes != header.UserMsgLen {
err = fmt.Errorf(
"Failed to read full user message (%d / %d)",
bytes, header.UserMsgLen)
}
if err != nil {
return err
}
d := m.config.Delegate
if d != nil {
d.NotifyMsg(userBuf)
}
}
return nil
}
// sendPingAndWaitForAck makes a TCP connection to the given address, sends
// a ping, and waits for an ack. All of this is done as a series of blocking
// operations, given the deadline. The bool return parameter is true if we
// we able to round trip a ping to the other node.
func (m *Memberlist) sendPingAndWaitForAck(destAddr net.Addr, ping ping, deadline time.Time) (bool, error) {
dialer := net.Dialer{Deadline: deadline}
conn, err := dialer.Dial("tcp", destAddr.String())
if err != nil {
// If the node is actually dead we expect this to fail, so we
// shouldn't spam the logs with it. After this point, errors
// with the connection are real, unexpected errors and should
// get propagated up.
return false, nil
}
defer conn.Close()
conn.SetDeadline(deadline)
out, err := encode(pingMsg, &ping)
if err != nil {
return false, err
}
if err = m.rawSendMsgTCP(conn, out.Bytes()); err != nil {
return false, err
}
msgType, _, dec, err := m.readTCP(conn)
if err != nil {
return false, err
}
if msgType != ackRespMsg {
return false, fmt.Errorf("Unexpected msgType (%d) from TCP ping %s", msgType, LogConn(conn))
}
var ack ackResp
if err = dec.Decode(&ack); err != nil {
return false, err
}
if ack.SeqNo != ping.SeqNo {
return false, fmt.Errorf("Sequence number from ack (%d) doesn't match ping (%d) from TCP ping %s", ack.SeqNo, ping.SeqNo, LogConn(conn))
}
return true, nil
}

View file

@ -0,0 +1,14 @@
package memberlist
import "time"
// PingDelegate is used to notify an observer how long it took for a ping message to
// complete a round trip. It can also be used for writing arbitrary byte slices
// into ack messages. Note that in order to be meaningful for RTT estimates, this
// delegate does not apply to indirect pings, nor fallback pings sent over TCP.
type PingDelegate interface {
// AckPayload is invoked when an ack is being sent; the returned bytes will be appended to the ack
AckPayload() []byte
// NotifyPing is invoked when an ack for a ping is received
NotifyPingComplete(other *Node, rtt time.Duration, payload []byte)
}

View file

@ -44,10 +44,20 @@ type nodeState struct {
// ackHandler is used to register handlers for incoming acks
type ackHandler struct {
handler func()
handler func([]byte, time.Time)
timer *time.Timer
}
// NoPingResponseError is used to indicate a 'ping' packet was
// successfully issued but no response was received
type NoPingResponseError struct {
node string
}
func (f NoPingResponseError) Error() string {
return fmt.Sprintf("No response from node %s", f.node)
}
// Schedule is used to ensure the Tick is performed periodically. This
// function is safe to call multiple times. If the memberlist is already
// scheduled, then it won't do anything.
@ -128,9 +138,7 @@ func (m *Memberlist) pushPullTrigger(stop <-chan struct{}) {
// Tick using a dynamic timer
for {
m.nodeLock.RLock()
tickTime := pushPullScale(interval, len(m.nodes))
m.nodeLock.RUnlock()
tickTime := pushPullScale(interval, m.estNumNodes())
select {
case <-time.After(tickTime):
m.pushPull()
@ -207,46 +215,55 @@ START:
m.probeNode(&node)
}
// probeNode handles a single round of failure checking on a node
// probeNode handles a single round of failure checking on a node.
func (m *Memberlist) probeNode(node *nodeState) {
defer metrics.MeasureSince([]string{"memberlist", "probeNode"}, time.Now())
// Send a ping to the node
// Prepare a ping message and setup an ack handler.
ping := ping{SeqNo: m.nextSeqNo(), Node: node.Name}
destAddr := &net.UDPAddr{IP: node.Addr, Port: int(node.Port)}
// Setup an ack handler
ackCh := make(chan bool, m.config.IndirectChecks+1)
ackCh := make(chan ackMessage, m.config.IndirectChecks+1)
m.setAckChannel(ping.SeqNo, ackCh, m.config.ProbeInterval)
// Send the ping message
// Send a ping to the node.
deadline := time.Now().Add(m.config.ProbeInterval)
destAddr := &net.UDPAddr{IP: node.Addr, Port: int(node.Port)}
if err := m.encodeAndSendMsg(destAddr, pingMsg, &ping); err != nil {
m.logger.Printf("[ERR] memberlist: Failed to send ping: %s", err)
return
}
// Wait for response or round-trip-time
// Mark the sent time here, which should be after any pre-processing and
// system calls to do the actual send. This probably under-reports a bit,
// but it's the best we can do.
sent := time.Now()
// Wait for response or round-trip-time.
select {
case v := <-ackCh:
if v == true {
if v.Complete == true {
if m.config.Ping != nil {
rtt := v.Timestamp.Sub(sent)
m.config.Ping.NotifyPingComplete(&node.Node, rtt, v.Payload)
}
return
}
// As an edge case, if we get a timeout, we need to re-enqueue it
// here to break out of the select below
if v == false {
// here to break out of the select below.
if v.Complete == false {
ackCh <- v
}
case <-time.After(m.config.ProbeTimeout):
m.logger.Printf("[DEBUG] memberlist: Failed UDP ping: %v (timeout reached)", node.Name)
}
// Get some random live nodes
// Get some random live nodes.
m.nodeLock.RLock()
excludes := []string{m.config.Name, node.Name}
kNodes := kRandomNodes(m.config.IndirectChecks, excludes, m.nodes)
m.nodeLock.RUnlock()
// Attempt an indirect ping
// Attempt an indirect ping.
ind := indirectPingReq{SeqNo: ping.SeqNo, Target: node.Addr, Port: node.Port, Node: node.Name}
for _, peer := range kNodes {
destAddr := &net.UDPAddr{IP: peer.Addr, Port: int(peer.Port)}
@ -255,10 +272,49 @@ func (m *Memberlist) probeNode(node *nodeState) {
}
}
// Wait for the acks or timeout
// Also make an attempt to contact the node directly over TCP. This
// helps prevent confused clients who get isolated from UDP traffic
// but can still speak TCP (which also means they can possibly report
// misinformation to other nodes via anti-entropy), avoiding flapping in
// the cluster.
//
// This is a little unusual because we will attempt a TCP ping to any
// member who understands version 3 of the protocol, regardless of
// which protocol version we are speaking. That's why we've included a
// config option to turn this off if desired.
fallbackCh := make(chan bool, 1)
if (!m.config.DisableTcpPings) && (node.PMax >= 3) {
destAddr := &net.TCPAddr{IP: node.Addr, Port: int(node.Port)}
go func() {
defer close(fallbackCh)
didContact, err := m.sendPingAndWaitForAck(destAddr, ping, deadline)
if err != nil {
m.logger.Printf("[ERR] memberlist: Failed TCP fallback ping: %s", err)
} else {
fallbackCh <- didContact
}
}()
} else {
close(fallbackCh)
}
// Wait for the acks or timeout. Note that we don't check the fallback
// channel here because we want to issue a warning below if that's the
// *only* way we hear back from the peer, so we have to let this time
// out first to allow the normal UDP-based acks to come in.
select {
case v := <-ackCh:
if v == true {
if v.Complete == true {
return
}
}
// Finally, poll the fallback channel. The timeouts are set such that
// the channel will have something or be closed without having to wait
// any additional time here.
for didContact := range fallbackCh {
if didContact {
m.logger.Printf("[WARN] memberlist: Was able to reach %s via TCP but not UDP, network may be misconfigured and not allowing bidirectional UDP", node.Name)
return
}
}
@ -269,6 +325,37 @@ func (m *Memberlist) probeNode(node *nodeState) {
m.suspectNode(&s)
}
// Ping initiates a ping to the node with the specified name.
func (m *Memberlist) Ping(node string, addr net.Addr) (time.Duration, error) {
// Prepare a ping message and setup an ack handler.
ping := ping{SeqNo: m.nextSeqNo(), Node: node}
ackCh := make(chan ackMessage, m.config.IndirectChecks+1)
m.setAckChannel(ping.SeqNo, ackCh, m.config.ProbeInterval)
// Send a ping to the node.
if err := m.encodeAndSendMsg(addr, pingMsg, &ping); err != nil {
return 0, err
}
// Mark the sent time here, which should be after any pre-processing and
// system calls to do the actual send. This probably under-reports a bit,
// but it's the best we can do.
sent := time.Now()
// Wait for response or timeout.
select {
case v := <-ackCh:
if v.Complete == true {
return v.Timestamp.Sub(sent), nil
}
case <-time.After(m.config.ProbeTimeout):
// Timeout, return an error below.
}
m.logger.Printf("[DEBUG] memberlist: Failed UDP ping: %v (timeout reached)", node)
return 0, NoPingResponseError{ping.Node}
}
// resetNodes is used when the tick wraps around. It will reap the
// dead nodes and shuffle the node list.
func (m *Memberlist) resetNodes() {
@ -287,6 +374,9 @@ func (m *Memberlist) resetNodes() {
// Trim the nodes to exclude the dead nodes
m.nodes = m.nodes[0:deadIdx]
// Update numNodes after we've trimmed the dead nodes
atomic.StoreUint32(&m.numNodes, uint32(deadIdx))
// Shuffle live nodes
shuffleNodes(m.nodes)
}
@ -320,7 +410,7 @@ func (m *Memberlist) gossip() {
// Send the compound message
destAddr := &net.UDPAddr{IP: node.Addr, Port: int(node.Port)}
if err := m.rawSendMsg(destAddr, compound.Bytes()); err != nil {
if err := m.rawSendMsgUDP(destAddr, compound.Bytes()); err != nil {
m.logger.Printf("[ERR] memberlist: Failed to send gossip to %s: %s", destAddr, err)
}
}
@ -359,40 +449,9 @@ func (m *Memberlist) pushPullNode(addr []byte, port uint16, join bool) error {
return err
}
if err := m.verifyProtocol(remote); err != nil {
if err := m.mergeRemoteState(join, remote, userState); err != nil {
return err
}
// Invoke the merge delegate if any
if join && m.config.Merge != nil {
nodes := make([]*Node, len(remote))
for idx, n := range remote {
nodes[idx] = &Node{
Name: n.Name,
Addr: n.Addr,
Port: n.Port,
Meta: n.Meta,
PMin: n.Vsn[0],
PMax: n.Vsn[1],
PCur: n.Vsn[2],
DMin: n.Vsn[3],
DMax: n.Vsn[4],
DCur: n.Vsn[5],
}
}
if m.config.Merge.NotifyMerge(nodes) {
m.logger.Printf("[WARN] memberlist: Cluster merge canceled")
return fmt.Errorf("Merge canceled")
}
}
// Merge the state
m.mergeState(remote)
// Invoke the delegate
if m.config.Delegate != nil {
m.config.Delegate.MergeRemoteState(userState, join)
}
return nil
}
@ -525,14 +584,24 @@ func (m *Memberlist) nextIncarnation() uint32 {
return atomic.AddUint32(&m.incarnation, 1)
}
// setAckChannel is used to attach a channel to receive a message when
// an ack with a given sequence number is received. The channel gets sent
// false on timeout
func (m *Memberlist) setAckChannel(seqNo uint32, ch chan bool, timeout time.Duration) {
// estNumNodes is used to get the current estimate of the number of nodes
func (m *Memberlist) estNumNodes() int {
return int(atomic.LoadUint32(&m.numNodes))
}
type ackMessage struct {
Complete bool
Payload []byte
Timestamp time.Time
}
// setAckChannel is used to attach a channel to receive a message when an ack with a given
// sequence number is received. The `complete` field of the message will be false on timeout
func (m *Memberlist) setAckChannel(seqNo uint32, ch chan ackMessage, timeout time.Duration) {
// Create a handler function
handler := func() {
handler := func(payload []byte, timestamp time.Time) {
select {
case ch <- true:
case ch <- ackMessage{true, payload, timestamp}:
default:
}
}
@ -549,7 +618,7 @@ func (m *Memberlist) setAckChannel(seqNo uint32, ch chan bool, timeout time.Dura
delete(m.ackHandlers, seqNo)
m.ackLock.Unlock()
select {
case ch <- false:
case ch <- ackMessage{false, nil, time.Now()}:
default:
}
})
@ -558,7 +627,7 @@ func (m *Memberlist) setAckChannel(seqNo uint32, ch chan bool, timeout time.Dura
// setAckHandler is used to attach a handler to be invoked when an
// ack with a given sequence number is received. If a timeout is reached,
// the handler is deleted
func (m *Memberlist) setAckHandler(seqNo uint32, handler func(), timeout time.Duration) {
func (m *Memberlist) setAckHandler(seqNo uint32, handler func([]byte, time.Time), timeout time.Duration) {
// Add the handler
ah := &ackHandler{handler, nil}
m.ackLock.Lock()
@ -574,16 +643,16 @@ func (m *Memberlist) setAckHandler(seqNo uint32, handler func(), timeout time.Du
}
// Invokes an Ack handler if any is associated, and reaps the handler immediately
func (m *Memberlist) invokeAckHandler(seqNo uint32) {
func (m *Memberlist) invokeAckHandler(ack ackResp, timestamp time.Time) {
m.ackLock.Lock()
ah, ok := m.ackHandlers[seqNo]
delete(m.ackHandlers, seqNo)
ah, ok := m.ackHandlers[ack.SeqNo]
delete(m.ackHandlers, ack.SeqNo)
m.ackLock.Unlock()
if !ok {
return
}
ah.timer.Stop()
ah.handler()
ah.handler(ack.Payload, timestamp)
}
// aliveNode is invoked by the network layer when we get a message about a
@ -601,6 +670,30 @@ func (m *Memberlist) aliveNode(a *alive, notify chan struct{}, bootstrap bool) {
return
}
// Invoke the Alive delegate if any. This can be used to filter out
// alive messages based on custom logic. For example, using a cluster name.
// Using a merge delegate is not enough, as it is possible for passive
// cluster merging to still occur.
if m.config.Alive != nil {
node := &Node{
Name: a.Node,
Addr: a.Addr,
Port: a.Port,
Meta: a.Meta,
PMin: a.Vsn[0],
PMax: a.Vsn[1],
PCur: a.Vsn[2],
DMin: a.Vsn[3],
DMax: a.Vsn[4],
DCur: a.Vsn[5],
}
if err := m.config.Alive.NotifyAlive(node); err != nil {
m.logger.Printf("[WARN] memberlist: ignoring alive message for '%s': %s",
a.Node, err)
return
}
}
// Check if we've never seen this node before, and if not, then
// store this node in our node map.
if !ok {
@ -627,6 +720,9 @@ func (m *Memberlist) aliveNode(a *alive, notify chan struct{}, bootstrap bool) {
// Add at the end and swap with the node at the offset
m.nodes = append(m.nodes, state)
m.nodes[offset], m.nodes[n] = m.nodes[n], m.nodes[offset]
// Update numNodes after we've added a new node
atomic.AddUint32(&m.numNodes, 1)
}
// Check if this address is different than the existing node
@ -658,9 +754,6 @@ func (m *Memberlist) aliveNode(a *alive, notify chan struct{}, bootstrap bool) {
return
}
// Update metrics
metrics.IncrCounter([]string{"memberlist", "msg", "alive"}, 1)
// Store the old state and meta data
oldState := state.State
oldMeta := state.Meta
@ -728,6 +821,9 @@ func (m *Memberlist) aliveNode(a *alive, notify chan struct{}, bootstrap bool) {
}
}
// Update metrics
metrics.IncrCounter([]string{"memberlist", "msg", "alive"}, 1)
// Notify the delegate of any relevant updates
if m.config.Events != nil {
if oldState == stateDead {
@ -799,7 +895,7 @@ func (m *Memberlist) suspectNode(s *suspect) {
state.StateChange = changeTime
// Setup a timeout for this
timeout := suspicionTimeout(m.config.SuspicionMult, len(m.nodes), m.config.ProbeInterval)
timeout := suspicionTimeout(m.config.SuspicionMult, m.estNumNodes(), m.config.ProbeInterval)
time.AfterFunc(timeout, func() {
m.nodeLock.Lock()
state, ok := m.nodeMap[s.Node]

View file

@ -5,12 +5,14 @@ import (
"compress/lzw"
"encoding/binary"
"fmt"
"github.com/hashicorp/go-msgpack/codec"
"io"
"math"
"math/rand"
"net"
"strings"
"time"
"github.com/hashicorp/go-msgpack/codec"
)
// pushPullScale is the minimum number of nodes
@ -23,8 +25,11 @@ const pushPullScaleThreshold = 32
/*
* Contains an entry for each private block:
* 10.0.0.0/8
* 100.64.0.0/10
* 127.0.0.0/8
* 169.254.0.0/16
* 172.16.0.0/12
* 192.168/16
* 192.168.0.0/16
*/
var privateBlocks []*net.IPNet
@ -40,25 +45,44 @@ func init() {
rand.Seed(time.Now().UnixNano())
// Add each private block
privateBlocks = make([]*net.IPNet, 3)
privateBlocks = make([]*net.IPNet, 6)
_, block, err := net.ParseCIDR("10.0.0.0/8")
if err != nil {
panic(fmt.Sprintf("Bad cidr. Got %v", err))
}
privateBlocks[0] = block
_, block, err = net.ParseCIDR("172.16.0.0/12")
_, block, err = net.ParseCIDR("100.64.0.0/10")
if err != nil {
panic(fmt.Sprintf("Bad cidr. Got %v", err))
}
privateBlocks[1] = block
_, block, err = net.ParseCIDR("192.168.0.0/16")
_, block, err = net.ParseCIDR("127.0.0.0/8")
if err != nil {
panic(fmt.Sprintf("Bad cidr. Got %v", err))
}
privateBlocks[2] = block
_, block, err = net.ParseCIDR("169.254.0.0/16")
if err != nil {
panic(fmt.Sprintf("Bad cidr. Got %v", err))
}
privateBlocks[3] = block
_, block, err = net.ParseCIDR("172.16.0.0/12")
if err != nil {
panic(fmt.Sprintf("Bad cidr. Got %v", err))
}
privateBlocks[4] = block
_, block, err = net.ParseCIDR("192.168.0.0/16")
if err != nil {
panic(fmt.Sprintf("Bad cidr. Got %v", err))
}
privateBlocks[5] = block
_, block, err = net.ParseCIDR("127.0.0.0/8")
if err != nil {
panic(fmt.Sprintf("Bad cidr. Got %v", err))
@ -84,6 +108,42 @@ func encode(msgType messageType, in interface{}) (*bytes.Buffer, error) {
return buf, err
}
// GetPrivateIP returns the first private IP address found in a list of
// addresses.
func GetPrivateIP(addresses []net.Addr) (net.IP, error) {
var candidates []net.IP
// Find private IPv4 address
for _, rawAddr := range addresses {
var ip net.IP
switch addr := rawAddr.(type) {
case *net.IPAddr:
ip = addr.IP
case *net.IPNet:
ip = addr.IP
default:
continue
}
if ip.To4() == nil {
continue
}
if !IsPrivateIP(ip.String()) {
continue
}
candidates = append(candidates, ip)
}
numIps := len(candidates)
switch numIps {
case 0:
return nil, fmt.Errorf("No private IP address found")
case 1:
return candidates[0], nil
default:
return nil, fmt.Errorf("Multiple private IPs found. Please configure one.")
}
}
// Returns a random offset between 0 and n
func randomOffset(n int) int {
if n == 0 {
@ -107,9 +167,10 @@ func retransmitLimit(retransmitMult, n int) int {
return limit
}
// shuffleNodes randomly shuffles the input nodes
// shuffleNodes randomly shuffles the input nodes using the Fisher-Yates shuffle
func shuffleNodes(nodes []*nodeState) {
for i := range nodes {
n := len(nodes)
for i := n - 1; i > 0; i-- {
j := rand.Intn(i + 1)
nodes[i], nodes[j] = nodes[j], nodes[i]
}
@ -250,7 +311,7 @@ func decodeCompoundMessage(buf []byte) (trunc int, parts [][]byte, err error) {
}
// Returns if the given IP is in a private block
func isPrivateIP(ip_str string) bool {
func IsPrivateIP(ip_str string) bool {
ip := net.ParseIP(ip_str)
for _, priv := range privateBlocks {
if priv.Contains(ip) {
@ -266,6 +327,12 @@ func isLoopbackIP(ip_str string) bool {
return loopbackBlock.Contains(ip)
}
// Given a string of the form "host", "host:port", or "[ipv6::address]:port",
// return true if the string includes a port.
func hasPort(s string) bool {
return strings.LastIndex(s, ":") > strings.LastIndex(s, "]")
}
// compressPayload takes an opaque input buffer, compresses it
// and wraps it in a compress{} message that is encoded.
func compressPayload(inp []byte) (*bytes.Buffer, error) {

View file

@ -0,0 +1,180 @@
package coordinate
import (
"fmt"
"math"
"sort"
"sync"
"time"
)
// Client manages the estimated network coordinate for a given node, and adjusts
// it as the node observes round trip times and estimated coordinates from other
// nodes. The core algorithm is based on Vivaldi, see the documentation for Config
// for more details.
type Client struct {
// coord is the current estimate of the client's network coordinate.
coord *Coordinate
// origin is a coordinate sitting at the origin.
origin *Coordinate
// config contains the tuning parameters that govern the performance of
// the algorithm.
config *Config
// adjustmentIndex is the current index into the adjustmentSamples slice.
adjustmentIndex uint
// adjustment is used to store samples for the adjustment calculation.
adjustmentSamples []float64
// latencyFilterSamples is used to store the last several RTT samples,
// keyed by node name. We will use the config's LatencyFilterSamples
// value to determine how many samples we keep, per node.
latencyFilterSamples map[string][]float64
// mutex enables safe concurrent access to the client.
mutex sync.RWMutex
}
// NewClient creates a new Client and verifies the configuration is valid.
func NewClient(config *Config) (*Client, error) {
if !(config.Dimensionality > 0) {
return nil, fmt.Errorf("dimensionality must be >0")
}
return &Client{
coord: NewCoordinate(config),
origin: NewCoordinate(config),
config: config,
adjustmentIndex: 0,
adjustmentSamples: make([]float64, config.AdjustmentWindowSize),
latencyFilterSamples: make(map[string][]float64),
}, nil
}
// GetCoordinate returns a copy of the coordinate for this client.
func (c *Client) GetCoordinate() *Coordinate {
c.mutex.RLock()
defer c.mutex.RUnlock()
return c.coord.Clone()
}
// SetCoordinate forces the client's coordinate to a known state.
func (c *Client) SetCoordinate(coord *Coordinate) {
c.mutex.Lock()
defer c.mutex.Unlock()
c.coord = coord.Clone()
}
// ForgetNode removes any client state for the given node.
func (c *Client) ForgetNode(node string) {
c.mutex.Lock()
defer c.mutex.Unlock()
delete(c.latencyFilterSamples, node)
}
// latencyFilter applies a simple moving median filter with a new sample for
// a node. This assumes that the mutex has been locked already.
func (c *Client) latencyFilter(node string, rttSeconds float64) float64 {
samples, ok := c.latencyFilterSamples[node]
if !ok {
samples = make([]float64, 0, c.config.LatencyFilterSize)
}
// Add the new sample and trim the list, if needed.
samples = append(samples, rttSeconds)
if len(samples) > int(c.config.LatencyFilterSize) {
samples = samples[1:]
}
c.latencyFilterSamples[node] = samples
// Sort a copy of the samples and return the median.
sorted := make([]float64, len(samples))
copy(sorted, samples)
sort.Float64s(sorted)
return sorted[len(sorted)/2]
}
// updateVivialdi updates the Vivaldi portion of the client's coordinate. This
// assumes that the mutex has been locked already.
func (c *Client) updateVivaldi(other *Coordinate, rttSeconds float64) {
const zeroThreshold = 1.0e-6
dist := c.coord.DistanceTo(other).Seconds()
if rttSeconds < zeroThreshold {
rttSeconds = zeroThreshold
}
wrongness := math.Abs(dist-rttSeconds) / rttSeconds
totalError := c.coord.Error + other.Error
if totalError < zeroThreshold {
totalError = zeroThreshold
}
weight := c.coord.Error / totalError
c.coord.Error = c.config.VivaldiCE*weight*wrongness + c.coord.Error*(1.0-c.config.VivaldiCE*weight)
if c.coord.Error > c.config.VivaldiErrorMax {
c.coord.Error = c.config.VivaldiErrorMax
}
delta := c.config.VivaldiCC * weight
force := delta * (rttSeconds - dist)
c.coord = c.coord.ApplyForce(c.config, force, other)
}
// updateAdjustment updates the adjustment portion of the client's coordinate, if
// the feature is enabled. This assumes that the mutex has been locked already.
func (c *Client) updateAdjustment(other *Coordinate, rttSeconds float64) {
if c.config.AdjustmentWindowSize == 0 {
return
}
// Note that the existing adjustment factors don't figure in to this
// calculation so we use the raw distance here.
dist := c.coord.rawDistanceTo(other)
c.adjustmentSamples[c.adjustmentIndex] = rttSeconds - dist
c.adjustmentIndex = (c.adjustmentIndex + 1) % c.config.AdjustmentWindowSize
sum := 0.0
for _, sample := range c.adjustmentSamples {
sum += sample
}
c.coord.Adjustment = sum / (2.0 * float64(c.config.AdjustmentWindowSize))
}
// updateGravity applies a small amount of gravity to pull coordinates towards
// the center of the coordinate system to combat drift. This assumes that the
// mutex is locked already.
func (c *Client) updateGravity() {
dist := c.origin.DistanceTo(c.coord).Seconds()
force := -1.0 * math.Pow(dist/c.config.GravityRho, 2.0)
c.coord = c.coord.ApplyForce(c.config, force, c.origin)
}
// Update takes other, a coordinate for another node, and rtt, a round trip
// time observation for a ping to that node, and updates the estimated position of
// the client's coordinate. Returns the updated coordinate.
func (c *Client) Update(node string, other *Coordinate, rtt time.Duration) *Coordinate {
c.mutex.Lock()
defer c.mutex.Unlock()
rttSeconds := c.latencyFilter(node, rtt.Seconds())
c.updateVivaldi(other, rttSeconds)
c.updateAdjustment(other, rttSeconds)
c.updateGravity()
return c.coord.Clone()
}
// DistanceTo returns the estimated RTT from the client's coordinate to other, the
// coordinate for another node.
func (c *Client) DistanceTo(other *Coordinate) time.Duration {
c.mutex.RLock()
defer c.mutex.RUnlock()
return c.coord.DistanceTo(other)
}

View file

@ -0,0 +1,70 @@
package coordinate
// Config is used to set the parameters of the Vivaldi-based coordinate mapping
// algorithm.
//
// The following references are called out at various points in the documentation
// here:
//
// [1] Dabek, Frank, et al. "Vivaldi: A decentralized network coordinate system."
// ACM SIGCOMM Computer Communication Review. Vol. 34. No. 4. ACM, 2004.
// [2] Ledlie, Jonathan, Paul Gardner, and Margo I. Seltzer. "Network Coordinates
// in the Wild." NSDI. Vol. 7. 2007.
// [3] Lee, Sanghwan, et al. "On suitability of Euclidean embedding for
// host-based network coordinate systems." Networking, IEEE/ACM Transactions
// on 18.1 (2010): 27-40.
type Config struct {
// The dimensionality of the coordinate system. As discussed in [2], more
// dimensions improves the accuracy of the estimates up to a point. Per [2]
// we chose 4 dimensions plus a non-Euclidean height.
Dimensionality uint
// VivaldiErrorMax is the default error value when a node hasn't yet made
// any observations. It also serves as an upper limit on the error value in
// case observations cause the error value to increase without bound.
VivaldiErrorMax float64
// VivaldiCE is a tuning factor that controls the maximum impact an
// observation can have on a node's confidence. See [1] for more details.
VivaldiCE float64
// VivaldiCC is a tuning factor that controls the maximum impact an
// observation can have on a node's coordinate. See [1] for more details.
VivaldiCC float64
// AdjustmentWindowSize is a tuning factor that determines how many samples
// we retain to calculate the adjustment factor as discussed in [3]. Setting
// this to zero disables this feature.
AdjustmentWindowSize uint
// HeightMin is the minimum value of the height parameter. Since this
// always must be positive, it will introduce a small amount error, so
// the chosen value should be relatively small compared to "normal"
// coordinates.
HeightMin float64
// LatencyFilterSamples is the maximum number of samples that are retained
// per node, in order to compute a median. The intent is to ride out blips
// but still keep the delay low, since our time to probe any given node is
// pretty infrequent. See [2] for more details.
LatencyFilterSize uint
// GravityRho is a tuning factor that sets how much gravity has an effect
// to try to re-center coordinates. See [2] for more details.
GravityRho float64
}
// DefaultConfig returns a Config that has some default values suitable for
// basic testing of the algorithm, but not tuned to any particular type of cluster.
func DefaultConfig() *Config {
return &Config{
Dimensionality: 8,
VivaldiErrorMax: 1.5,
VivaldiCE: 0.25,
VivaldiCC: 0.25,
AdjustmentWindowSize: 20,
HeightMin: 10.0e-6,
LatencyFilterSize: 3,
GravityRho: 150.0,
}
}

View file

@ -0,0 +1,183 @@
package coordinate
import (
"math"
"math/rand"
"time"
)
// Coordinate is a specialized structure for holding network coordinates for the
// Vivaldi-based coordinate mapping algorithm. All of the fields should be public
// to enable this to be serialized. All values in here are in units of seconds.
type Coordinate struct {
// Vec is the Euclidean portion of the coordinate. This is used along
// with the other fields to provide an overall distance estimate. The
// units here are seconds.
Vec []float64
// Err reflects the confidence in the given coordinate and is updated
// dynamically by the Vivaldi Client. This is dimensionless.
Error float64
// Adjustment is a distance offset computed based on a calculation over
// observations from all other nodes over a fixed window and is updated
// dynamically by the Vivaldi Client. The units here are seconds.
Adjustment float64
// Height is a distance offset that accounts for non-Euclidean effects
// which model the access links from nodes to the core Internet. The access
// links are usually set by bandwidth and congestion, and the core links
// usually follow distance based on geography.
Height float64
}
const (
// secondsToNanoseconds is used to convert float seconds to nanoseconds.
secondsToNanoseconds = 1.0e9
// zeroThreshold is used to decide if two coordinates are on top of each
// other.
zeroThreshold = 1.0e-6
)
// ErrDimensionalityConflict will be panic-d if you try to perform operations
// with incompatible dimensions.
type DimensionalityConflictError struct{}
// Adds the error interface.
func (e DimensionalityConflictError) Error() string {
return "coordinate dimensionality does not match"
}
// NewCoordinate creates a new coordinate at the origin, using the given config
// to supply key initial values.
func NewCoordinate(config *Config) *Coordinate {
return &Coordinate{
Vec: make([]float64, config.Dimensionality),
Error: config.VivaldiErrorMax,
Adjustment: 0.0,
Height: config.HeightMin,
}
}
// Clone creates an independent copy of this coordinate.
func (c *Coordinate) Clone() *Coordinate {
vec := make([]float64, len(c.Vec))
copy(vec, c.Vec)
return &Coordinate{
Vec: vec,
Error: c.Error,
Adjustment: c.Adjustment,
Height: c.Height,
}
}
// IsCompatibleWith checks to see if the two coordinates are compatible
// dimensionally. If this returns true then you are guaranteed to not get
// any runtime errors operating on them.
func (c *Coordinate) IsCompatibleWith(other *Coordinate) bool {
return len(c.Vec) == len(other.Vec)
}
// ApplyForce returns the result of applying the force from the direction of the
// other coordinate.
func (c *Coordinate) ApplyForce(config *Config, force float64, other *Coordinate) *Coordinate {
if !c.IsCompatibleWith(other) {
panic(DimensionalityConflictError{})
}
ret := c.Clone()
unit, mag := unitVectorAt(c.Vec, other.Vec)
ret.Vec = add(ret.Vec, mul(unit, force))
if mag > zeroThreshold {
ret.Height = (ret.Height+other.Height)*force/mag + ret.Height
ret.Height = math.Max(ret.Height, config.HeightMin)
}
return ret
}
// DistanceTo returns the distance between this coordinate and the other
// coordinate, including adjustments.
func (c *Coordinate) DistanceTo(other *Coordinate) time.Duration {
if !c.IsCompatibleWith(other) {
panic(DimensionalityConflictError{})
}
dist := c.rawDistanceTo(other)
adjustedDist := dist + c.Adjustment + other.Adjustment
if adjustedDist > 0.0 {
dist = adjustedDist
}
return time.Duration(dist * secondsToNanoseconds)
}
// rawDistanceTo returns the Vivaldi distance between this coordinate and the
// other coordinate in seconds, not including adjustments. This assumes the
// dimensions have already been checked to be compatible.
func (c *Coordinate) rawDistanceTo(other *Coordinate) float64 {
return magnitude(diff(c.Vec, other.Vec)) + c.Height + other.Height
}
// add returns the sum of vec1 and vec2. This assumes the dimensions have
// already been checked to be compatible.
func add(vec1 []float64, vec2 []float64) []float64 {
ret := make([]float64, len(vec1))
for i, _ := range ret {
ret[i] = vec1[i] + vec2[i]
}
return ret
}
// diff returns the difference between the vec1 and vec2. This assumes the
// dimensions have already been checked to be compatible.
func diff(vec1 []float64, vec2 []float64) []float64 {
ret := make([]float64, len(vec1))
for i, _ := range ret {
ret[i] = vec1[i] - vec2[i]
}
return ret
}
// mul returns vec multiplied by a scalar factor.
func mul(vec []float64, factor float64) []float64 {
ret := make([]float64, len(vec))
for i, _ := range vec {
ret[i] = vec[i] * factor
}
return ret
}
// magnitude computes the magnitude of the vec.
func magnitude(vec []float64) float64 {
sum := 0.0
for i, _ := range vec {
sum += vec[i] * vec[i]
}
return math.Sqrt(sum)
}
// unitVectorAt returns a unit vector pointing at vec1 from vec2. If the two
// positions are the same then a random unit vector is returned. We also return
// the distance between the points for use in the later height calculation.
func unitVectorAt(vec1 []float64, vec2 []float64) ([]float64, float64) {
ret := diff(vec1, vec2)
// If the coordinates aren't on top of each other we can normalize.
if mag := magnitude(ret); mag > zeroThreshold {
return mul(ret, 1.0/mag), mag
}
// Otherwise, just return a random unit vector.
for i, _ := range ret {
ret[i] = rand.Float64() - 0.5
}
if mag := magnitude(ret); mag > zeroThreshold {
return mul(ret, 1.0/mag), 0.0
}
// And finally just give up and make a unit vector along the first
// dimension. This should be exceedingly rare.
ret = make([]float64, len(ret))
ret[0] = 1.0
return ret, 0.0
}

View file

@ -0,0 +1,187 @@
package coordinate
import (
"fmt"
"math"
"math/rand"
"time"
)
// GenerateClients returns a slice with nodes number of clients, all with the
// given config.
func GenerateClients(nodes int, config *Config) ([]*Client, error) {
clients := make([]*Client, nodes)
for i, _ := range clients {
client, err := NewClient(config)
if err != nil {
return nil, err
}
clients[i] = client
}
return clients, nil
}
// GenerateLine returns a truth matrix as if all the nodes are in a straight linke
// with the given spacing between them.
func GenerateLine(nodes int, spacing time.Duration) [][]time.Duration {
truth := make([][]time.Duration, nodes)
for i := range truth {
truth[i] = make([]time.Duration, nodes)
}
for i := 0; i < nodes; i++ {
for j := i + 1; j < nodes; j++ {
rtt := time.Duration(j-i) * spacing
truth[i][j], truth[j][i] = rtt, rtt
}
}
return truth
}
// GenerateGrid returns a truth matrix as if all the nodes are in a two dimensional
// grid with the given spacing between them.
func GenerateGrid(nodes int, spacing time.Duration) [][]time.Duration {
truth := make([][]time.Duration, nodes)
for i := range truth {
truth[i] = make([]time.Duration, nodes)
}
n := int(math.Sqrt(float64(nodes)))
for i := 0; i < nodes; i++ {
for j := i + 1; j < nodes; j++ {
x1, y1 := float64(i%n), float64(i/n)
x2, y2 := float64(j%n), float64(j/n)
dx, dy := x2-x1, y2-y1
dist := math.Sqrt(dx*dx + dy*dy)
rtt := time.Duration(dist * float64(spacing))
truth[i][j], truth[j][i] = rtt, rtt
}
}
return truth
}
// GenerateSplit returns a truth matrix as if half the nodes are close together in
// one location and half the nodes are close together in another. The lan factor
// is used to separate the nodes locally and the wan factor represents the split
// between the two sides.
func GenerateSplit(nodes int, lan time.Duration, wan time.Duration) [][]time.Duration {
truth := make([][]time.Duration, nodes)
for i := range truth {
truth[i] = make([]time.Duration, nodes)
}
split := nodes / 2
for i := 0; i < nodes; i++ {
for j := i + 1; j < nodes; j++ {
rtt := lan
if (i <= split && j > split) || (i > split && j <= split) {
rtt += wan
}
truth[i][j], truth[j][i] = rtt, rtt
}
}
return truth
}
// GenerateCircle returns a truth matrix for a set of nodes, evenly distributed
// around a circle with the given radius. The first node is at the "center" of the
// circle because it's equidistant from all the other nodes, but we place it at
// double the radius, so it should show up above all the other nodes in height.
func GenerateCircle(nodes int, radius time.Duration) [][]time.Duration {
truth := make([][]time.Duration, nodes)
for i := range truth {
truth[i] = make([]time.Duration, nodes)
}
for i := 0; i < nodes; i++ {
for j := i + 1; j < nodes; j++ {
var rtt time.Duration
if i == 0 {
rtt = 2 * radius
} else {
t1 := 2.0 * math.Pi * float64(i) / float64(nodes)
x1, y1 := math.Cos(t1), math.Sin(t1)
t2 := 2.0 * math.Pi * float64(j) / float64(nodes)
x2, y2 := math.Cos(t2), math.Sin(t2)
dx, dy := x2-x1, y2-y1
dist := math.Sqrt(dx*dx + dy*dy)
rtt = time.Duration(dist * float64(radius))
}
truth[i][j], truth[j][i] = rtt, rtt
}
}
return truth
}
// GenerateRandom returns a truth matrix for a set of nodes with normally
// distributed delays, with the given mean and deviation. The RNG is re-seeded
// so you always get the same matrix for a given size.
func GenerateRandom(nodes int, mean time.Duration, deviation time.Duration) [][]time.Duration {
rand.Seed(1)
truth := make([][]time.Duration, nodes)
for i := range truth {
truth[i] = make([]time.Duration, nodes)
}
for i := 0; i < nodes; i++ {
for j := i + 1; j < nodes; j++ {
rttSeconds := rand.NormFloat64()*deviation.Seconds() + mean.Seconds()
rtt := time.Duration(rttSeconds * secondsToNanoseconds)
truth[i][j], truth[j][i] = rtt, rtt
}
}
return truth
}
// Simulate runs the given number of cycles using the given list of clients and
// truth matrix. On each cycle, each client will pick a random node and observe
// the truth RTT, updating its coordinate estimate. The RNG is re-seeded for
// each simulation run to get deterministic results (for this algorithm and the
// underlying algorithm which will use random numbers for position vectors when
// starting out with everything at the origin).
func Simulate(clients []*Client, truth [][]time.Duration, cycles int) {
rand.Seed(1)
nodes := len(clients)
for cycle := 0; cycle < cycles; cycle++ {
for i, _ := range clients {
if j := rand.Intn(nodes); j != i {
c := clients[j].GetCoordinate()
rtt := truth[i][j]
node := fmt.Sprintf("node_%d", j)
clients[i].Update(node, c, rtt)
}
}
}
}
// Stats is returned from the Evaluate function with a summary of the algorithm
// performance.
type Stats struct {
ErrorMax float64
ErrorAvg float64
}
// Evaluate uses the coordinates of the given clients to calculate estimated
// distances and compares them with the given truth matrix, returning summary
// stats.
func Evaluate(clients []*Client, truth [][]time.Duration) (stats Stats) {
nodes := len(clients)
count := 0
for i := 0; i < nodes; i++ {
for j := i + 1; j < nodes; j++ {
est := clients[i].DistanceTo(clients[j].GetCoordinate()).Seconds()
actual := truth[i][j].Seconds()
error := math.Abs(est-actual) / actual
stats.ErrorMax = math.Max(stats.ErrorMax, error)
stats.ErrorAvg += error
count += 1
}
}
stats.ErrorAvg /= float64(count)
fmt.Printf("Error avg=%9.6f max=%9.6f\n", stats.ErrorAvg, stats.ErrorMax)
return
}

View file

@ -149,6 +149,14 @@ type Config struct {
//
QueryTimeoutMult int
// QueryResponseSizeLimit and QuerySizeLimit limit the inbound and
// outbound payload sizes for queries, respectively. These must fit
// in a UDP packet with some additional overhead, so tuning these
// past the default values of 1024 will depend on your network
// configuration.
QueryResponseSizeLimit int
QuerySizeLimit int
// MemberlistConfig is the memberlist configuration that Serf will
// use to do the underlying membership management and gossip. Some
// fields in the MemberlistConfig will be overwritten by Serf no
@ -189,6 +197,12 @@ type Config struct {
// node stays while the other node will leave the cluster and exit.
EnableNameConflictResolution bool
// DisableCoordinates controls if Serf will maintain an estimate of this
// node's network coordinate internally. A network coordinate is useful
// for estimating the network distance (i.e. round trip time) between
// two nodes. Enabling this option adds some overhead to ping messages.
DisableCoordinates bool
// KeyringFile provides the location of a writable file where Serf can
// persist changes to the encryption keyring.
KeyringFile string
@ -229,6 +243,9 @@ func DefaultConfig() *Config {
TombstoneTimeout: 24 * time.Hour,
MemberlistConfig: memberlist.DefaultLANConfig(),
QueryTimeoutMult: 16,
QueryResponseSizeLimit: 1024,
QuerySizeLimit: 1024,
EnableNameConflictResolution: true,
DisableCoordinates: false,
}
}

View file

@ -2,6 +2,7 @@ package serf
import (
"fmt"
"github.com/armon/go-metrics"
)
@ -170,6 +171,12 @@ func (d *delegate) LocalState(join bool) []byte {
}
func (d *delegate) MergeRemoteState(buf []byte, isJoin bool) {
// Ensure we have a message
if len(buf) == 0 {
d.serf.logger.Printf("[ERR] serf: Remote state is zero bytes")
return
}
// Check the message type
if messageType(buf[0]) != messagePushPullType {
d.serf.logger.Printf("[ERR] serf: Remote state has bad type prefix: %v", buf[0])

View file

@ -152,8 +152,8 @@ func (q *Query) Respond(buf []byte) error {
}
// Check the size limit
if len(raw) > QueryResponseSizeLimit {
return fmt.Errorf("response exceeds limit of %d bytes", QueryResponseSizeLimit)
if len(raw) > q.serf.config.QueryResponseSizeLimit {
return fmt.Errorf("response exceeds limit of %d bytes", q.serf.config.QueryResponseSizeLimit)
}
// Send the response

View file

@ -7,29 +7,38 @@ import (
)
type MergeDelegate interface {
NotifyMerge([]*Member) (cancel bool)
NotifyMerge([]*Member) error
}
type mergeDelegate struct {
serf *Serf
}
func (m *mergeDelegate) NotifyMerge(nodes []*memberlist.Node) (cancel bool) {
func (m *mergeDelegate) NotifyMerge(nodes []*memberlist.Node) error {
members := make([]*Member, len(nodes))
for idx, n := range nodes {
members[idx] = &Member{
Name: n.Name,
Addr: net.IP(n.Addr),
Port: n.Port,
Tags: m.serf.decodeTags(n.Meta),
Status: StatusNone,
ProtocolMin: n.PMin,
ProtocolMax: n.PMax,
ProtocolCur: n.PCur,
DelegateMin: n.DMin,
DelegateMax: n.DMax,
DelegateCur: n.DCur,
}
members[idx] = m.nodeToMember(n)
}
return m.serf.config.Merge.NotifyMerge(members)
}
func (m *mergeDelegate) NotifyAlive(peer *memberlist.Node) error {
member := m.nodeToMember(peer)
return m.serf.config.Merge.NotifyMerge([]*Member{member})
}
func (m *mergeDelegate) nodeToMember(n *memberlist.Node) *Member {
return &Member{
Name: n.Name,
Addr: net.IP(n.Addr),
Port: n.Port,
Tags: m.serf.decodeTags(n.Meta),
Status: StatusNone,
ProtocolMin: n.PMin,
ProtocolMax: n.PMax,
ProtocolCur: n.PCur,
DelegateMin: n.DMin,
DelegateMax: n.DMax,
DelegateCur: n.DCur,
}
}

View file

@ -0,0 +1,89 @@
package serf
import (
"bytes"
"log"
"time"
"github.com/armon/go-metrics"
"github.com/hashicorp/go-msgpack/codec"
"github.com/hashicorp/memberlist"
"github.com/hashicorp/serf/coordinate"
)
// pingDelegate is notified when memberlist successfully completes a direct ping
// of a peer node. We use this to update our estimated network coordinate, as
// well as cache the coordinate of the peer.
type pingDelegate struct {
serf *Serf
}
const (
// PingVersion is an internal version for the ping message, above the normal
// versioning we get from the protocol version. This enables small updates
// to the ping message without a full protocol bump.
PingVersion = 1
)
// AckPayload is called to produce a payload to send back in response to a ping
// request.
func (p *pingDelegate) AckPayload() []byte {
var buf bytes.Buffer
// The first byte is the version number, forming a simple header.
version := []byte{PingVersion}
buf.Write(version)
// The rest of the message is the serialized coordinate.
enc := codec.NewEncoder(&buf, &codec.MsgpackHandle{})
if err := enc.Encode(p.serf.coordClient.GetCoordinate()); err != nil {
log.Printf("[ERR] serf: Failed to encode coordinate: %v\n", err)
}
return buf.Bytes()
}
// NotifyPingComplete is called when this node successfully completes a direct ping
// of a peer node.
func (p *pingDelegate) NotifyPingComplete(other *memberlist.Node, rtt time.Duration, payload []byte) {
if payload == nil || len(payload) == 0 {
return
}
// Verify ping version in the header.
version := payload[0]
if version != PingVersion {
log.Printf("[ERR] serf: Unsupported ping version: %v", version)
return
}
// Process the remainder of the message as a coordinate.
r := bytes.NewReader(payload[1:])
dec := codec.NewDecoder(r, &codec.MsgpackHandle{})
var coord coordinate.Coordinate
if err := dec.Decode(&coord); err != nil {
log.Printf("[ERR] serf: Failed to decode coordinate from ping: %v", err)
}
// Apply the update. Since this is a coordinate coming from some place
// else we harden this and look for dimensionality problems proactively.
before := p.serf.coordClient.GetCoordinate()
if before.IsCompatibleWith(&coord) {
after := p.serf.coordClient.Update(other.Name, &coord, rtt)
// Publish some metrics to give us an idea of how much we are
// adjusting each time we update.
d := float32(before.DistanceTo(after).Seconds() * 1.0e3)
metrics.AddSample([]string{"serf", "coordinate", "adjustment-ms"}, d)
// Cache the coordinate for the other node, and add our own
// to the cache as well since it just got updated. This lets
// users call GetCachedCoordinate with our node name, which is
// more friendly.
p.serf.coordCacheLock.Lock()
p.serf.coordCache[other.Name] = &coord
p.serf.coordCache[p.serf.config.NodeName] = p.serf.coordClient.GetCoordinate()
p.serf.coordCacheLock.Unlock()
} else {
log.Printf("[ERR] serf: Rejected bad coordinate: %v\n", coord)
}
}

View file

@ -17,6 +17,7 @@ import (
"github.com/armon/go-metrics"
"github.com/hashicorp/go-msgpack/codec"
"github.com/hashicorp/memberlist"
"github.com/hashicorp/serf/coordinate"
)
// These are the protocol versions that Serf can _understand_. These are
@ -91,6 +92,10 @@ type Serf struct {
snapshotter *Snapshotter
keyManager *KeyManager
coordClient *coordinate.Client
coordCache map[string]*coordinate.Coordinate
coordCacheLock sync.RWMutex
}
// SerfState is the state of the Serf instance.
@ -209,10 +214,8 @@ type queries struct {
}
const (
UserEventSizeLimit = 512 // Maximum byte size for event name and payload
QuerySizeLimit = 1024 // Maximum byte size for query
QueryResponseSizeLimit = 1024 // Maximum bytes size for response
snapshotSizeLimit = 128 * 1024 // Maximum 128 KB snapshot
UserEventSizeLimit = 512 // Maximum byte size for event name and payload
snapshotSizeLimit = 128 * 1024 // Maximum 128 KB snapshot
)
// Create creates a new Serf instance, starting all the background tasks
@ -274,15 +277,25 @@ func Create(conf *Config) (*Serf, error) {
}
conf.EventCh = outCh
// Set up network coordinate client.
if !conf.DisableCoordinates {
serf.coordClient, err = coordinate.NewClient(coordinate.DefaultConfig())
if err != nil {
return nil, fmt.Errorf("Failed to create coordinate client: %v", err)
}
}
// Try access the snapshot
var oldClock, oldEventClock, oldQueryClock LamportTime
var prev []*PreviousNode
if conf.SnapshotPath != "" {
eventCh, snap, err := NewSnapshotter(conf.SnapshotPath,
eventCh, snap, err := NewSnapshotter(
conf.SnapshotPath,
snapshotSizeLimit,
conf.RejoinAfterLeave,
serf.logger,
&serf.clock,
serf.coordClient,
conf.EventCh,
serf.shutdownCh)
if err != nil {
@ -298,6 +311,13 @@ func Create(conf *Config) (*Serf, error) {
serf.queryMinTime = oldQueryClock + 1
}
// Set up the coordinate cache. We do this after we read the snapshot to
// make sure we get a good initial value from there, if we got one.
if !conf.DisableCoordinates {
serf.coordCache = make(map[string]*coordinate.Coordinate)
serf.coordCache[conf.NodeName] = serf.coordClient.GetCoordinate()
}
// Setup the various broadcast queues, which we use to send our own
// custom broadcasts along the gossip channel.
serf.broadcasts = &memberlist.TransmitLimitedQueue{
@ -347,17 +367,22 @@ func Create(conf *Config) (*Serf, error) {
conf.MemberlistConfig.DelegateProtocolMax = ProtocolVersionMax
conf.MemberlistConfig.Name = conf.NodeName
conf.MemberlistConfig.ProtocolVersion = ProtocolVersionMap[conf.ProtocolVersion]
if !conf.DisableCoordinates {
conf.MemberlistConfig.Ping = &pingDelegate{serf: serf}
}
// Setup a merge delegate if necessary
if conf.Merge != nil {
conf.MemberlistConfig.Merge = &mergeDelegate{serf: serf}
md := &mergeDelegate{serf: serf}
conf.MemberlistConfig.Merge = md
conf.MemberlistConfig.Alive = md
}
// Create the underlying memberlist that will manage membership
// and failure detection for the Serf instance.
memberlist, err := memberlist.Create(conf.MemberlistConfig)
if err != nil {
return nil, err
return nil, fmt.Errorf("Failed to create memberlist: %v", err)
}
serf.memberlist = memberlist
@ -486,8 +511,8 @@ func (s *Serf) Query(name string, payload []byte, params *QueryParam) (*QueryRes
}
// Check the size
if len(raw) > QuerySizeLimit {
return nil, fmt.Errorf("query exceeds limit of %d bytes", QuerySizeLimit)
if len(raw) > s.config.QuerySizeLimit {
return nil, fmt.Errorf("query exceeds limit of %d bytes", s.config.QuerySizeLimit)
}
// Register QueryResponse to track acks and responses
@ -950,6 +975,19 @@ func (s *Serf) handleNodeUpdate(n *memberlist.Node) {
member.Port = n.Port
member.Tags = s.decodeTags(n.Meta)
// Snag the latest versions. NOTE - the current memberlist code will NOT
// fire an update event if the metadata (for Serf, tags) stays the same
// and only the protocol versions change. If we wake any Serf-level
// protocol changes where we want to get this event under those
// circumstances, we will need to update memberlist to do a check of
// versions as well as the metadata.
member.ProtocolMin = n.PMin
member.ProtocolMax = n.PMax
member.ProtocolCur = n.PCur
member.DelegateMin = n.DMin
member.DelegateMax = n.DMax
member.DelegateCur = n.DCur
// Update some metrics
metrics.IncrCounter([]string{"serf", "member", "update"}, 1)
@ -1016,6 +1054,17 @@ func (s *Serf) handleNodeLeaveIntent(leaveMsg *messageLeave) bool {
s.failedMembers = removeOldMember(s.failedMembers, member.Name)
s.leftMembers = append(s.leftMembers, member)
// We must push a message indicating the node has now
// left to allow higher-level applications to handle the
// graceful leave.
s.logger.Printf("[INFO] serf: EventMemberLeave (forced): %s %s",
member.Member.Name, member.Member.Addr)
if s.config.EventCh != nil {
s.config.EventCh <- MemberEvent{
Type: EventMemberLeave,
Members: []Member{member.Member},
}
}
return true
default:
return false
@ -1384,6 +1433,16 @@ func (s *Serf) reap(old []*memberState, timeout time.Duration) []*memberState {
// Delete from members
delete(s.members, m.Name)
// Tell the coordinate client the node has gone away and delete
// its cached coordinates.
if !s.config.DisableCoordinates {
s.coordClient.ForgetNode(m.Name)
s.coordCacheLock.Lock()
delete(s.coordCache, m.Name)
s.coordCacheLock.Unlock()
}
// Send an event along
s.logger.Printf("[INFO] serf: EventMemberReap: %s", m.Name)
if s.config.EventCh != nil {
@ -1596,3 +1655,38 @@ func (s *Serf) writeKeyringFile() error {
// Success!
return nil
}
// GetCoordinate returns the network coordinate of the local node.
func (s *Serf) GetCoordinate() (*coordinate.Coordinate, error) {
if !s.config.DisableCoordinates {
return s.coordClient.GetCoordinate(), nil
}
return nil, fmt.Errorf("Coordinates are disabled")
}
// GetCachedCoordinate returns the network coordinate for the node with the given
// name. This will only be valid if DisableCoordinates is set to false.
func (s *Serf) GetCachedCoordinate(name string) (coord *coordinate.Coordinate, ok bool) {
if !s.config.DisableCoordinates {
s.coordCacheLock.RLock()
defer s.coordCacheLock.RUnlock()
if coord, ok = s.coordCache[name]; ok {
return coord, true
}
return nil, false
}
return nil, false
}
// NumNodes returns the number of nodes in the serf cluster, regardless of
// their health or status.
func (s *Serf) NumNodes() (numNodes int) {
s.memberLock.RLock()
numNodes = len(s.members)
s.memberLock.RUnlock()
return numNodes
}

View file

@ -2,6 +2,7 @@ package serf
import (
"bufio"
"encoding/json"
"fmt"
"log"
"math/rand"
@ -12,6 +13,7 @@ import (
"time"
"github.com/armon/go-metrics"
"github.com/hashicorp/serf/coordinate"
)
/*
@ -27,6 +29,7 @@ old events.
const flushInterval = 500 * time.Millisecond
const clockUpdateInterval = 500 * time.Millisecond
const coordinateUpdateInterval = 60 * time.Second
const tmpExt = ".compact"
// Snapshotter is responsible for ingesting events and persisting
@ -34,6 +37,7 @@ const tmpExt = ".compact"
type Snapshotter struct {
aliveNodes map[string]string
clock *LamportClock
coordClient *coordinate.Client
fh *os.File
buffered *bufio.Writer
inCh <-chan Event
@ -74,6 +78,7 @@ func NewSnapshotter(path string,
rejoinAfterLeave bool,
logger *log.Logger,
clock *LamportClock,
coordClient *coordinate.Client,
outCh chan<- Event,
shutdownCh <-chan struct{}) (chan<- Event, *Snapshotter, error) {
inCh := make(chan Event, 1024)
@ -96,6 +101,7 @@ func NewSnapshotter(path string,
snap := &Snapshotter{
aliveNodes: make(map[string]string),
clock: clock,
coordClient: coordClient,
fh: fh,
buffered: bufio.NewWriter(fh),
inCh: inCh,
@ -171,6 +177,12 @@ func (s *Snapshotter) Leave() {
// stream is a long running routine that is used to handle events
func (s *Snapshotter) stream() {
clockTicker := time.NewTicker(clockUpdateInterval)
defer clockTicker.Stop()
coordinateTicker := time.NewTicker(coordinateUpdateInterval)
defer coordinateTicker.Stop()
for {
select {
case <-s.leaveCh:
@ -209,9 +221,12 @@ func (s *Snapshotter) stream() {
s.logger.Printf("[ERR] serf: Unknown event to snapshot: %#v", e)
}
case <-time.After(clockUpdateInterval):
case <-clockTicker.C:
s.updateClock()
case <-coordinateTicker.C:
s.updateCoordinate()
case <-s.shutdownCh:
if err := s.buffered.Flush(); err != nil {
s.logger.Printf("[ERR] serf: failed to flush snapshot: %v", err)
@ -258,6 +273,20 @@ func (s *Snapshotter) updateClock() {
}
}
// updateCoordinate is called periodically to write out the current local
// coordinate. It's safe to call this if coordinates aren't enabled (nil
// client) and it will be a no-op.
func (s *Snapshotter) updateCoordinate() {
if s.coordClient != nil {
encoded, err := json.Marshal(s.coordClient.GetCoordinate())
if err != nil {
s.logger.Printf("[ERR] serf: Failed to encode coordinate: %v", err)
} else {
s.tryAppend(fmt.Sprintf("coordinate: %s\n", encoded))
}
}
}
// processUserEvent is used to handle a single user event
func (s *Snapshotter) processUserEvent(e UserEvent) {
// Ignore old clocks
@ -362,6 +391,23 @@ func (s *Snapshotter) compact() error {
}
offset += int64(n)
// Write out the coordinate.
if s.coordClient != nil {
encoded, err := json.Marshal(s.coordClient.GetCoordinate())
if err != nil {
fh.Close()
return err
}
line = fmt.Sprintf("coordinate: %s\n", encoded)
n, err = buf.WriteString(line)
if err != nil {
fh.Close()
return err
}
offset += int64(n)
}
// Flush the new snapshot
err = buf.Flush()
fh.Close()
@ -473,6 +519,20 @@ func (s *Snapshotter) replay() error {
}
s.lastQueryClock = LamportTime(timeInt)
} else if strings.HasPrefix(line, "coordinate: ") {
if s.coordClient == nil {
s.logger.Printf("[WARN] serf: Ignoring snapshot coordinates since they are disabled")
continue
}
coordStr := strings.TrimPrefix(line, "coordinate: ")
var coord coordinate.Coordinate
err := json.Unmarshal([]byte(coordStr), &coord)
if err != nil {
s.logger.Printf("[WARN] serf: Failed to decode coordinate: %v", err)
continue
}
s.coordClient.SetCoordinate(&coord)
} else if line == "leave" {
// Ignore a leave if we plan on re-joining
if s.rejoinAfterLeave {