From 2f6921cbba0778d0ea7b73c84fd9ce7b325261e8 Mon Sep 17 00:00:00 2001 From: Flavio Crisciani Date: Fri, 8 Dec 2017 11:04:44 -0800 Subject: [PATCH] Diagnostic client - the client allows to talk to the diagnostic server and decode the internal values of the overlay and service discovery - the tool also allows to remediate in case of orphans entries - added README Signed-off-by: Flavio Crisciani --- libnetwork/Makefile | 1 + libnetwork/cmd/diagnostic/Dockerfile.client | 4 + libnetwork/cmd/diagnostic/Dockerfile.dind | 4 + libnetwork/cmd/diagnostic/README.md | 252 ++++++++++++++++++++ libnetwork/cmd/diagnostic/daemon.json | 4 + libnetwork/cmd/diagnostic/main.go | 191 +++++++++++++++ 6 files changed, 456 insertions(+) create mode 100644 libnetwork/cmd/diagnostic/Dockerfile.client create mode 100644 libnetwork/cmd/diagnostic/Dockerfile.dind create mode 100644 libnetwork/cmd/diagnostic/README.md create mode 100644 libnetwork/cmd/diagnostic/daemon.json create mode 100644 libnetwork/cmd/diagnostic/main.go diff --git a/libnetwork/Makefile b/libnetwork/Makefile index e477b81702..17060bc212 100644 --- a/libnetwork/Makefile +++ b/libnetwork/Makefile @@ -28,6 +28,7 @@ build-local: @mkdir -p "bin" go build -tags experimental -o "bin/dnet" ./cmd/dnet go build -o "bin/docker-proxy" ./cmd/proxy + GOOS=linux go build -o "./cmd/diagnostic/diagnosticClient" ./cmd/diagnostic clean: @echo "🐳 $@" diff --git a/libnetwork/cmd/diagnostic/Dockerfile.client b/libnetwork/cmd/diagnostic/Dockerfile.client new file mode 100644 index 0000000000..ee8771517a --- /dev/null +++ b/libnetwork/cmd/diagnostic/Dockerfile.client @@ -0,0 +1,4 @@ +FROM alpine +RUN apk add --no-cache curl +COPY diagnosticClient /usr/local/bin/diagnosticClient +ENTRYPOINT ["/usr/local/bin/diagnosticClient"] diff --git a/libnetwork/cmd/diagnostic/Dockerfile.dind b/libnetwork/cmd/diagnostic/Dockerfile.dind new file mode 100644 index 0000000000..fa66272168 --- /dev/null +++ b/libnetwork/cmd/diagnostic/Dockerfile.dind @@ -0,0 +1,4 @@ +FROM docker:17.12-dind +RUN apk add --no-cache curl +COPY daemon.json /etc/docker/daemon.json +COPY diagnosticClient /usr/local/bin/diagnosticClient diff --git a/libnetwork/cmd/diagnostic/README.md b/libnetwork/cmd/diagnostic/README.md new file mode 100644 index 0000000000..4c6ce6c35f --- /dev/null +++ b/libnetwork/cmd/diagnostic/README.md @@ -0,0 +1,252 @@ +--- +description: Learn to use the built-in network debugger to debug overlay networking problems +keywords: network, troubleshooting, debug +title: Debug overlay or swarm networking issues +--- + +**WARNING** +This tool can change the internal state of the libnetwork API, be really mindful +on its use and read carefully the following guide. Improper use of it will damage +or permanently destroy the network configuration. + + +Docker CE 17.12 and higher introduce a network debugging tool designed to help +debug issues with overlay networks and swarm services running on Linux hosts. +When enabled, a network diagnostic server listens on the specified port and +provides diagnostic information. The network debugging tool should only be +started to debug specific issues, and should not be left running all the time. + +Information about networks is stored in the database, which can be examined using +the API. Currently the database contains information about the overlay network +as well as the service discovery data. + +The Docker API exposes endpoints to query and control the network debugging +tool. CLI integration is provided as a preview, but the implementation is not +yet considered stable and commands and options may change without notice. + +The tool is available into 2 forms: +1) client only: dockereng/network-diagnostic:onlyclient +2) docker in docker version: dockereng/network-diagnostic:17.12-dind +The latter allows to use the tool with a cluster running an engine older than 17.12 + +## Enable the diagnostic server + +The tool currently only works on Docker hosts running on Linux. To enable it on a node +follow the step below. + +1. Set the `network-diagnostic-port` to a port which is free on the Docker + host, in the `/etc/docker/daemon.json` configuration file. + + ```json + “network-diagnostic-port”: + ``` + +2. Get the process ID (PID) of the `dockerd` process. It is the second field in + the output, and is typically a number from 2 to 6 digits long. + + ```bash + $ ps aux |grep dockerd | grep -v grep + ``` + +3. Reload the Docker configuration without restarting Docker, by sending the + `HUP` signal to the PID you found in the previous step. + + ```bash + kill -HUP + ``` + +If systemd is used the command `systemctl reload docker` will be enough + + +A message like the following will appear in the Docker host logs: + +```none +Starting the diagnostic server listening on for commands +``` + +## Disable the diagnostic tool + +Repeat these steps for each node participating in the swarm. + +1. Remove the `network-diagnostic-port` key from the `/etc/docker/daemon.json` + configuration file. + +2. Get the process ID (PID) of the `dockerd` process. It is the second field in + the output, and is typically a number from 2 to 6 digits long. + + ```bash + $ ps aux |grep dockerd | grep -v grep + ``` + +3. Reload the Docker configuration without restarting Docker, by sending the + `HUP` signal to the PID you found in the previous step. + + ```bash + kill -HUP + ``` + +A message like the following will appear in the Docker host logs: + +```none +Disabling the diagnostic server +``` + +## Access the diagnostic tool's API + +The network diagnostic tool exposes its own RESTful API. To access the API, +send a HTTP request to the port where the tool is listening. The following +commands assume the tool is listening on port 2000. + +Examples are not given for every endpoint. + +### Get help + +```bash +$ curl localhost:2000/help + +OK +/updateentry +/getentry +/gettable +/leavenetwork +/createentry +/help +/clusterpeers +/ready +/joinnetwork +/deleteentry +/networkpeers +/ +/join +``` + +### Join or leave the network database cluster + +```bash +$ curl localhost:2000/join?members=ip1,ip2,... +``` + +```bash +$ curl localhost:2000/leave?members=ip1,ip2,... +``` + +`ip1`, `ip2`, ... are the swarm node ips (usually one is enough) + +### Join or leave a network + +```bash +$ curl localhost:2000/joinnetwork?nid= +``` + +```bash +$ curl localhost:2000/leavenetwork?nid= +``` + +`network id` can be retrieved on the manager with `docker network ls --no-trunc` and has +to be the full length identifier + +### List cluster peers + +```bash +$ curl localhost:2000/clusterpeers +``` + +### List nodes connected to a given network + +```bash +$ curl localhost:2000/networkpeers?nid= +``` +`network id` can be retrieved on the manager with `docker network ls --no-trunc` and has +to be the full length identifier + +### Dump database tables + +The tables are called `endpoint_table` and `overlay_peer_table`. +The `overlay_peer_table` contains all the overlay forwarding information +The `endpoint_table` contains all the service discovery information + +```bash +$ curl localhost:2000/gettable?nid=&tname= +``` + +### Interact with a specific database table + +The tables are called `endpoint_table` and `overlay_peer_table`. + +```bash +$ curl localhost:2000/?nid=&tname=
&key=[&value=] +``` + +Note: +operations on tables have node ownership, this means that are going to remain persistent till +the node that inserted them is part of the cluster + +## Access the diagnostic tool's CLI + +The CLI is provided as a preview and is not yet stable. Commands or options may +change at any time. + +The CLI executable is called `diagnosticClient` and is made available using a +standalone container. + +`docker run --net host dockereng/network-diagnostic:onlyclient -v -net -t sd` + +The following flags are supported: + +| Flag | Description | +|---------------|-------------------------------------------------| +| -t | Table one of `sd` or `overlay`. | +| -ip | The IP address to query. Defaults to 127.0.0.1. | +| -net | The target network ID. | +| -port | The target port. (default port is 2000) | +| -v | Enable verbose output. | + +### Container version of the diagnostic tool + +The CLI is provided as a container with a 17.12 engine that needs to run using privileged mode. +*NOTE* +Remember that table operations have ownership, so any `create entry` will be persistent till +the diagnostic container is part of the swarm. + +1. Make sure that the node where the diagnostic client will run is not part of the swarm, if so do `docker swarm leave -f` + +2. To run the container, use a command like the following: + + ```bash + $ docker container run --name net-diagnostic -d --privileged --network host dockereng/network-diagnostic:17.12-dind + ``` + +3. Connect to the container using `docker exec -it sh`, + and start the server using the following command: + + ```bash + $ kill -HUP 1 + ``` + +4. Join the diagnostic container to the swarm, then run the diagnostic CLI within the container. + + ```bash + $ ./diagnosticClient ... + ``` + +4. When finished debugging, leave the swarm and stop the container. + +### Examples + +The following commands dump the service discovery table and verify node +ownership. + +*NOTE* +Remember to use the full network ID, you can easily find that with `docker network ls --no-trunc` + +**Service discovery and load balancer:** + +```bash +$ diagnostiClient -c sd -v -net n8a8ie6tb3wr2e260vxj8ncy4 +``` + +**Overlay network:** + +```bash +$ diagnostiClient -port 2001 -c overlay -v -net n8a8ie6tb3wr2e260vxj8ncy4 +``` diff --git a/libnetwork/cmd/diagnostic/daemon.json b/libnetwork/cmd/diagnostic/daemon.json new file mode 100644 index 0000000000..b5eb9889b8 --- /dev/null +++ b/libnetwork/cmd/diagnostic/daemon.json @@ -0,0 +1,4 @@ +{ + "debug": true, + "network-diagnostic-port": 2000 +} diff --git a/libnetwork/cmd/diagnostic/main.go b/libnetwork/cmd/diagnostic/main.go new file mode 100644 index 0000000000..0f3f559ec5 --- /dev/null +++ b/libnetwork/cmd/diagnostic/main.go @@ -0,0 +1,191 @@ +package main + +import ( + "bufio" + "encoding/base64" + "encoding/json" + "flag" + "fmt" + "io" + "io/ioutil" + "net/http" + "os" + "strings" + + "github.com/docker/libnetwork" + "github.com/docker/libnetwork/diagnostic" + "github.com/docker/libnetwork/drivers/overlay" + "github.com/sirupsen/logrus" +) + +const ( + readyPath = "http://%s:%d/ready" + joinNetwork = "http://%s:%d/joinnetwork?nid=%s" + leaveNetwork = "http://%s:%d/leavenetwork?nid=%s" + clusterPeers = "http://%s:%d/clusterpeers?json" + networkPeers = "http://%s:%d/networkpeers?nid=%s&json" + dumpTable = "http://%s:%d/gettable?nid=%s&tname=%s&json" + deleteEntry = "http://%s:%d/deleteentry?nid=%s&tname=%s&key=%s&json" +) + +func httpIsOk(body io.ReadCloser) { + b, err := ioutil.ReadAll(body) + if err != nil { + logrus.Fatalf("Failed the body parse %s", err) + } + if !strings.Contains(string(b), "OK") { + logrus.Fatalf("Server not ready %s", b) + } + body.Close() +} + +func main() { + ipPtr := flag.String("ip", "127.0.0.1", "ip address") + portPtr := flag.Int("port", 2000, "port") + networkPtr := flag.String("net", "", "target network") + tablePtr := flag.String("t", "", "table to process ") + remediatePtr := flag.Bool("r", false, "perform remediation deleting orphan entries") + verbosePtr := flag.Bool("v", false, "verbose output") + + flag.Parse() + + if *verbosePtr { + logrus.SetLevel(logrus.DebugLevel) + } + + logrus.Infof("Connecting to %s:%d checking ready", *ipPtr, *portPtr) + resp, err := http.Get(fmt.Sprintf(readyPath, *ipPtr, *portPtr)) + if err != nil { + logrus.WithError(err).Fatalf("The connection failed") + } + httpIsOk(resp.Body) + + clusterPeers := fetchNodePeers(*ipPtr, *portPtr, "") + var networkPeers map[string]string + var joinedNetwork bool + if *networkPtr != "" { + logrus.Infof("Joining the network:%s", *networkPtr) + resp, err = http.Get(fmt.Sprintf(joinNetwork, *ipPtr, *portPtr, *networkPtr)) + if err != nil { + logrus.WithError(err).Fatalf("Failed joining the network") + } + httpIsOk(resp.Body) + networkPeers = fetchNodePeers(*ipPtr, *portPtr, *networkPtr) + joinedNetwork = true + } + + switch *tablePtr { + case "sd": + fetchTable(*ipPtr, *portPtr, *networkPtr, "endpoint_table", clusterPeers, networkPeers, *remediatePtr) + case "overlay": + fetchTable(*ipPtr, *portPtr, *networkPtr, "overlay_peer_table", clusterPeers, networkPeers, *remediatePtr) + } + + if joinedNetwork { + resp, err = http.Get(fmt.Sprintf(leaveNetwork, *ipPtr, *portPtr, *networkPtr)) + if err != nil { + logrus.WithError(err).Fatalf("Failed leaving the network") + } + httpIsOk(resp.Body) + } +} + +func fetchNodePeers(ip string, port int, network string) map[string]string { + logrus.Infof("Fetch peers %s", network) + var path string + if network != "" { + path = fmt.Sprintf(networkPeers, ip, port, network) + } else { + path = fmt.Sprintf(clusterPeers, ip, port) + } + + resp, err := http.Get(path) + if err != nil { + logrus.WithError(err).Fatalf("Failed fetching path") + } + defer resp.Body.Close() + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + logrus.WithError(err).Fatalf("Failed the body parse") + } + + output := diagnostic.HTTPResult{Details: &diagnostic.TablePeersResult{}} + err = json.Unmarshal(body, &output) + if err != nil { + logrus.WithError(err).Fatalf("Failed the json unmarshalling") + } + + logrus.Debugf("Parsing JSON response") + result := make(map[string]string, output.Details.(*diagnostic.TablePeersResult).Length) + for _, v := range output.Details.(*diagnostic.TablePeersResult).Elements { + logrus.Debugf("name:%s ip:%s", v.Name, v.IP) + result[v.Name] = v.IP + } + return result +} + +func fetchTable(ip string, port int, network, tableName string, clusterPeers, networkPeers map[string]string, remediate bool) { + logrus.Infof("Fetch %s table and check owners", tableName) + resp, err := http.Get(fmt.Sprintf(dumpTable, ip, port, network, tableName)) + if err != nil { + logrus.WithError(err).Fatalf("Failed fetching endpoint table") + } + defer resp.Body.Close() + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + logrus.WithError(err).Fatalf("Failed the body parse") + } + + output := diagnostic.HTTPResult{Details: &diagnostic.TableEndpointsResult{}} + err = json.Unmarshal(body, &output) + if err != nil { + logrus.WithError(err).Fatalf("Failed the json unmarshalling") + } + + logrus.Debug("Parsing data structures") + var orphanKeys []string + for _, v := range output.Details.(*diagnostic.TableEndpointsResult).Elements { + decoded, err := base64.StdEncoding.DecodeString(v.Value) + if err != nil { + logrus.WithError(err).Errorf("Failed decoding entry") + continue + } + switch tableName { + case "endpoint_table": + var elem libnetwork.EndpointRecord + elem.Unmarshal(decoded) + logrus.Debugf("key:%s value:%+v owner:%s", v.Key, elem, v.Owner) + case "overlay_peer_table": + var elem overlay.PeerRecord + elem.Unmarshal(decoded) + logrus.Debugf("key:%s value:%+v owner:%s", v.Key, elem, v.Owner) + } + + if _, ok := networkPeers[v.Owner]; !ok { + logrus.Warnf("The element with key:%s does not belong to any node on this network", v.Key) + orphanKeys = append(orphanKeys, v.Key) + } + if _, ok := clusterPeers[v.Owner]; !ok { + logrus.Warnf("The element with key:%s does not belong to any node on this cluster", v.Key) + } + } + + if len(orphanKeys) > 0 && remediate { + logrus.Warnf("The following keys:%v results as orphan, do you want to proceed with the deletion (this operation is irreversible)? [Yes/No]", orphanKeys) + reader := bufio.NewReader(os.Stdin) + text, _ := reader.ReadString('\n') + text = strings.Replace(text, "\n", "", -1) + if strings.Compare(text, "Yes") == 0 { + for _, k := range orphanKeys { + resp, err := http.Get(fmt.Sprintf(deleteEntry, ip, port, network, tableName, k)) + if err != nil { + logrus.WithError(err).Errorf("Failed deleting entry k:%s", k) + break + } + resp.Body.Close() + } + } else { + logrus.Infof("Deletion skipped") + } + } +}