mirror of
https://github.com/moby/moby.git
synced 2022-11-09 12:21:53 -05:00
Diagnostic client
- the client allows to talk to the diagnostic server and decode the internal values of the overlay and service discovery - the tool also allows to remediate in case of orphans entries - added README Signed-off-by: Flavio Crisciani <flavio.crisciani@docker.com>
This commit is contained in:
parent
a59ecd9537
commit
2f6921cbba
6 changed files with 456 additions and 0 deletions
|
@ -28,6 +28,7 @@ build-local:
|
|||
@mkdir -p "bin"
|
||||
go build -tags experimental -o "bin/dnet" ./cmd/dnet
|
||||
go build -o "bin/docker-proxy" ./cmd/proxy
|
||||
GOOS=linux go build -o "./cmd/diagnostic/diagnosticClient" ./cmd/diagnostic
|
||||
|
||||
clean:
|
||||
@echo "🐳 $@"
|
||||
|
|
4
libnetwork/cmd/diagnostic/Dockerfile.client
Normal file
4
libnetwork/cmd/diagnostic/Dockerfile.client
Normal file
|
@ -0,0 +1,4 @@
|
|||
FROM alpine
|
||||
RUN apk add --no-cache curl
|
||||
COPY diagnosticClient /usr/local/bin/diagnosticClient
|
||||
ENTRYPOINT ["/usr/local/bin/diagnosticClient"]
|
4
libnetwork/cmd/diagnostic/Dockerfile.dind
Normal file
4
libnetwork/cmd/diagnostic/Dockerfile.dind
Normal file
|
@ -0,0 +1,4 @@
|
|||
FROM docker:17.12-dind
|
||||
RUN apk add --no-cache curl
|
||||
COPY daemon.json /etc/docker/daemon.json
|
||||
COPY diagnosticClient /usr/local/bin/diagnosticClient
|
252
libnetwork/cmd/diagnostic/README.md
Normal file
252
libnetwork/cmd/diagnostic/README.md
Normal file
|
@ -0,0 +1,252 @@
|
|||
---
|
||||
description: Learn to use the built-in network debugger to debug overlay networking problems
|
||||
keywords: network, troubleshooting, debug
|
||||
title: Debug overlay or swarm networking issues
|
||||
---
|
||||
|
||||
**WARNING**
|
||||
This tool can change the internal state of the libnetwork API, be really mindful
|
||||
on its use and read carefully the following guide. Improper use of it will damage
|
||||
or permanently destroy the network configuration.
|
||||
|
||||
|
||||
Docker CE 17.12 and higher introduce a network debugging tool designed to help
|
||||
debug issues with overlay networks and swarm services running on Linux hosts.
|
||||
When enabled, a network diagnostic server listens on the specified port and
|
||||
provides diagnostic information. The network debugging tool should only be
|
||||
started to debug specific issues, and should not be left running all the time.
|
||||
|
||||
Information about networks is stored in the database, which can be examined using
|
||||
the API. Currently the database contains information about the overlay network
|
||||
as well as the service discovery data.
|
||||
|
||||
The Docker API exposes endpoints to query and control the network debugging
|
||||
tool. CLI integration is provided as a preview, but the implementation is not
|
||||
yet considered stable and commands and options may change without notice.
|
||||
|
||||
The tool is available into 2 forms:
|
||||
1) client only: dockereng/network-diagnostic:onlyclient
|
||||
2) docker in docker version: dockereng/network-diagnostic:17.12-dind
|
||||
The latter allows to use the tool with a cluster running an engine older than 17.12
|
||||
|
||||
## Enable the diagnostic server
|
||||
|
||||
The tool currently only works on Docker hosts running on Linux. To enable it on a node
|
||||
follow the step below.
|
||||
|
||||
1. Set the `network-diagnostic-port` to a port which is free on the Docker
|
||||
host, in the `/etc/docker/daemon.json` configuration file.
|
||||
|
||||
```json
|
||||
“network-diagnostic-port”: <port>
|
||||
```
|
||||
|
||||
2. Get the process ID (PID) of the `dockerd` process. It is the second field in
|
||||
the output, and is typically a number from 2 to 6 digits long.
|
||||
|
||||
```bash
|
||||
$ ps aux |grep dockerd | grep -v grep
|
||||
```
|
||||
|
||||
3. Reload the Docker configuration without restarting Docker, by sending the
|
||||
`HUP` signal to the PID you found in the previous step.
|
||||
|
||||
```bash
|
||||
kill -HUP <pid-of-dockerd>
|
||||
```
|
||||
|
||||
If systemd is used the command `systemctl reload docker` will be enough
|
||||
|
||||
|
||||
A message like the following will appear in the Docker host logs:
|
||||
|
||||
```none
|
||||
Starting the diagnostic server listening on <port> for commands
|
||||
```
|
||||
|
||||
## Disable the diagnostic tool
|
||||
|
||||
Repeat these steps for each node participating in the swarm.
|
||||
|
||||
1. Remove the `network-diagnostic-port` key from the `/etc/docker/daemon.json`
|
||||
configuration file.
|
||||
|
||||
2. Get the process ID (PID) of the `dockerd` process. It is the second field in
|
||||
the output, and is typically a number from 2 to 6 digits long.
|
||||
|
||||
```bash
|
||||
$ ps aux |grep dockerd | grep -v grep
|
||||
```
|
||||
|
||||
3. Reload the Docker configuration without restarting Docker, by sending the
|
||||
`HUP` signal to the PID you found in the previous step.
|
||||
|
||||
```bash
|
||||
kill -HUP <pid-of-dockerd>
|
||||
```
|
||||
|
||||
A message like the following will appear in the Docker host logs:
|
||||
|
||||
```none
|
||||
Disabling the diagnostic server
|
||||
```
|
||||
|
||||
## Access the diagnostic tool's API
|
||||
|
||||
The network diagnostic tool exposes its own RESTful API. To access the API,
|
||||
send a HTTP request to the port where the tool is listening. The following
|
||||
commands assume the tool is listening on port 2000.
|
||||
|
||||
Examples are not given for every endpoint.
|
||||
|
||||
### Get help
|
||||
|
||||
```bash
|
||||
$ curl localhost:2000/help
|
||||
|
||||
OK
|
||||
/updateentry
|
||||
/getentry
|
||||
/gettable
|
||||
/leavenetwork
|
||||
/createentry
|
||||
/help
|
||||
/clusterpeers
|
||||
/ready
|
||||
/joinnetwork
|
||||
/deleteentry
|
||||
/networkpeers
|
||||
/
|
||||
/join
|
||||
```
|
||||
|
||||
### Join or leave the network database cluster
|
||||
|
||||
```bash
|
||||
$ curl localhost:2000/join?members=ip1,ip2,...
|
||||
```
|
||||
|
||||
```bash
|
||||
$ curl localhost:2000/leave?members=ip1,ip2,...
|
||||
```
|
||||
|
||||
`ip1`, `ip2`, ... are the swarm node ips (usually one is enough)
|
||||
|
||||
### Join or leave a network
|
||||
|
||||
```bash
|
||||
$ curl localhost:2000/joinnetwork?nid=<network id>
|
||||
```
|
||||
|
||||
```bash
|
||||
$ curl localhost:2000/leavenetwork?nid=<network id>
|
||||
```
|
||||
|
||||
`network id` can be retrieved on the manager with `docker network ls --no-trunc` and has
|
||||
to be the full length identifier
|
||||
|
||||
### List cluster peers
|
||||
|
||||
```bash
|
||||
$ curl localhost:2000/clusterpeers
|
||||
```
|
||||
|
||||
### List nodes connected to a given network
|
||||
|
||||
```bash
|
||||
$ curl localhost:2000/networkpeers?nid=<network id>
|
||||
```
|
||||
`network id` can be retrieved on the manager with `docker network ls --no-trunc` and has
|
||||
to be the full length identifier
|
||||
|
||||
### Dump database tables
|
||||
|
||||
The tables are called `endpoint_table` and `overlay_peer_table`.
|
||||
The `overlay_peer_table` contains all the overlay forwarding information
|
||||
The `endpoint_table` contains all the service discovery information
|
||||
|
||||
```bash
|
||||
$ curl localhost:2000/gettable?nid=<network id>&tname=<table name>
|
||||
```
|
||||
|
||||
### Interact with a specific database table
|
||||
|
||||
The tables are called `endpoint_table` and `overlay_peer_table`.
|
||||
|
||||
```bash
|
||||
$ curl localhost:2000/<method>?nid=<network id>&tname=<table name>&key=<key>[&value=<value>]
|
||||
```
|
||||
|
||||
Note:
|
||||
operations on tables have node ownership, this means that are going to remain persistent till
|
||||
the node that inserted them is part of the cluster
|
||||
|
||||
## Access the diagnostic tool's CLI
|
||||
|
||||
The CLI is provided as a preview and is not yet stable. Commands or options may
|
||||
change at any time.
|
||||
|
||||
The CLI executable is called `diagnosticClient` and is made available using a
|
||||
standalone container.
|
||||
|
||||
`docker run --net host dockereng/network-diagnostic:onlyclient -v -net <full network id> -t sd`
|
||||
|
||||
The following flags are supported:
|
||||
|
||||
| Flag | Description |
|
||||
|---------------|-------------------------------------------------|
|
||||
| -t <string> | Table one of `sd` or `overlay`. |
|
||||
| -ip <string> | The IP address to query. Defaults to 127.0.0.1. |
|
||||
| -net <string> | The target network ID. |
|
||||
| -port <int> | The target port. (default port is 2000) |
|
||||
| -v | Enable verbose output. |
|
||||
|
||||
### Container version of the diagnostic tool
|
||||
|
||||
The CLI is provided as a container with a 17.12 engine that needs to run using privileged mode.
|
||||
*NOTE*
|
||||
Remember that table operations have ownership, so any `create entry` will be persistent till
|
||||
the diagnostic container is part of the swarm.
|
||||
|
||||
1. Make sure that the node where the diagnostic client will run is not part of the swarm, if so do `docker swarm leave -f`
|
||||
|
||||
2. To run the container, use a command like the following:
|
||||
|
||||
```bash
|
||||
$ docker container run --name net-diagnostic -d --privileged --network host dockereng/network-diagnostic:17.12-dind
|
||||
```
|
||||
|
||||
3. Connect to the container using `docker exec -it <container-ID> sh`,
|
||||
and start the server using the following command:
|
||||
|
||||
```bash
|
||||
$ kill -HUP 1
|
||||
```
|
||||
|
||||
4. Join the diagnostic container to the swarm, then run the diagnostic CLI within the container.
|
||||
|
||||
```bash
|
||||
$ ./diagnosticClient <flags>...
|
||||
```
|
||||
|
||||
4. When finished debugging, leave the swarm and stop the container.
|
||||
|
||||
### Examples
|
||||
|
||||
The following commands dump the service discovery table and verify node
|
||||
ownership.
|
||||
|
||||
*NOTE*
|
||||
Remember to use the full network ID, you can easily find that with `docker network ls --no-trunc`
|
||||
|
||||
**Service discovery and load balancer:**
|
||||
|
||||
```bash
|
||||
$ diagnostiClient -c sd -v -net n8a8ie6tb3wr2e260vxj8ncy4
|
||||
```
|
||||
|
||||
**Overlay network:**
|
||||
|
||||
```bash
|
||||
$ diagnostiClient -port 2001 -c overlay -v -net n8a8ie6tb3wr2e260vxj8ncy4
|
||||
```
|
4
libnetwork/cmd/diagnostic/daemon.json
Normal file
4
libnetwork/cmd/diagnostic/daemon.json
Normal file
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"debug": true,
|
||||
"network-diagnostic-port": 2000
|
||||
}
|
191
libnetwork/cmd/diagnostic/main.go
Normal file
191
libnetwork/cmd/diagnostic/main.go
Normal file
|
@ -0,0 +1,191 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/docker/libnetwork"
|
||||
"github.com/docker/libnetwork/diagnostic"
|
||||
"github.com/docker/libnetwork/drivers/overlay"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
const (
|
||||
readyPath = "http://%s:%d/ready"
|
||||
joinNetwork = "http://%s:%d/joinnetwork?nid=%s"
|
||||
leaveNetwork = "http://%s:%d/leavenetwork?nid=%s"
|
||||
clusterPeers = "http://%s:%d/clusterpeers?json"
|
||||
networkPeers = "http://%s:%d/networkpeers?nid=%s&json"
|
||||
dumpTable = "http://%s:%d/gettable?nid=%s&tname=%s&json"
|
||||
deleteEntry = "http://%s:%d/deleteentry?nid=%s&tname=%s&key=%s&json"
|
||||
)
|
||||
|
||||
func httpIsOk(body io.ReadCloser) {
|
||||
b, err := ioutil.ReadAll(body)
|
||||
if err != nil {
|
||||
logrus.Fatalf("Failed the body parse %s", err)
|
||||
}
|
||||
if !strings.Contains(string(b), "OK") {
|
||||
logrus.Fatalf("Server not ready %s", b)
|
||||
}
|
||||
body.Close()
|
||||
}
|
||||
|
||||
func main() {
|
||||
ipPtr := flag.String("ip", "127.0.0.1", "ip address")
|
||||
portPtr := flag.Int("port", 2000, "port")
|
||||
networkPtr := flag.String("net", "", "target network")
|
||||
tablePtr := flag.String("t", "", "table to process <sd/overlay>")
|
||||
remediatePtr := flag.Bool("r", false, "perform remediation deleting orphan entries")
|
||||
verbosePtr := flag.Bool("v", false, "verbose output")
|
||||
|
||||
flag.Parse()
|
||||
|
||||
if *verbosePtr {
|
||||
logrus.SetLevel(logrus.DebugLevel)
|
||||
}
|
||||
|
||||
logrus.Infof("Connecting to %s:%d checking ready", *ipPtr, *portPtr)
|
||||
resp, err := http.Get(fmt.Sprintf(readyPath, *ipPtr, *portPtr))
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatalf("The connection failed")
|
||||
}
|
||||
httpIsOk(resp.Body)
|
||||
|
||||
clusterPeers := fetchNodePeers(*ipPtr, *portPtr, "")
|
||||
var networkPeers map[string]string
|
||||
var joinedNetwork bool
|
||||
if *networkPtr != "" {
|
||||
logrus.Infof("Joining the network:%s", *networkPtr)
|
||||
resp, err = http.Get(fmt.Sprintf(joinNetwork, *ipPtr, *portPtr, *networkPtr))
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatalf("Failed joining the network")
|
||||
}
|
||||
httpIsOk(resp.Body)
|
||||
networkPeers = fetchNodePeers(*ipPtr, *portPtr, *networkPtr)
|
||||
joinedNetwork = true
|
||||
}
|
||||
|
||||
switch *tablePtr {
|
||||
case "sd":
|
||||
fetchTable(*ipPtr, *portPtr, *networkPtr, "endpoint_table", clusterPeers, networkPeers, *remediatePtr)
|
||||
case "overlay":
|
||||
fetchTable(*ipPtr, *portPtr, *networkPtr, "overlay_peer_table", clusterPeers, networkPeers, *remediatePtr)
|
||||
}
|
||||
|
||||
if joinedNetwork {
|
||||
resp, err = http.Get(fmt.Sprintf(leaveNetwork, *ipPtr, *portPtr, *networkPtr))
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatalf("Failed leaving the network")
|
||||
}
|
||||
httpIsOk(resp.Body)
|
||||
}
|
||||
}
|
||||
|
||||
func fetchNodePeers(ip string, port int, network string) map[string]string {
|
||||
logrus.Infof("Fetch peers %s", network)
|
||||
var path string
|
||||
if network != "" {
|
||||
path = fmt.Sprintf(networkPeers, ip, port, network)
|
||||
} else {
|
||||
path = fmt.Sprintf(clusterPeers, ip, port)
|
||||
}
|
||||
|
||||
resp, err := http.Get(path)
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatalf("Failed fetching path")
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatalf("Failed the body parse")
|
||||
}
|
||||
|
||||
output := diagnostic.HTTPResult{Details: &diagnostic.TablePeersResult{}}
|
||||
err = json.Unmarshal(body, &output)
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatalf("Failed the json unmarshalling")
|
||||
}
|
||||
|
||||
logrus.Debugf("Parsing JSON response")
|
||||
result := make(map[string]string, output.Details.(*diagnostic.TablePeersResult).Length)
|
||||
for _, v := range output.Details.(*diagnostic.TablePeersResult).Elements {
|
||||
logrus.Debugf("name:%s ip:%s", v.Name, v.IP)
|
||||
result[v.Name] = v.IP
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func fetchTable(ip string, port int, network, tableName string, clusterPeers, networkPeers map[string]string, remediate bool) {
|
||||
logrus.Infof("Fetch %s table and check owners", tableName)
|
||||
resp, err := http.Get(fmt.Sprintf(dumpTable, ip, port, network, tableName))
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatalf("Failed fetching endpoint table")
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatalf("Failed the body parse")
|
||||
}
|
||||
|
||||
output := diagnostic.HTTPResult{Details: &diagnostic.TableEndpointsResult{}}
|
||||
err = json.Unmarshal(body, &output)
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatalf("Failed the json unmarshalling")
|
||||
}
|
||||
|
||||
logrus.Debug("Parsing data structures")
|
||||
var orphanKeys []string
|
||||
for _, v := range output.Details.(*diagnostic.TableEndpointsResult).Elements {
|
||||
decoded, err := base64.StdEncoding.DecodeString(v.Value)
|
||||
if err != nil {
|
||||
logrus.WithError(err).Errorf("Failed decoding entry")
|
||||
continue
|
||||
}
|
||||
switch tableName {
|
||||
case "endpoint_table":
|
||||
var elem libnetwork.EndpointRecord
|
||||
elem.Unmarshal(decoded)
|
||||
logrus.Debugf("key:%s value:%+v owner:%s", v.Key, elem, v.Owner)
|
||||
case "overlay_peer_table":
|
||||
var elem overlay.PeerRecord
|
||||
elem.Unmarshal(decoded)
|
||||
logrus.Debugf("key:%s value:%+v owner:%s", v.Key, elem, v.Owner)
|
||||
}
|
||||
|
||||
if _, ok := networkPeers[v.Owner]; !ok {
|
||||
logrus.Warnf("The element with key:%s does not belong to any node on this network", v.Key)
|
||||
orphanKeys = append(orphanKeys, v.Key)
|
||||
}
|
||||
if _, ok := clusterPeers[v.Owner]; !ok {
|
||||
logrus.Warnf("The element with key:%s does not belong to any node on this cluster", v.Key)
|
||||
}
|
||||
}
|
||||
|
||||
if len(orphanKeys) > 0 && remediate {
|
||||
logrus.Warnf("The following keys:%v results as orphan, do you want to proceed with the deletion (this operation is irreversible)? [Yes/No]", orphanKeys)
|
||||
reader := bufio.NewReader(os.Stdin)
|
||||
text, _ := reader.ReadString('\n')
|
||||
text = strings.Replace(text, "\n", "", -1)
|
||||
if strings.Compare(text, "Yes") == 0 {
|
||||
for _, k := range orphanKeys {
|
||||
resp, err := http.Get(fmt.Sprintf(deleteEntry, ip, port, network, tableName, k))
|
||||
if err != nil {
|
||||
logrus.WithError(err).Errorf("Failed deleting entry k:%s", k)
|
||||
break
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
} else {
|
||||
logrus.Infof("Deletion skipped")
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Add table
Reference in a new issue