mirror of
https://github.com/moby/moby.git
synced 2022-11-09 12:21:53 -05:00
Import the ssd tool in libnetwork
Signed-off-by: Flavio Crisciani <flavio.crisciani@docker.com>
This commit is contained in:
parent
2459e6fbd3
commit
a16d469867
3 changed files with 261 additions and 0 deletions
34
libnetwork/cmd/ssd/Dockerfile
Executable file
34
libnetwork/cmd/ssd/Dockerfile
Executable file
|
@ -0,0 +1,34 @@
|
||||||
|
FROM alpine:3.7
|
||||||
|
ENV PACKAGES="\
|
||||||
|
musl \
|
||||||
|
linux-headers \
|
||||||
|
build-base \
|
||||||
|
util-linux \
|
||||||
|
bash \
|
||||||
|
git \
|
||||||
|
ca-certificates \
|
||||||
|
python2 \
|
||||||
|
python2-dev \
|
||||||
|
py-setuptools \
|
||||||
|
iproute2 \
|
||||||
|
curl \
|
||||||
|
strace \
|
||||||
|
drill \
|
||||||
|
ipvsadm \
|
||||||
|
iperf \
|
||||||
|
ethtool \
|
||||||
|
"
|
||||||
|
|
||||||
|
RUN echo \
|
||||||
|
&& apk add --no-cache $PACKAGES \
|
||||||
|
&& if [[ ! -e /usr/bin/python ]]; then ln -sf /usr/bin/python2.7 /usr/bin/python; fi \
|
||||||
|
&& if [[ ! -e /usr/bin/python-config ]]; then ln -sf /usr/bin/python2.7-config /usr/bin/python-config; fi \
|
||||||
|
&& if [[ ! -e /usr/bin/easy_install ]]; then ln -sf /usr/bin/easy_install-2.7 /usr/bin/easy_install; fi \
|
||||||
|
&& easy_install pip \
|
||||||
|
&& pip install --upgrade pip \
|
||||||
|
&& if [[ ! -e /usr/bin/pip ]]; then ln -sf /usr/bin/pip2.7 /usr/bin/pip; fi \
|
||||||
|
&& echo
|
||||||
|
|
||||||
|
ADD ssd.py /
|
||||||
|
RUN pip install git+git://github.com/docker/docker-py.git
|
||||||
|
ENTRYPOINT [ "python", "/ssd.py"]
|
47
libnetwork/cmd/ssd/README.md
Executable file
47
libnetwork/cmd/ssd/README.md
Executable file
|
@ -0,0 +1,47 @@
|
||||||
|
# Docker Swarm Service Driller(ssd)
|
||||||
|
|
||||||
|
ssd is a troubleshooting utility for Docker swarm networks.
|
||||||
|
|
||||||
|
### control-plane and datapath consistency check on a node
|
||||||
|
ssd checks for the consistency between docker network control-plane (from the docker daemon in-memory state) and kernel data path programming. Currently the tool checks only for the consistency of the Load balancer (implemented using IPVS).
|
||||||
|
|
||||||
|
In a three node swarm cluser ssd status for a overlay network `ov2` which has three services running, each replicated to 3 instances.
|
||||||
|
|
||||||
|
````bash
|
||||||
|
vagrant@net-1:~/code/go/src/github.com/docker/docker-e2e/tests$ docker run -v /var/run/docker.sock:/var/run/docker.sock -v /var/run/docker/netns:/var/run/docker/netns --privileged --net=host sanimej/ssd ov2
|
||||||
|
Verifying LB programming for containers on network ov2
|
||||||
|
Verifying container /s2.3.ltrdwef0iqf90rqauw3ehcs56...
|
||||||
|
service s2... OK
|
||||||
|
service s3... OK
|
||||||
|
service s1... OK
|
||||||
|
Verifying container /s3.3.nyhwvdvnocb4wftyhb8dr4fj8...
|
||||||
|
service s2... OK
|
||||||
|
service s3... OK
|
||||||
|
service s1... OK
|
||||||
|
Verifying container /s1.3.wwx5tuxhnvoz5vrb8ohphby0r...
|
||||||
|
service s2... OK
|
||||||
|
service s3... OK
|
||||||
|
service s1... OK
|
||||||
|
Verifying LB programming for containers on network ingress
|
||||||
|
Verifying container Ingress...
|
||||||
|
service web... OK
|
||||||
|
````
|
||||||
|
|
||||||
|
ssd checks the required iptables programming to direct an incoming packet with the <host ip>:<published port> to the right <backend ip>:<target port>
|
||||||
|
|
||||||
|
### control-plane consistency check across nodes in a cluster
|
||||||
|
|
||||||
|
Docker networking uses a gossip protocol to synchronize networking state across nodes in a cluster. ssd's `gossip-consistency` command verifies if the state maintained by all the nodes are consistent.
|
||||||
|
|
||||||
|
````bash
|
||||||
|
In a three node cluster with services running on an overlay network ov2 ssd consistency-checker shows
|
||||||
|
|
||||||
|
vagrant@net-1:~/code/go/src/github.com/docker/docker-e2e/tests$ docker run -v /var/run/docker.sock:/var/run/docker.sock -v /var/run/docker/netns:/var/run/docker/netns --privileged sanimej/ssd ov2 gossip-consistency
|
||||||
|
Node id: sjfp0ca8f43rvnab6v7f21gq0 gossip hash c57d89094dbb574a37930393278dc282
|
||||||
|
|
||||||
|
Node id: bg228r3q9095grj4wxkqs80oe gossip hash c57d89094dbb574a37930393278dc282
|
||||||
|
|
||||||
|
Node id: 6jylcraipcv2pxdricqe77j5q gossip hash c57d89094dbb574a37930393278dc282
|
||||||
|
````
|
||||||
|
|
||||||
|
This is hash digest of the control-plane state for the network `ov2` from all the cluster nodes. If the values have a mismatch `docker network inspect --verbose` on the individual nodes can help in identifying what the specific difference is.
|
180
libnetwork/cmd/ssd/ssd.py
Executable file
180
libnetwork/cmd/ssd/ssd.py
Executable file
|
@ -0,0 +1,180 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
import sys, signal, time
|
||||||
|
import docker
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import json
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
ipv4match = re.compile(
|
||||||
|
r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9]).' +
|
||||||
|
r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9]).' +
|
||||||
|
r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9]).' +
|
||||||
|
r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9])'
|
||||||
|
)
|
||||||
|
|
||||||
|
def check_iptables(name, plist):
|
||||||
|
replace = (':', ',')
|
||||||
|
ports = []
|
||||||
|
for port in plist:
|
||||||
|
for r in replace:
|
||||||
|
port = port.replace(r, ' ')
|
||||||
|
|
||||||
|
p = port.split()
|
||||||
|
ports.append((p[1], p[3]))
|
||||||
|
|
||||||
|
# get the ingress sandbox's docker_gwbridge network IP.
|
||||||
|
# published ports get DNAT'ed to this IP.
|
||||||
|
ip = subprocess.check_output(['/usr/bin/nsenter', '--net=/var/run/docker/netns/ingress_sbox', '/bin/bash', '-c', 'ifconfig eth1 | grep \"inet\\ addr\" | cut -d: -f2 | cut -d\" \" -f1'])
|
||||||
|
ip = ip.rstrip()
|
||||||
|
|
||||||
|
for p in ports:
|
||||||
|
rule = '/sbin/iptables -t nat -C DOCKER-INGRESS -p tcp --dport {0} -j DNAT --to {1}:{2}'.format(p[1], ip, p[1])
|
||||||
|
try:
|
||||||
|
subprocess.check_output(["/bin/bash", "-c", rule])
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print "Service {0}: host iptables DNAT rule for port {1} -> ingress sandbox {2}:{3} missing".format(name, p[1], ip, p[1])
|
||||||
|
|
||||||
|
def get_namespaces(data, ingress=False):
|
||||||
|
if ingress is True:
|
||||||
|
return {"Ingress":"/var/run/docker/netns/ingress_sbox"}
|
||||||
|
else:
|
||||||
|
spaces =[]
|
||||||
|
for c in data["Containers"]:
|
||||||
|
sandboxes = {str(c) for c in data["Containers"]}
|
||||||
|
|
||||||
|
containers = {}
|
||||||
|
for s in sandboxes:
|
||||||
|
spaces.append(str(cli.inspect_container(s)["NetworkSettings"]["SandboxKey"]))
|
||||||
|
inspect = cli.inspect_container(s)
|
||||||
|
containers[str(inspect["Name"])] = str(inspect["NetworkSettings"]["SandboxKey"])
|
||||||
|
return containers
|
||||||
|
|
||||||
|
|
||||||
|
def check_network(nw_name, ingress=False):
|
||||||
|
|
||||||
|
print "Verifying LB programming for containers on network %s" % nw_name
|
||||||
|
|
||||||
|
data = cli.inspect_network(nw_name, verbose=True)
|
||||||
|
|
||||||
|
services = data["Services"]
|
||||||
|
fwmarks = {str(service): str(svalue["LocalLBIndex"]) for service, svalue in services.items()}
|
||||||
|
|
||||||
|
stasks = {}
|
||||||
|
for service, svalue in services.items():
|
||||||
|
if service == "":
|
||||||
|
continue
|
||||||
|
tasks = []
|
||||||
|
for task in svalue["Tasks"]:
|
||||||
|
tasks.append(str(task["EndpointIP"]))
|
||||||
|
stasks[fwmarks[str(service)]] = tasks
|
||||||
|
|
||||||
|
# for services in ingress network verify the iptables rules
|
||||||
|
# that direct ingress (published port) to backend (target port)
|
||||||
|
if ingress is True:
|
||||||
|
check_iptables(service, svalue["Ports"])
|
||||||
|
|
||||||
|
containers = get_namespaces(data, ingress)
|
||||||
|
for container, namespace in containers.items():
|
||||||
|
print "Verifying container %s..." % container
|
||||||
|
ipvs = subprocess.check_output(['/usr/bin/nsenter', '--net=%s' % namespace, '/usr/sbin/ipvsadm', '-ln'])
|
||||||
|
|
||||||
|
mark = ""
|
||||||
|
realmark = {}
|
||||||
|
for line in ipvs.splitlines():
|
||||||
|
if "FWM" in line:
|
||||||
|
mark = re.findall("[0-9]+", line)[0]
|
||||||
|
realmark[str(mark)] = []
|
||||||
|
elif "->" in line:
|
||||||
|
if mark == "":
|
||||||
|
continue
|
||||||
|
ip = ipv4match.search(line)
|
||||||
|
if ip is not None:
|
||||||
|
realmark[mark].append(format(ip.group(0)))
|
||||||
|
else:
|
||||||
|
mark = ""
|
||||||
|
for key in realmark.keys():
|
||||||
|
if key not in stasks:
|
||||||
|
print "LB Index %s" % key, "present in IPVS but missing in docker daemon"
|
||||||
|
del realmark[key]
|
||||||
|
|
||||||
|
for key in stasks.keys():
|
||||||
|
if key not in realmark:
|
||||||
|
print "LB Index %s" % key, "present in docker daemon but missing in IPVS"
|
||||||
|
del stasks[key]
|
||||||
|
|
||||||
|
for key in realmark:
|
||||||
|
service = "--Invalid--"
|
||||||
|
for sname, idx in fwmarks.items():
|
||||||
|
if key == idx:
|
||||||
|
service = sname
|
||||||
|
if len(set(realmark[key])) != len(set(stasks[key])):
|
||||||
|
print "Incorrect LB Programming for service %s" % service
|
||||||
|
print "control-plane backend tasks:"
|
||||||
|
for task in stasks[key]:
|
||||||
|
print task
|
||||||
|
print "kernel IPVS backend tasks:"
|
||||||
|
for task in realmark[key]:
|
||||||
|
print task
|
||||||
|
else:
|
||||||
|
print "service %s... OK" % service
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print 'Usage: ssd.py network-name [gossip-consistency]'
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
cli = docker.APIClient(base_url='unix://var/run/docker.sock', version='auto')
|
||||||
|
if len(sys.argv) == 3:
|
||||||
|
command = sys.argv[2]
|
||||||
|
else:
|
||||||
|
command = 'default'
|
||||||
|
|
||||||
|
if command == 'gossip-consistency':
|
||||||
|
cspec = docker.types.ContainerSpec(
|
||||||
|
image='sanimej/ssd',
|
||||||
|
args=[sys.argv[1], 'gossip-hash'],
|
||||||
|
mounts=[docker.types.Mount('/var/run/docker.sock', '/var/run/docker.sock', type='bind')]
|
||||||
|
)
|
||||||
|
mode = docker.types.ServiceMode(
|
||||||
|
mode='global'
|
||||||
|
)
|
||||||
|
task_template = docker.types.TaskTemplate(cspec)
|
||||||
|
|
||||||
|
cli.create_service(task_template, name='gossip-hash', mode=mode)
|
||||||
|
#TODO change to a deterministic way to check if the service is up.
|
||||||
|
time.sleep(5)
|
||||||
|
output = cli.service_logs('gossip-hash', stdout=True, stderr=True, details=True)
|
||||||
|
for line in output:
|
||||||
|
print("Node id: %s gossip hash %s" % (line[line.find("=")+1:line.find(",")], line[line.find(" ")+1:]))
|
||||||
|
if cli.remove_service('gossip-hash') is not True:
|
||||||
|
print("Deleting gossip-hash service failed")
|
||||||
|
elif command == 'gossip-hash':
|
||||||
|
data = cli.inspect_network(sys.argv[1], verbose=True)
|
||||||
|
services = data["Services"]
|
||||||
|
md5 = hashlib.md5()
|
||||||
|
entries = []
|
||||||
|
for service, value in services.items():
|
||||||
|
entries.append(service)
|
||||||
|
entries.append(value["VIP"])
|
||||||
|
for task in value["Tasks"]:
|
||||||
|
for key, val in task.items():
|
||||||
|
if isinstance(val, dict):
|
||||||
|
for k, v in val.items():
|
||||||
|
entries.append(v)
|
||||||
|
else:
|
||||||
|
entries.append(val)
|
||||||
|
entries.sort()
|
||||||
|
for e in entries:
|
||||||
|
md5.update(e)
|
||||||
|
print(md5.hexdigest())
|
||||||
|
sys.stdout.flush()
|
||||||
|
while True:
|
||||||
|
signal.pause()
|
||||||
|
elif command == 'default':
|
||||||
|
if sys.argv[1] == "ingress":
|
||||||
|
check_network("ingress", ingress=True)
|
||||||
|
else:
|
||||||
|
check_network(sys.argv[1])
|
||||||
|
check_network("ingress", ingress=True)
|
Loading…
Add table
Reference in a new issue