Merge pull request #2071 from fcrisciani/ssd

Import the ssd tool in libnetwork
This commit is contained in:
Flavio Crisciani 2018-02-10 09:52:40 -08:00 committed by GitHub
commit 92888febdb
3 changed files with 261 additions and 0 deletions

34
libnetwork/cmd/ssd/Dockerfile Executable file
View File

@ -0,0 +1,34 @@
FROM alpine:3.7
ENV PACKAGES="\
musl \
linux-headers \
build-base \
util-linux \
bash \
git \
ca-certificates \
python2 \
python2-dev \
py-setuptools \
iproute2 \
curl \
strace \
drill \
ipvsadm \
iperf \
ethtool \
"
RUN echo \
&& apk add --no-cache $PACKAGES \
&& if [[ ! -e /usr/bin/python ]]; then ln -sf /usr/bin/python2.7 /usr/bin/python; fi \
&& if [[ ! -e /usr/bin/python-config ]]; then ln -sf /usr/bin/python2.7-config /usr/bin/python-config; fi \
&& if [[ ! -e /usr/bin/easy_install ]]; then ln -sf /usr/bin/easy_install-2.7 /usr/bin/easy_install; fi \
&& easy_install pip \
&& pip install --upgrade pip \
&& if [[ ! -e /usr/bin/pip ]]; then ln -sf /usr/bin/pip2.7 /usr/bin/pip; fi \
&& echo
ADD ssd.py /
RUN pip install git+git://github.com/docker/docker-py.git
ENTRYPOINT [ "python", "/ssd.py"]

47
libnetwork/cmd/ssd/README.md Executable file
View File

@ -0,0 +1,47 @@
# Docker Swarm Service Driller(ssd)
ssd is a troubleshooting utility for Docker swarm networks.
### control-plane and datapath consistency check on a node
ssd checks for the consistency between docker network control-plane (from the docker daemon in-memory state) and kernel data path programming. Currently the tool checks only for the consistency of the Load balancer (implemented using IPVS).
In a three node swarm cluser ssd status for a overlay network `ov2` which has three services running, each replicated to 3 instances.
````bash
vagrant@net-1:~/code/go/src/github.com/docker/docker-e2e/tests$ docker run -v /var/run/docker.sock:/var/run/docker.sock -v /var/run/docker/netns:/var/run/docker/netns --privileged --net=host sanimej/ssd ov2
Verifying LB programming for containers on network ov2
Verifying container /s2.3.ltrdwef0iqf90rqauw3ehcs56...
service s2... OK
service s3... OK
service s1... OK
Verifying container /s3.3.nyhwvdvnocb4wftyhb8dr4fj8...
service s2... OK
service s3... OK
service s1... OK
Verifying container /s1.3.wwx5tuxhnvoz5vrb8ohphby0r...
service s2... OK
service s3... OK
service s1... OK
Verifying LB programming for containers on network ingress
Verifying container Ingress...
service web... OK
````
ssd checks the required iptables programming to direct an incoming packet with the <host ip>:<published port> to the right <backend ip>:<target port>
### control-plane consistency check across nodes in a cluster
Docker networking uses a gossip protocol to synchronize networking state across nodes in a cluster. ssd's `gossip-consistency` command verifies if the state maintained by all the nodes are consistent.
````bash
In a three node cluster with services running on an overlay network ov2 ssd consistency-checker shows
vagrant@net-1:~/code/go/src/github.com/docker/docker-e2e/tests$ docker run -v /var/run/docker.sock:/var/run/docker.sock -v /var/run/docker/netns:/var/run/docker/netns --privileged sanimej/ssd ov2 gossip-consistency
Node id: sjfp0ca8f43rvnab6v7f21gq0 gossip hash c57d89094dbb574a37930393278dc282
Node id: bg228r3q9095grj4wxkqs80oe gossip hash c57d89094dbb574a37930393278dc282
Node id: 6jylcraipcv2pxdricqe77j5q gossip hash c57d89094dbb574a37930393278dc282
````
This is hash digest of the control-plane state for the network `ov2` from all the cluster nodes. If the values have a mismatch `docker network inspect --verbose` on the individual nodes can help in identifying what the specific difference is.

180
libnetwork/cmd/ssd/ssd.py Executable file
View File

@ -0,0 +1,180 @@
#!/usr/bin/python
import sys, signal, time
import docker
import re
import subprocess
import json
import hashlib
ipv4match = re.compile(
r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9]).' +
r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9]).' +
r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9]).' +
r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9])'
)
def check_iptables(name, plist):
replace = (':', ',')
ports = []
for port in plist:
for r in replace:
port = port.replace(r, ' ')
p = port.split()
ports.append((p[1], p[3]))
# get the ingress sandbox's docker_gwbridge network IP.
# published ports get DNAT'ed to this IP.
ip = subprocess.check_output(['/usr/bin/nsenter', '--net=/var/run/docker/netns/ingress_sbox', '/bin/bash', '-c', 'ifconfig eth1 | grep \"inet\\ addr\" | cut -d: -f2 | cut -d\" \" -f1'])
ip = ip.rstrip()
for p in ports:
rule = '/sbin/iptables -t nat -C DOCKER-INGRESS -p tcp --dport {0} -j DNAT --to {1}:{2}'.format(p[1], ip, p[1])
try:
subprocess.check_output(["/bin/bash", "-c", rule])
except subprocess.CalledProcessError as e:
print "Service {0}: host iptables DNAT rule for port {1} -> ingress sandbox {2}:{3} missing".format(name, p[1], ip, p[1])
def get_namespaces(data, ingress=False):
if ingress is True:
return {"Ingress":"/var/run/docker/netns/ingress_sbox"}
else:
spaces =[]
for c in data["Containers"]:
sandboxes = {str(c) for c in data["Containers"]}
containers = {}
for s in sandboxes:
spaces.append(str(cli.inspect_container(s)["NetworkSettings"]["SandboxKey"]))
inspect = cli.inspect_container(s)
containers[str(inspect["Name"])] = str(inspect["NetworkSettings"]["SandboxKey"])
return containers
def check_network(nw_name, ingress=False):
print "Verifying LB programming for containers on network %s" % nw_name
data = cli.inspect_network(nw_name, verbose=True)
services = data["Services"]
fwmarks = {str(service): str(svalue["LocalLBIndex"]) for service, svalue in services.items()}
stasks = {}
for service, svalue in services.items():
if service == "":
continue
tasks = []
for task in svalue["Tasks"]:
tasks.append(str(task["EndpointIP"]))
stasks[fwmarks[str(service)]] = tasks
# for services in ingress network verify the iptables rules
# that direct ingress (published port) to backend (target port)
if ingress is True:
check_iptables(service, svalue["Ports"])
containers = get_namespaces(data, ingress)
for container, namespace in containers.items():
print "Verifying container %s..." % container
ipvs = subprocess.check_output(['/usr/bin/nsenter', '--net=%s' % namespace, '/usr/sbin/ipvsadm', '-ln'])
mark = ""
realmark = {}
for line in ipvs.splitlines():
if "FWM" in line:
mark = re.findall("[0-9]+", line)[0]
realmark[str(mark)] = []
elif "->" in line:
if mark == "":
continue
ip = ipv4match.search(line)
if ip is not None:
realmark[mark].append(format(ip.group(0)))
else:
mark = ""
for key in realmark.keys():
if key not in stasks:
print "LB Index %s" % key, "present in IPVS but missing in docker daemon"
del realmark[key]
for key in stasks.keys():
if key not in realmark:
print "LB Index %s" % key, "present in docker daemon but missing in IPVS"
del stasks[key]
for key in realmark:
service = "--Invalid--"
for sname, idx in fwmarks.items():
if key == idx:
service = sname
if len(set(realmark[key])) != len(set(stasks[key])):
print "Incorrect LB Programming for service %s" % service
print "control-plane backend tasks:"
for task in stasks[key]:
print task
print "kernel IPVS backend tasks:"
for task in realmark[key]:
print task
else:
print "service %s... OK" % service
if __name__ == '__main__':
if len(sys.argv) < 2:
print 'Usage: ssd.py network-name [gossip-consistency]'
sys.exit()
cli = docker.APIClient(base_url='unix://var/run/docker.sock', version='auto')
if len(sys.argv) == 3:
command = sys.argv[2]
else:
command = 'default'
if command == 'gossip-consistency':
cspec = docker.types.ContainerSpec(
image='sanimej/ssd',
args=[sys.argv[1], 'gossip-hash'],
mounts=[docker.types.Mount('/var/run/docker.sock', '/var/run/docker.sock', type='bind')]
)
mode = docker.types.ServiceMode(
mode='global'
)
task_template = docker.types.TaskTemplate(cspec)
cli.create_service(task_template, name='gossip-hash', mode=mode)
#TODO change to a deterministic way to check if the service is up.
time.sleep(5)
output = cli.service_logs('gossip-hash', stdout=True, stderr=True, details=True)
for line in output:
print("Node id: %s gossip hash %s" % (line[line.find("=")+1:line.find(",")], line[line.find(" ")+1:]))
if cli.remove_service('gossip-hash') is not True:
print("Deleting gossip-hash service failed")
elif command == 'gossip-hash':
data = cli.inspect_network(sys.argv[1], verbose=True)
services = data["Services"]
md5 = hashlib.md5()
entries = []
for service, value in services.items():
entries.append(service)
entries.append(value["VIP"])
for task in value["Tasks"]:
for key, val in task.items():
if isinstance(val, dict):
for k, v in val.items():
entries.append(v)
else:
entries.append(val)
entries.sort()
for e in entries:
md5.update(e)
print(md5.hexdigest())
sys.stdout.flush()
while True:
signal.pause()
elif command == 'default':
if sys.argv[1] == "ingress":
check_network("ingress", ingress=True)
else:
check_network(sys.argv[1])
check_network("ingress", ingress=True)