From 7d1414ec3eead44c9662f450aaf9da86c5ac3cb5 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Wed, 17 Apr 2019 19:29:32 -0700
Subject: [PATCH 1/7] aufs: remove extra locking

Both mount and unmount calls are already protected by fine-grained
(per id) locks in Get()/Put() introduced in commit fc1cf1911bb
("Add more locking to storage drivers"), so there's no point in
having a global lock in mount/unmount.

The only place from which unmount is called without any locking
is Cleanup() -- this is to be addressed in the next patch.

This reverts commit 824c24e6802ad3ed7e26b4f16e5ae81869b98185.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
(cherry picked from commit f93750b2c4d5f6144f0790ffa89291da3c097b80)
---
 daemon/graphdriver/aufs/aufs.go | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/daemon/graphdriver/aufs/aufs.go b/daemon/graphdriver/aufs/aufs.go
index 11ca939904..6c929fff35 100644
--- a/daemon/graphdriver/aufs/aufs.go
+++ b/daemon/graphdriver/aufs/aufs.go
@@ -72,7 +72,6 @@ func init() {
 
 // Driver contains information about the filesystem mounted.
 type Driver struct {
-	sync.Mutex
 	root          string
 	uidMaps       []idtools.IDMap
 	gidMaps       []idtools.IDMap
@@ -547,9 +546,6 @@ func (a *Driver) getParentLayerPaths(id string) ([]string, error) {
 }
 
 func (a *Driver) mount(id string, target string, mountLabel string, layers []string) error {
-	a.Lock()
-	defer a.Unlock()
-
 	// If the id is mounted or we get an error return
 	if mounted, err := a.mounted(target); err != nil || mounted {
 		return err
@@ -564,9 +560,6 @@ func (a *Driver) mount(id string, target string, mountLabel string, layers []str
 }
 
 func (a *Driver) unmount(mountPath string) error {
-	a.Lock()
-	defer a.Unlock()
-
 	if mounted, err := a.mounted(mountPath); err != nil || !mounted {
 		return err
 	}

From d1eae8959057eadc9fffe84b59e8ebc20c370e5d Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Wed, 17 Apr 2019 19:52:36 -0700
Subject: [PATCH 2/7] aufs: use mount.Unmount

1. Use mount.Unmount() which ignores EINVAL ("not mounted") error,
and provides better error diagnostics (so we don't have to explicitly
add target to error messages).

2. Since we're ignoring "not mounted" error, we can call
multiple unmounts without any locking -- but since "auplink flush"
is still involved and can produce an error in logs, let's keep
the check for fs being mounted (it's just a statfs so should be fast).

2. While at it, improve the "can't unmount" error message in Put().

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
(cherry picked from commit 4beee98026feabe4f4f0468215b8fd9b56f90d5e)
---
 daemon/graphdriver/aufs/aufs.go  | 8 ++++----
 daemon/graphdriver/aufs/mount.go | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/daemon/graphdriver/aufs/aufs.go b/daemon/graphdriver/aufs/aufs.go
index 6c929fff35..38df774225 100644
--- a/daemon/graphdriver/aufs/aufs.go
+++ b/daemon/graphdriver/aufs/aufs.go
@@ -326,11 +326,11 @@ func (a *Driver) Remove(id string) error {
 			break
 		}
 
-		if err != unix.EBUSY {
-			return errors.Wrapf(err, "aufs: unmount error: %s", mountpoint)
+		if errors.Cause(err) != unix.EBUSY {
+			return errors.Wrap(err, "aufs: unmount error")
 		}
 		if retries >= 5 {
-			return errors.Wrapf(err, "aufs: unmount error after retries: %s", mountpoint)
+			return errors.Wrap(err, "aufs: unmount error after retries")
 		}
 		// If unmount returns EBUSY, it could be a transient error. Sleep and retry.
 		retries++
@@ -436,7 +436,7 @@ func (a *Driver) Put(id string) error {
 
 	err := a.unmount(m)
 	if err != nil {
-		logger.Debugf("Failed to unmount %s aufs: %v", id, err)
+		logger.WithError(err).WithField("method", "Put()").Warn()
 	}
 	return err
 }
diff --git a/daemon/graphdriver/aufs/mount.go b/daemon/graphdriver/aufs/mount.go
index 9f5510380c..4e85cf235d 100644
--- a/daemon/graphdriver/aufs/mount.go
+++ b/daemon/graphdriver/aufs/mount.go
@@ -5,7 +5,7 @@ package aufs // import "github.com/docker/docker/daemon/graphdriver/aufs"
 import (
 	"os/exec"
 
-	"golang.org/x/sys/unix"
+	"github.com/docker/docker/pkg/mount"
 )
 
 // Unmount the target specified.
@@ -13,5 +13,5 @@ func Unmount(target string) error {
 	if err := exec.Command("auplink", target, "flush").Run(); err != nil {
 		logger.WithError(err).Warnf("Couldn't run auplink before unmount %s", target)
 	}
-	return unix.Unmount(target, 0)
+	return mount.Unmount(target)
 }

From 76d936ae761c56372b0a37dbc3fab2e87c6bc91c Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Thu, 18 Apr 2019 15:55:55 -0700
Subject: [PATCH 3/7] aufs: aufsMount: better errors for unix.Mount()

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
(cherry picked from commit 5873768dbe3be2733874b8cf68cb492817f81a94)
---
 daemon/graphdriver/aufs/aufs.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/daemon/graphdriver/aufs/aufs.go b/daemon/graphdriver/aufs/aufs.go
index 38df774225..5a95ef42e0 100644
--- a/daemon/graphdriver/aufs/aufs.go
+++ b/daemon/graphdriver/aufs/aufs.go
@@ -626,6 +626,7 @@ func (a *Driver) aufsMount(ro []string, rw, target, mountLabel string) (err erro
 	}
 	data := label.FormatMountLabel(fmt.Sprintf("%s,%s", string(b[:bp]), opts), mountLabel)
 	if err = unix.Mount("none", target, "aufs", 0, data); err != nil {
+		err = errors.Wrap(err, "mount target="+target+" data="+data)
 		return
 	}
 
@@ -633,6 +634,7 @@ func (a *Driver) aufsMount(ro []string, rw, target, mountLabel string) (err erro
 		layer := fmt.Sprintf(":%s=ro+wh", ro[index])
 		data := label.FormatMountLabel(fmt.Sprintf("append%s", layer), mountLabel)
 		if err = unix.Mount("none", target, "aufs", unix.MS_REMOUNT, data); err != nil {
+			err = errors.Wrap(err, "mount target="+target+" flags=MS_REMOUNT data="+data)
 			return
 		}
 	}

From 65ba452bb00c6f19c962c998587eda5aadb66640 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Mon, 22 Apr 2019 18:34:24 +0000
Subject: [PATCH 4/7] aufs: add lock around mount

Apparently there is some kind of race in aufs kernel module code,
which leads to the errors like:

[98221.158606] aufs au_xino_create2:186:dockerd[25801]: aufs.xino create err -17
[98221.162128] aufs au_xino_set:1229:dockerd[25801]: I/O Error, failed creating xino(-17).
[98362.239085] aufs au_xino_create2:186:dockerd[6348]: aufs.xino create err -17
[98362.243860] aufs au_xino_set:1229:dockerd[6348]: I/O Error, failed creating xino(-17).
[98373.775380] aufs au_xino_create:767:dockerd[27435]: open /dev/shm/aufs.xino(-17)
[98389.015640] aufs au_xino_create2:186:dockerd[26753]: aufs.xino create err -17
[98389.018776] aufs au_xino_set:1229:dockerd[26753]: I/O Error, failed creating xino(-17).
[98424.117584] aufs au_xino_create:767:dockerd[27105]: open /dev/shm/aufs.xino(-17)

So, we have to have a lock around mount syscall.

While at it, don't call the whole Unmount() on an error path, as
it leads to bogus error from auplink flush.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
(cherry picked from commit 5cd62852fa199f272b542d828c8c5d1db427ea53)
---
 daemon/graphdriver/aufs/aufs.go | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/daemon/graphdriver/aufs/aufs.go b/daemon/graphdriver/aufs/aufs.go
index 5a95ef42e0..fd2cf58063 100644
--- a/daemon/graphdriver/aufs/aufs.go
+++ b/daemon/graphdriver/aufs/aufs.go
@@ -80,6 +80,7 @@ type Driver struct {
 	pathCache     map[string]string
 	naiveDiff     graphdriver.DiffDriver
 	locker        *locker.Locker
+	mntL          sync.Mutex
 }
 
 // Init returns a new AUFS driver.
@@ -597,7 +598,7 @@ func (a *Driver) Cleanup() error {
 func (a *Driver) aufsMount(ro []string, rw, target, mountLabel string) (err error) {
 	defer func() {
 		if err != nil {
-			Unmount(target)
+			mount.Unmount(target)
 		}
 	}()
 
@@ -625,7 +626,10 @@ func (a *Driver) aufsMount(ro []string, rw, target, mountLabel string) (err erro
 		opts += ",dirperm1"
 	}
 	data := label.FormatMountLabel(fmt.Sprintf("%s,%s", string(b[:bp]), opts), mountLabel)
-	if err = unix.Mount("none", target, "aufs", 0, data); err != nil {
+	a.mntL.Lock()
+	err = unix.Mount("none", target, "aufs", 0, data)
+	a.mntL.Unlock()
+	if err != nil {
 		err = errors.Wrap(err, "mount target="+target+" data="+data)
 		return
 	}
@@ -633,7 +637,10 @@ func (a *Driver) aufsMount(ro []string, rw, target, mountLabel string) (err erro
 	for ; index < len(ro); index++ {
 		layer := fmt.Sprintf(":%s=ro+wh", ro[index])
 		data := label.FormatMountLabel(fmt.Sprintf("append%s", layer), mountLabel)
-		if err = unix.Mount("none", target, "aufs", unix.MS_REMOUNT, data); err != nil {
+		a.mntL.Lock()
+		err = unix.Mount("none", target, "aufs", unix.MS_REMOUNT, data)
+		a.mntL.Unlock()
+		if err != nil {
 			err = errors.Wrap(err, "mount target="+target+" flags=MS_REMOUNT data="+data)
 			return
 		}

From f0f7020b5d11b3fc99e6df64efdf96e7019eb5ae Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Tue, 23 Apr 2019 02:18:17 +0000
Subject: [PATCH 5/7] aufs: optimize lots of layers case

In case there are a big number of layers, so that mount data won't fit
into a single memory page (4096 bytes on most platforms, which is good
enough for about 40 layers, depending on how long graphdriver root path
is), we supply additional layers with O_REMOUNT, as described in aufs
documentation.

Problem is, the current implementation does that one layer at a time
(i.e. there is one mount syscall per each additional layer).

Optimize the code to supply as many layers as we can fit in one page
(basically reusing the same code as for the original mount).

Note, per aufs docs, "[a]t remount-time, the options are interpreted
in the given order, e.g. left to right" so we should be good.

Tested on an image with ~100 layers.

Before (35 syscalls):
> [pid 22756] 1556919088.686955 mount("none", "/mnt/volume_sfo2_09/docker-aufs/aufs/mnt/a86f8c9dd0ec2486293119c20b0ec026e19bbc4d51332c554f7cf05d777c9866", "aufs", 0, "br:/mnt/volume_sfo2_09/docker-au"...) = 0 <0.000504>
> [pid 22756] 1556919088.687643 mount("none", "/mnt/volume_sfo2_09/docker-aufs/aufs/mnt/a86f8c9dd0ec2486293119c20b0ec026e19bbc4d51332c554f7cf05d777c9866", 0xc000c451b0, MS_REMOUNT, "append:/mnt/volume_sfo2_09/docke"...) = 0 <0.000105>
> [pid 22756] 1556919088.687851 mount("none", "/mnt/volume_sfo2_09/docker-aufs/aufs/mnt/a86f8c9dd0ec2486293119c20b0ec026e19bbc4d51332c554f7cf05d777c9866", 0xc000c451ba, MS_REMOUNT, "append:/mnt/volume_sfo2_09/docke"...) = 0 <0.000098>
> ..... (~30 lines skipped for clarity)
> [pid 22756] 1556919088.696182 mount("none", "/mnt/volume_sfo2_09/docker-aufs/aufs/mnt/a86f8c9dd0ec2486293119c20b0ec026e19bbc4d51332c554f7cf05d777c9866", 0xc000c45310, MS_REMOUNT, "append:/mnt/volume_sfo2_09/docke"...) = 0 <0.000266>

After (2 syscalls):
> [pid 24352] 1556919361.799889 mount("none", "/mnt/volume_sfo2_09/docker-aufs/aufs/mnt/8e7ba189e347a834e99eea4ed568f95b86cec809c227516afdc7c70286ff9a20", "aufs", 0, "br:/mnt/volume_sfo2_09/docker-au"...) = 0 <0.001717>
> [pid 24352] 1556919361.801761 mount("none", "/mnt/volume_sfo2_09/docker-aufs/aufs/mnt/8e7ba189e347a834e99eea4ed568f95b86cec809c227516afdc7c70286ff9a20", 0xc000dbecb0, MS_REMOUNT, "append:/mnt/volume_sfo2_09/docke"...) = 0 <0.001358>

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
(cherry picked from commit d58c434bffef76e48bff75ede290937874488dd3)
---
 daemon/graphdriver/aufs/aufs.go | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/daemon/graphdriver/aufs/aufs.go b/daemon/graphdriver/aufs/aufs.go
index fd2cf58063..9a0a6207f0 100644
--- a/daemon/graphdriver/aufs/aufs.go
+++ b/daemon/graphdriver/aufs/aufs.go
@@ -634,9 +634,16 @@ func (a *Driver) aufsMount(ro []string, rw, target, mountLabel string) (err erro
 		return
 	}
 
-	for ; index < len(ro); index++ {
-		layer := fmt.Sprintf(":%s=ro+wh", ro[index])
-		data := label.FormatMountLabel(fmt.Sprintf("append%s", layer), mountLabel)
+	for index < len(ro) {
+		bp = 0
+		for ; index < len(ro); index++ {
+			layer := fmt.Sprintf("append:%s=ro+wh,", ro[index])
+			if bp+len(layer) > len(b) {
+				break
+			}
+			bp += copy(b[bp:], layer)
+		}
+		data := label.FormatMountLabel(string(b[:bp]), mountLabel)
 		a.mntL.Lock()
 		err = unix.Mount("none", target, "aufs", unix.MS_REMOUNT, data)
 		a.mntL.Unlock()

From cad766f6c7ff4f75c45d82ee70d87f190f8cfd19 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Wed, 17 Apr 2019 19:56:48 -0700
Subject: [PATCH 6/7] aufs.Cleanup: optimize

Do not use filepath.Walk() as there's no requirement to recursively
go into every directory under mnt -- a (non-recursive) list of
directories in mnt is sufficient.

With filepath.Walk(), in case some container will fail to unmount,
it'll go through the whole container filesystem which is both
excessive and useless.

This is similar to commit f1a459229724f5e8e440b49f ("devmapper.shutdown:
optimize")

While at it, raise the priority of "unmount error" message from debug
to a warning. Note we don't have to explicitly add `m` as unmount error (from
pkg/mount) will have it.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
(cherry picked from commit 8fda12c6078ed5c86be0822a7a980c6fbc9220bf)
---
 daemon/graphdriver/aufs/aufs.go | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/daemon/graphdriver/aufs/aufs.go b/daemon/graphdriver/aufs/aufs.go
index 9a0a6207f0..bbd19a82b0 100644
--- a/daemon/graphdriver/aufs/aufs.go
+++ b/daemon/graphdriver/aufs/aufs.go
@@ -573,23 +573,20 @@ func (a *Driver) mounted(mountpoint string) (bool, error) {
 
 // Cleanup aufs and unmount all mountpoints
 func (a *Driver) Cleanup() error {
-	var dirs []string
-	if err := filepath.Walk(a.mntPath(), func(path string, info os.FileInfo, err error) error {
-		if err != nil {
-			return err
-		}
-		if !info.IsDir() {
-			return nil
-		}
-		dirs = append(dirs, path)
-		return nil
-	}); err != nil {
-		return err
+	dir := a.mntPath()
+	files, err := ioutil.ReadDir(dir)
+	if err != nil {
+		return errors.Wrap(err, "aufs readdir error")
 	}
+	for _, f := range files {
+		if !f.IsDir() {
+			continue
+		}
+
+		m := path.Join(dir, f.Name())
 
-	for _, m := range dirs {
 		if err := a.unmount(m); err != nil {
-			logger.Debugf("error unmounting %s: %s", m, err)
+			logger.WithError(err).WithField("method", "Cleanup()").Warn()
 		}
 	}
 	return mount.RecursiveUnmount(a.root)

From a81278befee73b0c6a5ceb26dafe081577ce6e5c Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Fri, 10 May 2019 17:19:49 -0700
Subject: [PATCH 7/7] aufs: retry auplink flush

Running a bundled aufs benchmark sometimes results in this warning:

> WARN[0001] Couldn't run auplink before unmount /tmp/aufs-tests/aufs/mnt/XXXXX  error="exit status 22" storage-driver=aufs

If we take a look at what aulink utility produces on stderr, we'll see:

> auplink:proc_mnt.c:96: /tmp/aufs-tests/aufs/mnt/XXXXX: Invalid argument

and auplink exits with exit code of 22 (EINVAL).

Looking into auplink source code, what happens is it tries to find a
record in /proc/self/mounts corresponding to the mount point (by using
setmntent()/getmntent_r() glibc functions), and it fails.

Some manual testing, as well as runtime testing with lots of printf
added on mount/unmount, as well as calls to check the superblock fs
magic on mount point (as in graphdriver.Mounted(graphdriver.FsMagicAufs, target)
confirmed that this record is in fact there, but sometimes auplink
can't find it. I was also able to reproduce the same error (inability
to find a mount in /proc/self/mounts that should definitely be there)
using a small C program, mocking what `auplink` does:

```c
 #include <stdio.h>
 #include <err.h>
 #include <mntent.h>
 #include <string.h>
 #include <stdlib.h>

int main(int argc, char **argv)
{
	FILE *fp;
	struct mntent m, *p;
	char a[4096];
	char buf[4096 + 1024];
	int found =0, lines = 0;

	if (argc != 2) {
		fprintf(stderr, "Usage: %s <mountpoint>\n", argv[0]);
		exit(1);
	}

	fp = setmntent("/proc/self/mounts", "r");
	if (!fp) {
		err(1, "setmntent");
	}
	setvbuf(fp, a, _IOLBF, sizeof(a));
	while ((p = getmntent_r(fp, &m, buf, sizeof(buf)))) {
		lines++;
		if (!strcmp(p->mnt_dir, argv[1])) {
			found++;
		}
	}
	printf("found %d entries for %s (%d lines seen)\n", found, argv[1], lines);
	return !found;
}
```

I have also wrote a few other C proggies -- one that reads
/proc/self/mounts directly, one that reads /proc/self/mountinfo instead.
They are also prone to the same occasional error.

It is not perfectly clear why this happens, but so far my best theory
is when a lot of mounts/unmounts happen in parallel with reading
contents of /proc/self/mounts, sometimes the kernel fails to provide
continuity (i.e. it skips some part of file or mixes it up in some
other way). In other words, this is a kernel bug (which is probably
hard to fix unless some other interface to get a mount entry is added).

Now, there is no real fix, and a workaround I was able to come up
with is to retry when we got EINVAL. It usually works on the second
attempt, although I've once seen it took two attempts to go through.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
(cherry picked from commit ae431b10a9508e2bf3b1782e9d435855e3e13977)
---
 daemon/graphdriver/aufs/mount.go | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/daemon/graphdriver/aufs/mount.go b/daemon/graphdriver/aufs/mount.go
index 4e85cf235d..fc20a5eca6 100644
--- a/daemon/graphdriver/aufs/mount.go
+++ b/daemon/graphdriver/aufs/mount.go
@@ -4,14 +4,38 @@ package aufs // import "github.com/docker/docker/daemon/graphdriver/aufs"
 
 import (
 	"os/exec"
+	"syscall"
 
 	"github.com/docker/docker/pkg/mount"
 )
 
 // Unmount the target specified.
 func Unmount(target string) error {
-	if err := exec.Command("auplink", target, "flush").Run(); err != nil {
-		logger.WithError(err).Warnf("Couldn't run auplink before unmount %s", target)
+	const (
+		EINVAL  = 22 // if auplink returns this,
+		retries = 3  // retry a few times
+	)
+
+	for i := 0; ; i++ {
+		out, err := exec.Command("auplink", target, "flush").CombinedOutput()
+		if err == nil {
+			break
+		}
+		rc := 0
+		if exiterr, ok := err.(*exec.ExitError); ok {
+			if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
+				rc = status.ExitStatus()
+			}
+		}
+		if i >= retries || rc != EINVAL {
+			logger.WithError(err).WithField("method", "Unmount").Warnf("auplink flush failed: %s", out)
+			break
+		}
+		// auplink failed to find target in /proc/self/mounts because
+		// kernel can't guarantee continuity while reading from it
+		// while mounts table is being changed
+		logger.Debugf("auplink flush error (retrying %d/%d): %s", i+1, retries, out)
 	}
+
 	return mount.Unmount(target)
 }