From e07d3cd9acf14219f33e12375fb8c2e3fe02ad0c Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 2 Apr 2015 16:47:14 -0400 Subject: [PATCH 1/6] devmapper: Fix libdm logging There are issues with libdm logging. Right now if docker daemon is run in debug mode, logging by libdm is too verbose. And if a device can't be removed, thousands of messages fill the console and one can not see what's going on. This patch removes devicemapper.LogInitVerbose() call as that call will only work if docker was not registering its own log handler with libdm. For some reason docker registers one with libdm and libdm hands over all the messages to docker (including debug ones). And now it is up to devmapper backend to figure out which ones should go to console and which ones should not. So by default log only fatal messages from libdm. One can easily modify the code to change it for debugging purposes. Signed-off-by: Vivek Goyal --- daemon/graphdriver/devmapper/deviceset.go | 27 ++++++++++++++--------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/daemon/graphdriver/devmapper/deviceset.go b/daemon/graphdriver/devmapper/deviceset.go index 4d35adabc0..5515df9960 100644 --- a/daemon/graphdriver/devmapper/deviceset.go +++ b/daemon/graphdriver/devmapper/deviceset.go @@ -33,6 +33,10 @@ var ( DefaultThinpBlockSize uint32 = 128 // 64K = 128 512b sectors MaxDeviceId int = 0xffffff // 24 bit, pool limit DeviceIdMapSz int = (MaxDeviceId + 1) / 8 + // We retry device removal so many a times that even error messages + // will fill up console during normal operation. So only log Fatal + // messages by default. + DMLogLevel int = devicemapper.LogLevelFatal ) const deviceSetMetaFile string = "deviceset-metadata" @@ -723,14 +727,22 @@ func setCloseOnExec(name string) { } func (devices *DeviceSet) DMLog(level int, file string, line int, dmError int, message string) { - if level >= devicemapper.LogLevelDebug { - // (vbatts) libdm debug is very verbose. If you're debugging libdm, you can - // comment out this check yourself - level = devicemapper.LogLevelInfo + // By default libdm sends us all the messages including debug ones. + // We need to filter out messages here and figure out which one + // should be printed. + if level > DMLogLevel { + return } // FIXME(vbatts) push this back into ./pkg/devicemapper/ - logrus.Debugf("libdevmapper(%d): %s:%d (%d) %s", level, file, line, dmError, message) + if level <= devicemapper.LogLevelErr { + logrus.Errorf("libdevmapper(%d): %s:%d (%d) %s", level, file, line, dmError, message) + } else if level <= devicemapper.LogLevelInfo { + logrus.Infof("libdevmapper(%d): %s:%d (%d) %s", level, file, line, dmError, message) + } else { + // FIXME(vbatts) push this back into ./pkg/devicemapper/ + logrus.Debugf("libdevmapper(%d): %s:%d (%d) %s", level, file, line, dmError, message) + } } func major(device uint64) uint64 { @@ -947,11 +959,6 @@ func (devices *DeviceSet) closeTransaction() error { } func (devices *DeviceSet) initDevmapper(doInit bool) error { - if os.Getenv("DEBUG") != "" { - devicemapper.LogInitVerbose(devicemapper.LogLevelDebug) - } else { - devicemapper.LogInitVerbose(devicemapper.LogLevelWarn) - } // give ourselves to libdm as a log handler devicemapper.LogInit(devices) From cb7c893275c32ddfa775c3f22869a9c211024c71 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 2 Apr 2015 16:47:14 -0400 Subject: [PATCH 2/6] devicemapper: Remove debug messages from RemoveDevice() devmapper graph driver retries device removal 1000 times in case of failure and if this fills up console with 1000 messages (when daemon is running in debug mode). So remove these debug messages. Signed-off-by: Vivek Goyal --- daemon/graphdriver/devmapper/deviceset.go | 3 +++ pkg/devicemapper/devmapper.go | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/daemon/graphdriver/devmapper/deviceset.go b/daemon/graphdriver/devmapper/deviceset.go index 5515df9960..5923811d26 100644 --- a/daemon/graphdriver/devmapper/deviceset.go +++ b/daemon/graphdriver/devmapper/deviceset.go @@ -1249,6 +1249,9 @@ func (devices *DeviceSet) deactivateDevice(info *DevInfo) error { func (devices *DeviceSet) removeDeviceAndWait(devname string) error { var err error + logrus.Debugf("[devmapper] removeDeviceAndWait START(%s)", devname) + defer logrus.Debugf("[devmapper] removeDeviceAndWait END(%s)", devname) + for i := 0; i < 1000; i++ { err = devicemapper.RemoveDevice(devname) if err == nil { diff --git a/pkg/devicemapper/devmapper.go b/pkg/devicemapper/devmapper.go index e8341692a4..f75031996d 100644 --- a/pkg/devicemapper/devmapper.go +++ b/pkg/devicemapper/devmapper.go @@ -349,8 +349,6 @@ func CookieSupported() bool { // Useful helper for cleanup func RemoveDevice(name string) error { - logrus.Debugf("[devmapper] RemoveDevice START(%s)", name) - defer logrus.Debugf("[devmapper] RemoveDevice END(%s)", name) task, err := TaskCreateNamed(DeviceRemove, name) if task == nil { return err From 665656afbb8932a11c69c0cd79e21a768aa46d38 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 2 Apr 2015 16:47:14 -0400 Subject: [PATCH 3/6] devmapper: Use a pointer as argument to deferred function UdevWait() UdevWait() is deferred and takes uint cookie as an argument. As arguments to deferred functions are calculated at the time of call, it is possible that any update to cookie later by libdm are not taken into account when UdevWait() is called. Hence use a pointer to uint as argument to UdevWait() function. Signed-off-by: Vivek Goyal --- pkg/devicemapper/devmapper.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/devicemapper/devmapper.go b/pkg/devicemapper/devmapper.go index f75031996d..bb89f7fac2 100644 --- a/pkg/devicemapper/devmapper.go +++ b/pkg/devicemapper/devmapper.go @@ -284,9 +284,9 @@ func FindLoopDeviceFor(file *os.File) *os.File { return nil } -func UdevWait(cookie uint) error { - if res := DmUdevWait(cookie); res != 1 { - logrus.Debugf("Failed to wait on udev cookie %d", cookie) +func UdevWait(cookie *uint) error { + if res := DmUdevWait(*cookie); res != 1 { + logrus.Debugf("Failed to wait on udev cookie %d", *cookie) return ErrUdevWait } return nil @@ -358,7 +358,7 @@ func RemoveDevice(name string) error { if err := task.SetCookie(&cookie, 0); err != nil { return fmt.Errorf("Can not set cookie: %s", err) } - defer UdevWait(cookie) + defer UdevWait(&cookie) dmSawBusy = false // reset before the task is run if err = task.Run(); err != nil { @@ -425,7 +425,7 @@ func CreatePool(poolName string, dataFile, metadataFile *os.File, poolBlockSize if err := task.SetCookie(&cookie, flags); err != nil { return fmt.Errorf("Can't set cookie %s", err) } - defer UdevWait(cookie) + defer UdevWait(&cookie) if err := task.Run(); err != nil { return fmt.Errorf("Error running DeviceCreate (CreatePool) %s", err) @@ -556,7 +556,7 @@ func ResumeDevice(name string) error { if err := task.SetCookie(&cookie, 0); err != nil { return fmt.Errorf("Can't set cookie %s", err) } - defer UdevWait(cookie) + defer UdevWait(&cookie) if err := task.Run(); err != nil { return fmt.Errorf("Error running DeviceResume %s", err) @@ -632,7 +632,7 @@ func ActivateDevice(poolName string, name string, deviceId int, size uint64) err return fmt.Errorf("Can't set cookie %s", err) } - defer UdevWait(cookie) + defer UdevWait(&cookie) if err := task.Run(); err != nil { return fmt.Errorf("Error running DeviceCreate (ActivateDevice) %s", err) From dbf04ec4e2a6b4fe73f7f300918a906c0ff1a37b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 2 Apr 2015 16:47:14 -0400 Subject: [PATCH 4/6] devmapper: Remove extra wait after device removal Currently in device removal path (device deactivation), we wait for 10 seconds for devive to actually go away. waitRemove(). In current code this is not required. If dm removal task has completed and one has done the wait on udev cookie, then device is gone and there is no need to write another loop to wait for device removal. This patch removes the waitRemove() which waits for 10 seconds after device removal. This seems unnecessary. Signed-off-by: Vivek Goyal --- daemon/graphdriver/devmapper/deviceset.go | 52 +++-------------------- 1 file changed, 7 insertions(+), 45 deletions(-) diff --git a/daemon/graphdriver/devmapper/deviceset.go b/daemon/graphdriver/devmapper/deviceset.go index 5923811d26..221c87eb4d 100644 --- a/daemon/graphdriver/devmapper/deviceset.go +++ b/daemon/graphdriver/devmapper/deviceset.go @@ -1157,7 +1157,7 @@ func (devices *DeviceSet) deleteDevice(info *DevInfo) error { devinfo, _ := devicemapper.GetInfo(info.Name()) if devinfo != nil && devinfo.Exists != 0 { - if err := devices.removeDeviceAndWait(info.Name()); err != nil { + if err := devices.removeDevice(info.Name()); err != nil { logrus.Debugf("Error removing device: %s", err) return err } @@ -1236,7 +1236,7 @@ func (devices *DeviceSet) deactivateDevice(info *DevInfo) error { return err } if devinfo.Exists != 0 { - if err := devices.removeDeviceAndWait(info.Name()); err != nil { + if err := devices.removeDevice(info.Name()); err != nil { return err } } @@ -1244,13 +1244,12 @@ func (devices *DeviceSet) deactivateDevice(info *DevInfo) error { return nil } -// Issues the underlying dm remove operation and then waits -// for it to finish. -func (devices *DeviceSet) removeDeviceAndWait(devname string) error { +// Issues the underlying dm remove operation. +func (devices *DeviceSet) removeDevice(devname string) error { var err error - logrus.Debugf("[devmapper] removeDeviceAndWait START(%s)", devname) - defer logrus.Debugf("[devmapper] removeDeviceAndWait END(%s)", devname) + logrus.Debugf("[devmapper] removeDevice START(%s)", devname) + defer logrus.Debugf("[devmapper] removeDevice END(%s)", devname) for i := 0; i < 1000; i++ { err = devicemapper.RemoveDevice(devname) @@ -1267,45 +1266,8 @@ func (devices *DeviceSet) removeDeviceAndWait(devname string) error { time.Sleep(10 * time.Millisecond) devices.Lock() } - if err != nil { - return err - } - if err := devices.waitRemove(devname); err != nil { - return err - } - return nil -} - -// waitRemove blocks until either: -// a) the device registered at - is removed, -// or b) the 10 second timeout expires. -func (devices *DeviceSet) waitRemove(devname string) error { - logrus.Debugf("[deviceset %s] waitRemove(%s)", devices.devicePrefix, devname) - defer logrus.Debugf("[deviceset %s] waitRemove(%s) END", devices.devicePrefix, devname) - i := 0 - for ; i < 1000; i++ { - devinfo, err := devicemapper.GetInfo(devname) - if err != nil { - // If there is an error we assume the device doesn't exist. - // The error might actually be something else, but we can't differentiate. - return nil - } - if i%100 == 0 { - logrus.Debugf("Waiting for removal of %s: exists=%d", devname, devinfo.Exists) - } - if devinfo.Exists == 0 { - break - } - - devices.Unlock() - time.Sleep(10 * time.Millisecond) - devices.Lock() - } - if i == 1000 { - return fmt.Errorf("Timeout while waiting for device %s to be removed", devname) - } - return nil + return err } // waitClose blocks until either: From f74d12012c21349b2bd51d9c395a99331ff0a9a5 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 2 Apr 2015 16:47:14 -0400 Subject: [PATCH 5/6] devmapper: Remove call to waitClose() During device removal, we are first waiting for device to close() in a tight loop for 10 seconds. I am not sure why do we need it. First of all we come here once the umount() is successful so device should be free. For some reason of device is temporarily busy, then removeDevice() logic retries device removal logic in a loop for 10 seconds and that should cover it. Can't see why one more 10 seoncds loop is required before attempting device removal. One loop should be able to cover all the temporary device busy conditions and if condition is not temporary then 10 seconds loop is not going to help anyway. So instead of two loops of 10 seconds each, I am converting it to a single loop of 20 seconds. May be 10 second loop is good enough but for now I am keeping it 20 seconds to avoid any regressions. Signed-off-by: Vivek Goyal --- daemon/graphdriver/devmapper/deviceset.go | 34 +---------------------- 1 file changed, 1 insertion(+), 33 deletions(-) diff --git a/daemon/graphdriver/devmapper/deviceset.go b/daemon/graphdriver/devmapper/deviceset.go index 221c87eb4d..183ad9b6f5 100644 --- a/daemon/graphdriver/devmapper/deviceset.go +++ b/daemon/graphdriver/devmapper/deviceset.go @@ -1225,12 +1225,6 @@ func (devices *DeviceSet) deactivateDevice(info *DevInfo) error { logrus.Debugf("[devmapper] deactivateDevice(%s)", info.Hash) defer logrus.Debugf("[devmapper] deactivateDevice END(%s)", info.Hash) - // Wait for the unmount to be effective, - // by watching the value of Info.OpenCount for the device - if err := devices.waitClose(info); err != nil { - logrus.Errorf("Error waiting for device %s to close: %s", info.Hash, err) - } - devinfo, err := devicemapper.GetInfo(info.Name()) if err != nil { return err @@ -1251,7 +1245,7 @@ func (devices *DeviceSet) removeDevice(devname string) error { logrus.Debugf("[devmapper] removeDevice START(%s)", devname) defer logrus.Debugf("[devmapper] removeDevice END(%s)", devname) - for i := 0; i < 1000; i++ { + for i := 0; i < 2000; i++ { err = devicemapper.RemoveDevice(devname) if err == nil { break @@ -1270,32 +1264,6 @@ func (devices *DeviceSet) removeDevice(devname string) error { return err } -// waitClose blocks until either: -// a) the device registered at - is closed, -// or b) the 10 second timeout expires. -func (devices *DeviceSet) waitClose(info *DevInfo) error { - i := 0 - for ; i < 1000; i++ { - devinfo, err := devicemapper.GetInfo(info.Name()) - if err != nil { - return err - } - if i%100 == 0 { - logrus.Debugf("Waiting for unmount of %s: opencount=%d", info.Hash, devinfo.OpenCount) - } - if devinfo.OpenCount == 0 { - break - } - devices.Unlock() - time.Sleep(10 * time.Millisecond) - devices.Lock() - } - if i == 1000 { - return fmt.Errorf("Timeout while waiting for device %s to close", info.Hash) - } - return nil -} - func (devices *DeviceSet) Shutdown() error { logrus.Debugf("[deviceset %s] Shutdown()", devices.devicePrefix) logrus.Debugf("[devmapper] Shutting down DeviceSet: %s", devices.root) From c737800b7faced4b53854c8cb6766ebe58a3c3e9 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 2 Apr 2015 16:47:14 -0400 Subject: [PATCH 6/6] devmapper: Retry device removal after 100ms instead of 10ms Right now we try device removal at the interval of 10ms and keep on trying till either device is removed or 10 seconds are over. That means if device is busy, we will try 1000 times in those 10 seconds. Sounds too high a frequency of deivce removal retrial. All the logs are filled easily. I think it is a good idea to slow down a bit and retry at the interval of 100ms instead of 10ms. Signed-off-by: Vivek Goyal --- daemon/graphdriver/devmapper/deviceset.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/daemon/graphdriver/devmapper/deviceset.go b/daemon/graphdriver/devmapper/deviceset.go index 183ad9b6f5..a5ad0e676c 100644 --- a/daemon/graphdriver/devmapper/deviceset.go +++ b/daemon/graphdriver/devmapper/deviceset.go @@ -1245,7 +1245,7 @@ func (devices *DeviceSet) removeDevice(devname string) error { logrus.Debugf("[devmapper] removeDevice START(%s)", devname) defer logrus.Debugf("[devmapper] removeDevice END(%s)", devname) - for i := 0; i < 2000; i++ { + for i := 0; i < 200; i++ { err = devicemapper.RemoveDevice(devname) if err == nil { break @@ -1257,7 +1257,7 @@ func (devices *DeviceSet) removeDevice(devname string) error { // If we see EBUSY it may be a transient error, // sleep a bit a retry a few times. devices.Unlock() - time.Sleep(10 * time.Millisecond) + time.Sleep(100 * time.Millisecond) devices.Lock() }