diff --git a/components/engine/daemon/graphdriver/devmapper/deviceset.go b/components/engine/daemon/graphdriver/devmapper/deviceset.go index 1484648b95..499221ed80 100644 --- a/components/engine/daemon/graphdriver/devmapper/deviceset.go +++ b/components/engine/daemon/graphdriver/devmapper/deviceset.go @@ -40,6 +40,7 @@ var ( logLevel = devicemapper.LogLevelFatal driverDeferredRemovalSupport = false enableDeferredRemoval = false + enableDeferredDeletion = false ) const deviceSetMetaFile string = "deviceset-metadata" @@ -57,6 +58,7 @@ type devInfo struct { Size uint64 `json:"size"` TransactionID uint64 `json:"transaction_id"` Initialized bool `json:"initialized"` + Deleted bool `json:"deleted"` devices *DeviceSet mountCount int @@ -106,7 +108,10 @@ type DeviceSet struct { transaction `json:"-"` overrideUdevSyncCheck bool deferredRemove bool // use deferred removal + deferredDelete bool // use deferred deletion BaseDeviceUUID string //save UUID of base device + nrDeletedDevices uint //number of deleted devices + deletionWorkerTicker *time.Ticker } // DiskUsage contains information about disk usage and is used when reporting Status of a device. @@ -141,6 +146,13 @@ type Status struct { UdevSyncSupported bool // DeferredRemoveEnabled is true then the device is not unmounted. DeferredRemoveEnabled bool + // True if deferred deletion is enabled. This is different from + // deferred removal. "removal" means that device mapper device is + // deactivated. Thin device is still in thin pool and can be activated + // again. But "deletion" means that thin device will be deleted from + // thin pool and it can't be activated again. + DeferredDeleteEnabled bool + DeferredDeletedDeviceCount uint } // Structure used to export image/container metadata in docker inspect. @@ -375,6 +387,18 @@ func (devices *DeviceSet) lookupDeviceWithLock(hash string) (*devInfo, error) { return info, err } +// This function relies on that device hash map has been loaded in advance. +// Should be called with devices.Lock() held. +func (devices *DeviceSet) constructDeviceIDMap() { + logrus.Debugf("[deviceset] constructDeviceIDMap()") + defer logrus.Debugf("[deviceset] constructDeviceIDMap() END") + + for _, info := range devices.Devices { + devices.markDeviceIDUsed(info.DeviceID) + logrus.Debugf("Added deviceId=%d to DeviceIdMap", info.DeviceID) + } +} + func (devices *DeviceSet) deviceFileWalkFunction(path string, finfo os.FileInfo) error { // Skip some of the meta files which are not device files. @@ -405,19 +429,18 @@ func (devices *DeviceSet) deviceFileWalkFunction(path string, finfo os.FileInfo) hash = "" } - dinfo := devices.loadMetadata(hash) - if dinfo == nil { - return fmt.Errorf("Error loading device metadata file %s", hash) + // Include deleted devices also as cleanup delete device logic + // will go through it and see if there are any deleted devices. + if _, err := devices.lookupDevice(hash); err != nil { + return fmt.Errorf("Error looking up device %s:%v", hash, err) } - devices.markDeviceIDUsed(dinfo.DeviceID) - logrus.Debugf("Added deviceID=%d to DeviceIDMap", dinfo.DeviceID) return nil } -func (devices *DeviceSet) constructDeviceIDMap() error { - logrus.Debugf("[deviceset] constructDeviceIDMap()") - defer logrus.Debugf("[deviceset] constructDeviceIDMap() END") +func (devices *DeviceSet) loadDeviceFilesOnStart() error { + logrus.Debugf("[deviceset] loadDeviceFilesOnStart()") + defer logrus.Debugf("[deviceset] loadDeviceFilesOnStart() END") var scan = func(path string, info os.FileInfo, err error) error { if err != nil { @@ -477,9 +500,13 @@ func (devices *DeviceSet) registerDevice(id int, hash string, size uint64, trans return info, nil } -func (devices *DeviceSet) activateDeviceIfNeeded(info *devInfo) error { +func (devices *DeviceSet) activateDeviceIfNeeded(info *devInfo, ignoreDeleted bool) error { logrus.Debugf("activateDeviceIfNeeded(%v)", info.Hash) + if info.Deleted && !ignoreDeleted { + return fmt.Errorf("devmapper: Can't activate device %v as it is marked for deletion", info.Hash) + } + // Make sure deferred removal on device is canceled, if one was // scheduled. if err := devices.cancelDeferredRemoval(info); err != nil { @@ -553,6 +580,61 @@ func (devices *DeviceSet) migrateOldMetaData() error { return nil } +// Cleanup deleted devices. It assumes that all the devices have been +// loaded in the hash table. +func (devices *DeviceSet) cleanupDeletedDevices() error { + devices.Lock() + + // If there are no deleted devices, there is nothing to do. + if devices.nrDeletedDevices == 0 { + return nil + } + + var deletedDevices []*devInfo + + for _, info := range devices.Devices { + if !info.Deleted { + continue + } + logrus.Debugf("devmapper: Found deleted device %s.", info.Hash) + deletedDevices = append(deletedDevices, info) + } + + // Delete the deleted devices. DeleteDevice() first takes the info lock + // and then devices.Lock(). So drop it to avoid deadlock. + devices.Unlock() + + for _, info := range deletedDevices { + // This will again try deferred deletion. + if err := devices.DeleteDevice(info.Hash, false); err != nil { + logrus.Warnf("devmapper: Deletion of device %s, device_id=%v failed:%v", info.Hash, info.DeviceID, err) + } + } + + return nil +} + +func (devices *DeviceSet) countDeletedDevices() { + for _, info := range devices.Devices { + if !info.Deleted { + continue + } + devices.nrDeletedDevices++ + } +} + +func (devices *DeviceSet) startDeviceDeletionWorker() { + // Deferred deletion is not enabled. Don't do anything. + if !devices.deferredDelete { + return + } + + logrus.Debugf("devmapper: Worker to cleanup deleted devices started") + for range devices.deletionWorkerTicker.C { + devices.cleanupDeletedDevices() + } +} + func (devices *DeviceSet) initMetaData() error { devices.Lock() defer devices.Unlock() @@ -568,13 +650,19 @@ func (devices *DeviceSet) initMetaData() error { devices.TransactionID = transactionID - if err := devices.constructDeviceIDMap(); err != nil { - return err + if err := devices.loadDeviceFilesOnStart(); err != nil { + return fmt.Errorf("devmapper: Failed to load device files:%v", err) } + devices.constructDeviceIDMap() + devices.countDeletedDevices() + if err := devices.processPendingTransaction(); err != nil { return err } + + // Start a goroutine to cleanup Deleted Devices + go devices.startDeviceDeletionWorker() return nil } @@ -739,7 +827,7 @@ func (devices *DeviceSet) verifyBaseDeviceUUID(baseInfo *devInfo) error { devices.Lock() defer devices.Unlock() - if err := devices.activateDeviceIfNeeded(baseInfo); err != nil { + if err := devices.activateDeviceIfNeeded(baseInfo, false); err != nil { return err } @@ -761,7 +849,7 @@ func (devices *DeviceSet) saveBaseDeviceUUID(baseInfo *devInfo) error { devices.Lock() defer devices.Unlock() - if err := devices.activateDeviceIfNeeded(baseInfo); err != nil { + if err := devices.activateDeviceIfNeeded(baseInfo, false); err != nil { return err } @@ -788,7 +876,7 @@ func (devices *DeviceSet) createBaseImage() error { logrus.Debugf("Creating filesystem on base device-mapper thin volume") - if err := devices.activateDeviceIfNeeded(info); err != nil { + if err := devices.activateDeviceIfNeeded(info, false); err != nil { return err } @@ -851,7 +939,7 @@ func (devices *DeviceSet) setupBaseImage() error { // fresh. if oldInfo != nil { - if oldInfo.Initialized { + if oldInfo.Initialized && !oldInfo.Deleted { if err := devices.setupVerifyBaseImageUUID(oldInfo); err != nil { return err } @@ -860,7 +948,10 @@ func (devices *DeviceSet) setupBaseImage() error { } logrus.Debugf("Removing uninitialized base image") - if err := devices.DeleteDevice(""); err != nil { + // If previous base device is in deferred delete state, + // that needs to be cleaned up first. So don't try + // deferred deletion. + if err := devices.DeleteDevice("", true); err != nil { return err } } @@ -1303,13 +1394,27 @@ func (devices *DeviceSet) initDevmapper(doInit bool) error { return graphdriver.ErrNotSupported } - // If user asked for deferred removal and both library and driver - // supports deferred removal use it. - if enableDeferredRemoval && driverDeferredRemovalSupport && devicemapper.LibraryDeferredRemovalSupport == true { + // If user asked for deferred removal then check both libdm library + // and kernel driver support deferred removal otherwise error out. + if enableDeferredRemoval { + if !driverDeferredRemovalSupport { + return fmt.Errorf("devmapper: Deferred removal can not be enabled as kernel does not support it") + } + if !devicemapper.LibraryDeferredRemovalSupport { + return fmt.Errorf("devmapper: Deferred removal can not be enabled as libdm does not support it") + } logrus.Debugf("devmapper: Deferred removal support enabled.") devices.deferredRemove = true } + if enableDeferredDeletion { + if !devices.deferredRemove { + return fmt.Errorf("devmapper: Deferred deletion can not be enabled as deferred removal is not enabled. Enable deferred removal using --storage-opt dm.use_deferred_removal=true parameter") + } + logrus.Debugf("devmapper: Deferred deletion support enabled.") + devices.deferredDelete = true + } + // https://github.com/docker/docker/issues/4036 if supported := devicemapper.UdevSetSyncSupport(true); !supported { logrus.Warn("Udev sync is not supported. This will lead to unexpected behavior, data loss and errors. For more information, see https://docs.docker.com/reference/commandline/daemon/#daemon-storage-driver-option") @@ -1480,19 +1585,26 @@ func (devices *DeviceSet) AddDevice(hash, baseHash string) error { logrus.Debugf("[deviceset] AddDevice(hash=%s basehash=%s)", hash, baseHash) defer logrus.Debugf("[deviceset] AddDevice(hash=%s basehash=%s) END", hash, baseHash) + // If a deleted device exists, return error. baseInfo, err := devices.lookupDeviceWithLock(baseHash) if err != nil { return err } + if baseInfo.Deleted { + return fmt.Errorf("devmapper: Base device %v has been marked for deferred deletion", baseInfo.Hash) + } + baseInfo.lock.Lock() defer baseInfo.lock.Unlock() devices.Lock() defer devices.Unlock() + // Also include deleted devices in case hash of new device is + // same as one of the deleted devices. if info, _ := devices.lookupDevice(hash); info != nil { - return fmt.Errorf("device %s already exists", hash) + return fmt.Errorf("device %s already exists. Deleted=%v", hash, info.Deleted) } if err := devices.createRegisterSnapDevice(hash, baseInfo); err != nil { @@ -1502,8 +1614,28 @@ func (devices *DeviceSet) AddDevice(hash, baseHash string) error { return nil } +func (devices *DeviceSet) markForDeferredDeletion(info *devInfo) error { + // If device is already in deleted state, there is nothing to be done. + if info.Deleted { + return nil + } + + logrus.Debugf("devmapper: Marking device %s for deferred deletion.", info.Hash) + + info.Deleted = true + + // save device metadata to refelect deleted state. + if err := devices.saveMetadata(info); err != nil { + info.Deleted = false + return err + } + + devices.nrDeletedDevices++ + return nil +} + // Should be caled with devices.Lock() held. -func (devices *DeviceSet) deleteTransaction(info *devInfo) error { +func (devices *DeviceSet) deleteTransaction(info *devInfo, syncDelete bool) error { if err := devices.openTransaction(info.Hash, info.DeviceID); err != nil { logrus.Debugf("Error opening transaction hash = %s deviceId = %d", "", info.DeviceID) return err @@ -1511,13 +1643,31 @@ func (devices *DeviceSet) deleteTransaction(info *devInfo) error { defer devices.closeTransaction() - if err := devicemapper.DeleteDevice(devices.getPoolDevName(), info.DeviceID); err != nil { - logrus.Debugf("Error deleting device: %s", err) - return err + err := devicemapper.DeleteDevice(devices.getPoolDevName(), info.DeviceID) + if err != nil { + // If syncDelete is true, we want to return error. If deferred + // deletion is not enabled, we return an error. If error is + // something other then EBUSY, return an error. + if syncDelete || !devices.deferredDelete || err != devicemapper.ErrBusy { + logrus.Debugf("Error deleting device: %s", err) + return err + } } - if err := devices.unregisterDevice(info.DeviceID, info.Hash); err != nil { - return err + if err == nil { + if err := devices.unregisterDevice(info.DeviceID, info.Hash); err != nil { + return err + } + // If device was already in deferred delete state that means + // deletion was being tried again later. Reduce the deleted + // device count. + if info.Deleted { + devices.nrDeletedDevices-- + } + } else { + if err := devices.markForDeferredDeletion(info); err != nil { + return err + } } return nil @@ -1529,8 +1679,10 @@ func (devices *DeviceSet) issueDiscard(info *devInfo) error { defer logrus.Debugf("devmapper: issueDiscard(device: %s). END", info.Hash) // This is a workaround for the kernel not discarding block so // on the thin pool when we remove a thinp device, so we do it - // manually - if err := devices.activateDeviceIfNeeded(info); err != nil { + // manually. + // Even if device is deferred deleted, activate it and isue + // discards. + if err := devices.activateDeviceIfNeeded(info, true); err != nil { return err } @@ -1551,7 +1703,7 @@ func (devices *DeviceSet) issueDiscard(info *devInfo) error { } // Should be called with devices.Lock() held. -func (devices *DeviceSet) deleteDevice(info *devInfo) error { +func (devices *DeviceSet) deleteDevice(info *devInfo, syncDelete bool) error { if devices.doBlkDiscard { devices.issueDiscard(info) } @@ -1562,7 +1714,7 @@ func (devices *DeviceSet) deleteDevice(info *devInfo) error { return err } - if err := devices.deleteTransaction(info); err != nil { + if err := devices.deleteTransaction(info, syncDelete); err != nil { return err } @@ -1571,8 +1723,12 @@ func (devices *DeviceSet) deleteDevice(info *devInfo) error { return nil } -// DeleteDevice deletes a device from the hash. -func (devices *DeviceSet) DeleteDevice(hash string) error { +// DeleteDevice will return success if device has been marked for deferred +// removal. If one wants to override that and want DeleteDevice() to fail if +// device was busy and could not be deleted, set syncDelete=true. +func (devices *DeviceSet) DeleteDevice(hash string, syncDelete bool) error { + logrus.Debugf("devmapper: DeleteDevice(hash=%v syncDelete=%v) START", hash, syncDelete) + defer logrus.Debugf("devmapper: DeleteDevice(hash=%v syncDelete=%v) END", hash, syncDelete) info, err := devices.lookupDeviceWithLock(hash) if err != nil { return err @@ -1591,7 +1747,7 @@ func (devices *DeviceSet) DeleteDevice(hash string) error { return fmt.Errorf("devmapper: Can't delete device %v as it is still mounted. mntCount=%v", info.Hash, info.mountCount) } - return devices.deleteDevice(info) + return devices.deleteDevice(info, syncDelete) } func (devices *DeviceSet) deactivatePool() error { @@ -1716,6 +1872,13 @@ func (devices *DeviceSet) Shutdown() error { var devs []*devInfo + // Stop deletion worker. This should start delivering new events to + // ticker channel. That means no new instance of cleanupDeletedDevice() + // will run after this call. If one instance is already running at + // the time of the call, it must be holding devices.Lock() and + // we will block on this lock till cleanup function exits. + devices.deletionWorkerTicker.Stop() + devices.Lock() // Save DeviceSet Metadata first. Docker kills all threads if they // don't finish in certain time. It is possible that Shutdown() @@ -1778,6 +1941,10 @@ func (devices *DeviceSet) MountDevice(hash, path, mountLabel string) error { return err } + if info.Deleted { + return fmt.Errorf("devmapper: Can't mount device %v as it has been marked for deferred deletion", info.Hash) + } + info.lock.Lock() defer info.lock.Unlock() @@ -1793,7 +1960,7 @@ func (devices *DeviceSet) MountDevice(hash, path, mountLabel string) error { return nil } - if err := devices.activateDeviceIfNeeded(info); err != nil { + if err := devices.activateDeviceIfNeeded(info, false); err != nil { return fmt.Errorf("Error activating devmapper device for '%s': %s", hash, err) } @@ -1913,7 +2080,7 @@ func (devices *DeviceSet) GetDeviceStatus(hash string) (*DevStatus, error) { TransactionID: info.TransactionID, } - if err := devices.activateDeviceIfNeeded(info); err != nil { + if err := devices.activateDeviceIfNeeded(info, false); err != nil { return nil, fmt.Errorf("Error activating devmapper device for '%s': %s", hash, err) } @@ -1985,6 +2152,8 @@ func (devices *DeviceSet) Status() *Status { status.MetadataLoopback = devices.metadataLoopFile status.UdevSyncSupported = devicemapper.UdevSyncSupported() status.DeferredRemoveEnabled = devices.deferredRemove + status.DeferredDeleteEnabled = devices.deferredDelete + status.DeferredDeletedDeviceCount = devices.nrDeletedDevices totalSizeInSectors, _, dataUsed, dataTotal, metadataUsed, metadataTotal, err := devices.poolStatus() if err == nil { @@ -2049,6 +2218,7 @@ func NewDeviceSet(root string, doInit bool, options []string) (*DeviceSet, error doBlkDiscard: true, thinpBlockSize: defaultThinpBlockSize, deviceIDMap: make([]byte, deviceIDMapSz), + deletionWorkerTicker: time.NewTicker(time.Second * 30), } foundBlkDiscard := false @@ -2117,6 +2287,12 @@ func NewDeviceSet(root string, doInit bool, options []string) (*DeviceSet, error return nil, err } + case "dm.use_deferred_deletion": + enableDeferredDeletion, err = strconv.ParseBool(val) + if err != nil { + return nil, err + } + default: return nil, fmt.Errorf("Unknown option %s\n", key) } diff --git a/components/engine/daemon/graphdriver/devmapper/driver.go b/components/engine/daemon/graphdriver/devmapper/driver.go index dc2647d11d..44dab1f965 100644 --- a/components/engine/daemon/graphdriver/devmapper/driver.go +++ b/components/engine/daemon/graphdriver/devmapper/driver.go @@ -84,6 +84,8 @@ func (d *Driver) Status() [][2]string { {"Metadata Space Available", fmt.Sprintf("%s", units.HumanSize(float64(s.Metadata.Available)))}, {"Udev Sync Supported", fmt.Sprintf("%v", s.UdevSyncSupported)}, {"Deferred Removal Enabled", fmt.Sprintf("%v", s.DeferredRemoveEnabled)}, + {"Deferred Deletion Enabled", fmt.Sprintf("%v", s.DeferredDeleteEnabled)}, + {"Deferred Deleted Device Count", fmt.Sprintf("%v", s.DeferredDeletedDeviceCount)}, } if len(s.DataLoopback) > 0 { status = append(status, [2]string{"Data loop file", s.DataLoopback}) @@ -142,7 +144,7 @@ func (d *Driver) Remove(id string) error { } // This assumes the device has been properly Get/Put:ed and thus is unmounted - if err := d.DeviceSet.DeleteDevice(id); err != nil { + if err := d.DeviceSet.DeleteDevice(id, false); err != nil { return err } diff --git a/components/engine/docs/reference/commandline/daemon.md b/components/engine/docs/reference/commandline/daemon.md index e3ed62ef42..cef19ab038 100644 --- a/components/engine/docs/reference/commandline/daemon.md +++ b/components/engine/docs/reference/commandline/daemon.md @@ -368,6 +368,46 @@ options for `zfs` start with `zfs`. > Otherwise, set this flag for migrating existing Docker daemons to > a daemon with a supported environment. + * `dm.use_deferred_removal` + + Enables use of deferred device removal if `libdm` and the kernel driver + support the mechanism. + + Deferred device removal means that if device is busy when devices are + being removed/deactivated, then a deferred removal is scheduled on + device. And devices automatically go away when last user of the device + exits. + + For example, when a container exits, its associated thin device is removed. + If that device has leaked into some other mount namespace and can't be + removed, the container exit still succeeds and this option causes the + system to schedule the device for deferred removal. It does not wait in a + loop trying to remove a busy device. + + Example use: `docker daemon --storage-opt dm.use_deferred_removal=true` + + * `dm.use_deferred_deletion` + + Enables use of deferred device deletion for thin pool devices. By default, + thin pool device deletion is synchronous. Before a container is deleted, + the Docker daemon removes any associated devices. If the storage driver + can not remove a device, the container deletion fails and daemon returns. + + `Error deleting container: Error response from daemon: Cannot destroy container` + + To avoid this failure, enable both deferred device deletion and deferred + device removal on the daemon. + + `docker daemon --storage-opt dm.use_deferred_deletion=true --storage-opt dm.use_deferred_removal=true` + + With these two options enabled, if a device is busy when the driver is + deleting a container, the driver marks the device as deleted. Later, when + the device isn't in use, the driver deletes it. + + In general it should be safe to enable this option by default. It will help + when unintentional leaking of mount point happens across multiple mount + namespaces. + Currently supported options of `zfs`: * `zfs.fsname` diff --git a/components/engine/man/docker-daemon.8.md b/components/engine/man/docker-daemon.8.md index b37bd25e0d..59ba4d54f2 100644 --- a/components/engine/man/docker-daemon.8.md +++ b/components/engine/man/docker-daemon.8.md @@ -302,6 +302,28 @@ device. Example use: `docker daemon --storage-opt dm.use_deferred_removal=true` +#### dm.use_deferred_deletion + +Enables use of deferred device deletion for thin pool devices. By default, +thin pool device deletion is synchronous. Before a container is deleted, the +Docker daemon removes any associated devices. If the storage driver can not +remove a device, the container deletion fails and daemon returns. + +`Error deleting container: Error response from daemon: Cannot destroy container` + +To avoid this failure, enable both deferred device deletion and deferred +device removal on the daemon. + +`docker daemon --storage-opt dm.use_deferred_deletion=true --storage-opt dm.use_deferred_removal=true` + +With these two options enabled, if a device is busy when the driver is +deleting a container, the driver marks the device as deleted. Later, when the +device isn't in use, the driver deletes it. + +In general it should be safe to enable this option by default. It will help +when unintentional leaking of mount point happens across multiple mount +namespaces. + #### dm.loopdatasize **Note**: This option configures devicemapper loopback, which should not be used in production. diff --git a/components/engine/pkg/devicemapper/devmapper.go b/components/engine/pkg/devicemapper/devmapper.go index abbd28b5ff..136dd6bbd0 100644 --- a/components/engine/pkg/devicemapper/devmapper.go +++ b/components/engine/pkg/devicemapper/devmapper.go @@ -750,7 +750,11 @@ func DeleteDevice(poolName string, deviceID int) error { return fmt.Errorf("Can't set message %s", err) } + dmSawBusy = false if err := task.run(); err != nil { + if dmSawBusy { + return ErrBusy + } return fmt.Errorf("Error running DeleteDevice %s", err) } return nil