Files
docker-cli/components/engine/pkg/libcontainer/cgroups/systemd/apply_systemd.go
Alexander Larsson 006edb2caf libcontainer/cgroup: Use raw access to set up and join the devices cgroup
The systemd support for the devices cgroup lacks two required features:
 * Support for wildcards to allow mknod on any device
 * Support for wildcards to allow /dev/pts support

The second is available in more recent systemd as "char-pts", but not in e.g. v208 which is in wide use.

Additionally, the current approach of letting systemd set up the devices cgroup and then adding
some devices to it doesn't work, because some times systemd (at least v208) re-initializes
the devices cgroup, overwriting our custom devices. See https://github.com/dotcloud/docker/issues/6009
for the details.

When wildcarded mknod support is available in systemd we should implement a pure systemd version,
but we need to keep the old one around for backwards compat.

Docker-DCO-1.1-Signed-off-by: Alexander Larsson <alexl@redhat.com> (github: alexlarsson)
Upstream-commit: 6b8f0e394b3960bba1db982c62c002221419f3f5
Component: engine
2014-06-03 11:34:14 +02:00

326 lines
8.2 KiB
Go

// +build linux
package systemd
import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
systemd1 "github.com/coreos/go-systemd/dbus"
"github.com/dotcloud/docker/pkg/libcontainer/cgroups"
"github.com/dotcloud/docker/pkg/systemd"
"github.com/godbus/dbus"
)
type systemdCgroup struct {
cleanupDirs []string
}
var (
connLock sync.Mutex
theConn *systemd1.Conn
hasStartTransientUnit bool
)
func UseSystemd() bool {
if !systemd.SdBooted() {
return false
}
connLock.Lock()
defer connLock.Unlock()
if theConn == nil {
var err error
theConn, err = systemd1.New()
if err != nil {
return false
}
// Assume we have StartTransientUnit
hasStartTransientUnit = true
// But if we get UnknownMethod error we don't
if _, err := theConn.StartTransientUnit("test.scope", "invalid"); err != nil {
if dbusError, ok := err.(dbus.Error); ok {
if dbusError.Name == "org.freedesktop.DBus.Error.UnknownMethod" {
hasStartTransientUnit = false
}
}
}
}
return hasStartTransientUnit
}
func getIfaceForUnit(unitName string) string {
if strings.HasSuffix(unitName, ".scope") {
return "Scope"
}
if strings.HasSuffix(unitName, ".service") {
return "Service"
}
return "Unit"
}
type cgroupArg struct {
File string
Value string
}
func Apply(c *cgroups.Cgroup, pid int) (cgroups.ActiveCgroup, error) {
var (
unitName = getUnitName(c)
slice = "system.slice"
properties []systemd1.Property
cpuArgs []cgroupArg
cpusetArgs []cgroupArg
memoryArgs []cgroupArg
res systemdCgroup
)
// First set up things not supported by systemd
// -1 disables memorySwap
if c.MemorySwap >= 0 && (c.Memory != 0 || c.MemorySwap > 0) {
memorySwap := c.MemorySwap
if memorySwap == 0 {
// By default, MemorySwap is set to twice the size of RAM.
memorySwap = c.Memory * 2
}
memoryArgs = append(memoryArgs, cgroupArg{"memory.memsw.limit_in_bytes", strconv.FormatInt(memorySwap, 10)})
}
if c.CpusetCpus != "" {
cpusetArgs = append(cpusetArgs, cgroupArg{"cpuset.cpus", c.CpusetCpus})
}
if c.Slice != "" {
slice = c.Slice
}
properties = append(properties,
systemd1.Property{"Slice", dbus.MakeVariant(slice)},
systemd1.Property{"Description", dbus.MakeVariant("docker container " + c.Name)},
systemd1.Property{"PIDs", dbus.MakeVariant([]uint32{uint32(pid)})},
)
// Always enable accounting, this gets us the same behaviour as the fs implementation,
// plus the kernel has some problems with joining the memory cgroup at a later time.
properties = append(properties,
systemd1.Property{"MemoryAccounting", dbus.MakeVariant(true)},
systemd1.Property{"CPUAccounting", dbus.MakeVariant(true)},
systemd1.Property{"BlockIOAccounting", dbus.MakeVariant(true)})
if c.Memory != 0 {
properties = append(properties,
systemd1.Property{"MemoryLimit", dbus.MakeVariant(uint64(c.Memory))})
}
// TODO: MemoryReservation and MemorySwap not available in systemd
if c.CpuShares != 0 {
properties = append(properties,
systemd1.Property{"CPUShares", dbus.MakeVariant(uint64(c.CpuShares))})
}
if _, err := theConn.StartTransientUnit(unitName, "replace", properties...); err != nil {
return nil, err
}
// To work around the lack of /dev/pts/* support above we need to manually add these
// so, ask systemd for the cgroup used
props, err := theConn.GetUnitTypeProperties(unitName, getIfaceForUnit(unitName))
if err != nil {
return nil, err
}
cgroup := props["ControlGroup"].(string)
if !c.AllowAllDevices {
// Atm we can't use the systemd device support because of two missing things:
// * Support for wildcards to allow mknod on any device
// * Support for wildcards to allow /dev/pts support
//
// The second is available in more recent systemd as "char-pts", but not in e.g. v208 which is
// in wide use. When both these are availalable we will be able to switch, but need to keep the old
// implementation for backwards compat.
//
// Note: we can't use systemd to set up the initial limits, and then change the cgroup
// because systemd will re-write the device settings if it needs to re-apply the cgroup context.
// This happens at least for v208 when any sibling unit is started.
mountpoint, err := cgroups.FindCgroupMountpoint("devices")
if err != nil {
return nil, err
}
initPath, err := cgroups.GetInitCgroupDir("devices")
if err != nil {
return nil, err
}
dir := filepath.Join(mountpoint, initPath, c.Parent, c.Name)
res.cleanupDirs = append(res.cleanupDirs, dir)
if err := os.MkdirAll(dir, 0755); err != nil && !os.IsExist(err) {
return nil, err
}
if err := ioutil.WriteFile(filepath.Join(dir, "cgroup.procs"), []byte(strconv.Itoa(pid)), 0700); err != nil {
return nil, err
}
if err := writeFile(dir, "devices.deny", "a"); err != nil {
return nil, err
}
for _, dev := range c.AllowedDevices {
if err := writeFile(dir, "devices.allow", dev.GetCgroupAllowString()); err != nil {
return nil, err
}
}
}
if len(cpuArgs) != 0 {
mountpoint, err := cgroups.FindCgroupMountpoint("cpu")
if err != nil {
return nil, err
}
path := filepath.Join(mountpoint, cgroup)
for _, arg := range cpuArgs {
if err := ioutil.WriteFile(filepath.Join(path, arg.File), []byte(arg.Value), 0700); err != nil {
return nil, err
}
}
}
if len(memoryArgs) != 0 {
mountpoint, err := cgroups.FindCgroupMountpoint("memory")
if err != nil {
return nil, err
}
path := filepath.Join(mountpoint, cgroup)
for _, arg := range memoryArgs {
if err := ioutil.WriteFile(filepath.Join(path, arg.File), []byte(arg.Value), 0700); err != nil {
return nil, err
}
}
}
if len(cpusetArgs) != 0 {
// systemd does not atm set up the cpuset controller, so we must manually
// join it. Additionally that is a very finicky controller where each
// level must have a full setup as the default for a new directory is "no cpus",
// so we avoid using any hierarchies here, creating a toplevel directory.
mountpoint, err := cgroups.FindCgroupMountpoint("cpuset")
if err != nil {
return nil, err
}
initPath, err := cgroups.GetInitCgroupDir("cpuset")
if err != nil {
return nil, err
}
rootPath := filepath.Join(mountpoint, initPath)
path := filepath.Join(mountpoint, initPath, c.Parent+"-"+c.Name)
res.cleanupDirs = append(res.cleanupDirs, path)
if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) {
return nil, err
}
foundCpus := false
foundMems := false
for _, arg := range cpusetArgs {
if arg.File == "cpuset.cpus" {
foundCpus = true
}
if arg.File == "cpuset.mems" {
foundMems = true
}
if err := ioutil.WriteFile(filepath.Join(path, arg.File), []byte(arg.Value), 0700); err != nil {
return nil, err
}
}
// These are required, if not specified inherit from parent
if !foundCpus {
s, err := ioutil.ReadFile(filepath.Join(rootPath, "cpuset.cpus"))
if err != nil {
return nil, err
}
if err := ioutil.WriteFile(filepath.Join(path, "cpuset.cpus"), s, 0700); err != nil {
return nil, err
}
}
// These are required, if not specified inherit from parent
if !foundMems {
s, err := ioutil.ReadFile(filepath.Join(rootPath, "cpuset.mems"))
if err != nil {
return nil, err
}
if err := ioutil.WriteFile(filepath.Join(path, "cpuset.mems"), s, 0700); err != nil {
return nil, err
}
}
if err := ioutil.WriteFile(filepath.Join(path, "cgroup.procs"), []byte(strconv.Itoa(pid)), 0700); err != nil {
return nil, err
}
}
return &res, nil
}
func writeFile(dir, file, data string) error {
return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
}
func (c *systemdCgroup) Cleanup() error {
// systemd cleans up, we don't need to do much
for _, path := range c.cleanupDirs {
os.RemoveAll(path)
}
return nil
}
func GetPids(c *cgroups.Cgroup) ([]int, error) {
unitName := getUnitName(c)
mountpoint, err := cgroups.FindCgroupMountpoint("cpu")
if err != nil {
return nil, err
}
props, err := theConn.GetUnitTypeProperties(unitName, getIfaceForUnit(unitName))
if err != nil {
return nil, err
}
cgroup := props["ControlGroup"].(string)
return cgroups.ReadProcsFile(filepath.Join(mountpoint, cgroup))
}
func getUnitName(c *cgroups.Cgroup) string {
return fmt.Sprintf("%s-%s.scope", c.Parent, c.Name)
}