src/core/cgroup.c

	cgroup.c revision cd7affaeea16d3904354b810a292e594dfef25dd
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley/***
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley  This file is part of systemd.
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley  Copyright 2013 Lennart Poettering
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley  systemd is free software; you can redistribute it and/or modify it
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley  under the terms of the GNU Lesser General Public License as published by
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley  the Free Software Foundation; either version 2.1 of the License, or
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley  (at your option) any later version.
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley  systemd is distributed in the hope that it will be useful, but
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley  WITHOUT ANY WARRANTY; without even the implied warranty of
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley  Lesser General Public License for more details.
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley  You should have received a copy of the GNU Lesser General Public License
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley  along with systemd; If not, see <http://www.gnu.org/licenses/>.
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley***/
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley#include <fcntl.h>
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley#include <fnmatch.h>
e9f58f98089a4e4670e5ee2d3df315561084786eRich Riley
#include "path-util.h"
#include "special.h"
#include "cgroup-util.h"
#include "cgroup.h"

void cgroup_context_init(CGroupContext *c) {
        assert(c);

        /* Initialize everything to the kernel defaults, assuming the
         * structure is preinitialized to 0 */

        c->cpu_shares = 1024;
        c->memory_limit = (uint64_t) -1;
        c->blockio_weight = 1000;

        c->cpu_quota_per_sec_usec = (usec_t) -1;
        c->cpu_quota_usec = (usec_t) -1;
        c->cpu_quota_period_usec = 100*USEC_PER_MSEC;
}

void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
        assert(c);
        assert(a);

        LIST_REMOVE(device_allow, c->device_allow, a);
        free(a->path);
        free(a);
}

void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
        assert(c);
        assert(w);

        LIST_REMOVE(device_weights, c->blockio_device_weights, w);
        free(w->path);
        free(w);
}

void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
        assert(c);
        assert(b);

        LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
        free(b->path);
        free(b);
}

void cgroup_context_done(CGroupContext *c) {
        assert(c);

        while (c->blockio_device_weights)
                cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);

        while (c->blockio_device_bandwidths)
                cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);

        while (c->device_allow)
                cgroup_context_free_device_allow(c, c->device_allow);
}

usec_t cgroup_context_get_cpu_quota_usec(CGroupContext *c) {
        assert(c);

        /* Returns the absolute CPU quota */

        if (c->cpu_quota_usec != (usec_t) -1)
                return c->cpu_quota_usec;
        else if (c->cpu_quota_per_sec_usec != (usec_t) -1)
                return c->cpu_quota_per_sec_usec*c->cpu_quota_period_usec/USEC_PER_SEC;
        else
                return (usec_t) -1;
}

usec_t cgroup_context_get_cpu_quota_per_sec_usec(CGroupContext *c) {
        assert(c);

        /* Returns the CPU quota relative to 1s */

        if (c->cpu_quota_usec != (usec_t) -1)
                return c->cpu_quota_usec*USEC_PER_SEC/c->cpu_quota_period_usec;
        else if (c->cpu_quota_per_sec_usec != (usec_t) -1)
                return c->cpu_quota_per_sec_usec;
        else
                return (usec_t) -1;
}

void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
        CGroupBlockIODeviceBandwidth *b;
        CGroupBlockIODeviceWeight *w;
        CGroupDeviceAllow *a;
        char t[FORMAT_TIMESPAN_MAX], s[FORMAT_TIMESPAN_MAX], u[FORMAT_TIMESPAN_MAX];

        assert(c);
        assert(f);

        prefix = strempty(prefix);

        fprintf(f,
                "%sCPUAccounting=%s\n"
                "%sBlockIOAccounting=%s\n"
                "%sMemoryAccounting=%s\n"
                "%sCPUShares=%lu\n"
                "%sCPUQuota=%s\n"
                "%sCPUQuotaPerSecSec=%s\n"
                "%sCPUQuotaPeriodSec=%s\n"
                "%sBlockIOWeight=%lu\n"
                "%sMemoryLimit=%" PRIu64 "\n"
                "%sDevicePolicy=%s\n",
                prefix, yes_no(c->cpu_accounting),
                prefix, yes_no(c->blockio_accounting),
                prefix, yes_no(c->memory_accounting),
                prefix, c->cpu_shares,
                prefix, strna(format_timespan(u, sizeof(u), cgroup_context_get_cpu_quota_usec(c), 1)),
                prefix, strna(format_timespan(t, sizeof(t), cgroup_context_get_cpu_quota_per_sec_usec(c), 1)),
                prefix, strna(format_timespan(s, sizeof(s), c->cpu_quota_period_usec, 1)),
                prefix, c->blockio_weight,
                prefix, c->memory_limit,
                prefix, cgroup_device_policy_to_string(c->device_policy));

        LIST_FOREACH(device_allow, a, c->device_allow)
                fprintf(f,
                        "%sDeviceAllow=%s %s%s%s\n",
                        prefix,
                        a->path,
                        a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");

        LIST_FOREACH(device_weights, w, c->blockio_device_weights)
                fprintf(f,
                        "%sBlockIODeviceWeight=%s %lu",
                        prefix,
                        w->path,
                        w->weight);

        LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
                char buf[FORMAT_BYTES_MAX];

                fprintf(f,
                        "%s%s=%s %s\n",
                        prefix,
                        b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
                        b->path,
                        format_bytes(buf, sizeof(buf), b->bandwidth));
        }
}

static int lookup_blkio_device(const char *p, dev_t *dev) {
        struct stat st;
        int r;

        assert(p);
        assert(dev);

        r = stat(p, &st);
        if (r < 0) {
                log_warning("Couldn't stat device %s: %m", p);
                return -errno;
        }

        if (S_ISBLK(st.st_mode))
                *dev = st.st_rdev;
        else if (major(st.st_dev) != 0) {
                /* If this is not a device node then find the block
                 * device this file is stored on */
                *dev = st.st_dev;

                /* If this is a partition, try to get the originating
                 * block device */
                block_get_whole_disk(*dev, dev);
        } else {
                log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
                return -ENODEV;
        }

        return 0;
}

static int whitelist_device(const char *path, const char *node, const char *acc) {
        char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
        struct stat st;
        int r;

        assert(path);
        assert(acc);

        if (stat(node, &st) < 0) {
                log_warning("Couldn't stat device %s", node);
                return -errno;
        }

        if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
                log_warning("%s is not a device.", node);
                return -ENODEV;
        }

        sprintf(buf,
                "%c %u:%u %s",
                S_ISCHR(st.st_mode) ? 'c' : 'b',
                major(st.st_rdev), minor(st.st_rdev),
                acc);

        r = cg_set_attribute("devices", path, "devices.allow", buf);
        if (r < 0)
                log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));

        return r;
}

static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
        _cleanup_fclose_ FILE *f = NULL;
        char line[LINE_MAX];
        bool good = false;
        int r;

        assert(path);
        assert(acc);
        assert(type == 'b' || type == 'c');

        f = fopen("/proc/devices", "re");
        if (!f) {
                log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
                return -errno;
        }

        FOREACH_LINE(line, f, goto fail) {
                char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
                unsigned maj;

                truncate_nl(line);

                if (type == 'c' && streq(line, "Character devices:")) {
                        good = true;
                        continue;
                }

                if (type == 'b' && streq(line, "Block devices:")) {
                        good = true;
                        continue;
                }

                if (isempty(line)) {
                        good = false;
                        continue;
                }

                if (!good)
                        continue;

                p = strstrip(line);

                w = strpbrk(p, WHITESPACE);
                if (!w)
                        continue;
                *w = 0;

                r = safe_atou(p, &maj);
                if (r < 0)
                        continue;
                if (maj <= 0)
                        continue;

                w++;
                w += strspn(w, WHITESPACE);

                if (fnmatch(name, w, 0) != 0)
                        continue;

                sprintf(buf,
                        "%c %u:* %s",
                        type,
                        maj,
                        acc);

                r = cg_set_attribute("devices", path, "devices.allow", buf);
                if (r < 0)
                        log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
        }

        return 0;

fail:
        log_warning("Failed to read /proc/devices: %m");
        return -errno;
}

void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path) {
        bool is_root;
        int r;

        assert(c);
        assert(path);

        if (mask == 0)
                return;

        /* Some cgroup attributes are not support on the root cgroup,
         * hence silently ignore */
        is_root = isempty(path) || path_equal(path, "/");

        if ((mask & CGROUP_CPU) && !is_root) {
                char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
                usec_t q;

                sprintf(buf, "%lu\n", c->cpu_shares);
                r = cg_set_attribute("cpu", path, "cpu.shares", buf);
                if (r < 0)
                        log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));

                sprintf(buf, USEC_FMT "\n", c->cpu_quota_period_usec);
                r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
                if (r < 0)
                        log_warning("Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));

                q = cgroup_context_get_cpu_quota_usec(c);
                if (q != (usec_t) -1) {
                        sprintf(buf, USEC_FMT "\n", q);
                        r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
                } else
                        r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
                if (r < 0)
                        log_warning("Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
        }

        if (mask & CGROUP_BLKIO) {
                char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
                              DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
                              DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
                CGroupBlockIODeviceWeight *w;
                CGroupBlockIODeviceBandwidth *b;

                if (!is_root) {
                        sprintf(buf, "%lu\n", c->blockio_weight);
                        r = cg_set_attribute("blkio", path, "blkio.weight", buf);
                        if (r < 0)
                                log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));

                        /* FIXME: no way to reset this list */
                        LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
                                dev_t dev;

                                r = lookup_blkio_device(w->path, &dev);
                                if (r < 0)
                                        continue;

                                sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
                                r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
                                if (r < 0)
                                        log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
                        }
                }

                /* FIXME: no way to reset this list */
                LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
                        const char *a;
                        dev_t dev;

                        r = lookup_blkio_device(b->path, &dev);
                        if (r < 0)
                                continue;

                        a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";

                        sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
                        r = cg_set_attribute("blkio", path, a, buf);
                        if (r < 0)
                                log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
                }
        }

        if (mask & CGROUP_MEMORY) {
                if (c->memory_limit != (uint64_t) -1) {
                        char buf[DECIMAL_STR_MAX(uint64_t) + 1];

                        sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
                        r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
                } else
                        r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");

                if (r < 0)
                        log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
        }

        if ((mask & CGROUP_DEVICE) && !is_root) {
                CGroupDeviceAllow *a;

                if (c->device_allow || c->device_policy != CGROUP_AUTO)
                        r = cg_set_attribute("devices", path, "devices.deny", "a");
                else
                        r = cg_set_attribute("devices", path, "devices.allow", "a");
                if (r < 0)
                        log_warning("Failed to reset devices.list on %s: %s", path, strerror(-r));

                if (c->device_policy == CGROUP_CLOSED ||
                    (c->device_policy == CGROUP_AUTO && c->device_allow)) {
                        static const char auto_devices[] =
                                "/dev/null\0" "rwm\0"
                                "/dev/zero\0" "rwm\0"
                                "/dev/full\0" "rwm\0"
                                "/dev/random\0" "rwm\0"
                                "/dev/urandom\0" "rwm\0"
                                "/dev/tty\0" "rwm\0"
                                "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */

                        const char *x, *y;

                        NULSTR_FOREACH_PAIR(x, y, auto_devices)
                                whitelist_device(path, x, y);

                        whitelist_major(path, "pts", 'c', "rw");
                        whitelist_major(path, "kdbus", 'c', "rw");
                        whitelist_major(path, "kdbus/*", 'c', "rw");
                }

                LIST_FOREACH(device_allow, a, c->device_allow) {
                        char acc[4];
                        unsigned k = 0;

                        if (a->r)
                                acc[k++] = 'r';
                        if (a->w)
                                acc[k++] = 'w';
                        if (a->m)
                                acc[k++] = 'm';

                        if (k == 0)
                                continue;

                        acc[k++] = 0;

                        if (startswith(a->path, "/dev/"))
                                whitelist_device(path, a->path, acc);
                        else if (startswith(a->path, "block-"))
                                whitelist_major(path, a->path + 6, 'b', acc);
                        else if (startswith(a->path, "char-"))
                                whitelist_major(path, a->path + 5, 'c', acc);
                        else
                                log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
                }
        }
}

CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
        CGroupControllerMask mask = 0;

        /* Figure out which controllers we need */

        if (c->cpu_accounting ||
            c->cpu_shares != 1024 ||
            c->cpu_quota_usec != (usec_t) -1 ||
            c->cpu_quota_per_sec_usec != (usec_t) -1)
                mask |= CGROUP_CPUACCT | CGROUP_CPU;

        if (c->blockio_accounting ||
            c->blockio_weight != 1000 ||
            c->blockio_device_weights ||
            c->blockio_device_bandwidths)
                mask |= CGROUP_BLKIO;

        if (c->memory_accounting ||
            c->memory_limit != (uint64_t) -1)
                mask |= CGROUP_MEMORY;

        if (c->device_allow || c->device_policy != CGROUP_AUTO)
                mask |= CGROUP_DEVICE;

        return mask;
}

CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
        CGroupContext *c;

        c = unit_get_cgroup_context(u);
        if (!c)
                return 0;

        return cgroup_context_get_mask(c);
}

CGroupControllerMask unit_get_members_mask(Unit *u) {
        assert(u);

        if (u->cgroup_members_mask_valid)
                return u->cgroup_members_mask;

        u->cgroup_members_mask = 0;

        if (u->type == UNIT_SLICE) {
                Unit *member;
                Iterator i;

                SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {

                        if (member == u)
                                continue;

                        if (UNIT_DEREF(member->slice) != u)
                                continue;

                        u->cgroup_members_mask |=
                                unit_get_cgroup_mask(member) |
                                unit_get_members_mask(member);
                }
        }

        u->cgroup_members_mask_valid = true;
        return u->cgroup_members_mask;
}

CGroupControllerMask unit_get_siblings_mask(Unit *u) {
        CGroupControllerMask m;

        assert(u);

        if (UNIT_ISSET(u->slice))
                m = unit_get_members_mask(UNIT_DEREF(u->slice));
        else
                m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);

        /* Sibling propagation is only relevant for weight-based
         * controllers, so let's mask out everything else */
        return m & (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
}

CGroupControllerMask unit_get_target_mask(Unit *u) {
        CGroupControllerMask mask;

        mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
        mask &= u->manager->cgroup_supported;

        return mask;
}

/* Recurse from a unit up through its containing slices, propagating
 * mask bits upward. A unit is also member of itself. */
void unit_update_cgroup_members_masks(Unit *u) {
        CGroupControllerMask m;
        bool more;

        assert(u);

        /* Calculate subtree mask */
        m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);

        /* See if anything changed from the previous invocation. If
         * not, we're done. */
        if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
                return;

        more =
                u->cgroup_subtree_mask_valid &&
                ((m & ~u->cgroup_subtree_mask) != 0) &&
                ((~m & u->cgroup_subtree_mask) == 0);

        u->cgroup_subtree_mask = m;
        u->cgroup_subtree_mask_valid = true;

        if (UNIT_ISSET(u->slice)) {
                Unit *s = UNIT_DEREF(u->slice);

                if (more)
                        /* There's more set now than before. We
                         * propagate the new mask to the parent's mask
                         * (not caring if it actually was valid or
                         * not). */

                        s->cgroup_members_mask |= m;

                else
                        /* There's less set now than before (or we
                         * don't know), we need to recalculate
                         * everything, so let's invalidate the
                         * parent's members mask */

                        s->cgroup_members_mask_valid = false;

                /* And now make sure that this change also hits our
                 * grandparents */
                unit_update_cgroup_members_masks(s);
        }
}

static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
        Unit *u = userdata;

        assert(mask != 0);
        assert(u);

        while (u) {
                if (u->cgroup_path &&
                    u->cgroup_realized &&
                    (u->cgroup_realized_mask & mask) == mask)
                        return u->cgroup_path;

                u = UNIT_DEREF(u->slice);
        }

        return NULL;
}

static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
        _cleanup_free_ char *path = NULL;
        int r;

        assert(u);

        path = unit_default_cgroup_path(u);
        if (!path)
                return log_oom();

        r = hashmap_put(u->manager->cgroup_unit, path, u);
        if (r < 0) {
                log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
                return r;
        }
        if (r > 0) {
                u->cgroup_path = path;
                path = NULL;
        }

        /* First, create our own group */
        r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
        if (r < 0) {
                log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
                return r;
        }

        /* Keep track that this is now realized */
        u->cgroup_realized = true;
        u->cgroup_realized_mask = mask;

        /* Then, possibly move things over */
        r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
        if (r < 0)
                log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));

        return 0;
}

static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
        assert(u);

        return u->cgroup_realized && u->cgroup_realized_mask == mask;
}

/* Check if necessary controllers and attributes for a unit are in place.
 *
 * If so, do nothing.
 * If not, create paths, move processes over, and set attributes.
 *
 * Returns 0 on success and < 0 on failure. */
static int unit_realize_cgroup_now(Unit *u) {
        CGroupControllerMask mask;
        int r;

        assert(u);

        if (u->in_cgroup_queue) {
                LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
                u->in_cgroup_queue = false;
        }

        mask = unit_get_target_mask(u);

        if (unit_has_mask_realized(u, mask))
                return 0;

        /* First, realize parents */
        if (UNIT_ISSET(u->slice)) {
                r = unit_realize_cgroup_now(UNIT_DEREF(u->slice));
                if (r < 0)
                        return r;
        }

        /* And then do the real work */
        r = unit_create_cgroups(u, mask);
        if (r < 0)
                return r;

        /* Finally, apply the necessary attributes. */
        cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path);

        return 0;
}

static void unit_add_to_cgroup_queue(Unit *u) {

        if (u->in_cgroup_queue)
                return;

        LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
        u->in_cgroup_queue = true;
}

unsigned manager_dispatch_cgroup_queue(Manager *m) {
        Unit *i;
        unsigned n = 0;
        int r;

        while ((i = m->cgroup_queue)) {
                assert(i->in_cgroup_queue);

                r = unit_realize_cgroup_now(i);
                if (r < 0)
                        log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));

                n++;
        }

        return n;
}

static void unit_queue_siblings(Unit *u) {
        Unit *slice;

        /* This adds the siblings of the specified unit and the
         * siblings of all parent units to the cgroup queue. (But
         * neither the specified unit itself nor the parents.) */

        while ((slice = UNIT_DEREF(u->slice))) {
                Iterator i;
                Unit *m;

                SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
                        if (m == u)
                                continue;

                        /* Skip units that have a dependency on the slice
                         * but aren't actually in it. */
                        if (UNIT_DEREF(m->slice) != slice)
                                continue;

                        /* No point in doing cgroup application for units
                         * without active processes. */
                        if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
                                continue;

                        /* If the unit doesn't need any new controllers
                         * and has current ones realized, it doesn't need
                         * any changes. */
                        if (unit_has_mask_realized(m, unit_get_target_mask(m)))
                                continue;

                        unit_add_to_cgroup_queue(m);
                }

                u = slice;
        }
}

int unit_realize_cgroup(Unit *u) {
        CGroupContext *c;

        assert(u);

        c = unit_get_cgroup_context(u);
        if (!c)
                return 0;

        /* So, here's the deal: when realizing the cgroups for this
         * unit, we need to first create all parents, but there's more
         * actually: for the weight-based controllers we also need to
         * make sure that all our siblings (i.e. units that are in the
         * same slice as we are) have cgroups, too. Otherwise, things
         * would become very uneven as each of their processes would
         * get as much resources as all our group together. This call
         * will synchronously create the parent cgroups, but will
         * defer work on the siblings to the next event loop
         * iteration. */

        /* Add all sibling slices to the cgroup queue. */
        unit_queue_siblings(u);

        /* And realize this one now (and apply the values) */
        return unit_realize_cgroup_now(u);
}

void unit_destroy_cgroup(Unit *u) {
        int r;

        assert(u);

        if (!u->cgroup_path)
                return;

        r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
        if (r < 0)
                log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));

        hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);

        free(u->cgroup_path);
        u->cgroup_path = NULL;
        u->cgroup_realized = false;
        u->cgroup_realized_mask = 0;

}

pid_t unit_search_main_pid(Unit *u) {
        _cleanup_fclose_ FILE *f = NULL;
        pid_t pid = 0, npid, mypid;

        assert(u);

        if (!u->cgroup_path)
                return 0;

        if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
                return 0;

        mypid = getpid();
        while (cg_read_pid(f, &npid) > 0)  {
                pid_t ppid;

                if (npid == pid)
                        continue;

                /* Ignore processes that aren't our kids */
                if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
                        continue;

                if (pid != 0) {
                        /* Dang, there's more than one daemonized PID
                        in this group, so we don't know what process
                        is the main process. */
                        pid = 0;
                        break;
                }

                pid = npid;
        }

        return pid;
}

int manager_setup_cgroup(Manager *m) {
        _cleanup_free_ char *path = NULL;
        char *e;
        int r;

        assert(m);

        /* 1. Determine hierarchy */
        free(m->cgroup_root);
        m->cgroup_root = NULL;

        r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
        if (r < 0) {
                log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
                return r;
        }

        /* LEGACY: Already in /system.slice? If so, let's cut this
         * off. This is to support live upgrades from older systemd
         * versions where PID 1 was moved there. */
        if (m->running_as == SYSTEMD_SYSTEM) {
                e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
                if (!e)
                        e = endswith(m->cgroup_root, "/system");
                if (e)
                        *e = 0;
        }

        /* And make sure to store away the root value without trailing
         * slash, even for the root dir, so that we can easily prepend
         * it everywhere. */
        if (streq(m->cgroup_root, "/"))
                m->cgroup_root[0] = 0;

        /* 2. Show data */
        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
        if (r < 0) {
                log_error("Cannot find cgroup mount point: %s", strerror(-r));
                return r;
        }

        log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);

        /* 3. Install agent */
        if (m->running_as == SYSTEMD_SYSTEM) {
                r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
                if (r < 0)
                        log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
                else if (r > 0)
                        log_debug("Installed release agent.");
                else
                        log_debug("Release agent already installed.");
        }

        /* 4. Make sure we are in the root cgroup */
        r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
        if (r < 0) {
                log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
                return r;
        }

        /* 5. And pin it, so that it cannot be unmounted */
        safe_close(m->pin_cgroupfs_fd);

        m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
        if (m->pin_cgroupfs_fd < 0) {
                log_error("Failed to open pin file: %m");
                return -errno;
        }

        /* 6. Figure out which controllers are supported */
        m->cgroup_supported = cg_mask_supported();

        /* 7.  Always enable hierarchial support if it exists... */
        cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");

        return 0;
}

void manager_shutdown_cgroup(Manager *m, bool delete) {
        assert(m);

        /* We can't really delete the group, since we are in it. But
         * let's trim it. */
        if (delete && m->cgroup_root)
                cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);

        m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);

        free(m->cgroup_root);
        m->cgroup_root = NULL;
}

Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
        char *p;
        Unit *u;

        assert(m);
        assert(cgroup);

        u = hashmap_get(m->cgroup_unit, cgroup);
        if (u)
                return u;

        p = strdupa(cgroup);
        for (;;) {
                char *e;

                e = strrchr(p, '/');
                if (e == p || !e)
                        return NULL;

                *e = 0;

                u = hashmap_get(m->cgroup_unit, p);
                if (u)
                        return u;
        }
}

Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
        _cleanup_free_ char *cgroup = NULL;
        int r;

        assert(m);

        if (pid <= 1)
                return NULL;

        r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
        if (r < 0)
                return NULL;

        return manager_get_unit_by_cgroup(m, cgroup);
}

int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
        Unit *u;
        int r;

        assert(m);
        assert(cgroup);

        u = manager_get_unit_by_cgroup(m, cgroup);
        if (u) {
                r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
                if (r > 0) {
                        if (UNIT_VTABLE(u)->notify_cgroup_empty)
                                UNIT_VTABLE(u)->notify_cgroup_empty(u);

                        unit_add_to_gc_queue(u);
                }
        }

        return 0;
}

static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
        [CGROUP_AUTO] = "auto",
        [CGROUP_CLOSED] = "closed",
        [CGROUP_STRICT] = "strict",
};

DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);