nspawn.c revision 840295fc1e30bb8902e8df08127bbc281318b537
a336a7912ecb62fb9310c4f3e50dc622aea4951cKnut Anders Hatlen/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
a336a7912ecb62fb9310c4f3e50dc622aea4951cKnut Anders Hatlen This file is part of systemd.
a336a7912ecb62fb9310c4f3e50dc622aea4951cKnut Anders Hatlen Copyright 2010 Lennart Poettering
a336a7912ecb62fb9310c4f3e50dc622aea4951cKnut Anders Hatlen systemd is free software; you can redistribute it and/or modify it
a336a7912ecb62fb9310c4f3e50dc622aea4951cKnut Anders Hatlen under the terms of the GNU Lesser General Public License as published by
a336a7912ecb62fb9310c4f3e50dc622aea4951cKnut Anders Hatlen the Free Software Foundation; either version 2.1 of the License, or
a336a7912ecb62fb9310c4f3e50dc622aea4951cKnut Anders Hatlen (at your option) any later version.
a336a7912ecb62fb9310c4f3e50dc622aea4951cKnut Anders Hatlen systemd is distributed in the hope that it will be useful, but
a336a7912ecb62fb9310c4f3e50dc622aea4951cKnut Anders Hatlen WITHOUT ANY WARRANTY; without even the implied warranty of
a336a7912ecb62fb9310c4f3e50dc622aea4951cKnut Anders Hatlen MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
a336a7912ecb62fb9310c4f3e50dc622aea4951cKnut Anders Hatlen Lesser General Public License for more details.
a336a7912ecb62fb9310c4f3e50dc622aea4951cKnut Anders Hatlen You should have received a copy of the GNU Lesser General Public License
a336a7912ecb62fb9310c4f3e50dc622aea4951cKnut Anders Hatlen along with systemd; If not, see <http://www.gnu.org/licenses/>.
#include <stdio.h>
#include <errno.h>
#include <getopt.h>
#include <termios.h>
#include <grp.h>
#ifdef HAVE_SELINUX
#ifdef HAVE_SECCOMP
#include <seccomp.h>
#ifdef HAVE_BLKID
#include "sd-daemon.h"
#include "sd-bus.h"
#include "sd-id128.h"
#include "sd-rtnl.h"
#include "log.h"
#include "util.h"
#include "mkdir.h"
#include "macro.h"
#include "audit.h"
#include "missing.h"
#include "cgroup-util.h"
#include "strv.h"
#include "path-util.h"
#include "loopback-setup.h"
#include "dev-setup.h"
#include "fdset.h"
#include "build.h"
#include "fileio.h"
#include "bus-util.h"
#include "bus-error.h"
#include "ptyfwd.h"
#include "bus-kernel.h"
#include "env-util.h"
#include "def.h"
#include "rtnl-util.h"
#include "udev-util.h"
#include "eventfd-util.h"
#include "blkid-util.h"
#include "gpt.h"
#include "siphash24.h"
#include "copy.h"
#include "base-filesystem.h"
#ifdef HAVE_SECCOMP
#include "seccomp-util.h"
typedef enum ContainerStatus {
typedef enum LinkJournal {
} LinkJournal;
static bool arg_private_network = false;
static bool arg_read_only = false;
static bool arg_boot = false;
static bool arg_quiet = false;
static bool arg_share_system = false;
static bool arg_register = true;
static bool arg_keep_unit = false;
static bool arg_network_veth = false;
static int help(void) {
return help();
case ARG_VERSION:
if (!arg_directory) {
return -ENOMEM;
if (!arg_user)
return log_oom();
case ARG_NETWORK_BRIDGE:
case ARG_NETWORK_VETH:
arg_network_veth = true;
arg_private_network = true;
case ARG_NETWORK_INTERFACE:
return log_oom();
arg_private_network = true;
case ARG_NETWORK_MACVLAN:
return log_oom();
case ARG_PRIVATE_NETWORK:
arg_private_network = true;
arg_boot = true;
case ARG_UUID:
return -EINVAL;
if (!arg_machine)
return log_oom();
case ARG_READ_ONLY:
arg_read_only = true;
case ARG_CAPABILITY:
case ARG_DROP_CAPABILITY: {
_cleanup_free_ char *t;
return log_oom();
if (c == ARG_CAPABILITY)
return -EINVAL;
if (c == ARG_CAPABILITY)
case ARG_LINK_JOURNAL:
return -EINVAL;
case ARG_BIND:
case ARG_BIND_RO: {
return log_oom();
return -EINVAL;
r = strv_extend(x, a);
return log_oom();
r = strv_extend(x, b);
return log_oom();
case ARG_TMPFS: {
return log_oom();
if (!path_is_absolute(a)) {
return -EINVAL;
return log_oom();
a = NULL;
return log_oom();
b = NULL;
case ARG_SETENV: {
return -EINVAL;
return log_oom();
arg_setenv = n;
arg_quiet = true;
case ARG_SHARE_SYSTEM:
arg_share_system = true;
case ARG_REGISTER:
arg_register = r;
case ARG_KEEP_UNIT:
arg_keep_unit = true;
case ARG_PERSONALITY:
return -EINVAL;
return -EINVAL;
if (arg_share_system)
arg_register = false;
return -EINVAL;
return -EINVAL;
return -EINVAL;
typedef struct MountPoint {
const char *what;
const char *where;
const char *type;
const char *options;
unsigned long flags;
bool fatal;
} MountPoint;
{ "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
#ifdef HAVE_SELINUX
{ NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
#ifdef HAVE_SELINUX
if (!where)
return log_oom();
#ifdef HAVE_SELINUX
if (arg_selinux_apifs_context &&
if (!options)
return log_oom();
o = options;
r = -errno;
STRV_FOREACH_PAIR(x, y, l) {
return -errno;
if (!where)
return log_oom();
return -EINVAL;
return -errno;
return -ENOTSUP;
return -errno;
if (ro) {
if (!where)
return log_oom();
return -errno;
log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
if (!where)
return log_oom();
if (y && streq(y, z))
if (!check)
return log_oom();
if (!what)
return log_oom();
if (arg_private_network)
/* Fix resolv.conf, if possible */
if (!where)
return log_oom();
if (arg_share_system)
return log_oom();
r = -errno;
static const char devnodes[] =
u = umask(0000);
return log_oom();
return -errno;
return -EIO;
return -errno;
return log_oom();
return -errno;
const char *to;
u = umask(0000);
return -errno;
return -errno;
return -errno;
int r, fd, k;
} control = {};
u = umask(0000);
return log_oom();
return -errno;
return -errno;
if (fd < 0) {
return -errno;
return -errno;
static int setup_hostname(void) {
if (arg_share_system)
return -errno;
char *id;
return log_oom();
r = read_one_line_file(p, &b);
-EEXIST;
free(p);
return log_oom();
if (path_is_mount_point(p, false) > 0) {
return -EEXIST;
if (path_is_mount_point(q, false) > 0) {
return -EEXIST;
r = readlink_and_make_absolute(p, &d);
path_equal(d, q)) {
if (unlink(p) < 0) {
return -errno;
} else if (r == -EINVAL) {
rmdir(p) < 0) {
return -errno;
} else if (r != -ENOENT) {
if (symlink(q, p) < 0) {
return -errno;
if (dir_is_empty(q) == 0)
return -errno;
if (!path)
return -errno;
return -errno;
static int drop_capabilities(void) {
if (!arg_register)
if (arg_keep_unit) {
r = sd_bus_call_method(
bus,
"/org/freedesktop/machine1",
&error,
NULL,
bus,
"/org/freedesktop/machine1",
r = sd_bus_message_close_container(m);
const char *path;
if (!arg_register)
r = sd_bus_call_method(
bus,
"/org/freedesktop/machine1",
&error,
&reply,
return bus_log_parse_error(r);
r = sd_bus_call_method(
bus,
path,
&error,
NULL,
NULL);
static int reset_audit_loginuid(void) {
if (arg_share_system)
if (r == -ENOENT)
uint8_t *v;
if (!arg_private_network)
if (!arg_network_veth)
if (arg_network_bridge)
r = sd_rtnl_message_close_container(m);
r = sd_rtnl_message_close_container(m);
r = sd_rtnl_message_close_container(m);
int r, bridge;
if (!arg_private_network)
if (!arg_network_veth)
if (!arg_network_bridge)
if (bridge <= 0) {
return -errno;
int ifi;
if (ifi <= 0) {
return -errno;
return -errno;
if (udev_device_get_is_initialized(d) <= 0) {
return -EBUSY;
return ifi;
if (!arg_private_network)
if (!udev) {
return -ENOMEM;
int ifi;
if (ifi < 0)
return ifi;
if (!arg_private_network)
if (!udev) {
return -ENOMEM;
int ifi;
if (ifi < 0)
return ifi;
return log_oom();
r = sd_rtnl_message_close_container(m);
r = sd_rtnl_message_close_container(m);
static int audit_still_doesnt_work_in_containers(void) {
#ifdef HAVE_SECCOMP
if (!seccomp)
return log_oom();
goto finish;
r = seccomp_rule_add(
goto finish;
goto finish;
int r, nr;
if (fd < 0) {
return -errno;
return -errno;
return log_oom();
*device_path = p;
r = fd;
return -EINVAL;
if (control < 0) {
return -errno;
if (nr < 0) {
return -errno;
return log_oom();
if (loop < 0) {
return -errno;
return -errno;
if (arg_read_only)
return -errno;
r = loop;
static int dissect_image(
int fd,
bool *secondary) {
#ifdef HAVE_BLKID
b = blkid_new_probe();
return log_oom();
errno = 0;
if (errno == 0)
return log_oom();
return -errno;
errno = 0;
r = blkid_do_safeprobe(b);
"Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
return -EINVAL;
if (errno == 0)
return -errno;
"Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
return -EINVAL;
errno = 0;
if (!pl) {
if (errno == 0)
return log_oom();
return -errno;
if (!udev)
return log_oom();
return -errno;
return log_oom();
return log_oom();
r = udev_enumerate_add_match_parent(e, d);
return log_oom();
r = udev_enumerate_scan_devices(e);
unsigned long long flags;
int nr;
errno = 0;
if (!errno)
return -errno;
if (!node)
if (!pp)
if (nr < 0)
if (!stype)
if (!home)
return log_oom();
if (!srv)
return log_oom();
#ifdef GPT_ROOT_NATIVE
if (!root)
return log_oom();
#ifdef GPT_ROOT_SECONDARY
if (!secondary_root)
return log_oom();
"Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
return -EINVAL;
if (root) {
*secondary = false;
} else if (secondary_root) {
*secondary = true;
if (home) {
if (srv) {
return -ENOTSUP;
#ifdef HAVE_BLKID
const char *fstype, *p;
if (arg_read_only)
rw = false;
if (directory)
p = where;
errno = 0;
if (errno == 0)
return log_oom();
return -errno;
errno = 0;
r = blkid_do_safeprobe(b);
return -EINVAL;
if (errno == 0)
return -errno;
errno = 0;
if (errno == 0)
return -errno;
return -ENOTSUP;
return -errno;
return -ENOTSUP;
static int mount_devices(
const char *where,
if (root_device) {
if (home_device) {
if (srv_device) {
if (nr < 0)
if (control < 0)
return -errno;
if (pid < 0) {
return -errno;
} else if (pid == 0) {
int nullfd;
if (nullfd < 0)
return pipe_fds[0];
unsigned n_uids = 0;
return -errno;
if (setresgid(0, 0, 0) < 0) {
return -errno;
if (setresuid(0, 0, 0) < 0) {
return -errno;
if (fd < 0)
return fd;
return log_oom();
if (!ferror(f)) {
return -ESRCH;
return -errno;
return -EIO;
return -EIO;
return -EIO;
return -EIO;
return -EIO;
return -EIO;
return -EIO;
return -EIO;
if (!home)
return log_oom();
if (fd < 0)
return fd;
fclose(f);
return log_oom();
if (!ferror(f)) {
return -ESRCH;
return -errno;
x = line;
memcpy(c, w, l);
return log_oom();
return -EIO;
if (r < 0 && r != -EEXIST) {
return -errno;
return -errno;
return -errno;
if (_home) {
case CLD_EXITED:
if (!arg_quiet)
case CLD_KILLED:
if (!arg_quiet)
if (!arg_quiet)
case CLD_DUMPED:
_cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
bool secondary = false;
log_open();
goto finish;
r = EXIT_SUCCESS;
goto finish;
if (!arg_image) {
if (arg_directory) {
arg_directory = p;
if (!arg_directory) {
goto finish;
if (!arg_machine) {
if (!arg_machine) {
log_oom();
goto finish;
goto finish;
if (geteuid() != 0) {
goto finish;
if (sd_booted() <= 0) {
goto finish;
log_close();
if (n_fd_passed > 0) {
goto finish;
log_open();
if (arg_directory) {
goto finish;
if (arg_boot) {
log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
goto finish;
log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
goto finish;
r = -errno;
goto finish;
if (!arg_directory) {
r = log_oom();
goto finish;
if (image_fd < 0) {
r = image_fd;
goto finish;
r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
goto finish;
if (master < 0) {
goto finish;
if (!console) {
goto finish;
if (!arg_quiet)
goto finish;
if (arg_share_system) {
if (!kdbus_domain) {
log_oom();
goto finish;
const char *ns;
goto finish;
goto finish;
goto finish;
if (pid < 0) {
log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
r = pid;
goto finish;
if (pid == 0) {
const char *envp[] = {
char **env_use;
n_env ++;
if (k != STDIN_FILENO) {
safe_close(k);
k = -EINVAL;
goto child_fail;
goto child_fail;
if (setsid() < 0) {
goto child_fail;
if (reset_audit_loginuid() < 0)
goto child_fail;
goto child_fail;
goto child_fail;
goto child_fail;
goto child_fail;
goto child_fail;
if (arg_read_only) {
goto child_fail;
goto child_fail;
goto child_fail;
goto child_fail;
if (audit_still_doesnt_work_in_containers() < 0)
goto child_fail;
goto child_fail;
goto child_fail;
goto child_fail;
goto child_fail;
goto child_fail;
goto child_fail;
goto child_fail;
goto child_fail;
goto child_fail;
goto child_fail;
goto child_fail;
goto child_fail;
goto child_fail;
goto child_fail;
goto child_fail;
if (arg_private_network)
if (drop_capabilities() < 0) {
goto child_fail;
goto child_fail;
log_oom();
goto child_fail;
if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
log_oom();
goto child_fail;
goto child_fail;
log_oom();
goto child_fail;
goto child_fail;
} else if (secondary) {
goto child_fail;
#ifdef HAVE_SELINUX
if (arg_selinux_context)
goto child_fail;
log_oom();
goto child_fail;
env_use = n;
goto child_fail;
if (arg_boot) {
size_t l;
goto finish;
goto finish;
goto finish;
goto finish;
goto finish;
goto finish;
goto finish;
goto finish;
r = EXIT_FAILURE;
if (!arg_quiet)
pid = 0;
r = EXIT_FAILURE;
if (pid > 0)