nspawn.c revision 1e41be20158a6d982c34cea20e66ff271302abc5
/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
/***
This file is part of systemd.
Copyright 2010 Lennart Poettering
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
#include <signal.h>
#include <sched.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <sys/capability.h>
#include <getopt.h>
#include <termios.h>
#include <sys/signalfd.h>
#include <grp.h>
#include <systemd/sd-daemon.h>
#include "log.h"
#include "util.h"
#include "mkdir.h"
#include "audit.h"
#include "missing.h"
#include "cgroup-util.h"
#include "strv.h"
#include "path-util.h"
#include "loopback-setup.h"
#include "sd-id128.h"
typedef enum LinkJournal {
} LinkJournal;
static char *arg_directory = NULL;
static char **arg_controllers = NULL;
static bool arg_private_network = false;
static bool arg_read_only = false;
static bool arg_boot = false;
static uint64_t arg_retain =
(1ULL << CAP_CHOWN) |
(1ULL << CAP_DAC_OVERRIDE) |
(1ULL << CAP_DAC_READ_SEARCH) |
(1ULL << CAP_FOWNER) |
(1ULL << CAP_FSETID) |
(1ULL << CAP_IPC_OWNER) |
(1ULL << CAP_KILL) |
(1ULL << CAP_LEASE) |
(1ULL << CAP_LINUX_IMMUTABLE) |
(1ULL << CAP_NET_BIND_SERVICE) |
(1ULL << CAP_NET_BROADCAST) |
(1ULL << CAP_NET_RAW) |
(1ULL << CAP_SETGID) |
(1ULL << CAP_SETFCAP) |
(1ULL << CAP_SETPCAP) |
(1ULL << CAP_SETUID) |
(1ULL << CAP_SYS_ADMIN) |
(1ULL << CAP_SYS_CHROOT) |
(1ULL << CAP_SYS_NICE) |
(1ULL << CAP_SYS_PTRACE) |
(1ULL << CAP_SYS_TTY_CONFIG) |
(1ULL << CAP_SYS_RESOURCE);
static int help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
"Spawn a minimal namespace container for debugging, testing and building.\n\n"
" -h --help Show this help\n"
" -D --directory=NAME Root directory for the container\n"
" -b --boot Boot up full system (i.e. invoke init)\n"
" -u --user=USER Run the command under specified user or uid\n"
" -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
" --uuid=UUID Set a specific machine UUID for the container\n"
" --private-network Disable network in container\n"
" --read-only Mount the root directory read-only\n"
" --capability=CAP In addition to the default, retain specified capability\n"
" --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
" -j Equivalent to --link-journal=host\n",
return 0;
}
enum {
ARG_PRIVATE_NETWORK = 0x100,
};
};
int c;
switch (c) {
case 'h':
help();
return 0;
case 'D':
if (!arg_directory) {
log_error("Failed to canonicalize root directory.");
return -ENOMEM;
}
break;
case 'u':
log_error("Failed to duplicate user name.");
return -ENOMEM;
}
break;
case 'C':
if (!arg_controllers) {
log_error("Failed to split controllers list.");
return -ENOMEM;
}
break;
case ARG_PRIVATE_NETWORK:
arg_private_network = true;
break;
case 'b':
arg_boot = true;
break;
case ARG_UUID:
break;
case ARG_READ_ONLY:
arg_read_only = true;
break;
case ARG_CAPABILITY: {
char *t;
if (!t)
return log_oom();
if (cap_from_name(t, &cap) < 0) {
log_error("Failed to parse capability %s.", t);
free(t);
return -EINVAL;
}
free(t);
}
break;
}
case 'j':
break;
case ARG_LINK_JOURNAL:
else {
return -EINVAL;
}
break;
case '?':
return -EINVAL;
default:
log_error("Unknown option code %c", c);
return -EINVAL;
}
}
return 1;
}
typedef struct MountPoint {
const char *what;
const char *where;
const char *type;
const char *options;
unsigned long flags;
bool fatal;
} MountPoint;
static const MountPoint mount_table[] = {
#ifdef HAVE_SELINUX
{ NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
#endif
};
unsigned k;
int r = 0;
char *where;
for (k = 0; k < ELEMENTSOF(mount_table); k++) {
int t;
log_oom();
if (r == 0)
r = -ENOMEM;
break;
}
t = path_is_mount_point(where, false);
if (t < 0) {
if (r == 0)
r = t;
continue;
}
mount_table[k].type,
mount_table[k].flags,
mount_table[k].options) < 0 &&
mount_table[k].fatal) {
if (r == 0)
r = -errno;
}
}
return r;
}
static int setup_timezone(const char *dest) {
char *where;
/* Fix the timezone, if possible */
return log_oom();
return log_oom();
return 0;
}
static int setup_resolv_conf(const char *dest) {
char *where;
if (arg_private_network)
return 0;
/* Fix resolv.conf, if possible */
return log_oom();
}
return 0;
}
static int copy_devnodes(const char *dest) {
static const char devnodes[] =
"null\0"
"zero\0"
"full\0"
"random\0"
"urandom\0"
"tty\0"
"ptmx\0"
"rtc0\0";
const char *d;
int r = 0;
mode_t u;
u = umask(0000);
NULSTR_FOREACH(d, devnodes) {
log_error("Failed to allocate devnode path");
if (r == 0)
r = -ENOMEM;
break;
}
if (r == 0)
r = -errno;
}
if (r == 0)
r = -EIO;
if (r == 0)
r = -errno;
}
}
umask(u);
return r;
}
int r;
mode_t u;
u = umask(0000);
r = -errno;
goto finish;
r = -EIO;
goto finish;
}
if (r < 0) {
goto finish;
}
r = log_oom();
goto finish;
}
* ptys can only exist on pts file systems. To have something
* to bind mount things on we create a device node first, that
* doesn't actually matter here, since we mount it over
* anyway). */
r = -errno;
goto finish;
}
r = -errno;
goto finish;
}
umask(u);
return r;
}
int r, fd, k;
mode_t u;
union {
} control;
assert(kmsg_socket >= 0);
u = umask(0000);
* that writing blocks when nothing is reading. In order to
* avoid any problems with containers deadlocking due to this
r = log_oom();
goto finish;
}
r = log_oom();
goto finish;
}
r = -errno;
goto finish;
}
if (r < 0) {
goto finish;
}
r = -errno;
goto finish;
}
if (fd < 0) {
log_error("Failed to open fifo: %m");
r = -errno;
goto finish;
}
/* Store away the fd in the socket, so that it stays open as
* long as we run the child */
if (k < 0) {
log_error("Failed to send FIFO fd: %m");
r = -errno;
goto finish;
}
umask(u);
return r;
}
static int setup_hostname(void) {
char *hn;
int r = 0;
if (hn) {
if (!hn)
return -ENOMEM;
r = -errno;
}
return r;
}
static int setup_journal(const char *directory) {
int r;
if (arg_link_journal == LINK_NO)
return 0;
if (!p) {
r = log_oom();
goto finish;
}
r = read_one_line_file(p, &b);
r = 0;
goto finish;
} else if (r < 0) {
return r;
}
l = strstrip(b);
r = 0;
goto finish;
}
/* Verify validaty */
r = sd_id128_from_string(l, &machine_id);
if (r < 0) {
goto finish;
}
free(p);
if (!p || !q) {
r = log_oom();
goto finish;
}
if (path_is_mount_point(p, false) > 0 ||
path_is_mount_point(q, false) > 0) {
if (arg_link_journal != LINK_AUTO) {
log_error("Journal already a mount point, refusing.");
r = -EEXIST;
goto finish;
}
r = 0;
goto finish;
}
r = readlink_and_make_absolute(p, &d);
if (r >= 0) {
if ((arg_link_journal == LINK_GUEST ||
arg_link_journal == LINK_AUTO) &&
path_equal(d, q)) {
mkdir_p(q, 0755);
r = 0;
goto finish;
}
if (unlink(p) < 0) {
log_error("Failed to remove symlink %s: %m", p);
r = -errno;
goto finish;
}
} else if (r == -EINVAL) {
if (arg_link_journal == LINK_GUEST &&
rmdir(p) < 0) {
log_error("%s already exists and is neither symlink nor directory.", p);
else {
log_error("Failed to remove %s: %m", p);
r = -errno;
}
goto finish;
}
} else if (r != -ENOENT) {
log_error("readlink(%s) failed: %m", p);
goto finish;
}
if (arg_link_journal == LINK_GUEST) {
if (symlink(q, p) < 0) {
log_error("Failed to symlink %s to %s: %m", q, p);
r = -errno;
goto finish;
}
mkdir_p(q, 0755);
r = 0;
goto finish;
}
if (arg_link_journal == LINK_HOST) {
r = mkdir_p(p, 0755);
if (r < 0) {
log_error("Failed to create %s: %m", p);
goto finish;
}
r = 0;
goto finish;
}
if (dir_is_empty(q) == 0) {
log_error("%s not empty.", q);
r = -ENOTEMPTY;
goto finish;
}
r = mkdir_p(q, 0755);
if (r < 0) {
log_error("Failed to create %s: %m", q);
goto finish;
}
log_error("Failed to bind mount journal from host into guest: %m");
r = -errno;
goto finish;
}
r = 0;
free(p);
free(q);
free(d);
free(b);
return r;
}
static int drop_capabilities(void) {
return capability_bounding_set_drop(~arg_retain, false);
}
static int is_os_tree(const char *path) {
int r;
char *p;
return -ENOMEM;
free(p);
return r < 0 ? 0 : 1;
}
bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
if (signal_fd < 0) {
log_error("signalfd(): %m");
r = -errno;
goto finish;
}
if (ep < 0) {
log_error("Failed to create epoll: %m");
r = -errno;
goto finish;
}
log_error("Failed to regiser fds in epoll: %m");
r = -errno;
goto finish;
}
for (;;) {
ssize_t k;
int i, nfds;
if (nfds < 0) {
continue;
log_error("epoll_wait(): %m");
r = -errno;
goto finish;
}
for (i = 0; i < nfds; i++) {
stdin_readable = true;
stdout_writable = true;
master_readable = true;
master_writable = true;
struct signalfd_siginfo sfsi;
ssize_t n;
if (n != sizeof(sfsi)) {
if (n >= 0) {
log_error("Failed to read from signalfd: invalid block size");
r = -EIO;
goto finish;
}
log_error("Failed to read from signalfd: %m");
r = -errno;
goto finish;
}
} else {
/* The window size changed, let's forward that. */
} else {
r = 0;
goto finish;
}
}
}
}
while ((stdin_readable && in_buffer_full <= 0) ||
(master_writable && in_buffer_full > 0) ||
(master_readable && out_buffer_full <= 0) ||
(stdout_writable && out_buffer_full > 0)) {
if (k < 0) {
stdin_readable = false;
else {
log_error("read(): %m");
r = -errno;
goto finish;
}
} else
in_buffer_full += (size_t) k;
}
if (master_writable && in_buffer_full > 0) {
if (k < 0) {
master_writable = false;
else {
log_error("write(): %m");
r = -errno;
goto finish;
}
} else {
in_buffer_full -= k;
}
}
if (k < 0) {
master_readable = false;
else {
log_error("read(): %m");
r = -errno;
goto finish;
}
} else
out_buffer_full += (size_t) k;
}
if (stdout_writable && out_buffer_full > 0) {
if (k < 0) {
stdout_writable = false;
else {
log_error("write(): %m");
r = -errno;
goto finish;
}
} else {
out_buffer_full -= k;
}
}
}
}
if (ep >= 0)
if (signal_fd >= 0)
return r;
}
int r = EXIT_FAILURE, k;
char **controller = NULL;
int master = -1;
bool saved_attr_valid = false;
log_open();
if (r <= 0)
goto finish;
if (arg_directory) {
char *p;
arg_directory = p;
} else
if (!arg_directory) {
log_error("Failed to determine path");
goto finish;
}
if (geteuid() != 0) {
log_error("Need to be root.");
goto finish;
}
if (sd_booted() <= 0) {
log_error("Not running on a systemd system.");
goto finish;
}
log_error("Spawning container on root directory not supported.");
goto finish;
}
if (is_os_tree(arg_directory) <= 0) {
goto finish;
}
if (k < 0) {
goto finish;
}
log_error("Failed to allocate cgroup path.");
goto finish;
}
if (k < 0) {
goto finish;
}
if (k < 0)
}
if (master < 0) {
log_error("Failed to acquire pseudo tty: %m");
goto finish;
}
if (!console) {
log_error("Failed to determine tty name: %m");
goto finish;
}
log_error("Failed to unlock tty: %m");
goto finish;
}
log_error("Failed to get terminal attributes: %m");
goto finish;
}
saved_attr_valid = true;
log_error("Failed to set terminal attributes: %m");
goto finish;
}
log_error("Failed to create kmsg socket pair");
goto finish;
}
pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
if (pid < 0) {
log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
else
log_error("clone() failed: %m");
goto finish;
}
if (pid == 0) {
/* child */
const char *envp[] = {
"container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
NULL, /* TERM */
NULL, /* HOME */
NULL, /* USER */
NULL, /* LOGNAME */
NULL, /* container_uuid */
};
goto child_fail;
if (setsid() < 0) {
log_error("setsid() failed: %m");
goto child_fail;
}
log_error("PR_SET_PDEATHSIG failed: %m");
goto child_fail;
}
/* Mark everything as slave, so that we still
* receive mounts from the real root, but don't
* propagate mounts to the real root. */
log_error("MS_SLAVE|MS_REC failed: %m");
goto child_fail;
}
/* Turn directory into bind mount */
log_error("Failed to make bind mount.");
goto child_fail;
}
if (arg_read_only)
log_error("Failed to make read-only.");
goto child_fail;
}
if (mount_all(arg_directory) < 0)
goto child_fail;
if (copy_devnodes(arg_directory) < 0)
goto child_fail;
goto child_fail;
goto child_fail;
if (setup_timezone(arg_directory) < 0)
goto child_fail;
if (setup_resolv_conf(arg_directory) < 0)
goto child_fail;
if (setup_journal(arg_directory) < 0)
goto child_fail;
if (chdir(arg_directory) < 0) {
goto child_fail;
}
log_error("mount(MS_MOVE) failed: %m");
goto child_fail;
}
if (chroot(".") < 0) {
log_error("chroot() failed: %m");
goto child_fail;
}
if (chdir("/") < 0) {
log_error("chdir() failed: %m");
goto child_fail;
}
umask(0022);
if (drop_capabilities() < 0) {
log_error("drop_capabilities() failed: %m");
goto child_fail;
}
if (arg_user) {
log_error("get_user_creds() failed: %m");
goto child_fail;
}
log_error("mkdir_parents_label() failed: %m");
goto child_fail;
}
log_error("mkdir_safe_label() failed: %m");
goto child_fail;
}
log_error("initgroups() failed: %m");
goto child_fail;
}
log_error("setregid() failed: %m");
goto child_fail;
}
log_error("setreuid() failed: %m");
goto child_fail;
}
}
log_oom();
goto child_fail;
}
if (arg_uuid) {
log_oom();
goto child_fail;
}
}
if (arg_boot) {
char **a;
size_t l;
/* Automatically search for the init system */
a = newa(char*, l + 1);
else {
}
log_error("execv() failed: %m");
}
goto finish;
if (saved_attr_valid) {
saved_attr_valid = false;
}
if (r < 0)
r = EXIT_FAILURE;
if (saved_attr_valid)
if (master >= 0)
if (oldcg)
if (newcg)
return r;
}