nspawn.c revision 0f0dbc46ccf5aaaf3131446d0a4d78bc97a37295
/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
/***
This file is part of systemd.
Copyright 2010 Lennart Poettering
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
#include <signal.h>
#include <sched.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <sys/capability.h>
#include <getopt.h>
#include <termios.h>
#include <sys/signalfd.h>
#include <grp.h>
#include <systemd/sd-daemon.h>
#include "log.h"
#include "util.h"
#include "mkdir.h"
#include "audit.h"
#include "missing.h"
#include "cgroup-util.h"
#include "strv.h"
#include "loopback-setup.h"
static char *arg_directory = NULL;
static char **arg_controllers = NULL;
static bool arg_private_network = false;
static bool arg_boot = false;
static int help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
"Spawn a minimal namespace container for debugging, testing and building.\n\n"
" -h --help Show this help\n"
" -D --directory=NAME Root directory for the container\n"
" -b --boot Boot up full system (i.e. invoke init)\n"
" -u --user=USER Run the command under specified user or uid\n"
" -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
" --private-network Disable network in container\n",
return 0;
}
enum {
ARG_PRIVATE_NETWORK = 0x100
};
};
int c;
switch (c) {
case 'h':
help();
return 0;
case 'D':
if (!arg_directory) {
log_error("Failed to canonicalize root directory.");
return -ENOMEM;
}
break;
case 'u':
log_error("Failed to duplicate user name.");
return -ENOMEM;
}
break;
case 'C':
if (!arg_controllers) {
log_error("Failed to split controllers list.");
return -ENOMEM;
}
break;
case ARG_PRIVATE_NETWORK:
arg_private_network = true;
break;
case 'b':
arg_boot = true;
break;
case '?':
return -EINVAL;
default:
log_error("Unknown option code %c", c);
return -EINVAL;
}
}
return 1;
}
typedef struct MountPoint {
const char *what;
const char *where;
const char *type;
const char *options;
unsigned long flags;
bool fatal;
} MountPoint;
static const MountPoint mount_table[] = {
{ "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
#ifdef HAVE_SELINUX
{ "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
#endif
};
unsigned k;
int r = 0;
char *where;
for (k = 0; k < ELEMENTSOF(mount_table); k++) {
int t;
log_error("Out of memory");
if (r == 0)
r = -ENOMEM;
break;
}
t = path_is_mount_point(where, false);
if (t < 0) {
if (r == 0)
r = t;
continue;
}
mount_table[k].type,
mount_table[k].flags,
mount_table[k].options) < 0 &&
mount_table[k].fatal) {
if (r == 0)
r = -errno;
}
}
return r;
}
static int setup_timezone(const char *dest) {
char *where;
/* Fix the timezone, if possible */
log_error("Out of memory");
return -ENOMEM;
}
log_error("Out of memory");
return -ENOMEM;
}
return 0;
}
static int copy_devnodes(const char *dest) {
static const char devnodes[] =
"null\0"
"zero\0"
"full\0"
"random\0"
"urandom\0"
"tty\0"
"ptmx\0"
"rtc0\0";
const char *d;
int r = 0;
mode_t u;
u = umask(0000);
NULSTR_FOREACH(d, devnodes) {
log_error("Failed to allocate devnode path");
if (r == 0)
r = -ENOMEM;
break;
}
if (r == 0)
r = -errno;
}
if (r == 0)
r = -EIO;
if (r == 0)
r = -errno;
}
}
umask(u);
return r;
}
int r;
mode_t u;
u = umask(0000);
r = -errno;
goto finish;
r = -EIO;
goto finish;
}
if (r < 0) {
goto finish;
}
log_error("Out of memory");
r = -ENOMEM;
goto finish;
}
* ptys can only exist on pts file systems. To have something
* to bind mount things on we create a device node first, that
* doesn't actually matter here, since we mount it over
* anyway). */
r = -errno;
goto finish;
}
r = -errno;
goto finish;
}
umask(u);
return r;
}
int r, fd, k;
mode_t u;
union {
} control;
assert(kmsg_socket >= 0);
u = umask(0000);
* that writing blocks when nothing is reading. In order to
* avoid any problems with containers deadlocking due to this
log_error("Out of memory");
r = -ENOMEM;
goto finish;
}
log_error("Out of memory");
r = -ENOMEM;
goto finish;
}
r = -errno;
goto finish;
}
if (r < 0) {
goto finish;
}
r = -errno;
goto finish;
}
if (fd < 0) {
log_error("Failed to open fifo: %m");
r = -errno;
goto finish;
}
/* Store away the fd in the socket, so that it stays open as
* long as we run the child */
if (k < 0) {
log_error("Failed to send FIFO fd: %m");
r = -errno;
goto finish;
}
umask(u);
return r;
}
static int setup_hostname(void) {
char *hn;
int r = 0;
if (hn) {
if (!hn)
return -ENOMEM;
r = -errno;
}
return r;
}
static int drop_capabilities(void) {
static const unsigned long retain[] = {
};
unsigned long l;
for (l = 0; l <= cap_last_cap(); l++) {
unsigned i;
for (i = 0; i < ELEMENTSOF(retain); i++)
if (retain[i] == l)
break;
if (i < ELEMENTSOF(retain))
continue;
if (prctl(PR_CAPBSET_DROP, l) < 0) {
log_error("PR_CAPBSET_DROP failed: %m");
return -errno;
}
}
return 0;
}
static int is_os_tree(const char *path) {
int r;
char *p;
return -ENOMEM;
free(p);
return r < 0 ? 0 : 1;
}
bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
log_error("signalfd(): %m");
r = -errno;
goto finish;
}
log_error("Failed to create epoll: %m");
r = -errno;
goto finish;
}
log_error("Failed to regiser fds in epoll: %m");
r = -errno;
goto finish;
}
for (;;) {
ssize_t k;
int i, nfds;
continue;
log_error("epoll_wait(): %m");
r = -errno;
goto finish;
}
for (i = 0; i < nfds; i++) {
stdin_readable = true;
stdout_writable = true;
master_readable = true;
master_writable = true;
struct signalfd_siginfo sfsi;
ssize_t n;
if (n >= 0) {
log_error("Failed to read from signalfd: invalid block size");
r = -EIO;
goto finish;
}
log_error("Failed to read from signalfd: %m");
r = -errno;
goto finish;
}
} else {
/* The window size changed, let's forward that. */
} else {
r = 0;
goto finish;
}
}
}
}
while ((stdin_readable && in_buffer_full <= 0) ||
(master_writable && in_buffer_full > 0) ||
(master_readable && out_buffer_full <= 0) ||
(stdout_writable && out_buffer_full > 0)) {
stdin_readable = false;
else {
log_error("read(): %m");
r = -errno;
goto finish;
}
} else
in_buffer_full += (size_t) k;
}
if (master_writable && in_buffer_full > 0) {
master_writable = false;
else {
log_error("write(): %m");
r = -errno;
goto finish;
}
} else {
in_buffer_full -= k;
}
}
master_readable = false;
else {
log_error("read(): %m");
r = -errno;
goto finish;
}
} else
out_buffer_full += (size_t) k;
}
if (stdout_writable && out_buffer_full > 0) {
stdout_writable = false;
else {
log_error("write(): %m");
r = -errno;
goto finish;
}
} else {
out_buffer_full -= k;
}
}
}
}
if (ep >= 0)
if (signal_fd >= 0)
return r;
}
int r = EXIT_FAILURE, k;
char **controller = NULL;
int master = -1;
bool saved_attr_valid = false;
log_open();
goto finish;
if (arg_directory) {
char *p;
arg_directory = p;
} else
if (!arg_directory) {
log_error("Failed to determine path");
goto finish;
}
if (geteuid() != 0) {
log_error("Need to be root.");
goto finish;
}
if (sd_booted() <= 0) {
log_error("Not running on a systemd system.");
goto finish;
}
log_error("Spawning container on root directory not supported.");
goto finish;
}
if (is_os_tree(arg_directory) <= 0) {
goto finish;
}
goto finish;
}
log_error("Failed to allocate cgroup path.");
goto finish;
}
if (k < 0) {
goto finish;
}
if (k < 0)
}
log_error("Failed to acquire pseudo tty: %m");
goto finish;
}
log_error("Failed to determine tty name: %m");
goto finish;
}
log_error("Failed to unlock tty: %m");
goto finish;
}
log_error("Failed to get terminal attributes: %m");
goto finish;
}
saved_attr_valid = true;
log_error("Failed to set terminal attributes: %m");
goto finish;
}
log_error("Failed to create kmsg socket pair");
goto finish;
}
pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
if (pid < 0) {
log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
else
log_error("clone() failed: %m");
goto finish;
}
if (pid == 0) {
/* child */
const char *envp[] = {
"container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
NULL, /* TERM */
NULL, /* HOME */
NULL, /* USER */
NULL, /* LOGNAME */
};
if (setsid() < 0)
goto child_fail;
goto child_fail;
/* Mark / as private, in case somebody marked it shared */
goto child_fail;
if (mount_all(arg_directory) < 0)
goto child_fail;
if (copy_devnodes(arg_directory) < 0)
goto child_fail;
goto child_fail;
goto child_fail;
if (setup_timezone(arg_directory) < 0)
goto child_fail;
if (chdir(arg_directory) < 0) {
goto child_fail;
}
goto child_fail;
log_error("mount(MS_MOVE) failed: %m");
goto child_fail;
}
if (chroot(".") < 0) {
log_error("chroot() failed: %m");
goto child_fail;
}
if (chdir("/") < 0) {
log_error("chdir() failed: %m");
goto child_fail;
}
umask(0022);
if (drop_capabilities() < 0)
goto child_fail;
if (arg_user) {
log_error("get_user_creds() failed: %m");
goto child_fail;
}
log_error("mkdir_parents() failed: %m");
goto child_fail;
}
log_error("safe_mkdir() failed: %m");
goto child_fail;
}
log_error("initgroups() failed: %m");
goto child_fail;
}
log_error("setregid() failed: %m");
goto child_fail;
}
log_error("setreuid() failed: %m");
goto child_fail;
}
}
log_error("Out of memory");
goto child_fail;
}
if (arg_boot) {
char **a;
size_t l;
/* Automatically search for the init system */
a = newa(char*, l + 1);
else {
}
log_error("execv() failed: %m");
}
goto finish;
if (saved_attr_valid) {
saved_attr_valid = false;
}
if (r < 0)
r = EXIT_FAILURE;
if (saved_attr_valid)
if (master >= 0)
if (oldcg)
if (newcg)
return r;
}