src/lxc/lxc_attach.c

	lxc_attach.c revision ab1bf971d2db43777cbf3892fb887bf71ce7d155
/*
 * lxc: linux Container library
 *
 * (C) Copyright IBM Corp. 2007, 2010
 *
 * Authors:
 * Daniel Lezcano <daniel.lezcano at free.fr>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */

#define _GNU_SOURCE
#include <unistd.h>
#include <errno.h>
#include <pwd.h>
#include <stdlib.h>
#include <sys/param.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/wait.h>

#include "attach.h"
#include "commands.h"
#include "arguments.h"
#include "caps.h"
#include "cgroup.h"
#include "config.h"
#include "confile.h"
#include "start.h"
#include "sync.h"
#include "log.h"
#include "namespace.h"
#include "apparmor.h"

#if HAVE_SYS_PERSONALITY_H
#include <sys/personality.h>
#endif

lxc_log_define(lxc_attach_ui, lxc);

static const struct option my_longopts[] = {
	{"elevated-privileges", no_argument, 0, 'e'},
	{"arch", required_argument, 0, 'a'},
	{"namespaces", required_argument, 0, 's'},
	{"remount-sys-proc", no_argument, 0, 'R'},
	/* TODO: decide upon short option names */
	{"clear-env", no_argument, 0, 500},
	{"keep-env", no_argument, 0, 501},
	LXC_COMMON_OPTIONS
};

static int elevated_privileges = 0;
static signed long new_personality = -1;
static int namespace_flags = -1;
static int remount_sys_proc = 0;
static lxc_attach_env_policy_t env_policy = LXC_ATTACH_KEEP_ENV;

static int my_parser(struct lxc_arguments* args, int c, char* arg)
{
	int ret;

	switch (c) {
	case 'e': elevated_privileges = 1; break;
	case 'R': remount_sys_proc = 1; break;
	case 'a':
		new_personality = lxc_config_parse_arch(arg);
		if (new_personality < 0) {
			lxc_error(args, "invalid architecture specified: %s", arg);
			return -1;
		}
		break;
	case 's':
		namespace_flags = 0;
		ret = lxc_fill_namespace_flags(arg, &namespace_flags);
		if (ret)
			return -1;
		/* -s implies -e */
		elevated_privileges = 1;
		break;
        case 500: /* clear-env */
                env_policy = LXC_ATTACH_CLEAR_ENV;
                break;
        case 501: /* keep-env */
                env_policy = LXC_ATTACH_KEEP_ENV;
                break;
	}

	return 0;
}

static struct lxc_arguments my_args = {
	.progname = "lxc-attach",
	.help     = "\
--name=NAME [-- COMMAND]\n\
\n\
Execute the specified COMMAND - enter the container NAME\n\
\n\
Options :\n\
  -n, --name=NAME   NAME for name of the container\n\
  -e, --elevated-privileges\n\
                    Use elevated privileges (capabilities, cgroup\n\
                    restrictions) instead of those of the container.\n\
                    WARNING: This may leak privleges into the container.\n\
                    Use with care.\n\
  -a, --arch=ARCH   Use ARCH for program instead of container's own\n\
                    architecture.\n\
  -s, --namespaces=FLAGS\n\
                    Don't attach to all the namespaces of the container\n\
                    but just to the following OR'd list of flags:\n\
                    MOUNT, PID, UTSNAME, IPC, USER or NETWORK\n\
                    WARNING: Using -s implies -e, it may therefore\n\
                    leak privileges into the container. Use with care.\n\
  -R, --remount-sys-proc\n\
                    Remount /sys and /proc if not attaching to the\n\
                    mount namespace when using -s in order to properly\n\
                    reflect the correct namespace context. See the\n\
                    lxc-attach(1) manual page for details.\n\
      --clear-env\n\
                    Clear all environment variables before attaching.\n\
                    The attached shell/program will start with only\n\
                    container=lxc set.\n\
      --keep-env\n\
                    Keep all current enivornment variables. This\n\
                    is the current default behaviour, but is likely to\n\
                    change in the future.\n",
	.options  = my_longopts,
	.parser   = my_parser,
	.checker  = NULL,
};

struct child_data {
	struct lxc_proc_context_info *init_ctx;
	struct lxc_handler *handler;
	int ipc_socket;
};

static int child_main(void* data)
{
	struct child_data* child_data = data;
	struct lxc_proc_context_info *init_ctx = child_data->init_ctx;
	struct lxc_handler *handler = child_data->handler;
	int ipc_socket = child_data->ipc_socket;
	struct passwd *passwd;
	char *user_shell;
	uid_t uid;
	int ret;

	lxc_sync_fini_parent(handler);
	close(ipc_socket);

	if ((namespace_flags & CLONE_NEWNS)) {
		if (attach_apparmor(init_ctx->aa_profile) < 0) {
			ERROR("failed switching apparmor profiles");
			return -1;
		}
	}

	/* A description of the purpose of this functionality is
	 * provided in the lxc-attach(1) manual page. We have to
	 * remount here and not in the parent process, otherwise
	 * /proc may not properly reflect the new pid namespace.
	 */
	if (!(namespace_flags & CLONE_NEWNS) && remount_sys_proc) {
		ret = lxc_attach_remount_sys_proc();
		if (ret < 0) {
			return -1;
		}
	}

#if HAVE_SYS_PERSONALITY_H
	if (new_personality < 0)
		new_personality = init_ctx->personality;

	if (personality(new_personality) == -1) {
		ERROR("could not ensure correct architecture: %s",
		      strerror(errno));
		return -1;
	}
#endif

	if (!elevated_privileges && lxc_attach_drop_privs(init_ctx)) {
		ERROR("could not drop privileges");
		return -1;
	}

	if (lxc_attach_set_environment(env_policy, NULL, NULL)) {
		ERROR("could not set environment");
		return -1;
	}

	/* tell parent we are done setting up the container and wait
	 * until we have been put in the container's cgroup, if
	 * applicable */
	if (lxc_sync_barrier_parent(handler, LXC_SYNC_CONFIGURE))
		return -1;

	lxc_sync_fini(handler);

	if (namespace_flags & CLONE_NEWUSER) {
		uid_t init_uid = 0;
		gid_t init_gid = 0;

		/* ignore errors, we will fall back to root in that case
		 * (/proc was not mounted etc.)
		 */
		lxc_attach_get_init_uidgid(&init_uid, &init_gid);

		/* try to set the uid/gid combination */
		if (setgid(init_gid)) {
			SYSERROR("switching to container gid");
			return -1;
		}
		if (setuid(init_uid)) {
			SYSERROR("switching to container uid");
			return -1;
		}
	}

	if (my_args.argc) {
		execvp(my_args.argv[0], my_args.argv);
		SYSERROR("failed to exec '%s'", my_args.argv[0]);
		return -1;
	}

	uid = getuid();

	passwd = getpwuid(uid);

	/* this probably happens because of incompatible nss
	 * implementations in host and container (remember, this
	 * code is still using the host's glibc but our mount
	 * namespace is in the container)
	 * we may try to get the information by spawning a
	 * [getent passwd uid] process and parsing the result
	 */
	if (!passwd)
		user_shell = lxc_attach_getpwshell(uid);
	else
		user_shell = passwd->pw_shell;

	if (user_shell) {
		char *const args[] = {
			user_shell,
			NULL,
		};

		(void) execvp(args[0], args);
	}

	/* executed if either no passwd entry or execvp fails,
	 * we will fall back on /bin/sh as a default shell
	 */
	{
		char *const args[] = {
			"/bin/sh",
			NULL,
		};

		execvp(args[0], args);
		SYSERROR("failed to exec '%s'", args[0]);
		return -1;
	}
}

int main(int argc, char *argv[])
{
	int ret;
	pid_t pid, init_pid;
	struct lxc_proc_context_info *init_ctx;
	struct lxc_handler *handler;
	char *curdir;
	int cgroup_ipc_sockets[2];

	ret = lxc_caps_init();
	if (ret)
		return ret;

	ret = lxc_arguments_parse(&my_args, argc, argv);
	if (ret)
		return ret;

	ret = lxc_log_init(my_args.name, my_args.log_file, my_args.log_priority,
			   my_args.progname, my_args.quiet, my_args.lxcpath);
	if (ret)
		return ret;

	init_pid = get_init_pid(my_args.name, my_args.lxcpath);
	if (init_pid < 0) {
		ERROR("failed to get the init pid");
		return -1;
	}

	init_ctx = lxc_proc_get_context_info(init_pid);
	if (!init_ctx) {
		ERROR("failed to get context of the init process, pid = %d", init_pid);
		return -1;
	}

	curdir = getcwd(NULL, 0);

	/* determine which namespaces the container was created with
	 * by asking lxc-start
	 */
	if (namespace_flags == -1) {
		namespace_flags = lxc_get_clone_flags(my_args.name, my_args.lxcpath);
		/* call failed */
		if (namespace_flags == -1) {
			ERROR("failed to automatically determine the "
			      "namespaces which the container unshared");
			return -1;
		}
	}

	/* For the cgroup attaching logic to work in conjunction with pid and user namespaces,
	 * we need to have the following hierarchy:
	 *
	 *     lxc-attach [process executed externally]
	 *         | socketpair(cgroup_ipc_sockets)
	 *         | fork()           -> child
	 *         |                       | setns()
	 *         |                       | fork()    -> grandchild
	 *         |                       |                   | initialize
	 *         |                       |                   | signal parent
	 *         |                       |<------------------|----+
	 *         |                       | signal parent     |
	 *         |<----------------------|-----+             |
	 *         | add to cgroups        |                   |
	 *         | signal child -------->|                   |
	 *         |                       | signal child ---->|
	 *         | waitpid()             | waitpid()         | exec()
	 *         |                       |<------------------| exit()
	 *         |<----------------------| exit()
	 *         | exit()
	 *
	 * The rationale is the following: The first parent is needed because after
	 * setns() (mount + user namespace) we can't access the cgroup filesystem
	 * to add the pid to the corresponding cgroup. Therefore, we need to do that
	 * in a process executed on the host, so that's why we need to fork and wait
	 * for it to have done some initialization (cgroups may restrict certain
	 * operations so we have to do that in the end) and use IPC for signaling.
	 *
	 * Then in the child process we do the setns(). However, a process is never
	 * really attached to a pid namespace (never changes its pid, doesn't appear
	 * in the pid namespace /proc), only child processes of that process are
	 * truely inside the new pid namespace. That's why we need to fork() again
	 * after setns() before performing final initializations, then signal our
	 * parent, which signals the primary process, which does cgroup adding,
	 * which then signals to the grandchild that it can exec().
	 */
	ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, cgroup_ipc_sockets);
	if (ret < 0) {
		SYSERROR("could not set up required IPC mechanism for attaching");
		return -1;
	}

	pid = fork();
	if (pid < 0) {
		SYSERROR("failed to create first subprocess");
		return -1;
	}

	if (pid) {
		int status;
		pid_t grandchild;

		close(cgroup_ipc_sockets[1]);

	gparent_reread:
		ret = read(cgroup_ipc_sockets[0], &grandchild, sizeof(grandchild));
		if (ret <= 0) {
			if (ret < 0 && (errno == EAGAIN || errno == EINTR))
				goto gparent_reread;
			ERROR("failed to get pid of attached process to add to cgroup");
			return -1;
		}

		if (!elevated_privileges) {
			ret = lxc_cgroup_attach(grandchild, my_args.name, my_args.lxcpath);
			if (ret < 0) {
				ERROR("failed to attach process to cgroup");
				return -1;
			}
		}

		status = 0;
		ret = write(cgroup_ipc_sockets[0], &status, sizeof(status));
		if (ret <= 0) {
			ERROR("failed to signal child that cgroup logic has finished");
			return -1;
		}

		close(cgroup_ipc_sockets[0]);

	gparent_again:
		ret = waitpid(pid, &status, 0);
		if (ret < 0) {
			if (errno == EINTR)
				goto gparent_again;
			SYSERROR("failed to wait for process '%d'", pid);
			return -1;
		}

		if (WIFEXITED(status))
			return WEXITSTATUS(status);

		return -1;
	}

	/* at this point we are in the 'parent' process so we need to close the
	 * socket reserved for the 'grandparent' process
	 */
	close(cgroup_ipc_sockets[0]);

	/* we need to attach before we fork since certain namespaces
	 * (such as pid namespaces) only really affect children of the
	 * current process and not the process itself
	 */
	ret = lxc_attach_to_ns(init_pid, namespace_flags);
	if (ret < 0) {
		ERROR("failed to enter the namespace");
		return -1;
	}

	if (curdir && chdir(curdir))
		WARN("could not change directory to '%s'", curdir);

	free(curdir);

	/* hack: we need sync.h infrastructure - and that needs a handler
	 * FIXME: perhaps we should also just use a very simple socketpair()
	 * here? - like with the grandparent <-> parent communication?
	 */
	handler = calloc(1, sizeof(*handler));

	if (lxc_sync_init(handler)) {
		ERROR("failed to initialize synchronization socket");
		return -1;
	}

	{
		struct child_data child_data = {
			.init_ctx = init_ctx,
			.handler = handler,
			.ipc_socket = cgroup_ipc_sockets[1]
		};
		pid = lxc_clone(child_main, &child_data, 0);
	}

	if (pid < 0) {
		SYSERROR("failed to fork");
		return -1;
	}

	if (pid) {
		int status;

		lxc_sync_fini_child(handler);

		/* wait until the child has done configuring itself before
		 * we put it in a cgroup that potentially limits these
		 * possibilities */
		if (lxc_sync_wait_child(handler, LXC_SYNC_CONFIGURE))
			return -1;

		/* ask grandparent to add child to cgroups, the grandparent will
		 * itself check whether that's actually necessary
		 */
		ret = write(cgroup_ipc_sockets[1], &pid, sizeof(pid));
		if (ret != sizeof(pid)) {
			ERROR("error using IPC to notify main process of pid to add to the cgroups of the container");
			return -1;
		}

	parent_reread:
		/* we need some mechanism to check whether the grandparent could
		 * add us to the cgroups or not - so we await a dummy integer
		 * on the same socket (that's why we don't use a pipe - we need
		 * two-way communication). So if the parent fails and exits, that
		 * will close the socket, which will cause a read of 0 bytes for
		 * us, so we just terminate. If we read at least a byte, we don't
		 * care about the contents...
		 */
		ret = read(cgroup_ipc_sockets[1], &status, sizeof(status));
		if (ret <= 0) {
			if (ret < 0 && (errno == EAGAIN || errno == EINTR))
				goto parent_reread;
			/* only print someting if we can't assume the parent already
			 * gave an error message, that will reduce confusion for the
			 * user
			 */
			if (ret != 0)
				ERROR("failed to get notification that the child process was added to the container's cgroups");
			return -1;
		}

		/* we don't need that IPC interface anymore */
		close(cgroup_ipc_sockets[1]);

		/* tell the child we are done initializing */
		if (lxc_sync_wake_child(handler, LXC_SYNC_POST_CONFIGURE))
			return -1;

		lxc_sync_fini(handler);

	again:
		if (waitpid(pid, &status, 0) < 0) {
			if (errno == EINTR)
				goto again;
			SYSERROR("failed to wait '%d'", pid);
			return -1;
		}

		if (WIFEXITED(status))
			return WEXITSTATUS(status);

		return -1;
	}

	/* shouldn't happen, because clone should never return 0 */
	return -1;
}