clone.c revision 9acbbeaf2a1ffe5c14b244867d427714fab43c5c
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <assert.h>
#include <errno.h>
#include <stdlib.h>
#include <signal.h>
#include <unistd.h>
#include <ucontext.h>
#include <thread.h>
#include <strings.h>
#include <libintl.h>
#include <sys/regset.h>
#include <sys/syscall.h>
#include <sys/inttypes.h>
#include <sys/param.h>
#include <sys/types.h>
#include <sys/segments.h>
#include <signal.h>
#include <sys/lx_misc.h>
#include <sys/lx_types.h>
#include <sys/lx_signal.h>
#include <sys/lx_syscall.h>
#include <sys/lx_brand.h>
#include <sys/lx_debug.h>
#include <sys/lx_thread.h>
#define LX_CSIGNAL 0x000000ff
#define LX_CLONE_VM 0x00000100
#define LX_CLONE_FS 0x00000200
#define LX_CLONE_FILES 0x00000400
#define LX_CLONE_SIGHAND 0x00000800
#define LX_CLONE_PID 0x00001000
#define LX_CLONE_PTRACE 0x00002000
#define LX_CLONE_VFORK 0x00004000
#define LX_CLONE_PARENT 0x00008000
#define LX_CLONE_THREAD 0x00010000
#define LX_CLONE_SYSVSEM 0x00040000
#define LX_CLONE_SETTLS 0x00080000
#define LX_CLONE_PARENT_SETTID 0x00100000
#define LX_CLONE_CHILD_CLEARTID 0x00200000
#define LX_CLONE_DETACH 0x00400000
#define LX_CLONE_CHILD_SETTID 0x01000000
#define SHARED_AS \
(LX_CLONE_VM | LX_CLONE_FS | LX_CLONE_FILES | LX_CLONE_SIGHAND)
#define CLONE_VFORK (LX_CLONE_VM | LX_CLONE_VFORK)
#define CLONE_TD (LX_CLONE_THREAD|LX_CLONE_DETACH)
#define IS_FORK(f) (((f) & SHARED_AS) == 0)
#define IS_VFORK(f) (((f) & CLONE_VFORK) == CLONE_VFORK)
#define LX_EXIT 1
#define LX_EXIT_GROUP 2
/*
* This is dicey. This seems to be an internal glibc structure, and not
* part of any external interface. Thus, it is subject to change without
* notice. FWIW, clone(2) itself seems to be an internal (or at least
* unstable) interface, since strace(1) shows it differently than the man
* page.
*/
struct lx_desc
{
uint32_t entry_number;
uint32_t base_addr;
uint32_t limit;
uint32_t seg_32bit:1;
uint32_t contents:2;
uint32_t read_exec_only:1;
uint32_t limit_in_pages:1;
uint32_t seg_not_present:1;
uint32_t useable:1;
uint32_t empty:25;
};
struct clone_state {
void *c_retaddr; /* instr after clone()'s int80 */
int c_flags; /* flags to clone(2) */
int c_sig; /* signal to send on thread exit */
void *c_stk; /* %esp of new thread */
void *c_ptidp;
struct lx_desc *c_ldtinfo; /* thread-specific segment */
void *c_ctidp;
uintptr_t c_gs; /* Linux's %gs */
sigset_t c_sigmask; /* signal mask */
uint64_t c_affmask; /* CPU affinity mask */
volatile int *c_clone_res; /* pid/error returned to cloner */
};
extern void lx_setup_clone(uintptr_t, void *, void *);
/*
* Counter incremented when we vfork(2) ourselves, and decremented when the
* vfork(2)ed child exit(2)s or exec(2)s.
*/
static int is_vforked = 0;
int
lx_exit(uintptr_t p1)
{
int ret, status = (int)p1;
lx_tsd_t *lx_tsd;
/*
* If we are a vfork(2)ed child, we need to exit as quickly and
* cleanly as possible to avoid corrupting our parent.
*/
if (is_vforked != 0) {
is_vforked--;
_exit(status);
}
if ((ret = thr_getspecific(lx_tsd_key, (void **)&lx_tsd)) != 0)
lx_err_fatal(gettext(
"%s: unable to read thread-specific data: %s"),
"exit", strerror(ret));
assert(lx_tsd != 0);
lx_tsd->lxtsd_exit = LX_EXIT;
lx_tsd->lxtsd_exit_status = status;
/*
* Block all signals in the exit context to avoid taking any signals
* (to the degree possible) while exiting.
*/
(void) sigfillset(&lx_tsd->lxtsd_exit_context.uc_sigmask);
/*
* This thread is exiting. Restore the state of the thread to
* what it was before we started running linux code.
*/
(void) setcontext(&lx_tsd->lxtsd_exit_context);
/*
* If we returned from the setcontext(2), something is very wrong.
*/
lx_err_fatal(gettext("%s: unable to set exit context: %s"),
"exit", strerror(errno));
/*NOTREACHED*/
return (0);
}
int
lx_group_exit(uintptr_t p1)
{
int ret, status = (int)p1;
lx_tsd_t *lx_tsd;
/*
* If we are a vfork(2)ed child, we need to exit as quickly and
* cleanly as possible to avoid corrupting our parent.
*/
if (is_vforked != 0) {
is_vforked--;
_exit(status);
}
if ((ret = thr_getspecific(lx_tsd_key, (void **)&lx_tsd)) != 0)
lx_err_fatal(gettext(
"%s: unable to read thread-specific data: %s"),
"group_exit", strerror(ret));
assert(lx_tsd != 0);
lx_tsd->lxtsd_exit = LX_EXIT_GROUP;
lx_tsd->lxtsd_exit_status = status;
/*
* Block all signals in the exit context to avoid taking any signals
* (to the degree possible) while exiting.
*/
(void) sigfillset(&lx_tsd->lxtsd_exit_context.uc_sigmask);
/*
* This thread is exiting. Restore the state of the thread to
* what it was before we started running linux code.
*/
(void) setcontext(&lx_tsd->lxtsd_exit_context);
/*
* If we returned from the setcontext(2), something is very wrong.
*/
lx_err_fatal(gettext("%s: unable to set exit context: %s"),
"group_exit", strerror(errno));
/*NOTREACHED*/
return (0);
}
static void *
clone_start(void *arg)
{
int rval;
struct clone_state *cs = (struct clone_state *)arg;
lx_tsd_t lx_tsd;
/*
* Let the kernel finish setting up all the needed state for this
* new thread.
*
* We already created the thread using the thr_create(3C) library
* call, so most of the work required to emulate lx_clone(2) has
* been done by the time we get to this point. Instead of creating
* a new brandsys(2) subcommand to perform the last few bits of
* bookkeeping, we just use the lx_clone() slot in the syscall
* table.
*/
lx_debug("\tre-vectoring to lx kernel module to complete lx_clone()");
lx_debug("\tLX_SYS_clone(0x%x, 0x%p, 0x%p, 0x%p, 0x%p)",
cs->c_flags, cs->c_stk, cs->c_ptidp, cs->c_ldtinfo, cs->c_ctidp);
rval = syscall(SYS_brand, B_EMULATE_SYSCALL + LX_SYS_clone,
cs->c_flags, cs->c_stk, cs->c_ptidp, cs->c_ldtinfo, cs->c_ctidp,
NULL);
/*
* At this point the parent is waiting for cs->c_clone_res to go
* non-zero to indicate the thread has been cloned. The value set
* in cs->c_clone_res will be used for the return value from
* clone().
*/
if (rval < 0) {
*(cs->c_clone_res) = -errno;
lx_debug("\tkernel clone failed, errno %d\n", errno);
return (NULL);
}
if (lx_sched_setaffinity(0, sizeof (cs->c_affmask),
(uintptr_t)&cs->c_affmask) != 0) {
*(cs->c_clone_res) = -errno;
lx_err_fatal(gettext(
"Unable to set affinity mask in child thread: %s"),
strerror(errno));
}
/* Initialize the thread specific data for this thread. */
bzero(&lx_tsd, sizeof (lx_tsd));
lx_tsd.lxtsd_gs = cs->c_gs;
/*
* Use the address of the stack-allocated lx_tsd as the
* per-thread storage area to cache various values for later
* use.
*
* This address is only used by this thread, so there is no
* danger of other threads using this storage area, nor of it
* being accessed once this stack frame has been freed.
*/
if (thr_setspecific(lx_tsd_key, &lx_tsd) != 0) {
*(cs->c_clone_res) = -errno;
lx_err_fatal(
gettext("Unable to set thread-specific ptr for clone: %s"),
strerror(rval));
}
/*
* Save the current context of this thread.
*
* We'll restore this context when this thread attempts to exit.
*/
if (getcontext(&lx_tsd.lxtsd_exit_context) != 0) {
*(cs->c_clone_res) = -errno;
lx_err_fatal(gettext(
"Unable to initialize thread-specific exit context: %s"),
strerror(errno));
}
/*
* Do the final stack twiddling, reset %gs, and return to the
* clone(2) path.
*/
if (lx_tsd.lxtsd_exit == 0) {
if (sigprocmask(SIG_SETMASK, &cs->c_sigmask, NULL) < 0) {
*(cs->c_clone_res) = -errno;
lx_err_fatal(gettext(
"Unable to release held signals for child "
"thread: %s"), strerror(errno));
}
/*
* Let the parent know that the clone has (effectively) been
* completed.
*/
*(cs->c_clone_res) = rval;
lx_setup_clone(cs->c_gs, cs->c_retaddr, cs->c_stk);
/* lx_setup_clone() should never return. */
assert(0);
}
/*
* We are here because the Linux application called the exit() or
* exit_group() system call. In turn the brand library did a
* setcontext() to jump to the thread context state saved in
* getcontext(), above.
*/
if (lx_tsd.lxtsd_exit == LX_EXIT)
thr_exit((void *)lx_tsd.lxtsd_exit_status);
else
exit(lx_tsd.lxtsd_exit_status);
assert(0);
/*NOTREACHED*/
}
int
lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
uintptr_t p5)
{
struct clone_state *cs;
int flags = (int)p1;
void *cldstk = (void *)p2;
void *ptidp = (void *)p3;
struct lx_desc *ldtinfo = (void *)p4;
void *ctidp = (void *)p5;
thread_t tid;
volatile int clone_res;
int sig;
int rval;
lx_regs_t *rp;
sigset_t sigmask;
if (flags & LX_CLONE_SETTLS) {
lx_debug("lx_clone(flags=0x%x stk=0x%p ptidp=0x%p ldt=0x%p "
"ctidp=0x%p", flags, cldstk, ptidp, ldtinfo, ctidp);
} else {
lx_debug("lx_clone(flags=0x%x stk=0x%p ptidp=0x%p)",
flags, cldstk, ptidp);
}
/*
* Only supported for pid 0 on Linux
*/
if (flags & LX_CLONE_PID)
return (-EINVAL);
/*
* CLONE_THREAD require CLONE_SIGHAND. CLONE_THREAD and
* CLONE_DETACHED must both be either set or cleared.
*/
if ((flags & CLONE_TD) &&
(!(flags & LX_CLONE_SIGHAND) || ((flags & CLONE_TD) != CLONE_TD)))
return (-EINVAL);
rp = lx_syscall_regs();
/* See if this is a fork() operation or a thr_create(). */
if (IS_FORK(flags) || IS_VFORK(flags)) {
if (flags & LX_CLONE_PARENT) {
lx_unsupported(gettext(
"clone(2) only supports CLONE_PARENT "
"for threads.\n"));
return (-ENOTSUP);
}
if (flags & LX_CLONE_PTRACE)
lx_ptrace_fork();
if (flags & LX_CLONE_VFORK) {
is_vforked++;
rval = vfork();
if (rval != 0)
is_vforked--;
} else {
rval = fork1();
}
if (rval > 0 && (flags & LX_CLONE_PARENT_SETTID))
*((int *)ptidp) = rval;
/* Parent just returns */
if (rval != 0)
return ((rval < 0) ? -errno : rval);
/*
* If provided, the child needs its new stack set up.
*/
if (cldstk)
lx_setup_clone(rp->lxr_gs, (void *)rp->lxr_eip, cldstk);
return (0);
}
/*
* We have very restricted support.... only exactly these flags are
* supported
*/
if (((flags & SHARED_AS) != SHARED_AS)) {
lx_unsupported(gettext(
"clone(2) requires that all or none of CLONE_VM "
"CLONE_FS, CLONE_FILES, and CLONE_SIGHAND be set.\n"));
return (-ENOTSUP);
}
if (cldstk == NULL) {
lx_unsupported(gettext(
"clone(2) requires the caller to allocate the "
"child's stack.\n"));
return (-ENOTSUP);
}
/*
* If we want a signal-on-exit, ensure that the signal is valid.
*/
if ((sig = ltos_signo[flags & LX_CSIGNAL]) == -1) {
lx_unsupported(gettext(
"clone(2) passed unsupported signal: %d"), sig);
return (-ENOTSUP);
}
/*
* To avoid malloc() here, we steal a part of the new thread's
* stack to store all the info that thread might need for
* initialization. We also make it 64-bit aligned for good
* measure.
*/
cs = (struct clone_state *)
((p2 - sizeof (struct clone_state)) & -((uintptr_t)8));
cs->c_flags = flags;
cs->c_sig = sig;
cs->c_stk = cldstk;
cs->c_ptidp = ptidp;
cs->c_ldtinfo = ldtinfo;
cs->c_ctidp = ctidp;
cs->c_clone_res = &clone_res;
cs->c_gs = rp->lxr_gs;
if (lx_sched_getaffinity(0, sizeof (cs->c_affmask),
(uintptr_t)&cs->c_affmask) == -1)
lx_err_fatal(gettext(
"Unable to get affinity mask for parent thread: %s"),
strerror(errno));
/*
* We want the new thread to return directly to the return site for
* the system call.
*/
cs->c_retaddr = (void *)rp->lxr_eip;
clone_res = 0;
(void) sigfillset(&sigmask);
/*
* Block all signals because the thread we create won't be able to
* properly handle them until it's fully set up.
*/
if (sigprocmask(SIG_BLOCK, &sigmask, &cs->c_sigmask) < 0) {
lx_debug("lx_clone sigprocmask() failed: %s", strerror(errno));
return (-errno);
}
rval = thr_create(NULL, NULL, clone_start, cs, THR_DETACHED, &tid);
/*
* Release any pending signals
*/
(void) sigprocmask(SIG_SETMASK, &cs->c_sigmask, NULL);
/*
* Wait for the child to be created and have its tid assigned.
*/
if (rval == 0) {
while (clone_res == 0)
;
rval = clone_res;
}
return (rval);
}