lx_brand.c revision 9acbbeaf2a1ffe5c14b244867d427714fab43c5c
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/types.h>
#include <sys/kmem.h>
#include <sys/errno.h>
#include <sys/thread.h>
#include <sys/systm.h>
#include <sys/syscall.h>
#include <sys/proc.h>
#include <sys/modctl.h>
#include <sys/cmn_err.h>
#include <sys/model.h>
#include <sys/exec.h>
#include <sys/lx_impl.h>
#include <sys/machbrand.h>
#include <sys/lx_syscalls.h>
#include <sys/lx_pid.h>
#include <sys/lx_futex.h>
#include <sys/lx_brand.h>
#include <sys/termios.h>
#include <sys/sunddi.h>
#include <sys/ddi.h>
#include <sys/exec.h>
#include <sys/vnode.h>
#include <sys/pathname.h>
#include <sys/machelf.h>
#include <sys/auxv.h>
#include <sys/priv.h>
#include <sys/regset.h>
#include <sys/privregs.h>
#include <sys/archsystm.h>
#include <sys/zone.h>
#include <sys/brand.h>
int lx_debug = 0;
void lx_setbrand(proc_t *);
int lx_getattr(zone_t *, int, void *, size_t *);
int lx_setattr(zone_t *, int, void *, size_t);
int lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
uintptr_t, uintptr_t, uintptr_t);
void lx_copy_procdata(proc_t *, proc_t *);
extern void lx_setrval(klwp_t *, int, int);
extern void lx_proc_exit(proc_t *, klwp_t *);
extern void lx_exec();
extern int lx_initlwp(klwp_t *);
extern void lx_forklwp(klwp_t *, klwp_t *);
extern void lx_exitlwp(klwp_t *);
extern void lx_freelwp(klwp_t *);
extern greg_t lx_fixsegreg(greg_t, model_t);
extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *);
int lx_systrace_brand_enabled;
lx_systrace_f *lx_systrace_entry_ptr;
lx_systrace_f *lx_systrace_return_ptr;
static int lx_systrace_enabled;
static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
struct intpdata *idata, int level, long *execsz, int setid,
caddr_t exec_file, struct cred *cred, int brand_action);
/* lx brand */
struct brand_ops lx_brops = {
lx_brandsys,
lx_setbrand,
lx_getattr,
lx_setattr,
lx_copy_procdata,
lx_proc_exit,
lx_exec,
lx_setrval,
lx_initlwp,
lx_forklwp,
lx_freelwp,
lx_exitlwp,
lx_elfexec
};
struct brand_mach_ops lx_mops = {
NULL,
lx_brand_int80_callback,
NULL,
NULL,
NULL,
lx_fixsegreg,
};
struct brand lx_brand = {
BRAND_VER_1,
"lx",
&lx_brops,
&lx_mops
};
static struct modlbrand modlbrand = {
&mod_brandops, "lx brand %I%", &lx_brand
};
static struct modlinkage modlinkage = {
MODREV_1, (void *)&modlbrand, NULL
};
void
lx_proc_exit(proc_t *p, klwp_t *lwp)
{
zone_t *z = p->p_zone;
ASSERT(p->p_brand != NULL);
ASSERT(p->p_brand_data != NULL);
/*
* If init is dying and we aren't explicitly shutting down the zone
* or the system, then Solaris is about to restart init. The Linux
* init is not designed to handle a restart, which it interprets as
* a reboot. To give it a sane environment in which to run, we
* reboot the zone.
*/
if (p->p_pid == z->zone_proc_initpid) {
if (z->zone_boot_err == 0 &&
z->zone_restart_init &&
zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN)
(void) zone_kadmin(A_REBOOT, 0, NULL, CRED());
} else {
lx_exitlwp(lwp);
kmem_free(p->p_brand_data, sizeof (struct lx_proc_data));
p->p_brand_data = NULL;
p->p_brand = &native_brand;
}
}
void
lx_setbrand(proc_t *p)
{
kthread_t *t = p->p_tlist;
int err;
ASSERT(p->p_brand_data == NULL);
ASSERT(ttolxlwp(curthread) == NULL);
p->p_brand_data = kmem_zalloc(sizeof (struct lx_proc_data), KM_SLEEP);
/*
* This routine can only be called for single-threaded processes.
* Since lx_initlwp() can only fail if we run out of PIDs for
* multithreaded processes, we know that this can never fail.
*/
err = lx_initlwp(t->t_lwp);
ASSERT(err == 0);
}
/* ARGSUSED */
int
lx_setattr(zone_t *zone, int attr, void *buf, size_t bufsize)
{
boolean_t val;
if (attr == LX_ATTR_RESTART_INIT) {
if (bufsize > sizeof (boolean_t))
return (ERANGE);
if (copyin(buf, &val, sizeof (val)) != 0)
return (EFAULT);
if (val != B_TRUE && val != B_FALSE)
return (EINVAL);
zone->zone_restart_init = val;
return (0);
}
return (EINVAL);
}
/* ARGSUSED */
int
lx_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize)
{
if (attr == LX_ATTR_RESTART_INIT) {
if (*bufsize < sizeof (boolean_t))
return (ERANGE);
if (copyout(&zone->zone_restart_init, buf,
sizeof (boolean_t)) != 0)
return (EFAULT);
*bufsize = sizeof (boolean_t);
return (0);
}
return (-EINVAL);
}
/*
* Enable ptrace system call tracing for the given LWP. This is done by
* both setting the flag in that LWP's brand data (in the kernel) and setting
* the process-wide trace flag (in the brand library of the traced process).
*/
static int
lx_ptrace_syscall_set(pid_t pid, id_t lwpid, int set)
{
proc_t *p;
kthread_t *t;
klwp_t *lwp;
lx_proc_data_t *lpdp;
lx_lwp_data_t *lldp;
uintptr_t addr;
int ret, flag = 1;
if ((p = sprlock(pid)) == NULL)
return (ESRCH);
if (priv_proc_cred_perm(curproc->p_cred, p, NULL, VWRITE) != 0) {
sprunlock(p);
return (EPERM);
}
if ((t = idtot(p, lwpid)) == NULL || (lwp = ttolwp(t)) == NULL) {
sprunlock(p);
return (ESRCH);
}
if ((lpdp = p->p_brand_data) == NULL ||
(lldp = lwp->lwp_brand) == NULL) {
sprunlock(p);
return (ESRCH);
}
if (set) {
/*
* Enable the ptrace flag for this LWP and this process. Note
* that we will turn off the LWP's ptrace flag, but we don't
* turn off the process's ptrace flag.
*/
lldp->br_ptrace = 1;
lpdp->l_ptrace = 1;
addr = lpdp->l_traceflag;
mutex_exit(&p->p_lock);
/*
* This can fail only in some rare corner cases where the
* process is exiting or we're completely out of memory. In
* these cases, it's sufficient to return an error to the ptrace
* consumer and leave the process-wide flag set.
*/
ret = uwrite(p, &flag, sizeof (flag), addr);
mutex_enter(&p->p_lock);
/*
* If we couldn't set the trace flag, unset the LWP's ptrace
* flag as there ptrace consumer won't expect this LWP to stop.
*/
if (ret != 0)
lldp->br_ptrace = 0;
} else {
lldp->br_ptrace = 0;
ret = 0;
}
sprunlock(p);
if (ret != 0)
ret = EIO;
return (ret);
}
static void
lx_ptrace_fire(void)
{
kthread_t *t = curthread;
klwp_t *lwp = ttolwp(t);
lx_lwp_data_t *lldp = lwp->lwp_brand;
/*
* The ptrace flag only applies until the next event is encountered
* for the given LWP. If it's set, turn off the flag and poke the
* controlling process by raising a signal.
*/
if (lldp->br_ptrace) {
lldp->br_ptrace = 0;
tsignal(t, SIGTRAP);
}
}
void
lx_brand_systrace_enable(void)
{
extern void lx_brand_int80_enable(void);
ASSERT(!lx_systrace_enabled);
lx_brand_int80_enable();
lx_systrace_enabled = 1;
}
void
lx_brand_systrace_disable(void)
{
extern void lx_brand_int80_disable(void);
ASSERT(lx_systrace_enabled);
lx_brand_int80_disable();
lx_systrace_enabled = 0;
}
/*
* Get the addresses of the user-space system call handler and attach it to
* the proc structure. Returning 0 indicates success; the value returned
* by the system call is the value stored in rval. Returning a non-zero
* value indicates a failure; the value returned is used to set errno, -1
* is returned from the syscall and the contents of rval are ignored. To
* set errno and have the syscall return a value other than -1 we can
* manually set errno and rval and return 0.
*/
int
lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6)
{
kthread_t *t = curthread;
proc_t *p = ttoproc(t);
lx_proc_data_t *pd;
int linux_call;
struct termios *termios;
uint_t termios_len;
int error;
lx_brand_registration_t reg;
/*
* There is one operation that is suppored for non-branded
* process. B_EXEC_BRAND. This is the equilivant of an
* exec call, but the new process that is created will be
* a branded process.
*/
if (cmd == B_EXEC_BRAND) {
ASSERT(p->p_zone != NULL);
ASSERT(p->p_zone->zone_brand == &lx_brand);
return (exec_common(
(char *)arg1, (const char **)arg2, (const char **)arg3,
EBA_BRAND));
}
/* For all other operations this must be a branded process. */
if (p->p_brand == NULL)
return (set_errno(ENOSYS));
ASSERT(p->p_brand == &lx_brand);
ASSERT(p->p_brand_data != NULL);
switch (cmd) {
case B_REGISTER:
if (p->p_model == DATAMODEL_NATIVE) {
if (copyin((void *)arg1, &reg, sizeof (reg)) != 0) {
lx_print("Failed to copyin brand registration "
"at 0x%p\n", (void *)arg1);
return (EFAULT);
}
#ifdef _LP64
} else {
lx_brand_registration32_t reg32;
if (copyin((void *)arg1, &reg32, sizeof (reg32)) != 0) {
lx_print("Failed to copyin brand registration "
"at 0x%p\n", (void *)arg1);
return (EFAULT);
}
reg.lxbr_version = (uint_t)reg32.lxbr_version;
reg.lxbr_handler =
(void *)(uintptr_t)reg32.lxbr_handler;
reg.lxbr_tracehandler =
(void *)(uintptr_t)reg32.lxbr_tracehandler;
reg.lxbr_traceflag =
(void *)(uintptr_t)reg32.lxbr_traceflag;
#endif
}
if (reg.lxbr_version != LX_VERSION_1) {
lx_print("Invalid brand library version (%u)\n",
reg.lxbr_version);
return (EINVAL);
}
lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n",
(void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p);
pd = p->p_brand_data;
pd->l_handler = (uintptr_t)reg.lxbr_handler;
pd->l_tracehandler = (uintptr_t)reg.lxbr_tracehandler;
pd->l_traceflag = (uintptr_t)reg.lxbr_traceflag;
*rval = 0;
return (0);
case B_TTYMODES:
/* This is necessary for emulating TCGETS ioctls. */
if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(),
DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios,
&termios_len) != DDI_SUCCESS)
return (EIO);
ASSERT(termios_len == sizeof (*termios));
if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) {
ddi_prop_free(termios);
return (EFAULT);
}
ddi_prop_free(termios);
*rval = 0;
return (0);
case B_ELFDATA:
pd = curproc->p_brand_data;
if (copyout(&pd->l_elf_data, (void *)arg1,
sizeof (lx_elf_data_t)) != 0) {
(void) set_errno(EFAULT);
return (*rval = -1);
}
*rval = 0;
return (0);
case B_EXEC_NATIVE:
error = exec_common(
(char *)arg1, (const char **)arg2, (const char **)arg3,
EBA_NATIVE);
if (error) {
(void) set_errno(error);
return (*rval = -1);
}
return (*rval = 0);
case B_LPID_TO_SPAIR:
/*
* Given a Linux pid as arg1, return the Solaris pid in arg2 and
* the Solaris LWP in arg3. We also translate pid 1 (which is
* hardcoded in many applications) to the zone's init process.
*/
{
pid_t s_pid;
id_t s_tid;
if ((pid_t)arg1 == 1) {
s_pid = p->p_zone->zone_proc_initpid;
/* handle the dead/missing init(1M) case */
if (s_pid == -1)
s_pid = 1;
s_tid = 1;
} else if (lx_lpid_to_spair((pid_t)arg1, &s_pid,
&s_tid) < 0)
return (ESRCH);
if (copyout(&s_pid, (void *)arg2,
sizeof (s_pid)) != 0 ||
copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0)
return (EFAULT);
*rval = 0;
return (0);
}
case B_PTRACE_SYSCALL:
*rval = lx_ptrace_syscall_set((pid_t)arg1, (id_t)arg2,
(int)arg3);
return (0);
case B_SYSENTRY:
if (lx_systrace_enabled) {
uint32_t args[6];
ASSERT(lx_systrace_entry_ptr != NULL);
if (copyin((void *)arg2, args, sizeof (args)) != 0)
return (EFAULT);
(*lx_systrace_entry_ptr)(arg1, args[0], args[1],
args[2], args[3], args[4], args[5]);
}
lx_ptrace_fire();
pd = p->p_brand_data;
/*
* If neither DTrace not ptrace are interested in tracing
* this process any more, turn off the trace flag.
*/
if (!lx_systrace_enabled && !pd->l_ptrace)
(void) suword32((void *)pd->l_traceflag, 0);
*rval = 0;
return (0);
case B_SYSRETURN:
if (lx_systrace_enabled) {
ASSERT(lx_systrace_return_ptr != NULL);
(*lx_systrace_return_ptr)(arg1, arg2, arg2, 0, 0, 0, 0);
}
lx_ptrace_fire();
pd = p->p_brand_data;
/*
* If neither DTrace not ptrace are interested in tracing
* this process any more, turn off the trace flag.
*/
if (!lx_systrace_enabled && !pd->l_ptrace)
(void) suword32((void *)pd->l_traceflag, 0);
*rval = 0;
return (0);
case B_SET_AFFINITY_MASK:
case B_GET_AFFINITY_MASK:
/*
* Retrieve or store the CPU affinity mask for the
* requested linux pid.
*
* arg1 is a linux PID (0 means curthread).
* arg2 is the size of the given mask.
* arg3 is the address of the affinity mask.
*/
return (lx_sched_affinity(cmd, arg1, arg2, arg3, rval));
default:
linux_call = cmd - B_EMULATE_SYSCALL;
if (linux_call >= 0 && linux_call < LX_NSYSCALLS) {
*rval = lx_emulate_syscall(linux_call, arg1, arg2,
arg3, arg4, arg5, arg6);
return (0);
}
}
return (EINVAL);
}
/*
* Copy the per-process brand data from a parent proc to a child.
*/
void
lx_copy_procdata(proc_t *child, proc_t *parent)
{
lx_proc_data_t *cpd, *ppd;
ppd = parent->p_brand_data;
ASSERT(ppd != NULL);
cpd = kmem_alloc(sizeof (lx_proc_data_t), KM_SLEEP);
*cpd = *ppd;
child->p_brand_data = cpd;
}
#if defined(_ELF32_COMPAT)
/*
* Currently, only 32-bit branded ELF executables are supported.
*/
#define elfexec elf32exec
#define mapexec_brand mapexec32_brand
#endif /* __amd64 */
extern int elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file,
cred_t *cred, int brand_action);
extern int mapexec_brand(vnode_t *, uarg_t *, Ehdr *ehdr, Elf32_Addr *,
intptr_t *, caddr_t, int *, caddr_t *, caddr_t *, size_t *);
/*
* Exec routine called by elfexec() to load 32-bit Linux binaries.
*/
static int
lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
struct intpdata *idata, int level, long *execsz, int setid,
caddr_t exec_file, struct cred *cred, int brand_action)
{
int error;
vnode_t *nvp;
auxv32_t phdr_auxv = { AT_SUN_BRAND_PHDR, 0 };
Ehdr ehdr;
Elf32_Addr uphdr_vaddr;
intptr_t voffset;
int interp;
int i;
struct execenv env;
struct user *up = PTOU(ttoproc(curthread));
lx_elf_data_t *edp =
&((lx_proc_data_t *)ttoproc(curthread)->p_brand_data)->l_elf_data;
ASSERT(ttoproc(curthread)->p_brand == &lx_brand);
ASSERT(ttoproc(curthread)->p_brand_data != NULL);
/*
* Set the brandname and library name for the new process so that
* elfexec() puts them onto the stack.
*/
args->brandname = LX_BRANDNAME;
args->emulator = LX_LIB_PATH;
/*
* We will exec the brand library, and map in the linux linker and the
* linux executable.
*/
if (error = lookupname(LX_LIB_PATH, UIO_SYSSPACE, FOLLOW, NULLVPP,
&nvp)) {
uprintf("%s: not found.", LX_LIB);
return (error);
}
if (error = elfexec(nvp, uap, args, idata, level + 1, execsz, setid,
exec_file, cred, brand_action)) {
VN_RELE(nvp);
return (error);
}
VN_RELE(nvp);
bzero(&env, sizeof (env));
if (error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, &voffset,
exec_file, &interp, &env.ex_bssbase, &env.ex_brkbase,
&env.ex_brksize))
return (error);
/*
* Save off the important properties of the lx executable. The brand
* library will ask us for this data later, when it is ready to set
* things up for the lx executable.
*/
edp->ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff :
voffset + uphdr_vaddr;
edp->ed_entry = voffset + ehdr.e_entry;
edp->ed_phent = ehdr.e_phentsize;
edp->ed_phnum = ehdr.e_phnum;
if (interp) {
if (ehdr.e_type == ET_DYN) {
/*
* This is a shared object executable, so we need to
* pick a reasonable place to put the heap. Just don't
* use the first page.
*/
env.ex_brkbase = (caddr_t)PAGESIZE;
env.ex_bssbase = (caddr_t)PAGESIZE;
}
/*
* If the program needs an interpreter (most do), map it in and
* store relevant information about it in the aux vector, where
* the brand library can find it.
*/
if (error = lookupname(LX_LINKER, UIO_SYSSPACE, FOLLOW, NULLVPP,
&nvp)) {
uprintf("%s: not found.", LX_LINKER);
return (error);
}
if (error = mapexec_brand(nvp, args, &ehdr, &uphdr_vaddr,
&voffset, exec_file, &interp, NULL, NULL, NULL)) {
VN_RELE(nvp);
return (error);
}
VN_RELE(nvp);
/*
* Now that we know the base address of the brand's linker,
* place it in the aux vector.
*/
edp->ed_base = voffset;
edp->ed_ldentry = voffset + ehdr.e_entry;
} else {
/*
* This program has no interpreter. The lx brand library will
* jump to the address in the AT_SUN_BRAND_LDENTRY aux vector,
* so in this case, put the entry point of the main executable
* there.
*/
if (ehdr.e_type == ET_EXEC) {
/*
* An executable with no interpreter, this must be a
* statically linked executable, which means we loaded
* it at the address specified in the elf header, in
* which case the e_entry field of the elf header is an
* absolute address.
*/
edp->ed_ldentry = ehdr.e_entry;
edp->ed_entry = ehdr.e_entry;
} else {
/*
* A shared object with no interpreter, we use the
* calculated address from above.
*/
edp->ed_ldentry = edp->ed_entry;
}
/*
* Delay setting the brkbase until the first call to brk();
* see elfexec() for details.
*/
env.ex_bssbase = (caddr_t)0;
env.ex_brkbase = (caddr_t)0;
env.ex_brksize = 0;
}
env.ex_vp = vp;
setexecenv(&env);
/*
* We don't need to copy this stuff out. It is only used by our
* tools to locate the lx linker's debug section. But we should at
* least try to keep /proc's view of the aux vector consistent with
* what's on the process stack.
*/
phdr_auxv.a_un.a_val = edp->ed_phdr;
if (copyout(&phdr_auxv, args->brand_auxp, sizeof (phdr_auxv)) == -1)
return (EFAULT);
/*
* /proc uses the AT_ENTRY aux vector entry to deduce
* the location of the executable in the address space. The user
* structure contains a copy of the aux vector that needs to have those
* entries patched with the values of the real lx executable (they
* currently contain the values from the lx brand library that was
* elfexec'd, above).
*
* For live processes, AT_BASE is used to locate the linker segment,
* which /proc and friends will later use to find Solaris symbols
* (such as rtld_db_preinit). However, for core files, /proc uses
* AT_ENTRY to find the right segment to label as the executable.
* So we set AT_ENTRY to be the entry point of the linux executable,
* but leave AT_BASE to be the address of the Solaris linker.
*/
for (i = 0; i < __KERN_NAUXV_IMPL; i++) {
if (up->u_auxv[i].a_type == AT_ENTRY)
up->u_auxv[i].a_un.a_val = edp->ed_entry;
if (up->u_auxv[i].a_type == AT_SUN_BRAND_PHDR)
up->u_auxv[i].a_un.a_val = edp->ed_phdr;
}
return (0);
}
int
_init(void)
{
int err = 0;
/* pid/tid conversion hash tables */
lx_pid_init();
/* for lx_futex() */
lx_futex_init();
err = mod_install(&modlinkage);
if (err != 0) {
cmn_err(CE_WARN, "Couldn't install lx brand module");
/*
* This looks drastic, but it should never happen. These
* two data structures should be completely free-able until
* they are used by Linux processes. Since the brand
* wasn't loaded there should be no Linux processes, and
* thus no way for these data structures to be modified.
*/
if (lx_futex_fini())
panic("lx brand module cannot be loaded or unloaded.");
}
return (err);
}
int
_info(struct modinfo *modinfop)
{
return (mod_info(&modlinkage, modinfop));
}
int
_fini(void)
{
int err;
int futex_done = 0;
/*
* If there are any zones using this brand, we can't allow it to be
* unloaded.
*/
if (brand_zone_count(&lx_brand))
return (EBUSY);
lx_pid_fini();
if ((err = lx_futex_fini()) != 0)
goto done;
futex_done = 1;
err = mod_remove(&modlinkage);
done:
if (err) {
/*
* If we can't unload the module, then we have to get it
* back into a sane state.
*/
lx_pid_init();
if (futex_done)
lx_futex_init();
}
return (err);
}