exec.c revision ec25b48f5e0576a68280c5e549673a266f0be346
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/* Copyright (c) 1988 AT&T */
/* All Rights Reserved */
#include <sys/sysmacros.h>
#include <sys/cred_impl.h>
#include <sys/pathname.h>
#include <sys/schedctl.h>
#include <sys/systeminfo.h>
#include <sys/lwpchan_impl.h>
static int hold_execsw(struct execsw *);
#if defined(_SYSCALL32_IMPL)
#endif
/*
* exec() - wrapper around exece providing NULL environment pointer
*/
int
{
}
/*
* exece() - system call wrapper around exec_common()
*/
int
{
int error;
}
int
int brand_action)
{
long execsz; /* temporary count of exec size */
int i;
int error;
int brandme = 0;
/*
* exec() is not supported for the /proc agent lwp.
*/
return (ENOTSUP);
return (error);
if (brand_action != EBA_NONE) {
/*
* Brand actions are not supported for processes that are not
* running in a branded zone.
*/
if (!ZONE_IS_BRANDED(p->p_zone))
return (ENOTSUP);
if (brand_action == EBA_NATIVE) {
/* Only branded processes can be unbranded */
if (!PROC_IS_BRANDED(p))
return (ENOTSUP);
} else {
/* Only unbranded processes can be branded */
if (PROC_IS_BRANDED(p))
return (ENOTSUP);
brandme = 1;
}
} else {
/*
* If this is a native zone, or if the process is already
* branded, then we don't need to do anything. If this is
* a native process in a branded zone, we need to brand the
* process as it exec()s the new binary.
*/
brandme = 1;
}
/*
* Inform /proc that an exec() has started.
* Hold signals that are ignored by default so that we will
* not be interrupted by a signal that will be ignored after
* successful completion of gexec().
*/
mutex_enter(&p->p_lock);
prexecstart();
mutex_exit(&p->p_lock);
/*
* Look up path name and remember last component for later.
* To help coreadm expand its %d token, we attempt to save
* the directory containing the executable in p_execdir. The
* first call to lookuppn() may fail and return EINVAL because
* dirvpp is non-NULL. In that case, we make a second call to
* lookuppn() with dirvpp set to NULL; p_execdir will be NULL,
* but coreadm is allowed to expand %d to the empty string and
* there are other cases in which that failure may occur.
*/
goto out;
goto out;
goto out;
&vp)) != 0) {
goto out;
}
}
goto out;
}
/*
* We do not allow executing files in attribute directories.
* We test this by determining whether the resolved path
* contains a "/" when we're in an attribute directory;
* only if the pathname does not contain a "/" the resolved path
* points to a file in the current working (attribute) directory.
*/
goto out;
}
/* don't free resolvepn until we are done with args */
/*
* Specific exec handlers, or policies determined via
*/
/* If necessary, brand this process before we start the exec. */
if (brandme != 0)
brand_setbrand(p);
if (brandme != 0)
goto fail;
}
/*
* Free floating point registers (sun4u only)
*/
/*
* Free thread and process context ops.
*/
if (p->p_pctx)
freepctx(p, 1);
/*
* Remember file name for accounting; clear any cached DTrace predicate.
*/
/*
* Clear contract template state
*/
/*
* Save the directory in which we found the executable for expanding
* the %d token used in core file patterns.
*/
mutex_enter(&p->p_lock);
mutex_exit(&p->p_lock);
/*
* Reset stack state to the user stack, clear set of signals
* caught on the signal stack, and reset list of signals that
* restart system calls; the new program's environment should
* not be affected by detritus from the old program. Any
* pending held signals remain held, so don't clear t_hold.
*/
mutex_enter(&p->p_lock);
lwp->lwp_oldcontext = 0;
lwp->lwp_ustack = 0;
lwp->lwp_old_stk_ctl = 0;
/*
* Make saved resource limit == current resource limit.
*/
for (i = 0; i < RLIM_NLIMITS; i++) {
/*CONSTCOND*/
if (RLIM_SAVED(i)) {
(void) rctl_rlimit_get(rctlproc_legacy[i], p,
&up->u_saved_rlimit[i]);
}
}
/*
* If the action was to catch the signal, then the action
* must be reset to SIG_DFL.
*/
sigdefault(p);
/*
* Delete the dot4 sigqueues/signotifies.
*/
sigqfree(p);
mutex_exit(&p->p_lock);
mutex_enter(&p->p_pflock);
p->p_prof.pr_samples = 0;
mutex_exit(&p->p_pflock);
#if defined(__sparc)
utrap_free(p);
#endif /* __sparc */
/*
* Close all close-on-exec files.
*/
close_exec(P_FINFO(p));
/* Unbrand ourself if requested. */
if (brand_action == EBA_NATIVE)
/* Mark this as an executable vnode */
/*
* Allocate a new lwp directory and lwpid hash table if necessary.
*/
else
}
if (PROC_IS_BRANDED(p))
mutex_enter(&p->p_lock);
prbarrier(p);
/*
* Reset lwp id to the default value of 1.
* This is a single-threaded process now
* and lwp #1 is lwp_wait()able by default.
* The t_unpark flag should not be inherited.
*/
p->p_lwpdaemon = 0; /* but oh well ... */
p->p_lwpid = 1;
/*
* Install the newly-allocated lwp directory and lwpid hash table
* and insert the current thread into the new hash table.
*/
old_lwpdir = p->p_lwpdir;
old_lwpdir_sz = p->p_lwpdir_sz;
old_tidhash = p->p_tidhash;
old_tidhash_sz = p->p_tidhash_sz;
p->p_lwpdir_sz = 2;
p->p_tidhash_sz = 2;
lwp_hash_in(p, lep);
}
/*
* Restore the saved signal mask and
* inform /proc that the exec() has finished.
*/
prexecend();
mutex_exit(&p->p_lock);
if (old_lwpdir) {
}
return (0);
fail:
out: /* error return */
mutex_enter(&p->p_lock);
prexecend();
mutex_exit(&p->p_lock);
return (error);
}
/*
* Perform generic exec duties and switchout to object-file specific
* handler.
*/
int
int level,
long *execsz,
int brand_action)
{
int error = 0;
int suidflags = 0;
char magbuf[MAGIC_BYTES];
int setid;
int privflags = 0;
int setidfl;
/*
* If the SNOCD or SUGID flag is set, turn it off and remember the
* previous setting so we can restore it if we encounter an error.
*/
}
goto bad;
/* need to open vnode for stateful file systems like rfs */
goto bad;
/*
* Note: to support binary compatibility with SunOS a.out
* executables, we read in the first four bytes, as the
* magic number is in bytes 2-3.
*/
goto bad;
if (resid != 0)
goto bad;
goto bad;
if (level == 0 &&
/* If we can, drop the PA bit */
if ((privflags & PRIV_RESET) != 0)
if (privflags & PRIV_SETID) {
}
}
/*
* Implement the privilege updates:
*
* Restrict with L:
*
* I' = I & L
*
* E' = P' = (I' + F) & A
*
* But if running under ptrace, we cap I with P.
*/
if ((privflags & PRIV_RESET) != 0) {
if ((privflags & PRIV_INCREASE) != 0 &&
}
}
/* SunOS 4.x buy-back */
"!%s, uid %d: setuid execution not allowed, dev=%lx",
}
/*
* execsetid() told us whether or not we had to change the
* credentials of the process. In privflags, it told us
* whether we gained any privileges or executed a set-uid executable.
*/
/*
* should be marked as executable by default.
*/
if (noexec_user_stack)
/*
* Traditionally, the setid flags told the sub processes whether
* the file just executed was set-uid or set-gid; this caused
* some confusion as the 'setid' flag did not match the SUGID
* Now we flag those cases where the calling process cannot
* be trusted to influence the newly exec'ed process, either
* do in fact not match.
* This also makes the runtime linker agree with the on exec
* values of SNOCD and SUGID.
*/
setidfl = 0;
}
if (setid & PRIV_SETUGID)
if (setid & PRIV_INCREASE)
if (error != 0) {
goto bad;
}
if (level == 0) {
/*
* Free the old credentials, and set the new ones.
* Do this for both the process and the (single) thread.
*/
/*
* DTrace accesses t_cred in probe context. t_cred
* must always be either NULL, or point to a valid,
* allocated cred structure.
*/
}
/*
* On emerging from a successful exec(), the saved
* uid and gid equal the effective uid and gid.
*/
/*
* If the real and effective ids do not match, this
* is a setuid process that should not dump core.
* The group comparison is tricky; we prevent the code
* from flagging SNOCD when executing with an effective gid
* which is a supplementary group.
*/
(privflags & PRIV_INCREASE) != 0)
else
suidflags = 0;
if (suidflags) {
}
/*
* If process is traced via /proc, arrange to
* invalidate the associated /proc vnode.
*/
}
if (args->traceinval)
}
return (0);
bad:
if (error == 0)
if (suidflags) {
}
return (error);
}
extern char *execswnames[];
struct execsw *
{
int i, j;
char *ename;
char *magicp;
for (i = 0; i < nexectype; i++) {
if (execswnames[i] == NULL) {
execswnames[i] = ename;
/*
* Set the magic number last so that we
* don't need to hold the execsw_lock in
* findexectype().
*/
for (j = 0; j < magic_size; j++)
return (&execsw[i]);
}
}
return (NULL);
}
/*
* Find the exec switch table entry with the corresponding magic string.
*/
struct execsw *
findexecsw(char *magic)
{
return (eswp);
}
return (NULL);
}
/*
* Find the execsw[] index for the given exec header string by looking for the
* magic string at a specified offset and length for each kind of executable
* file format until one matches. If no execsw[] entry is found, try to
* autoload a module for this magic string.
*/
struct execsw *
findexec_by_hdr(char *header)
{
eswp->exec_maglen) == 0) {
if (hold_execsw(eswp) != 0)
return (NULL);
return (eswp);
}
}
return (NULL); /* couldn't find the type */
}
/*
* Find the execsw[] index for the given magic string. If no execsw[] entry
* is found, try to autoload a module for this magic string.
*/
struct execsw *
findexec_by_magic(char *magic)
{
if (hold_execsw(eswp) != 0)
return (NULL);
return (eswp);
}
}
return (NULL); /* couldn't find the type */
}
static int
{
char *name;
while (!LOADED_EXEC(eswp)) {
return (-1);
}
return (0);
}
static int
{
int privflags = 0;
/*
* Remember credentials.
*/
/* Will try to reset the PRIV_AWARE bit later. */
privflags |= PRIV_RESET;
/*
* Set-uid root execution only allowed if the limit set
* holds all unsafe privileges.
*/
}
}
}
/*
* Do we need to change our credential anyway?
* This is the case when E != I or P != I, as
* we need to do the assignments (with F empty and A full)
* Or when I is not a subset of L; in that case we need to
* enforce L.
*
* I' = L & I
*
* E' = P' = (I' + F) & A
* or
* E' = P' = I'
*/
privflags |= PRIV_RESET;
/* If MAC-aware flag(s) are on, need to update cred to remove. */
/*
* When we introduce the "forced" set then we will need
* to set PRIV_INCREASE here if I not a subset of P.
* If the "allowed" set is introduced we will need to do
* a similar thing; however, it seems more reasonable to
* have the allowed set reduce "L": script language interpreters
* would typically have an allowed set of "all".
*/
/*
* the presence of ptrace() compatibility.
*/
privflags |= PRIV_SETID;
}
return (privflags);
}
int
{
int error;
return (error);
/*
* Check the access mode.
* If VPROC, ask /proc if the file is an object file.
*/
if (error == 0)
return (error);
}
/*
* If process is under ptrace(2) compatibility,
* fail the exec(2).
*/
if (p->p_proc_flag & P_PR_PTRACE)
goto bad;
/*
* Process is traced via /proc.
* Arrange to invalidate the /proc vnode.
*/
}
return (0);
bad:
if (error == 0)
return (error);
}
/*
* Map a section of an executable file into the user's
* address space.
*/
int
{
int error = 0;
if (len) {
if (page) {
int preread;
} else {
mflag |= MAP_INITDATA;
}
goto bad;
}
goto bad;
/*
* If the segment can fit, then we prefault
* the entire segment in. This is based on the
* model that says the best working set of a
* small program is all of its pages.
*/
preread =
/*
* If we aren't prefaulting the segment,
* increment "deficit", if necessary to ensure
* that pages will become available when this
* process starts executing.
*/
}
if (preread) {
"execmap preread:freemem %d size %lu",
}
} else {
goto bad;
}
goto bad;
/*
* Read in the segment in one big chunk.
*/
goto bad;
/*
* Now set protections.
*/
}
}
}
if (zfodlen) {
if (zfoddiff) {
/*
* Before we go to zero the remaining space on the last
* page, make sure we have write permission.
*/
&zprot);
}
no_fault();
goto bad;
}
no_fault();
}
struct segvn_crargs crargs =
goto bad;
}
if (szc > 0) {
/*
* ASSERT alignment because the mapelfexec()
* caller for the szc > 0 case extended zfod
* so it's end is pgsz aligned.
*/
} else {
}
} else {
}
goto bad;
}
}
}
return (0);
bad:
return (error);
}
void
{
if (p->p_exec)
}
int
{
int error = 0;
return (error);
}
*fdp = -1;
return (error);
}
return (0);
}
int
{
}
/*
* noexec stub function.
*/
/*ARGSUSED*/
int
int level,
long *execsz,
int setid,
{
return (ENOEXEC);
}
/*
* Support routines for building a user stack.
*
* execve(path, argv, envp) must construct a new stack with the specified
* arguments and environment variables (see exec_args() for a description
* of the user stack layout). To do this, we copy the arguments and
* environment variables from the old user address space into the kernel,
* free the old as, create the new as, and copy our buffered information
* to the new stack. Our kernel buffer has the following structure:
*
* +-----------------------+ <--- stk_base + stk_size
* | string offsets |
* +-----------------------+ <--- stk_offp
* | |
* | STK_AVAIL() space |
* | |
* +-----------------------+ <--- stk_strp
* | strings |
* +-----------------------+ <--- stk_base
*
* When we add a string, we store the string's contents (including the null
* terminator) at stk_strp, and we store the offset of the string relative to
* stk_base at --stk_offp. At strings are added, stk_strp increases and
* stk_offp decreases. The amount of space remaining, STK_AVAIL(), is just
* the difference between these pointers. If we run out of space, we return
* an error and exec_args() starts all over again with a buffer twice as large.
* When we're all done, the kernel buffer looks like this:
*
* +-----------------------+ <--- stk_base + stk_size
* | argv[0] offset |
* +-----------------------+
* | ... |
* +-----------------------+
* | argv[argc-1] offset |
* +-----------------------+
* | envp[0] offset |
* +-----------------------+
* | ... |
* +-----------------------+
* | envp[envc-1] offset |
* +-----------------------+
* | AT_SUN_PLATFORM offset|
* +-----------------------+
* | AT_SUN_EXECNAME offset|
* +-----------------------+ <--- stk_offp
* | |
* | STK_AVAIL() space |
* | |
* +-----------------------+ <--- stk_strp
* | AT_SUN_EXECNAME offset|
* +-----------------------+
* | AT_SUN_PLATFORM offset|
* +-----------------------+
* | envp[envc-1] string |
* +-----------------------+
* | ... |
* +-----------------------+
* | envp[0] string |
* +-----------------------+
* | argv[argc-1] string |
* +-----------------------+
* | ... |
* +-----------------------+
* | argv[0] string |
* +-----------------------+ <--- stk_base
*/
/*
* Add a string to the stack.
*/
static int
{
int error;
return (E2BIG);
if (segflg == UIO_USERSPACE) {
if (error != 0)
return (error);
} else {
return (E2BIG);
}
return (0);
}
static int
{
int error;
} else {
}
return (error);
}
static int
{
else
}
static int
{
char *sp;
int argv_empty = 0;
/*
* Copy interpreter's name and argument to argv[0] and argv[1].
*/
return (error);
return (error);
else
if (error)
return (error);
/*
* Check for an empty argv[].
*/
return (EFAULT);
argv_empty = 1;
}
if (argv_empty == 0) {
/*
* Add argv[] strings to the stack.
*/
for (;;) {
return (EFAULT);
break;
return (error);
}
}
/*
* Add environ[] strings to the stack.
*/
for (;;) {
return (EFAULT);
break;
return (error);
}
}
/*
* Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
* AT_SUN_EMULATOR strings to the stack.
*/
return (error);
return (error);
UIO_SYSSPACE)) != 0)
return (error);
UIO_SYSSPACE)) != 0)
return (error);
}
/*
* Compute the size of the stack. This includes all the pointers,
* the space reserved for the aux vector, and all the strings.
* The total number of pointers is args->na (which is argc + envc)
* plus 4 more: (1) a pointer's worth of space for argc; (2) the NULL
* after the last argument (i.e. argv[argc]); (3) the NULL after the
* last environment variable (i.e. envp[envc]); and (4) the NULL after
* all the strings, at the very top of the stack.
*/
/*
* Pad the string section with zeroes to align the stack size.
*/
return (E2BIG);
while (pad-- != 0)
return (0);
}
static int
{
int i;
/*
* Record argc for /proc.
*/
/*
* Put argc on the stack. Note that even though it's an int,
* it always consumes ptrsize bytes (for alignment).
*/
return (-1);
/*
* Add argc space (ptrsize) to usp and record argv for /proc.
*/
/*
* Put the argv[] pointers on the stack.
*/
return (-1);
/*
* Copy arguments to u_psargs.
*/
for (i = 0; i < pslen; i++)
while (i < PSARGSZ)
/*
* Add space for argv[]'s NULL terminator (ptrsize) to usp and
* record envp for /proc.
*/
/*
* Put the envp[] pointers on the stack.
*/
return (-1);
/*
* Add space for envp[]'s NULL terminator (ptrsize) to usp and
* remember where the stack ends, which is also where auxv begins.
*/
/*
* Put all the argv[], envp[], and auxv strings on the stack.
*/
return (-1);
/*
* Fill in the aux vector now that we know the user stack addresses
* for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
* AT_SUN_EMULATOR strings.
*/
ADDAUX(*a,
ADDAUX(*a,
} else {
ADDAUX(*a,
ADDAUX(*a,
ADDAUX(*a, AT_SUN_BRANDNAME,
ADDAUX(*a, AT_SUN_EMULATOR,
}
}
return (0);
}
/*
* Initialize a new user stack with the specified arguments and environment.
* The initial user stack layout is as follows:
*
* User Stack
* +---------------+ <--- curproc->p_usrstack
* | NULL |
* +---------------+
* | |
* | auxv strings |
* | |
* +---------------+
* | |
* | envp strings |
* | |
* +---------------+
* | |
* | argv strings |
* | |
* +---------------+ <--- ustrp
* | |
* | aux vector |
* | |
* +---------------+ <--- auxv
* | NULL |
* +---------------+
* | envp[envc-1] |
* +---------------+
* | ... |
* +---------------+
* | envp[0] |
* +---------------+ <--- envp[]
* | NULL |
* +---------------+
* | argv[argc-1] |
* +---------------+
* | ... |
* +---------------+
* | argv[0] |
* +---------------+ <--- argv[]
* | argc |
* +---------------+ <--- stack base
*/
int
{
int error;
char *usrstack;
extern int use_stk_lpg;
if (p->p_model == DATAMODEL_NATIVE) {
args->from_ptrsize = sizeof (long);
} else {
}
args->to_ptrsize = sizeof (long);
} else {
usrstack = (char *)USRSTACK32;
}
#if defined(__sparc)
/*
* Make sure user register windows are empty before
* attempting to make a new stack.
*/
(void) flush_user_windows_to_stack(NULL);
#endif
if (error == 0)
break;
return (error);
return (E2BIG);
}
return (E2BIG);
}
/*
* Leave only the current lwp and force the other lwps to exit.
* If another lwp beat us to the punch by calling exit(), bail out.
*/
return (error);
}
/*
* Revoke any doors created by the process.
*/
if (p->p_door_list)
door_exit();
/*
* Release schedctl data structures.
*/
if (p->p_pagep)
/*
* Clean up any DTrace helpers for the process.
*/
if (p->p_dtrace_helpers != NULL) {
(*dtrace_helpers_cleanup)();
}
mutex_enter(&p->p_lock);
/*
* Cleanup the DTrace provider associated with this process.
*/
if (p->p_dtrace_probes) {
}
mutex_exit(&p->p_lock);
/*
* discard the lwpchan cache.
*/
/*
* Delete the POSIX timers.
*/
timer_exit();
#ifdef C2_AUDIT
if (audit_active)
#endif
/*
* Ensure that we don't change resource associations while we
* change address spaces.
*/
mutex_enter(&p->p_lock);
mutex_exit(&p->p_lock);
/*
* Destroy the old address space and create a new one.
* From here on, any errors are fatal to the exec()ing process.
* On error we return -1, which means the caller must SIGKILL
* the process.
*/
relvm();
mutex_enter(&p->p_lock);
mutex_exit(&p->p_lock);
p->p_brksize = 0;
p->p_brkpageszc = 0;
p->p_stksize = 0;
p->p_stkpageszc = 0;
p->p_usrstack = usrstack;
/*
* Reset resource controls such that all controls are again active as
* well as appropriate to the potentially new address model for the
* process.
*/
e.rcep_t = RCENTITY_PROCESS;
rctl_set_reset(p->p_rctls, p, &e);
/* Too early to call map_pgsz for the heap */
if (use_stk_lpg) {
}
mutex_enter(&p->p_lock);
mutex_exit(&p->p_lock);
if (p->p_model == DATAMODEL_ILP32)
/*
* Finally, write out the contents of the new stack.
*/
return (error);
}