common/os/exec.c

	exec.c revision dc32d872cbeb56532bcea030255db9cd79bac7da
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster/*
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster * CDDL HEADER START
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster *
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster * The contents of this file are subject to the terms of the
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster * Common Development and Distribution License (the "License").
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster * You may not use this file except in compliance with the License.
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster *
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster * or http://www.opensolaris.org/os/licensing.
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster * See the License for the specific language governing permissions
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster * and limitations under the License.
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster *
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster * When distributing Covered Code, include this CDDL HEADER in each
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster * If applicable, add the following below this CDDL HEADER, with the
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster * fields enclosed by brackets "[]" replaced with your own identifying
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster * information: Portions Copyright [yyyy] [name of copyright owner]
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster *
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster * CDDL HEADER END
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster */
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster/*
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster */
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster/*  Copyright (c) 1988 AT&T */
6909255a1970175507277a0f2f105979625f76b2Jaco Jooste/*    All Rights Reserved   */
6909255a1970175507277a0f2f105979625f76b2Jaco Jooste/*
6909255a1970175507277a0f2f105979625f76b2Jaco Jooste * Copyright 2014, Joyent, Inc.  All rights reserved.
6909255a1970175507277a0f2f105979625f76b2Jaco Jooste */
6909255a1970175507277a0f2f105979625f76b2Jaco Jooste
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/types.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/param.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/sysmacros.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/systm.h>
6909255a1970175507277a0f2f105979625f76b2Jaco Jooste#include <sys/signal.h>
6466ccf2fa7622ea05ba8cd089a224c8fc710f9aCraig McDonnell#include <sys/cred_impl.h>
6909255a1970175507277a0f2f105979625f76b2Jaco Jooste#include <sys/policy.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/user.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/errno.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/file.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/vfs.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/vnode.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/mman.h>
6466ccf2fa7622ea05ba8cd089a224c8fc710f9aCraig McDonnell#include <sys/acct.h>
94bba0f00a89bd3995d00513446d4849ecc79858Craig McDonnell#include <sys/cpuvar.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/proc.h>
6909255a1970175507277a0f2f105979625f76b2Jaco Jooste#include <sys/cmn_err.h>
6909255a1970175507277a0f2f105979625f76b2Jaco Jooste#include <sys/debug.h>
6909255a1970175507277a0f2f105979625f76b2Jaco Jooste#include <sys/pathname.h>
6909255a1970175507277a0f2f105979625f76b2Jaco Jooste#include <sys/vm.h>
6909255a1970175507277a0f2f105979625f76b2Jaco Jooste#include <sys/lgrp.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/vtrace.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/exec.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/exechdr.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/kmem.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/prsystm.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/modctl.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/vmparam.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/door.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/schedctl.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/utrap.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/systeminfo.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/stack.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/rctl.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/dtrace.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/lwpchan_impl.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/pool.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/sdt.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/brand.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <sys/klpd.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <c2/audit.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <vm/hat.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <vm/anon.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <vm/as.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <vm/seg.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#include <vm/seg_vn.h>
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#define PRIV_RESET      0x01    /* needs to reset privs */
869a36e2649ae064c98063cf1e55198488d78d12Allan Foster#define PRIV_SETID      0x02    /* needs to change uids */
#define PRIV_SETUGID        0x04    /* is setuid/setgid/forced privs */
#define PRIV_INCREASE       0x08    /* child runs with more privs */
#define MAC_FLAGS       0x10    /* need to adjust MAC flags */
#define PRIV_FORCED     0x20    /* has forced privileges */

static int execsetid(struct vnode *, struct vattr *, uid_t *, uid_t *,
    priv_set_t *, cred_t *, const char *);
static int hold_execsw(struct execsw *);

uint_t auxv_hwcap = 0;  /* auxv AT_SUN_HWCAP value; determined on the fly */
uint_t auxv_hwcap_2 = 0;    /* AT_SUN_HWCAP2 */
#if defined(_SYSCALL32_IMPL)
uint_t auxv_hwcap32 = 0;    /* 32-bit version of auxv_hwcap */
uint_t auxv_hwcap32_2 = 0;  /* 32-bit version of auxv_hwcap2 */
#endif

#define PSUIDFLAGS      (SNOCD|SUGID)

/*
 * exece() - system call wrapper around exec_common()
 */
int
exece(const char *fname, const char **argp, const char **envp)
{
    int error;

    error = exec_common(fname, argp, envp, EBA_NONE);
    return (error ? (set_errno(error)) : 0);
}

int
exec_common(const char *fname, const char **argp, const char **envp,
    int brand_action)
{
    vnode_t *vp = NULL, *dir = NULL, *tmpvp = NULL;
    proc_t *p = ttoproc(curthread);
    klwp_t *lwp = ttolwp(curthread);
    struct user *up = PTOU(p);
    long execsz;        /* temporary count of exec size */
    int i;
    int error;
    char exec_file[MAXCOMLEN+1];
    struct pathname pn;
    struct pathname resolvepn;
    struct uarg args;
    struct execa ua;
    k_sigset_t savedmask;
    lwpdir_t *lwpdir = NULL;
    tidhash_t *tidhash;
    lwpdir_t *old_lwpdir = NULL;
    uint_t old_lwpdir_sz;
    tidhash_t *old_tidhash;
    uint_t old_tidhash_sz;
    ret_tidhash_t *ret_tidhash;
    lwpent_t *lep;
    boolean_t brandme = B_FALSE;

    /*
     * exec() is not supported for the /proc agent lwp.
     */
    if (curthread == p->p_agenttp)
        return (ENOTSUP);

    if (brand_action != EBA_NONE) {
        /*
         * Brand actions are not supported for processes that are not
         * running in a branded zone.
         */
        if (!ZONE_IS_BRANDED(p->p_zone))
            return (ENOTSUP);

        if (brand_action == EBA_NATIVE) {
            /* Only branded processes can be unbranded */
            if (!PROC_IS_BRANDED(p))
                return (ENOTSUP);
        } else {
            /* Only unbranded processes can be branded */
            if (PROC_IS_BRANDED(p))
                return (ENOTSUP);
            brandme = B_TRUE;
        }
    } else {
        /*
         * If this is a native zone, or if the process is already
         * branded, then we don't need to do anything.  If this is
         * a native process in a branded zone, we need to brand the
         * process as it exec()s the new binary.
         */
        if (ZONE_IS_BRANDED(p->p_zone) && !PROC_IS_BRANDED(p))
            brandme = B_TRUE;
    }

    /*
     * Inform /proc that an exec() has started.
     * Hold signals that are ignored by default so that we will
     * not be interrupted by a signal that will be ignored after
     * successful completion of gexec().
     */
    mutex_enter(&p->p_lock);
    prexecstart();
    schedctl_finish_sigblock(curthread);
    savedmask = curthread->t_hold;
    sigorset(&curthread->t_hold, &ignoredefault);
    mutex_exit(&p->p_lock);

    /*
     * Look up path name and remember last component for later.
     * To help coreadm expand its %d token, we attempt to save
     * the directory containing the executable in p_execdir. The
     * first call to lookuppn() may fail and return EINVAL because
     * dirvpp is non-NULL. In that case, we make a second call to
     * lookuppn() with dirvpp set to NULL; p_execdir will be NULL,
     * but coreadm is allowed to expand %d to the empty string and
     * there are other cases in which that failure may occur.
     */
    if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
        goto out;
    pn_alloc(&resolvepn);
    if ((error = lookuppn(&pn, &resolvepn, FOLLOW, &dir, &vp)) != 0) {
        pn_free(&resolvepn);
        pn_free(&pn);
        if (error != EINVAL)
            goto out;

        dir = NULL;
        if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
            goto out;
        pn_alloc(&resolvepn);
        if ((error = lookuppn(&pn, &resolvepn, FOLLOW, NULLVPP,
            &vp)) != 0) {
            pn_free(&resolvepn);
            pn_free(&pn);
            goto out;
        }
    }
    if (vp == NULL) {
        if (dir != NULL)
            VN_RELE(dir);
        error = ENOENT;
        pn_free(&resolvepn);
        pn_free(&pn);
        goto out;
    }

    if ((error = secpolicy_basic_exec(CRED(), vp)) != 0) {
        if (dir != NULL)
            VN_RELE(dir);
        pn_free(&resolvepn);
        pn_free(&pn);
        VN_RELE(vp);
        goto out;
    }

    /*
     * We do not allow executing files in attribute directories.
     * We test this by determining whether the resolved path
     * contains a "/" when we're in an attribute directory;
     * only if the pathname does not contain a "/" the resolved path
     * points to a file in the current working (attribute) directory.
     */
    if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
        strchr(resolvepn.pn_path, '/') == NULL) {
        if (dir != NULL)
            VN_RELE(dir);
        error = EACCES;
        pn_free(&resolvepn);
        pn_free(&pn);
        VN_RELE(vp);
        goto out;
    }

    bzero(exec_file, MAXCOMLEN+1);
    (void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
    bzero(&args, sizeof (args));
    args.pathname = resolvepn.pn_path;
    /* don't free resolvepn until we are done with args */
    pn_free(&pn);

    /*
     * If we're running in a profile shell, then call pfexecd.
     */
    if ((CR_FLAGS(p->p_cred) & PRIV_PFEXEC) != 0) {
        error = pfexec_call(p->p_cred, &resolvepn, &args.pfcred,
            &args.scrubenv);

        /* Returning errno in case we're not allowed to execute. */
        if (error > 0) {
            if (dir != NULL)
                VN_RELE(dir);
            pn_free(&resolvepn);
            VN_RELE(vp);
            goto out;
        }

        /* Don't change the credentials when using old ptrace. */
        if (args.pfcred != NULL &&
            (p->p_proc_flag & P_PR_PTRACE) != 0) {
            crfree(args.pfcred);
            args.pfcred = NULL;
            args.scrubenv = B_FALSE;
        }
    }

    /*
     * Specific exec handlers, or policies determined via
     * /etc/system may override the historical default.
     */
    args.stk_prot = PROT_ZFOD;
    args.dat_prot = PROT_ZFOD;

    CPU_STATS_ADD_K(sys, sysexec, 1);
    DTRACE_PROC1(exec, char *, args.pathname);

    ua.fname = fname;
    ua.argp = argp;
    ua.envp = envp;

    /* If necessary, brand this process before we start the exec. */
    if (brandme)
        brand_setbrand(p);

    if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
        exec_file, p->p_cred, brand_action)) != 0) {
        if (brandme)
            brand_clearbrand(p, B_FALSE);
        VN_RELE(vp);
        if (dir != NULL)
            VN_RELE(dir);
        pn_free(&resolvepn);
        goto fail;
    }

    /*
     * Free floating point registers (sun4u only)
     */
    ASSERT(lwp != NULL);
    lwp_freeregs(lwp, 1);

    /*
     * Free thread and process context ops.
     */
    if (curthread->t_ctx)
        freectx(curthread, 1);
    if (p->p_pctx)
        freepctx(p, 1);

    /*
     * Remember file name for accounting; clear any cached DTrace predicate.
     */
    up->u_acflag &= ~AFORK;
    bcopy(exec_file, up->u_comm, MAXCOMLEN+1);
    curthread->t_predcache = NULL;

    /*
     * Clear contract template state
     */
    lwp_ctmpl_clear(lwp);

    /*
     * Save the directory in which we found the executable for expanding
     * the %d token used in core file patterns.
     */
    mutex_enter(&p->p_lock);
    tmpvp = p->p_execdir;
    p->p_execdir = dir;
    if (p->p_execdir != NULL)
        VN_HOLD(p->p_execdir);
    mutex_exit(&p->p_lock);

    if (tmpvp != NULL)
        VN_RELE(tmpvp);

    /*
     * Reset stack state to the user stack, clear set of signals
     * caught on the signal stack, and reset list of signals that
     * restart system calls; the new program's environment should
     * not be affected by detritus from the old program.  Any
     * pending held signals remain held, so don't clear t_hold.
     */
    mutex_enter(&p->p_lock);
    lwp->lwp_oldcontext = 0;
    lwp->lwp_ustack = 0;
    lwp->lwp_old_stk_ctl = 0;
    sigemptyset(&up->u_signodefer);
    sigemptyset(&up->u_sigonstack);
    sigemptyset(&up->u_sigresethand);
    lwp->lwp_sigaltstack.ss_sp = 0;
    lwp->lwp_sigaltstack.ss_size = 0;
    lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;

    /*
     * Make saved resource limit == current resource limit.
     */
    for (i = 0; i < RLIM_NLIMITS; i++) {
        /*CONSTCOND*/
        if (RLIM_SAVED(i)) {
            (void) rctl_rlimit_get(rctlproc_legacy[i], p,
                &up->u_saved_rlimit[i]);
        }
    }

    /*
     * If the action was to catch the signal, then the action
     * must be reset to SIG_DFL.
     */
    sigdefault(p);
    p->p_flag &= ~(SNOWAIT|SJCTL);
    p->p_flag |= (SEXECED|SMSACCT|SMSFORK);
    up->u_signal[SIGCLD - 1] = SIG_DFL;

    /*
     * Delete the dot4 sigqueues/signotifies.
     */
    sigqfree(p);

    mutex_exit(&p->p_lock);

    mutex_enter(&p->p_pflock);
    p->p_prof.pr_base = NULL;
    p->p_prof.pr_size = 0;
    p->p_prof.pr_off = 0;
    p->p_prof.pr_scale = 0;
    p->p_prof.pr_samples = 0;
    mutex_exit(&p->p_pflock);

    ASSERT(curthread->t_schedctl == NULL);

#if defined(__sparc)
    if (p->p_utraps != NULL)
        utrap_free(p);
#endif  /* __sparc */

    /*
     * Close all close-on-exec files.
     */
    close_exec(P_FINFO(p));
    TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);

    /* Unbrand ourself if necessary. */
    if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE))
        brand_clearbrand(p, B_FALSE);

    setregs(&args);

    /* Mark this as an executable vnode */
    mutex_enter(&vp->v_lock);
    vp->v_flag |= VVMEXEC;
    mutex_exit(&vp->v_lock);

    VN_RELE(vp);
    if (dir != NULL)
        VN_RELE(dir);
    pn_free(&resolvepn);

    /*
     * Allocate a new lwp directory and lwpid hash table if necessary.
     */
    if (curthread->t_tid != 1 || p->p_lwpdir_sz != 2) {
        lwpdir = kmem_zalloc(2 * sizeof (lwpdir_t), KM_SLEEP);
        lwpdir->ld_next = lwpdir + 1;
        tidhash = kmem_zalloc(2 * sizeof (tidhash_t), KM_SLEEP);
        if (p->p_lwpdir != NULL)
            lep = p->p_lwpdir[curthread->t_dslot].ld_entry;
        else
            lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
    }

    if (PROC_IS_BRANDED(p))
        BROP(p)->b_exec();

    mutex_enter(&p->p_lock);
    prbarrier(p);

    /*
     * Reset lwp id to the default value of 1.
     * This is a single-threaded process now
     * and lwp #1 is lwp_wait()able by default.
     * The t_unpark flag should not be inherited.
     */
    ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
    curthread->t_tid = 1;
    kpreempt_disable();
    ASSERT(curthread->t_lpl != NULL);
    p->p_t1_lgrpid = curthread->t_lpl->lpl_lgrpid;
    kpreempt_enable();
    if (p->p_tr_lgrpid != LGRP_NONE && p->p_tr_lgrpid != p->p_t1_lgrpid) {
        lgrp_update_trthr_migrations(1);
    }
    curthread->t_unpark = 0;
    curthread->t_proc_flag |= TP_TWAIT;
    curthread->t_proc_flag &= ~TP_DAEMON;   /* daemons shouldn't exec */
    p->p_lwpdaemon = 0;         /* but oh well ... */
    p->p_lwpid = 1;

    /*
     * Install the newly-allocated lwp directory and lwpid hash table
     * and insert the current thread into the new hash table.
     */
    if (lwpdir != NULL) {
        old_lwpdir = p->p_lwpdir;
        old_lwpdir_sz = p->p_lwpdir_sz;
        old_tidhash = p->p_tidhash;
        old_tidhash_sz = p->p_tidhash_sz;
        p->p_lwpdir = p->p_lwpfree = lwpdir;
        p->p_lwpdir_sz = 2;
        lep->le_thread = curthread;
        lep->le_lwpid = curthread->t_tid;
        lep->le_start = curthread->t_start;
        lwp_hash_in(p, lep, tidhash, 2, 0);
        p->p_tidhash = tidhash;
        p->p_tidhash_sz = 2;
    }
    ret_tidhash = p->p_ret_tidhash;
    p->p_ret_tidhash = NULL;

    /*
     * Restore the saved signal mask and
     * inform /proc that the exec() has finished.
     */
    curthread->t_hold = savedmask;
    prexecend();
    mutex_exit(&p->p_lock);
    if (old_lwpdir) {
        kmem_free(old_lwpdir, old_lwpdir_sz * sizeof (lwpdir_t));
        kmem_free(old_tidhash, old_tidhash_sz * sizeof (tidhash_t));
    }
    while (ret_tidhash != NULL) {
        ret_tidhash_t *next = ret_tidhash->rth_next;
        kmem_free(ret_tidhash->rth_tidhash,
            ret_tidhash->rth_tidhash_sz * sizeof (tidhash_t));
        kmem_free(ret_tidhash, sizeof (*ret_tidhash));
        ret_tidhash = next;
    }

    ASSERT(error == 0);
    DTRACE_PROC(exec__success);
    return (0);

fail:
    DTRACE_PROC1(exec__failure, int, error);
out:        /* error return */
    mutex_enter(&p->p_lock);
    curthread->t_hold = savedmask;
    prexecend();
    mutex_exit(&p->p_lock);
    ASSERT(error != 0);
    return (error);
}


/*
 * Perform generic exec duties and switchout to object-file specific
 * handler.
 */
int
gexec(
    struct vnode **vpp,
    struct execa *uap,
    struct uarg *args,
    struct intpdata *idatap,
    int level,
    long *execsz,
    caddr_t exec_file,
    struct cred *cred,
    int brand_action)
{
    struct vnode *vp, *execvp = NULL;
    proc_t *pp = ttoproc(curthread);
    struct execsw *eswp;
    int error = 0;
    int suidflags = 0;
    ssize_t resid;
    uid_t uid, gid;
    struct vattr vattr;
    char magbuf[MAGIC_BYTES];
    int setid;
    cred_t *oldcred, *newcred = NULL;
    int privflags = 0;
    int setidfl;
    priv_set_t fset;

    /*
     * If the SNOCD or SUGID flag is set, turn it off and remember the
     * previous setting so we can restore it if we encounter an error.
     */
    if (level == 0 && (pp->p_flag & PSUIDFLAGS)) {
        mutex_enter(&pp->p_lock);
        suidflags = pp->p_flag & PSUIDFLAGS;
        pp->p_flag &= ~PSUIDFLAGS;
        mutex_exit(&pp->p_lock);
    }

    if ((error = execpermissions(*vpp, &vattr, args)) != 0)
        goto bad_noclose;

    /* need to open vnode for stateful file systems */
    if ((error = VOP_OPEN(vpp, FREAD, CRED(), NULL)) != 0)
        goto bad_noclose;
    vp = *vpp;

    /*
     * Note: to support binary compatibility with SunOS a.out
     * executables, we read in the first four bytes, as the
     * magic number is in bytes 2-3.
     */
    if (error = vn_rdwr(UIO_READ, vp, magbuf, sizeof (magbuf),
        (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid))
        goto bad;
    if (resid != 0)
        goto bad;

    if ((eswp = findexec_by_hdr(magbuf)) == NULL)
        goto bad;

    if (level == 0 &&
        (privflags = execsetid(vp, &vattr, &uid, &gid, &fset,
        args->pfcred == NULL ? cred : args->pfcred, args->pathname)) != 0) {

        /* Pfcred is a credential with a ref count of 1 */

        if (args->pfcred != NULL) {
            privflags |= PRIV_INCREASE|PRIV_RESET;
            newcred = cred = args->pfcred;
        } else {
            newcred = cred = crdup(cred);
        }

        /* If we can, drop the PA bit */
        if ((privflags & PRIV_RESET) != 0)
            priv_adjust_PA(cred);

        if (privflags & PRIV_SETID) {
            cred->cr_uid = uid;
            cred->cr_gid = gid;
            cred->cr_suid = uid;
            cred->cr_sgid = gid;
        }

        if (privflags & MAC_FLAGS) {
            if (!(CR_FLAGS(cred) & NET_MAC_AWARE_INHERIT))
                CR_FLAGS(cred) &= ~NET_MAC_AWARE;
            CR_FLAGS(cred) &= ~NET_MAC_AWARE_INHERIT;
        }

        /*
         * Implement the privilege updates:
         *
         * Restrict with L:
         *
         *  I' = I & L
         *
         *  E' = P' = (I' + F) & A
         *
         * But if running under ptrace, we cap I and F with P.
         */
        if ((privflags & (PRIV_RESET|PRIV_FORCED)) != 0) {
            if ((privflags & PRIV_INCREASE) != 0 &&
                (pp->p_proc_flag & P_PR_PTRACE) != 0) {
                priv_intersect(&CR_OPPRIV(cred),
                    &CR_IPRIV(cred));
                priv_intersect(&CR_OPPRIV(cred), &fset);
            }
            priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
            CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
            if (privflags & PRIV_FORCED) {
                priv_set_PA(cred);
                priv_union(&fset, &CR_EPRIV(cred));
                priv_union(&fset, &CR_PPRIV(cred));
            }
            priv_adjust_PA(cred);
        }
    } else if (level == 0 && args->pfcred != NULL) {
        newcred = cred = args->pfcred;
        privflags |= PRIV_INCREASE;
        /* pfcred is not forced to adhere to these settings */
        priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
        CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
        priv_adjust_PA(cred);
    }

    /* SunOS 4.x buy-back */
    if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) &&
        (vattr.va_mode & (VSUID|VSGID))) {
        char path[MAXNAMELEN];
        refstr_t *mntpt = NULL;
        int ret = -1;

        bzero(path, sizeof (path));
        zone_hold(pp->p_zone);

        ret = vnodetopath(pp->p_zone->zone_rootvp, vp, path,
            sizeof (path), cred);

        /* fallback to mountpoint if a path can't be found */
        if ((ret != 0) || (ret == 0 && path[0] == '\0'))
            mntpt = vfs_getmntpoint(vp->v_vfsp);

        if (mntpt == NULL)
            zcmn_err(pp->p_zone->zone_id, CE_NOTE,
                "!uid %d: setuid execution not allowed, "
                "file=%s", cred->cr_uid, path);
        else
            zcmn_err(pp->p_zone->zone_id, CE_NOTE,
                "!uid %d: setuid execution not allowed, "
                "fs=%s, file=%s", cred->cr_uid,
                ZONE_PATH_TRANSLATE(refstr_value(mntpt),
                pp->p_zone), exec_file);

        if (!INGLOBALZONE(pp)) {
            /* zone_rootpath always has trailing / */
            if (mntpt == NULL)
                cmn_err(CE_NOTE, "!zone: %s, uid: %d "
                    "setuid execution not allowed, file=%s%s",
                    pp->p_zone->zone_name, cred->cr_uid,
                    pp->p_zone->zone_rootpath, path + 1);
            else
                cmn_err(CE_NOTE, "!zone: %s, uid: %d "
                    "setuid execution not allowed, fs=%s, "
                    "file=%s", pp->p_zone->zone_name,
                    cred->cr_uid, refstr_value(mntpt),
                    exec_file);
        }

        if (mntpt != NULL)
            refstr_rele(mntpt);

        zone_rele(pp->p_zone);
    }

    /*
     * execsetid() told us whether or not we had to change the
     * credentials of the process.  In privflags, it told us
     * whether we gained any privileges or executed a set-uid executable.
     */
    setid = (privflags & (PRIV_SETUGID|PRIV_INCREASE|PRIV_FORCED));

    /*
     * Use /etc/system variable to determine if the stack
     * should be marked as executable by default.
     */
    if (noexec_user_stack)
        args->stk_prot &= ~PROT_EXEC;

    args->execswp = eswp; /* Save execsw pointer in uarg for exec_func */
    args->ex_vp = vp;

    /*
     * Traditionally, the setid flags told the sub processes whether
     * the file just executed was set-uid or set-gid; this caused
     * some confusion as the 'setid' flag did not match the SUGID
     * process flag which is only set when the uids/gids do not match.
     * A script set-gid/set-uid to the real uid/gid would start with
     * /dev/fd/X but an executable would happily trust LD_LIBRARY_PATH.
     * Now we flag those cases where the calling process cannot
     * be trusted to influence the newly exec'ed process, either
     * because it runs with more privileges or when the uids/gids
     * do in fact not match.
     * This also makes the runtime linker agree with the on exec
     * values of SNOCD and SUGID.
     */
    setidfl = 0;
    if (cred->cr_uid != cred->cr_ruid || (cred->cr_rgid != cred->cr_gid &&
        !supgroupmember(cred->cr_gid, cred))) {
        setidfl |= EXECSETID_UGIDS;
    }
    if (setid & PRIV_SETUGID)
        setidfl |= EXECSETID_SETID;
    if (setid & PRIV_FORCED)
        setidfl |= EXECSETID_PRIVS;

    execvp = pp->p_exec;
    if (execvp)
        VN_HOLD(execvp);

    error = (*eswp->exec_func)(vp, uap, args, idatap, level, execsz,
        setidfl, exec_file, cred, brand_action);
    rw_exit(eswp->exec_lock);
    if (error != 0) {
        if (execvp)
            VN_RELE(execvp);
        /*
         * If this process's p_exec has been set to the vp of
         * the executable by exec_func, we will return without
         * calling VOP_CLOSE because proc_exit will close it
         * on exit.
         */
        if (pp->p_exec == vp)
            goto bad_noclose;
        else
            goto bad;
    }

    if (level == 0) {
        uid_t oruid;

        if (execvp != NULL) {
            /*
             * Close the previous executable only if we are
             * at level 0.
             */
            (void) VOP_CLOSE(execvp, FREAD, 1, (offset_t)0,
                cred, NULL);
        }

        mutex_enter(&pp->p_crlock);

        oruid = pp->p_cred->cr_ruid;

        if (newcred != NULL) {
            /*
             * Free the old credentials, and set the new ones.
             * Do this for both the process and the (single) thread.
             */
            crfree(pp->p_cred);
            pp->p_cred = cred;  /* cred already held for proc */
            crhold(cred);       /* hold new cred for thread */
            /*
             * DTrace accesses t_cred in probe context.  t_cred
             * must always be either NULL, or point to a valid,
             * allocated cred structure.
             */
            oldcred = curthread->t_cred;
            curthread->t_cred = cred;
            crfree(oldcred);

            if (priv_basic_test >= 0 &&
                !PRIV_ISASSERT(&CR_IPRIV(newcred),
                priv_basic_test)) {
                pid_t pid = pp->p_pid;
                char *fn = PTOU(pp)->u_comm;

                cmn_err(CE_WARN, "%s[%d]: exec: basic_test "
                    "privilege removed from E/I", fn, pid);
            }
        }
        /*
         * On emerging from a successful exec(), the saved
         * uid and gid equal the effective uid and gid.
         */
        cred->cr_suid = cred->cr_uid;
        cred->cr_sgid = cred->cr_gid;

        /*
         * If the real and effective ids do not match, this
         * is a setuid process that should not dump core.
         * The group comparison is tricky; we prevent the code
         * from flagging SNOCD when executing with an effective gid
         * which is a supplementary group.
         */
        if (cred->cr_ruid != cred->cr_uid ||
            (cred->cr_rgid != cred->cr_gid &&
            !supgroupmember(cred->cr_gid, cred)) ||
            (privflags & PRIV_INCREASE) != 0)
            suidflags = PSUIDFLAGS;
        else
            suidflags = 0;

        mutex_exit(&pp->p_crlock);
        if (newcred != NULL && oruid != newcred->cr_ruid) {
            /* Note that the process remains in the same zone. */
            mutex_enter(&pidlock);
            upcount_dec(oruid, crgetzoneid(newcred));
            upcount_inc(newcred->cr_ruid, crgetzoneid(newcred));
            mutex_exit(&pidlock);
        }
        if (suidflags) {
            mutex_enter(&pp->p_lock);
            pp->p_flag |= suidflags;
            mutex_exit(&pp->p_lock);
        }
        if (setid && (pp->p_proc_flag & P_PR_PTRACE) == 0) {
            /*
             * If process is traced via /proc, arrange to
             * invalidate the associated /proc vnode.
             */
            if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
                args->traceinval = 1;
        }
        if (pp->p_proc_flag & P_PR_PTRACE)
            psignal(pp, SIGTRAP);
        if (args->traceinval)
            prinvalidate(&pp->p_user);
    }
    if (execvp)
        VN_RELE(execvp);
    return (0);

bad:
    (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, cred, NULL);

bad_noclose:
    if (newcred != NULL)
        crfree(newcred);
    if (error == 0)
        error = ENOEXEC;

    if (suidflags) {
        mutex_enter(&pp->p_lock);
        pp->p_flag |= suidflags;
        mutex_exit(&pp->p_lock);
    }
    return (error);
}

extern char *execswnames[];

struct execsw *
allocate_execsw(char *name, char *magic, size_t magic_size)
{
    int i, j;
    char *ename;
    char *magicp;

    mutex_enter(&execsw_lock);
    for (i = 0; i < nexectype; i++) {
        if (execswnames[i] == NULL) {
            ename = kmem_alloc(strlen(name) + 1, KM_SLEEP);
            (void) strcpy(ename, name);
            execswnames[i] = ename;
            /*
             * Set the magic number last so that we
             * don't need to hold the execsw_lock in
             * findexectype().
             */
            magicp = kmem_alloc(magic_size, KM_SLEEP);
            for (j = 0; j < magic_size; j++)
                magicp[j] = magic[j];
            execsw[i].exec_magic = magicp;
            mutex_exit(&execsw_lock);
            return (&execsw[i]);
        }
    }
    mutex_exit(&execsw_lock);
    return (NULL);
}

/*
 * Find the exec switch table entry with the corresponding magic string.
 */
struct execsw *
findexecsw(char *magic)
{
    struct execsw *eswp;

    for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
        ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
        if (magic && eswp->exec_maglen != 0 &&
            bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0)
            return (eswp);
    }
    return (NULL);
}

/*
 * Find the execsw[] index for the given exec header string by looking for the
 * magic string at a specified offset and length for each kind of executable
 * file format until one matches.  If no execsw[] entry is found, try to
 * autoload a module for this magic string.
 */
struct execsw *
findexec_by_hdr(char *header)
{
    struct execsw *eswp;

    for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
        ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
        if (header && eswp->exec_maglen != 0 &&
            bcmp(&header[eswp->exec_magoff], eswp->exec_magic,
            eswp->exec_maglen) == 0) {
            if (hold_execsw(eswp) != 0)
                return (NULL);
            return (eswp);
        }
    }
    return (NULL);  /* couldn't find the type */
}

/*
 * Find the execsw[] index for the given magic string.  If no execsw[] entry
 * is found, try to autoload a module for this magic string.
 */
struct execsw *
findexec_by_magic(char *magic)
{
    struct execsw *eswp;

    for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
        ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
        if (magic && eswp->exec_maglen != 0 &&
            bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0) {
            if (hold_execsw(eswp) != 0)
                return (NULL);
            return (eswp);
        }
    }
    return (NULL);  /* couldn't find the type */
}

static int
hold_execsw(struct execsw *eswp)
{
    char *name;

    rw_enter(eswp->exec_lock, RW_READER);
    while (!LOADED_EXEC(eswp)) {
        rw_exit(eswp->exec_lock);
        name = execswnames[eswp-execsw];
        ASSERT(name);
        if (modload("exec", name) == -1)
            return (-1);
        rw_enter(eswp->exec_lock, RW_READER);
    }
    return (0);
}

static int
execsetid(struct vnode *vp, struct vattr *vattrp, uid_t *uidp, uid_t *gidp,
    priv_set_t *fset, cred_t *cr, const char *pathname)
{
    proc_t *pp = ttoproc(curthread);
    uid_t uid, gid;
    int privflags = 0;

    /*
     * Remember credentials.
     */
    uid = cr->cr_uid;
    gid = cr->cr_gid;

    /* Will try to reset the PRIV_AWARE bit later. */
    if ((CR_FLAGS(cr) & (PRIV_AWARE|PRIV_AWARE_INHERIT)) == PRIV_AWARE)
        privflags |= PRIV_RESET;

    if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) == 0) {
        /*
         * If it's a set-uid root program we perform the
         * forced privilege look-aside. This has three possible
         * outcomes:
         *  no look aside information -> treat as before
         *  look aside in Limit set -> apply forced privs
         *  look aside not in Limit set -> ignore set-uid root
         *
         * Ordinary set-uid root execution only allowed if the limit
         * set holds all unsafe privileges.
         */
        if (vattrp->va_mode & VSUID) {
            if (vattrp->va_uid == 0) {
                int res = get_forced_privs(cr, pathname, fset);

                switch (res) {
                case -1:
                    if (priv_issubset(&priv_unsafe,
                        &CR_LPRIV(cr))) {
                        uid = vattrp->va_uid;
                        privflags |= PRIV_SETUGID;
                    }
                    break;
                case 0:
                    privflags |= PRIV_FORCED|PRIV_INCREASE;
                    break;
                default:
                    break;
                }
            } else {
                uid = vattrp->va_uid;
                privflags |= PRIV_SETUGID;
            }
        }
        if (vattrp->va_mode & VSGID) {
            gid = vattrp->va_gid;
            privflags |= PRIV_SETUGID;
        }
    }

    /*
     * Do we need to change our credential anyway?
     * This is the case when E != I or P != I, as
     * we need to do the assignments (with F empty and A full)
     * Or when I is not a subset of L; in that case we need to
     * enforce L.
     *
     *      I' = L & I
     *
     *      E' = P' = (I' + F) & A
     * or
     *      E' = P' = I'
     */
    if (!priv_isequalset(&CR_EPRIV(cr), &CR_IPRIV(cr)) ||
        !priv_issubset(&CR_IPRIV(cr), &CR_LPRIV(cr)) ||
        !priv_isequalset(&CR_PPRIV(cr), &CR_IPRIV(cr)))
        privflags |= PRIV_RESET;

    /* Child has more privileges than parent */
    if (!priv_issubset(&CR_IPRIV(cr), &CR_PPRIV(cr)))
        privflags |= PRIV_INCREASE;

    /* If MAC-aware flag(s) are on, need to update cred to remove. */
    if ((CR_FLAGS(cr) & NET_MAC_AWARE) ||
        (CR_FLAGS(cr) & NET_MAC_AWARE_INHERIT))
        privflags |= MAC_FLAGS;
    /*
     * Set setuid/setgid protections if no ptrace() compatibility.
     * For privileged processes, honor setuid/setgid even in
     * the presence of ptrace() compatibility.
     */
    if (((pp->p_proc_flag & P_PR_PTRACE) == 0 ||
        PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, (uid == 0))) &&
        (cr->cr_uid != uid ||
        cr->cr_gid != gid ||
        cr->cr_suid != uid ||
        cr->cr_sgid != gid)) {
        *uidp = uid;
        *gidp = gid;
        privflags |= PRIV_SETID;
    }
    return (privflags);
}

int
execpermissions(struct vnode *vp, struct vattr *vattrp, struct uarg *args)
{
    int error;
    proc_t *p = ttoproc(curthread);

    vattrp->va_mask = AT_MODE | AT_UID | AT_GID | AT_SIZE;
    if (error = VOP_GETATTR(vp, vattrp, ATTR_EXEC, p->p_cred, NULL))
        return (error);
    /*
     * Check the access mode.
     * If VPROC, ask /proc if the file is an object file.
     */
    if ((error = VOP_ACCESS(vp, VEXEC, 0, p->p_cred, NULL)) != 0 ||
        !(vp->v_type == VREG || (vp->v_type == VPROC && pr_isobject(vp))) ||
        (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0 ||
        (vattrp->va_mode & (VEXEC|(VEXEC>>3)|(VEXEC>>6))) == 0) {
        if (error == 0)
            error = EACCES;
        return (error);
    }

    if ((p->p_plist || (p->p_proc_flag & (P_PR_PTRACE|P_PR_TRACE))) &&
        (error = VOP_ACCESS(vp, VREAD, 0, p->p_cred, NULL))) {
        /*
         * If process is under ptrace(2) compatibility,
         * fail the exec(2).
         */
        if (p->p_proc_flag & P_PR_PTRACE)
            goto bad;
        /*
         * Process is traced via /proc.
         * Arrange to invalidate the /proc vnode.
         */
        args->traceinval = 1;
    }
    return (0);
bad:
    if (error == 0)
        error = ENOEXEC;
    return (error);
}

/*
 * Map a section of an executable file into the user's
 * address space.
 */
int
execmap(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
    off_t offset, int prot, int page, uint_t szc)
{
    int error = 0;
    off_t oldoffset;
    caddr_t zfodbase, oldaddr;
    size_t end, oldlen;
    size_t zfoddiff;
    label_t ljb;
    proc_t *p = ttoproc(curthread);

    oldaddr = addr;
    addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
    if (len) {
        oldlen = len;
        len += ((size_t)oldaddr - (size_t)addr);
        oldoffset = offset;
        offset = (off_t)((uintptr_t)offset & PAGEMASK);
        if (page) {
            spgcnt_t  prefltmem, availm, npages;
            int preread;
            uint_t mflag = MAP_PRIVATE | MAP_FIXED;

            if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) {
                mflag |= MAP_TEXT;
            } else {
                mflag |= MAP_INITDATA;
            }

            if (valid_usr_range(addr, len, prot, p->p_as,
                p->p_as->a_userlimit) != RANGE_OKAY) {
                error = ENOMEM;
                goto bad;
            }
            if (error = VOP_MAP(vp, (offset_t)offset,
                p->p_as, &addr, len, prot, PROT_ALL,
                mflag, CRED(), NULL))
                goto bad;

            /*
             * If the segment can fit, then we prefault
             * the entire segment in.  This is based on the
             * model that says the best working set of a
             * small program is all of its pages.
             */
            npages = (spgcnt_t)btopr(len);
            prefltmem = freemem - desfree;
            preread =
                (npages < prefltmem && len < PGTHRESH) ? 1 : 0;

            /*
             * If we aren't prefaulting the segment,
             * increment "deficit", if necessary to ensure
             * that pages will become available when this
             * process starts executing.
             */
            availm = freemem - lotsfree;
            if (preread == 0 && npages > availm &&
                deficit < lotsfree) {
                deficit += MIN((pgcnt_t)(npages - availm),
                    lotsfree - deficit);
            }

            if (preread) {
                TRACE_2(TR_FAC_PROC, TR_EXECMAP_PREREAD,
                    "execmap preread:freemem %d size %lu",
                    freemem, len);
                (void) as_fault(p->p_as->a_hat, p->p_as,
                    (caddr_t)addr, len, F_INVAL, S_READ);
            }
        } else {
            if (valid_usr_range(addr, len, prot, p->p_as,
                p->p_as->a_userlimit) != RANGE_OKAY) {
                error = ENOMEM;
                goto bad;
            }

            if (error = as_map(p->p_as, addr, len,
                segvn_create, zfod_argsp))
                goto bad;
            /*
             * Read in the segment in one big chunk.
             */
            if (error = vn_rdwr(UIO_READ, vp, (caddr_t)oldaddr,
                oldlen, (offset_t)oldoffset, UIO_USERSPACE, 0,
                (rlim64_t)0, CRED(), (ssize_t *)0))
                goto bad;
            /*
             * Now set protections.
             */
            if (prot != PROT_ZFOD) {
                (void) as_setprot(p->p_as, (caddr_t)addr,
                    len, prot);
            }
        }
    }

    if (zfodlen) {
        struct as *as = curproc->p_as;
        struct seg *seg;
        uint_t zprot = 0;

        end = (size_t)addr + len;
        zfodbase = (caddr_t)roundup(end, PAGESIZE);
        zfoddiff = (uintptr_t)zfodbase - end;
        if (zfoddiff) {
            /*
             * Before we go to zero the remaining space on the last
             * page, make sure we have write permission.
             *
             * Normal illumos binaries don't even hit the case
             * where we have to change permission on the last page
             * since their protection is typically either
             *    PROT_USER | PROT_WRITE | PROT_READ
             * or
             *    PROT_ZFOD (same as PROT_ALL).
             *
             * We need to be careful how we zero-fill the last page
             * if the segment protection does not include
             * PROT_WRITE. Using as_setprot() can cause the VM
             * segment code to call segvn_vpage(), which must
             * allocate a page struct for each page in the segment.
             * If we have a very large segment, this may fail, so
             * we have to check for that, even though we ignore
             * other return values from as_setprot.
             */

            AS_LOCK_ENTER(as, RW_READER);
            seg = as_segat(curproc->p_as, (caddr_t)end);
            if (seg != NULL)
                SEGOP_GETPROT(seg, (caddr_t)end, zfoddiff - 1,
                    &zprot);
            AS_LOCK_EXIT(as);

            if (seg != NULL && (zprot & PROT_WRITE) == 0) {
                if (as_setprot(as, (caddr_t)end, zfoddiff - 1,
                    zprot | PROT_WRITE) == ENOMEM) {
                    error = ENOMEM;
                    goto bad;
                }
            }

            if (on_fault(&ljb)) {
                no_fault();
                if (seg != NULL && (zprot & PROT_WRITE) == 0)
                    (void) as_setprot(as, (caddr_t)end,
                        zfoddiff - 1, zprot);
                error = EFAULT;
                goto bad;
            }
            uzero((void *)end, zfoddiff);
            no_fault();
            if (seg != NULL && (zprot & PROT_WRITE) == 0)
                (void) as_setprot(as, (caddr_t)end,
                    zfoddiff - 1, zprot);
        }
        if (zfodlen > zfoddiff) {
            struct segvn_crargs crargs =
                SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);

            zfodlen -= zfoddiff;
            if (valid_usr_range(zfodbase, zfodlen, prot, p->p_as,
                p->p_as->a_userlimit) != RANGE_OKAY) {
                error = ENOMEM;
                goto bad;
            }
            if (szc > 0) {
                /*
                 * ASSERT alignment because the mapelfexec()
                 * caller for the szc > 0 case extended zfod
                 * so it's end is pgsz aligned.
                 */
                size_t pgsz = page_get_pagesize(szc);
                ASSERT(IS_P2ALIGNED(zfodbase + zfodlen, pgsz));

                if (IS_P2ALIGNED(zfodbase, pgsz)) {
                    crargs.szc = szc;
                } else {
                    crargs.szc = AS_MAP_HEAP;
                }
            } else {
                crargs.szc = AS_MAP_NO_LPOOB;
            }
            if (error = as_map(p->p_as, (caddr_t)zfodbase,
                zfodlen, segvn_create, &crargs))
                goto bad;
            if (prot != PROT_ZFOD) {
                (void) as_setprot(p->p_as, (caddr_t)zfodbase,
                    zfodlen, prot);
            }
        }
    }
    return (0);
bad:
    return (error);
}

void
setexecenv(struct execenv *ep)
{
    proc_t *p = ttoproc(curthread);
    klwp_t *lwp = ttolwp(curthread);
    struct vnode *vp;

    p->p_bssbase = ep->ex_bssbase;
    p->p_brkbase = ep->ex_brkbase;
    p->p_brksize = ep->ex_brksize;
    if (p->p_exec)
        VN_RELE(p->p_exec); /* out with the old */
    vp = p->p_exec = ep->ex_vp;
    if (vp != NULL)
        VN_HOLD(vp);        /* in with the new */

    lwp->lwp_sigaltstack.ss_sp = 0;
    lwp->lwp_sigaltstack.ss_size = 0;
    lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
}

int
execopen(struct vnode **vpp, int *fdp)
{
    struct vnode *vp = *vpp;
    file_t *fp;
    int error = 0;
    int filemode = FREAD;

    VN_HOLD(vp);        /* open reference */
    if (error = falloc(NULL, filemode, &fp, fdp)) {
        VN_RELE(vp);
        *fdp = -1;  /* just in case falloc changed value */
        return (error);
    }
    if (error = VOP_OPEN(&vp, filemode, CRED(), NULL)) {
        VN_RELE(vp);
        setf(*fdp, NULL);
        unfalloc(fp);
        *fdp = -1;
        return (error);
    }
    *vpp = vp;      /* vnode should not have changed */
    fp->f_vnode = vp;
    mutex_exit(&fp->f_tlock);
    setf(*fdp, fp);
    return (0);
}

int
execclose(int fd)
{
    return (closeandsetf(fd, NULL));
}


/*
 * noexec stub function.
 */
/*ARGSUSED*/
int
noexec(
    struct vnode *vp,
    struct execa *uap,
    struct uarg *args,
    struct intpdata *idatap,
    int level,
    long *execsz,
    int setid,
    caddr_t exec_file,
    struct cred *cred)
{
    cmn_err(CE_WARN, "missing exec capability for %s", uap->fname);
    return (ENOEXEC);
}

/*
 * Support routines for building a user stack.
 *
 * execve(path, argv, envp) must construct a new stack with the specified
 * arguments and environment variables (see exec_args() for a description
 * of the user stack layout).  To do this, we copy the arguments and
 * environment variables from the old user address space into the kernel,
 * free the old as, create the new as, and copy our buffered information
 * to the new stack.  Our kernel buffer has the following structure:
 *
 *  +-----------------------+ <--- stk_base + stk_size
 *  | string offsets    |
 *  +-----------------------+ <--- stk_offp
 *  |           |
 *  | STK_AVAIL() space |
 *  |           |
 *  +-----------------------+ <--- stk_strp
 *  | strings       |
 *  +-----------------------+ <--- stk_base
 *
 * When we add a string, we store the string's contents (including the null
 * terminator) at stk_strp, and we store the offset of the string relative to
 * stk_base at --stk_offp.  At strings are added, stk_strp increases and
 * stk_offp decreases.  The amount of space remaining, STK_AVAIL(), is just
 * the difference between these pointers.  If we run out of space, we return
 * an error and exec_args() starts all over again with a buffer twice as large.
 * When we're all done, the kernel buffer looks like this:
 *
 *  +-----------------------+ <--- stk_base + stk_size
 *  | argv[0] offset    |
 *  +-----------------------+
 *  | ...           |
 *  +-----------------------+
 *  | argv[argc-1] offset   |
 *  +-----------------------+
 *  | envp[0] offset    |
 *  +-----------------------+
 *  | ...           |
 *  +-----------------------+
 *  | envp[envc-1] offset   |
 *  +-----------------------+
 *  | AT_SUN_PLATFORM offset|
 *  +-----------------------+
 *  | AT_SUN_EXECNAME offset|
 *  +-----------------------+ <--- stk_offp
 *  |           |
 *  | STK_AVAIL() space |
 *  |           |
 *  +-----------------------+ <--- stk_strp
 *  | AT_SUN_EXECNAME offset|
 *  +-----------------------+
 *  | AT_SUN_PLATFORM offset|
 *  +-----------------------+
 *  | envp[envc-1] string   |
 *  +-----------------------+
 *  | ...           |
 *  +-----------------------+
 *  | envp[0] string    |
 *  +-----------------------+
 *  | argv[argc-1] string   |
 *  +-----------------------+
 *  | ...           |
 *  +-----------------------+
 *  | argv[0] string    |
 *  +-----------------------+ <--- stk_base
 */

#define STK_AVAIL(args)     ((char *)(args)->stk_offp - (args)->stk_strp)

/*
 * Add a string to the stack.
 */
static int
stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
{
    int error;
    size_t len;

    if (STK_AVAIL(args) < sizeof (int))
        return (E2BIG);
    *--args->stk_offp = args->stk_strp - args->stk_base;

    if (segflg == UIO_USERSPACE) {
        error = copyinstr(sp, args->stk_strp, STK_AVAIL(args), &len);
        if (error != 0)
            return (error);
    } else {
        len = strlen(sp) + 1;
        if (len > STK_AVAIL(args))
            return (E2BIG);
        bcopy(sp, args->stk_strp, len);
    }

    args->stk_strp += len;

    return (0);
}

static int
stk_getptr(uarg_t *args, char *src, char **dst)
{
    int error;

    if (args->from_model == DATAMODEL_NATIVE) {
        ulong_t ptr;
        error = fulword(src, &ptr);
        *dst = (caddr_t)ptr;
    } else {
        uint32_t ptr;
        error = fuword32(src, &ptr);
        *dst = (caddr_t)(uintptr_t)ptr;
    }
    return (error);
}

static int
stk_putptr(uarg_t *args, char *addr, char *value)
{
    if (args->to_model == DATAMODEL_NATIVE)
        return (sulword(addr, (ulong_t)value));
    else
        return (suword32(addr, (uint32_t)(uintptr_t)value));
}

static int
stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
{
    char *sp;
    int argc, error;
    int argv_empty = 0;
    size_t ptrsize = args->from_ptrsize;
    size_t size, pad;
    char *argv = (char *)uap->argp;
    char *envp = (char *)uap->envp;

    /*
     * Copy interpreter's name and argument to argv[0] and argv[1].
     */
    if (intp != NULL && intp->intp_name != NULL) {
        if ((error = stk_add(args, intp->intp_name, UIO_SYSSPACE)) != 0)
            return (error);
        if (intp->intp_arg != NULL &&
            (error = stk_add(args, intp->intp_arg, UIO_SYSSPACE)) != 0)
            return (error);
        if (args->fname != NULL)
            error = stk_add(args, args->fname, UIO_SYSSPACE);
        else
            error = stk_add(args, uap->fname, UIO_USERSPACE);
        if (error)
            return (error);

        /*
         * Check for an empty argv[].
         */
        if (stk_getptr(args, argv, &sp))
            return (EFAULT);
        if (sp == NULL)
            argv_empty = 1;

        argv += ptrsize;        /* ignore original argv[0] */
    }

    if (argv_empty == 0) {
        /*
         * Add argv[] strings to the stack.
         */
        for (;;) {
            if (stk_getptr(args, argv, &sp))
                return (EFAULT);
            if (sp == NULL)
                break;
            if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
                return (error);
            argv += ptrsize;
        }
    }
    argc = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
    args->arglen = args->stk_strp - args->stk_base;

    /*
     * Add environ[] strings to the stack.
     */
    if (envp != NULL) {
        for (;;) {
            char *tmp = args->stk_strp;
            if (stk_getptr(args, envp, &sp))
                return (EFAULT);
            if (sp == NULL)
                break;
            if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
                return (error);
            if (args->scrubenv && strncmp(tmp, "LD_", 3) == 0) {
                /* Undo the copied string */
                args->stk_strp = tmp;
                *(args->stk_offp++) = NULL;
            }
            envp += ptrsize;
        }
    }
    args->na = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
    args->ne = args->na - argc;

    /*
     * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
     * AT_SUN_EMULATOR strings to the stack.
     */
    if (auxvpp != NULL && *auxvpp != NULL) {
        if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
            return (error);
        if ((error = stk_add(args, args->pathname, UIO_SYSSPACE)) != 0)
            return (error);
        if (args->brandname != NULL &&
            (error = stk_add(args, args->brandname, UIO_SYSSPACE)) != 0)
            return (error);
        if (args->emulator != NULL &&
            (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0)
            return (error);
    }

    /*
     * Compute the size of the stack.  This includes all the pointers,
     * the space reserved for the aux vector, and all the strings.
     * The total number of pointers is args->na (which is argc + envc)
     * plus 4 more: (1) a pointer's worth of space for argc; (2) the NULL
     * after the last argument (i.e. argv[argc]); (3) the NULL after the
     * last environment variable (i.e. envp[envc]); and (4) the NULL after
     * all the strings, at the very top of the stack.
     */
    size = (args->na + 4) * args->to_ptrsize + args->auxsize +
        (args->stk_strp - args->stk_base);

    /*
     * Pad the string section with zeroes to align the stack size.
     */
    pad = P2NPHASE(size, args->stk_align);

    if (STK_AVAIL(args) < pad)
        return (E2BIG);

    args->usrstack_size = size + pad;

    while (pad-- != 0)
        *args->stk_strp++ = 0;

    args->nc = args->stk_strp - args->stk_base;

    return (0);
}

static int
stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
{
    size_t ptrsize = args->to_ptrsize;
    ssize_t pslen;
    char *kstrp = args->stk_base;
    char *ustrp = usrstack - args->nc - ptrsize;
    char *usp = usrstack - args->usrstack_size;
    int *offp = (int *)(args->stk_base + args->stk_size);
    int envc = args->ne;
    int argc = args->na - envc;
    int i;

    /*
     * Record argc for /proc.
     */
    up->u_argc = argc;

    /*
     * Put argc on the stack.  Note that even though it's an int,
     * it always consumes ptrsize bytes (for alignment).
     */
    if (stk_putptr(args, usp, (char *)(uintptr_t)argc))
        return (-1);

    /*
     * Add argc space (ptrsize) to usp and record argv for /proc.
     */
    up->u_argv = (uintptr_t)(usp += ptrsize);

    /*
     * Put the argv[] pointers on the stack.
     */
    for (i = 0; i < argc; i++, usp += ptrsize)
        if (stk_putptr(args, usp, &ustrp[*--offp]))
            return (-1);

    /*
     * Copy arguments to u_psargs.
     */
    pslen = MIN(args->arglen, PSARGSZ) - 1;
    for (i = 0; i < pslen; i++)
        up->u_psargs[i] = (kstrp[i] == '\0' ? ' ' : kstrp[i]);
    while (i < PSARGSZ)
        up->u_psargs[i++] = '\0';

    /*
     * Add space for argv[]'s NULL terminator (ptrsize) to usp and
     * record envp for /proc.
     */
    up->u_envp = (uintptr_t)(usp += ptrsize);

    /*
     * Put the envp[] pointers on the stack.
     */
    for (i = 0; i < envc; i++, usp += ptrsize)
        if (stk_putptr(args, usp, &ustrp[*--offp]))
            return (-1);

    /*
     * Add space for envp[]'s NULL terminator (ptrsize) to usp and
     * remember where the stack ends, which is also where auxv begins.
     */
    args->stackend = usp += ptrsize;

    /*
     * Put all the argv[], envp[], and auxv strings on the stack.
     */
    if (copyout(args->stk_base, ustrp, args->nc))
        return (-1);

    /*
     * Fill in the aux vector now that we know the user stack addresses
     * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
     * AT_SUN_EMULATOR strings.
     */
    if (auxvpp != NULL && *auxvpp != NULL) {
        if (args->to_model == DATAMODEL_NATIVE) {
            auxv_t **a = (auxv_t **)auxvpp;
            ADDAUX(*a, AT_SUN_PLATFORM, (long)&ustrp[*--offp])
            ADDAUX(*a, AT_SUN_EXECNAME, (long)&ustrp[*--offp])
            if (args->brandname != NULL)
                ADDAUX(*a,
                    AT_SUN_BRANDNAME, (long)&ustrp[*--offp])
            if (args->emulator != NULL)
                ADDAUX(*a,
                    AT_SUN_EMULATOR, (long)&ustrp[*--offp])
        } else {
            auxv32_t **a = (auxv32_t **)auxvpp;
            ADDAUX(*a,
                AT_SUN_PLATFORM, (int)(uintptr_t)&ustrp[*--offp])
            ADDAUX(*a,
                AT_SUN_EXECNAME, (int)(uintptr_t)&ustrp[*--offp])
            if (args->brandname != NULL)
                ADDAUX(*a, AT_SUN_BRANDNAME,
                    (int)(uintptr_t)&ustrp[*--offp])
            if (args->emulator != NULL)
                ADDAUX(*a, AT_SUN_EMULATOR,
                    (int)(uintptr_t)&ustrp[*--offp])
        }
    }

    return (0);
}

/*
 * Initialize a new user stack with the specified arguments and environment.
 * The initial user stack layout is as follows:
 *
 *  User Stack
 *  +---------------+ <--- curproc->p_usrstack
 *  |       |
 *  | slew      |
 *  |       |
 *  +---------------+
 *  | NULL      |
 *  +---------------+
 *  |       |
 *  | auxv strings  |
 *  |       |
 *  +---------------+
 *  |       |
 *  | envp strings  |
 *  |       |
 *  +---------------+
 *  |       |
 *  | argv strings  |
 *  |       |
 *  +---------------+ <--- ustrp
 *  |       |
 *  | aux vector    |
 *  |       |
 *  +---------------+ <--- auxv
 *  | NULL      |
 *  +---------------+
 *  | envp[envc-1]  |
 *  +---------------+
 *  | ...       |
 *  +---------------+
 *  | envp[0]   |
 *  +---------------+ <--- envp[]
 *  | NULL      |
 *  +---------------+
 *  | argv[argc-1]  |
 *  +---------------+
 *  | ...       |
 *  +---------------+
 *  | argv[0]   |
 *  +---------------+ <--- argv[]
 *  | argc      |
 *  +---------------+ <--- stack base
 */
int
exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
{
    size_t size;
    int error;
    proc_t *p = ttoproc(curthread);
    user_t *up = PTOU(p);
    char *usrstack;
    rctl_entity_p_t e;
    struct as *as;
    extern int use_stk_lpg;
    size_t sp_slew;

    args->from_model = p->p_model;
    if (p->p_model == DATAMODEL_NATIVE) {
        args->from_ptrsize = sizeof (long);
    } else {
        args->from_ptrsize = sizeof (int32_t);
    }

    if (args->to_model == DATAMODEL_NATIVE) {
        args->to_ptrsize = sizeof (long);
        args->ncargs = NCARGS;
        args->stk_align = STACK_ALIGN;
        if (args->addr32)
            usrstack = (char *)USRSTACK64_32;
        else
            usrstack = (char *)USRSTACK;
    } else {
        args->to_ptrsize = sizeof (int32_t);
        args->ncargs = NCARGS32;
        args->stk_align = STACK_ALIGN32;
        usrstack = (char *)USRSTACK32;
    }

    ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);

#if defined(__sparc)
    /*
     * Make sure user register windows are empty before
     * attempting to make a new stack.
     */
    (void) flush_user_windows_to_stack(NULL);
#endif

    for (size = PAGESIZE; ; size *= 2) {
        args->stk_size = size;
        args->stk_base = kmem_alloc(size, KM_SLEEP);
        args->stk_strp = args->stk_base;
        args->stk_offp = (int *)(args->stk_base + size);
        error = stk_copyin(uap, args, intp, auxvpp);
        if (error == 0)
            break;
        kmem_free(args->stk_base, size);
        if (error != E2BIG && error != ENAMETOOLONG)
            return (error);
        if (size >= args->ncargs)
            return (E2BIG);
    }

    size = args->usrstack_size;

    ASSERT(error == 0);
    ASSERT(P2PHASE(size, args->stk_align) == 0);
    ASSERT((ssize_t)STK_AVAIL(args) >= 0);

    if (size > args->ncargs) {
        kmem_free(args->stk_base, args->stk_size);
        return (E2BIG);
    }

    /*
     * Leave only the current lwp and force the other lwps to exit.
     * If another lwp beat us to the punch by calling exit(), bail out.
     */
    if ((error = exitlwps(0)) != 0) {
        kmem_free(args->stk_base, args->stk_size);
        return (error);
    }

    /*
     * Revoke any doors created by the process.
     */
    if (p->p_door_list)
        door_exit();

    /*
     * Release schedctl data structures.
     */
    if (p->p_pagep)
        schedctl_proc_cleanup();

    /*
     * Clean up any DTrace helpers for the process.
     */
    if (p->p_dtrace_helpers != NULL) {
        ASSERT(dtrace_helpers_cleanup != NULL);
        (*dtrace_helpers_cleanup)();
    }

    mutex_enter(&p->p_lock);
    /*
     * Cleanup the DTrace provider associated with this process.
     */
    if (p->p_dtrace_probes) {
        ASSERT(dtrace_fasttrap_exec_ptr != NULL);
        dtrace_fasttrap_exec_ptr(p);
    }
    mutex_exit(&p->p_lock);

    /*
     * discard the lwpchan cache.
     */
    if (p->p_lcp != NULL)
        lwpchan_destroy_cache(1);

    /*
     * Delete the POSIX timers.
     */
    if (p->p_itimer != NULL)
        timer_exit();

    /*
     * Delete the ITIMER_REALPROF interval timer.
     * The other ITIMER_* interval timers are specified
     * to be inherited across exec().
     */
    delete_itimer_realprof();

    if (AU_AUDITING())
        audit_exec(args->stk_base, args->stk_base + args->arglen,
            args->na - args->ne, args->ne, args->pfcred);

    /*
     * Ensure that we don't change resource associations while we
     * change address spaces.
     */
    mutex_enter(&p->p_lock);
    pool_barrier_enter();
    mutex_exit(&p->p_lock);

    /*
     * Destroy the old address space and create a new one.
     * From here on, any errors are fatal to the exec()ing process.
     * On error we return -1, which means the caller must SIGKILL
     * the process.
     */
    relvm();

    mutex_enter(&p->p_lock);
    pool_barrier_exit();
    mutex_exit(&p->p_lock);

    up->u_execsw = args->execswp;

    p->p_brkbase = NULL;
    p->p_brksize = 0;
    p->p_brkpageszc = 0;
    p->p_stksize = 0;
    p->p_stkpageszc = 0;
    p->p_model = args->to_model;
    p->p_usrstack = usrstack;
    p->p_stkprot = args->stk_prot;
    p->p_datprot = args->dat_prot;

    /*
     * Reset resource controls such that all controls are again active as
     * well as appropriate to the potentially new address model for the
     * process.
     */
    e.rcep_p.proc = p;
    e.rcep_t = RCENTITY_PROCESS;
    rctl_set_reset(p->p_rctls, p, &e);

    /* Too early to call map_pgsz for the heap */
    if (use_stk_lpg) {
        p->p_stkpageszc = page_szc(map_pgsz(MAPPGSZ_STK, p, 0, 0, 0));
    }

    mutex_enter(&p->p_lock);
    p->p_flag |= SAUTOLPG;  /* kernel controls page sizes */
    mutex_exit(&p->p_lock);

    /*
     * Some platforms may choose to randomize real stack start by adding a
     * small slew (not more than a few hundred bytes) to the top of the
     * stack. This helps avoid cache thrashing when identical processes
     * simultaneously share caches that don't provide enough associativity
     * (e.g. sun4v systems). In this case stack slewing makes the same hot
     * stack variables in different processes to live in different cache
     * sets increasing effective associativity.
     */
    sp_slew = exec_get_spslew();
    ASSERT(P2PHASE(sp_slew, args->stk_align) == 0);
    exec_set_sp(size + sp_slew);

    as = as_alloc();
    p->p_as = as;
    as->a_proc = p;
    if (p->p_model == DATAMODEL_ILP32 || args->addr32)
        as->a_userlimit = (caddr_t)USERLIMIT32;
    (void) hat_setup(as->a_hat, HAT_ALLOC);
    hat_join_srd(as->a_hat, args->ex_vp);

    /*
     * Finally, write out the contents of the new stack.
     */
    error = stk_copyout(args, usrstack - sp_slew, auxvpp, up);
    kmem_free(args->stk_base, args->stk_size);
    return (error);
}