common/syscall/sem.c

	sem.c revision a913396d8daab34d2fa497f49ae18d9f3d3a059f
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*  Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/*    All Rights Reserved   */


#pragma ident   "%Z%%M% %I% %E% SMI"

/*
 * Inter-Process Communication Semaphore Facility.
 *
 * See os/ipc.c for a description of common IPC functionality.
 *
 * Resource controls
 * -----------------
 *
 * Control:      zone.max-sem-ids (rc_zone_semmni)
 * Description:  Maximum number of semaphore ids allowed a zone.
 *
 *   When semget() is used to allocate a semaphore set, one id is
 *   allocated.  If the id allocation doesn't succeed, semget() fails
 *   and errno is set to ENOSPC.  Upon successful semctl(, IPC_RMID)
 *   the id is deallocated.
 *
 * Control:      project.max-sem-ids (rc_project_semmni)
 * Description:  Maximum number of semaphore ids allowed a project.
 *
 *   When semget() is used to allocate a semaphore set, one id is
 *   allocated.  If the id allocation doesn't succeed, semget() fails
 *   and errno is set to ENOSPC.  Upon successful semctl(, IPC_RMID)
 *   the id is deallocated.
 *
 * Control:      process.max-sem-nsems (rc_process_semmsl)
 * Description:  Maximum number of semaphores allowed per semaphore set.
 *
 *   When semget() is used to allocate a semaphore set, the size of the
 *   set is compared with this limit.  If the number of semaphores
 *   exceeds the limit, semget() fails and errno is set to EINVAL.
 *
 * Control:      process.max-sem-ops (rc_process_semopm)
 * Description:  Maximum number of semaphore operations allowed per
 *               semop call.
 *
 *   When semget() successfully allocates a semaphore set, the minimum
 *   enforced value of this limit is used to initialize the
 *   "system-imposed maximum" number of operations a semop() call for
 *   this set can perform.
 *
 * Undo structures
 * ---------------
 *
 * Removing the undo structure tunables involved a serious redesign of
 * how they were implemented.  There is now one undo structure for
 * every process/semaphore array combination (lazily allocated, of
 * course), and each is equal in size to the semaphore it corresponds
 * to.  To avoid scalability and performance problems, the undo
 * structures are stored in two places: a per-process AVL tree sorted
 * by ksemid pointer (p_semacct, protected by p_lock) and an unsorted
 * per-semaphore linked list (sem_undos, protected by the semaphore's
 * ID lock).  The former is used by semop, where a lookup is performed
 * once and cached if SEM_UNDO is specified for any of the operations,
 * and at process exit where the undoable operations are rolled back.
 * The latter is used when removing the semaphore, so the undo
 * structures can be removed from the appropriate processes' trees.
 *
 * The undo structure itself contains pointers to the ksemid and proc
 * to which it corresponds, a list node, an AVL node, and an array of
 * adjust-on-exit (AOE) values.  When an undo structure is allocated it
 * is immediately added to both the process's tree and the semaphore's
 * list.  Lastly, the reference count on the semaphore is increased.
 *
 * Avoiding a lock ordering violation between p_lock and the ID lock,
 * wont to occur when there is a race between a process exiting and the
 * removal of a semaphore, mandates the delicate dance that exists
 * between semexit and sem_rmid.
 *
 * sem_rmid, holding the ID lock, iterates through all undo structures
 * and for each takes the appropriate process's p_lock and checks to
 * see if p_semacct is NULL.  If it is, it skips that undo structure
 * and continues to the next.  Otherwise, it removes the undo structure
 * from both the AVL tree and the semaphore's list, and releases the
 * hold that the undo structure had on the semaphore.
 *
 * The important other half of this is semexit, which will immediately
 * take p_lock, obtain the AVL pointer, clear p_semacct, and drop
 * p_lock.  From this point on it is semexit's responsibility to clean
 * up all undo structures found in the tree -- a coexecuting sem_rmid
 * will see the NULL p_semacct and skip that undo structure.  It walks
 * the AVL tree (using avl_destroy_nodes) and for each undo structure
 * takes the appropriate semaphore's ID lock (always legal since the
 * undo structure has a hold on the semaphore), updates all semaphores
 * with non-zero AOE values, and removes the structure from the
 * semaphore's list.  It then drops the structure's reference on the
 * semaphore, drops the ID lock, and frees the undo structure.
 */

#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/cred.h>
#include <sys/vmem.h>
#include <sys/kmem.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/ipc.h>
#include <sys/ipc_impl.h>
#include <sys/sem.h>
#include <sys/sem_impl.h>
#include <sys/user.h>
#include <sys/proc.h>
#include <sys/cpuvar.h>
#include <sys/debug.h>
#include <sys/var.h>
#include <sys/cmn_err.h>
#include <sys/modctl.h>
#include <sys/syscall.h>
#include <sys/avl.h>
#include <sys/list.h>
#include <sys/zone.h>

#include <c2/audit.h>

extern rctl_hndl_t rc_zone_semmni;
extern rctl_hndl_t rc_project_semmni;
extern rctl_hndl_t rc_process_semmsl;
extern rctl_hndl_t rc_process_semopm;
static ipc_service_t *sem_svc;
static zone_key_t sem_zone_key;

/*
 * The following tunables are obsolete.  Though for compatibility we
 * still read and interpret seminfo_semmsl, seminfo_semopm and
 * seminfo_semmni (see os/project.c and os/rctl_proc.c), the preferred
 * mechanism for administrating the IPC Semaphore facility is through
 * the resource controls described at the top of this file.
 */
int seminfo_semaem = 16384; /* (obsolete) */
int seminfo_semmap = 10;    /* (obsolete) */
int seminfo_semmni = 10;    /* (obsolete) */
int seminfo_semmns = 60;    /* (obsolete) */
int seminfo_semmnu = 30;    /* (obsolete) */
int seminfo_semmsl = 25;    /* (obsolete) */
int seminfo_semopm = 10;    /* (obsolete) */
int seminfo_semume = 10;    /* (obsolete) */
int seminfo_semusz = 96;    /* (obsolete) */
int seminfo_semvmx = 32767; /* (obsolete) */

#define SEM_MAXUCOPS    4096    /* max # of unchecked ops per semop call */
#define SEM_UNDOSZ(n)   (sizeof (struct sem_undo) + (n - 1) * sizeof (int))

static int semsys(int opcode, uintptr_t a0, uintptr_t a1,
    uintptr_t a2, uintptr_t a3);
static void sem_dtor(kipc_perm_t *);
static void sem_rmid(kipc_perm_t *);
static void sem_remove_zone(zoneid_t, void *);

static struct sysent ipcsem_sysent = {
    5,
    SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
    semsys
};

/*
 * Module linkage information for the kernel.
 */
static struct modlsys modlsys = {
    &mod_syscallops, "System V semaphore facility", &ipcsem_sysent
};

#ifdef _SYSCALL32_IMPL
static struct modlsys modlsys32 = {
    &mod_syscallops32, "32-bit System V semaphore facility", &ipcsem_sysent
};
#endif

static struct modlinkage modlinkage = {
    MODREV_1,
    &modlsys,
#ifdef _SYSCALL32_IMPL
    &modlsys32,
#endif
    NULL
};


int
_init(void)
{
    int result;

    sem_svc = ipcs_create("semids", rc_project_semmni, rc_zone_semmni,
        sizeof (ksemid_t), sem_dtor, sem_rmid, AT_IPC_SEM,
        offsetof(ipc_rqty_t, ipcq_semmni));
    zone_key_create(&sem_zone_key, NULL, sem_remove_zone, NULL);

    if ((result = mod_install(&modlinkage)) == 0)
        return (0);

    (void) zone_key_delete(sem_zone_key);
    ipcs_destroy(sem_svc);

    return (result);
}

int
_fini(void)
{
    return (EBUSY);
}

int
_info(struct modinfo *modinfop)
{
    return (mod_info(&modlinkage, modinfop));
}

static void
sem_dtor(kipc_perm_t *perm)
{
    ksemid_t *sp = (ksemid_t *)perm;

    kmem_free(sp->sem_base,
        P2ROUNDUP(sp->sem_nsems * sizeof (struct sem), 64));
    list_destroy(&sp->sem_undos);
}

/*
 * sem_undo_add - Create or update adjust on exit entry.
 */
static int
sem_undo_add(short val, ushort_t num, struct sem_undo *undo)
{
    int newval = undo->un_aoe[num] - val;

    if (newval > USHRT_MAX || newval < -USHRT_MAX)
        return (ERANGE);
    undo->un_aoe[num] = newval;

    return (0);
}

/*
 * sem_undo_clear - clears all undo entries for specified semaphores
 *
 * Used when semaphores are reset by SETVAL or SETALL.
 */
static void
sem_undo_clear(ksemid_t *sp, ushort_t low, ushort_t high)
{
    struct sem_undo *undo;
    int i;

    ASSERT(low <= high);
    ASSERT(high < sp->sem_nsems);

    for (undo = list_head(&sp->sem_undos); undo;
        undo = list_next(&sp->sem_undos, undo))
        for (i = low; i <= high; i++)
            undo->un_aoe[i] = 0;
}

/*
 * sem_rollback - roll back work done so far if unable to complete operation
 */
static void
sem_rollback(ksemid_t *sp, struct sembuf *op, int n, struct sem_undo *undo)
{
    struct sem *semp;   /* semaphore ptr */

    for (op += n - 1; n--; op--) {
        if (op->sem_op == 0)
            continue;
        semp = &sp->sem_base[op->sem_num];
        semp->semval -= op->sem_op;
        if (op->sem_flg & SEM_UNDO) {
            ASSERT(undo != NULL);
            (void) sem_undo_add(-op->sem_op, op->sem_num, undo);
        }
    }
}

static void
sem_rmid(kipc_perm_t *perm)
{
    ksemid_t *sp = (ksemid_t *)perm;
    struct sem *semp;
    struct sem_undo *undo;
    size_t size = SEM_UNDOSZ(sp->sem_nsems);
    int i;

    /*LINTED*/
    while (undo = list_head(&sp->sem_undos)) {
        list_remove(&sp->sem_undos, undo);
        mutex_enter(&undo->un_proc->p_lock);
        if (undo->un_proc->p_semacct == NULL) {
            mutex_exit(&undo->un_proc->p_lock);
            continue;
        }
        avl_remove(undo->un_proc->p_semacct, undo);
        mutex_exit(&undo->un_proc->p_lock);
        kmem_free(undo, size);
        ipc_rele_locked(sem_svc, (kipc_perm_t *)sp);
    }

    for (i = 0; i < sp->sem_nsems; i++) {
        semp = &sp->sem_base[i];
        semp->semval = semp->sempid = 0;
        if (semp->semncnt) {
            cv_broadcast(&semp->semncnt_cv);
            semp->semncnt = 0;
        }
        if (semp->semzcnt) {
            cv_broadcast(&semp->semzcnt_cv);
            semp->semzcnt = 0;
        }
    }
}

/*
 * semctl - Semctl system call.
 */
static int
semctl(int semid, uint_t semnum, int cmd, uintptr_t arg)
{
    ksemid_t        *sp;    /* ptr to semaphore header */
    struct sem      *p; /* ptr to semaphore */
    unsigned int        i;  /* loop control */
    ushort_t        *vals, *vp;
    size_t          vsize = 0;
    int         error = 0;
    int         retval = 0;
    struct cred     *cr;
    kmutex_t        *lock;
    model_t         mdl = get_udatamodel();
    STRUCT_DECL(semid_ds, sid);
    struct semid_ds64   ds64;

    STRUCT_INIT(sid, mdl);
    cr = CRED();

    /*
     * Perform pre- or non-lookup actions (e.g. copyins, RMID).
     */
    switch (cmd) {
    case IPC_SET:
        if (copyin((void *)arg, STRUCT_BUF(sid), STRUCT_SIZE(sid)))
            return (set_errno(EFAULT));
        break;

    case IPC_SET64:
        if (copyin((void *)arg, &ds64, sizeof (struct semid_ds64)))
            return (set_errno(EFAULT));
        break;

    case SETALL:
        if ((lock = ipc_lookup(sem_svc, semid,
            (kipc_perm_t **)&sp)) == NULL)
            return (set_errno(EINVAL));
        vsize = sp->sem_nsems * sizeof (*vals);
        mutex_exit(lock);

        /* allocate space to hold all semaphore values */
        vals = kmem_alloc(vsize, KM_SLEEP);

        if (copyin((void *)arg, vals, vsize)) {
            kmem_free(vals, vsize);
            return (set_errno(EFAULT));
        }
        break;

    case IPC_RMID:
        if (error = ipc_rmid(sem_svc, semid, cr))
            return (set_errno(error));
        return (0);
    }

    if ((lock = ipc_lookup(sem_svc, semid, (kipc_perm_t **)&sp)) == NULL) {
        if (vsize != 0)
            kmem_free(vals, vsize);
        return (set_errno(EINVAL));
    }
    switch (cmd) {
    /* Set ownership and permissions. */
    case IPC_SET:

        if (error = ipcperm_set(sem_svc, cr, &sp->sem_perm,
            &STRUCT_BUF(sid)->sem_perm, mdl)) {
            mutex_exit(lock);
            return (set_errno(error));
        }
        sp->sem_ctime = gethrestime_sec();
        mutex_exit(lock);
        return (0);

    /* Get semaphore data structure. */
    case IPC_STAT:

        if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
            mutex_exit(lock);
            return (set_errno(error));
        }

        ipcperm_stat(&STRUCT_BUF(sid)->sem_perm, &sp->sem_perm, mdl);
        STRUCT_FSETP(sid, sem_base, NULL);  /* kernel addr */
        STRUCT_FSET(sid, sem_nsems, sp->sem_nsems);
        STRUCT_FSET(sid, sem_otime, sp->sem_otime);
        STRUCT_FSET(sid, sem_ctime, sp->sem_ctime);
        STRUCT_FSET(sid, sem_binary, sp->sem_binary);
        mutex_exit(lock);

        if (copyout(STRUCT_BUF(sid), (void *)arg, STRUCT_SIZE(sid)))
            return (set_errno(EFAULT));
        return (0);

    case IPC_SET64:

        if (error = ipcperm_set64(sem_svc, cr, &sp->sem_perm,
            &ds64.semx_perm)) {
            mutex_exit(lock);
            return (set_errno(error));
        }
        sp->sem_ctime = gethrestime_sec();
        mutex_exit(lock);
        return (0);

    case IPC_STAT64:

        ipcperm_stat64(&ds64.semx_perm, &sp->sem_perm);
        ds64.semx_nsems = sp->sem_nsems;
        ds64.semx_otime = sp->sem_otime;
        ds64.semx_ctime = sp->sem_ctime;

        mutex_exit(lock);
        if (copyout(&ds64, (void *)arg, sizeof (struct semid_ds64)))
            return (set_errno(EFAULT));

        return (0);

    /* Get # of processes sleeping for greater semval. */
    case GETNCNT:
        if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
            mutex_exit(lock);
            return (set_errno(error));
        }
        if (semnum >= sp->sem_nsems) {
            mutex_exit(lock);
            return (set_errno(EINVAL));
        }
        retval = sp->sem_base[semnum].semncnt;
        mutex_exit(lock);
        return (retval);

    /* Get pid of last process to operate on semaphore. */
    case GETPID:
        if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
            mutex_exit(lock);
            return (set_errno(error));
        }
        if (semnum >= sp->sem_nsems) {
            mutex_exit(lock);
            return (set_errno(EINVAL));
        }
        retval = sp->sem_base[semnum].sempid;
        mutex_exit(lock);
        return (retval);

    /* Get semval of one semaphore. */
    case GETVAL:
        if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
            mutex_exit(lock);
            return (set_errno(error));
        }
        if (semnum >= sp->sem_nsems) {
            mutex_exit(lock);
            return (set_errno(EINVAL));
        }
        retval = sp->sem_base[semnum].semval;
        mutex_exit(lock);
        return (retval);

    /* Get all semvals in set. */
    case GETALL:
        if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
            mutex_exit(lock);
            return (set_errno(error));
        }

        /* allocate space to hold all semaphore values */
        vsize = sp->sem_nsems * sizeof (*vals);
        vals = vp = kmem_alloc(vsize, KM_SLEEP);

        for (i = sp->sem_nsems, p = sp->sem_base; i--; p++, vp++)
            bcopy(&p->semval, vp, sizeof (p->semval));

        mutex_exit(lock);

        if (copyout((void *)vals, (void *)arg, vsize)) {
            kmem_free(vals, vsize);
            return (set_errno(EFAULT));
        }

        kmem_free(vals, vsize);
        return (0);

    /* Get # of processes sleeping for semval to become zero. */
    case GETZCNT:
        if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
            mutex_exit(lock);
            return (set_errno(error));
        }
        if (semnum >= sp->sem_nsems) {
            mutex_exit(lock);
            return (set_errno(EINVAL));
        }
        retval = sp->sem_base[semnum].semzcnt;
        mutex_exit(lock);
        return (retval);

    /* Set semval of one semaphore. */
    case SETVAL:
        if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
            mutex_exit(lock);
            return (set_errno(error));
        }
        if (semnum >= sp->sem_nsems) {
            mutex_exit(lock);
            return (set_errno(EINVAL));
        }
        if ((uint_t)arg > USHRT_MAX) {
            mutex_exit(lock);
            return (set_errno(ERANGE));
        }
        p = &sp->sem_base[semnum];
        if ((p->semval = (ushort_t)arg) != 0) {
            if (p->semncnt) {
                cv_broadcast(&p->semncnt_cv);
            }
        } else if (p->semzcnt) {
            cv_broadcast(&p->semzcnt_cv);
        }
        p->sempid = curproc->p_pid;
        sem_undo_clear(sp, (ushort_t)semnum, (ushort_t)semnum);
        mutex_exit(lock);
        return (0);

    /* Set semvals of all semaphores in set. */
    case SETALL:
        /* Check if semaphore set has been deleted and reallocated. */
        if (sp->sem_nsems * sizeof (*vals) != vsize) {
            error = set_errno(EINVAL);
            goto seterr;
        }
        if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
            error = set_errno(error);
            goto seterr;
        }
        sem_undo_clear(sp, 0, sp->sem_nsems - 1);
        for (i = 0, p = sp->sem_base; i < sp->sem_nsems;
            (p++)->sempid = curproc->p_pid) {
            if ((p->semval = vals[i++]) != 0) {
                if (p->semncnt) {
                    cv_broadcast(&p->semncnt_cv);
                }
            } else if (p->semzcnt) {
                cv_broadcast(&p->semzcnt_cv);
            }
        }
seterr:
        mutex_exit(lock);
        kmem_free(vals, vsize);
        return (error);

    default:
        mutex_exit(lock);
        return (set_errno(EINVAL));
    }

    /* NOTREACHED */
}

/*
 * semexit - Called by exit() to clean up on process exit.
 */
void
semexit(proc_t *pp)
{
    avl_tree_t  *tree;
    struct sem_undo *undo;
    void        *cookie = NULL;

    mutex_enter(&pp->p_lock);
    tree = pp->p_semacct;
    pp->p_semacct = NULL;
    mutex_exit(&pp->p_lock);

    while (undo = avl_destroy_nodes(tree, &cookie)) {
        ksemid_t *sp = undo->un_sp;
        size_t size = SEM_UNDOSZ(sp->sem_nsems);
        int i;

        (void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
        if (!IPC_FREE(&sp->sem_perm)) {
            for (i = 0; i < sp->sem_nsems; i++) {
                int adj = undo->un_aoe[i];
                if (adj) {
                    struct sem *semp = &sp->sem_base[i];
                    int v = (int)semp->semval + adj;

                    if (v < 0 || v > USHRT_MAX)
                        continue;
                    semp->semval = (ushort_t)v;
                    if (v == 0 && semp->semzcnt)
                        cv_broadcast(&semp->semzcnt_cv);
                    if (adj > 0 && semp->semncnt)
                        cv_broadcast(&semp->semncnt_cv);
                }
            }
            list_remove(&sp->sem_undos, undo);
        }
        ipc_rele(sem_svc, (kipc_perm_t *)sp);
        kmem_free(undo, size);
    }

    avl_destroy(tree);
    kmem_free(tree, sizeof (avl_tree_t));
}

/*
 * Remove all semaphores associated with a given zone.  Called by
 * zone_shutdown when the zone is halted.
 */
/*ARGSUSED1*/
static void
sem_remove_zone(zoneid_t zoneid, void *arg)
{
    ipc_remove_zone(sem_svc, zoneid);
}

/*
 * semget - Semget system call.
 */
static int
semget(key_t key, int nsems, int semflg)
{
    ksemid_t    *sp;
    kmutex_t    *lock;
    int     id, error;
    proc_t      *pp = curproc;

top:
    if (error = ipc_get(sem_svc, key, semflg, (kipc_perm_t **)&sp, &lock))
        return (set_errno(error));

    if (!IPC_FREE(&sp->sem_perm)) {
        /*
         * A semaphore with the requested key exists.
         */
        if (!((nsems >= 0) && (nsems <= sp->sem_nsems))) {
            mutex_exit(lock);
            return (set_errno(EINVAL));
        }
    } else {
        /*
         * This is a new semaphore set.  Finish initialization.
         */
        if (nsems <= 0 || (rctl_test(rc_process_semmsl, pp->p_rctls, pp,
            nsems, RCA_SAFE) & RCT_DENY)) {
            mutex_exit(lock);
            mutex_exit(&pp->p_lock);
            ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
            return (set_errno(EINVAL));
        }
        mutex_exit(lock);
        mutex_exit(&pp->p_lock);

        /*
         * We round the allocation up to coherency granularity
         * so that multiple semaphore allocations won't result
         * in the false sharing of their sem structures.
         */
        sp->sem_base =
            kmem_zalloc(P2ROUNDUP(nsems * sizeof (struct sem), 64),
            KM_SLEEP);
        sp->sem_binary = (nsems == 1);
        sp->sem_nsems = (ushort_t)nsems;
        sp->sem_ctime = gethrestime_sec();
        sp->sem_otime = 0;
        list_create(&sp->sem_undos, sizeof (struct sem_undo),
            offsetof(struct sem_undo, un_list));

        if (error = ipc_commit_begin(sem_svc, key, semflg,
            (kipc_perm_t *)sp)) {
            if (error == EAGAIN)
                goto top;
            return (set_errno(error));
        }
        sp->sem_maxops =
            rctl_enforced_value(rc_process_semopm, pp->p_rctls, pp);
        if (rctl_test(rc_process_semmsl, pp->p_rctls, pp, nsems,
            RCA_SAFE) & RCT_DENY) {
            ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
            return (set_errno(EINVAL));
        }
        lock = ipc_commit_end(sem_svc, &sp->sem_perm);
    }
#ifdef C2_AUDIT
    if (audit_active)
        audit_ipcget(AT_IPC_SEM, (void *)sp);
#endif
    id = sp->sem_perm.ipc_id;
    mutex_exit(lock);
    return (id);
}

/*
 * semids system call.
 */
static int
semids(int *buf, uint_t nids, uint_t *pnids)
{
    int error;

    if (error = ipc_ids(sem_svc, buf, nids, pnids))
        return (set_errno(error));

    return (0);
}


/*
 * Helper function for semop - copies in the provided timespec and
 * computes the absolute future time after which we must return.
 */
static int
compute_timeout(timespec_t **tsp, timespec_t *ts, timespec_t *now,
    timespec_t *timeout)
{
    model_t datamodel = get_udatamodel();

    if (datamodel == DATAMODEL_NATIVE) {
        if (copyin(timeout, ts, sizeof (timespec_t)))
            return (EFAULT);
    } else {
        timespec32_t ts32;

        if (copyin(timeout, &ts32, sizeof (timespec32_t)))
            return (EFAULT);
        TIMESPEC32_TO_TIMESPEC(ts, &ts32)
    }

    if (itimerspecfix(ts))
        return (EINVAL);

    /*
     * Convert the timespec value into absolute time.
     */
    timespecadd(ts, now);
    *tsp = ts;

    return (0);
}

/*
 * Undo structure comparator.  We sort based on ksemid_t pointer.
 */
static int
sem_undo_compar(const void *x, const void *y)
{
    struct sem_undo *undo1 = (struct sem_undo *)x;
    struct sem_undo *undo2 = (struct sem_undo *)y;

    if (undo1->un_sp < undo2->un_sp)
        return (-1);
    if (undo1->un_sp > undo2->un_sp)
        return (1);
    return (0);
}

/*
 * Helper function for semop - creates an undo structure and adds it to
 * the process's avl tree and the semaphore's list.
 */
static int
sem_undo_alloc(proc_t *pp, ksemid_t *sp, kmutex_t **lock,
    struct sem_undo *template, struct sem_undo **un)
{
    size_t size;
    struct sem_undo *undo;
    avl_tree_t *tree = NULL;
    avl_index_t where;

    mutex_exit(*lock);

    size = SEM_UNDOSZ(sp->sem_nsems);
    undo = kmem_zalloc(size, KM_SLEEP);
    undo->un_proc = pp;
    undo->un_sp = sp;

    if (pp->p_semacct == NULL)
        tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);

    *lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
    if (IPC_FREE(&sp->sem_perm)) {
        kmem_free(undo, size);
        if (tree)
            kmem_free(tree, sizeof (avl_tree_t));
        return (EIDRM);
    }

    mutex_enter(&pp->p_lock);
    if (tree) {
        if (pp->p_semacct == NULL) {
            avl_create(tree, sem_undo_compar,
                sizeof (struct sem_undo),
                offsetof(struct sem_undo, un_avl));
            pp->p_semacct = tree;
        } else {
            kmem_free(tree, sizeof (avl_tree_t));
        }
    }

    if (*un = avl_find(pp->p_semacct, template, &where)) {
        mutex_exit(&pp->p_lock);
        kmem_free(undo, size);
    } else {
        *un = undo;
        avl_insert(pp->p_semacct, undo, where);
        mutex_exit(&pp->p_lock);
        list_insert_head(&sp->sem_undos, undo);
        ipc_hold(sem_svc, (kipc_perm_t *)sp);
    }


    return (0);
}

/*
 * semop - Semop system call.
 */
static int
semop(int semid, struct sembuf *sops, size_t nsops, timespec_t *timeout)
{
    ksemid_t    *sp = NULL;
    kmutex_t    *lock;
    struct sembuf   *op;    /* ptr to operation */
    int     i;  /* loop control */
    struct sem  *semp;  /* ptr to semaphore */
    int         error = 0;
    struct sembuf   *uops;  /* ptr to copy of user ops */
    struct sembuf   x_sem;  /* avoid kmem_alloc's */
    timespec_t  now, ts, *tsp = NULL;
    int     cvres, needundo, mode;
    struct sem_undo *undo;
    proc_t      *pp = curproc;
    int     held = 0;

    CPU_STATS_ADDQ(CPU, sys, sema, 1); /* bump semaphore op count */

    /*
     * To avoid the cost of copying in 'timeout' in the common
     * case, we could only grab the time here and defer the copyin
     * and associated computations until we are about to block.
     *
     * The down side to this is that we would then have to spin
     * some goto top nonsense to avoid the copyin behind the semid
     * lock.  As a common use of timed semaphores is as an explicit
     * blocking mechanism, this could incur a greater penalty.
     *
     * If we eventually decide that this would be a wise route to
     * take, the deferrable functionality is completely contained
     * in 'compute_timeout', and the interface is defined such that
     * we can legally not validate 'timeout' if it is unused.
     */
    if (timeout != NULL) {
        gethrestime(&now);
        if (error = compute_timeout(&tsp, &ts, &now, timeout))
            return (set_errno(error));
    }

    /*
     * Allocate space to hold the vector of semaphore ops.  If
     * there is only 1 operation we use a preallocated buffer on
     * the stack for speed.
     *
     * Since we don't want to allow the user to allocate an
     * arbitrary amount of kernel memory, we need to check against
     * the number of operations allowed by the semaphore.  We only
     * bother doing this if the number of operations is larger than
     * SEM_MAXUCOPS.
     */
    if (nsops == 1)
        uops = &x_sem;
    else if (nsops == 0)
        return (0);
    else if (nsops <= SEM_MAXUCOPS)
        uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);

    if (nsops > SEM_MAXUCOPS) {
        if ((lock = ipc_lookup(sem_svc, semid,
            (kipc_perm_t **)&sp)) == NULL)
            return (set_errno(EFAULT));

        if (nsops > sp->sem_maxops) {
            mutex_exit(lock);
            return (set_errno(E2BIG));
        }
        held = 1;
        ipc_hold(sem_svc, (kipc_perm_t *)sp);
        mutex_exit(lock);

        uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);
        if (copyin(sops, uops, nsops * sizeof (*op))) {
            error = EFAULT;
            (void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
            goto semoperr;
        }

        lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
        if (IPC_FREE(&sp->sem_perm)) {
            error = EIDRM;
            goto semoperr;
        }
    } else {
        /*
         * This could be interleaved with the above code, but
         * keeping them separate improves readability.
         */
        if (copyin(sops, uops, nsops * sizeof (*op))) {
            error = EFAULT;
            goto semoperr_unlocked;
        }

        if ((lock = ipc_lookup(sem_svc, semid,
            (kipc_perm_t **)&sp)) == NULL) {
            error = EINVAL;
            goto semoperr_unlocked;
        }

        if (nsops > sp->sem_maxops) {
            error = E2BIG;
            goto semoperr;
        }
    }

    /*
     * Scan all operations.  Verify that sem #s are in range and
     * this process is allowed the requested operations.  If any
     * operations are marked SEM_UNDO, find (or allocate) the undo
     * structure for this process and semaphore.
     */
    needundo = 0;
    mode = 0;
    for (i = 0, op = uops; i++ < nsops; op++) {
        mode |= op->sem_op ? SEM_A : SEM_R;
        if (op->sem_num >= sp->sem_nsems) {
            error = EFBIG;
            goto semoperr;
        }
        if ((op->sem_flg & SEM_UNDO) && op->sem_op)
            needundo = 1;
    }
    if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
        goto semoperr;

    if (needundo) {
        struct sem_undo template;

        template.un_sp = sp;
        mutex_enter(&pp->p_lock);
        if (pp->p_semacct)
            undo = avl_find(pp->p_semacct, &template, NULL);
        else
            undo = NULL;
        mutex_exit(&pp->p_lock);
        if (undo == NULL) {
            if (error = sem_undo_alloc(pp, sp, &lock, &template,
                &undo))
                goto semoperr;

            /* sem_undo_alloc unlocks the semaphore */
            if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
                goto semoperr;
        }
    }

check:
    /*
     * Loop waiting for the operations to be satisfied atomically.
     * Actually, do the operations and undo them if a wait is needed
     * or an error is detected.
     */
    for (i = 0; i < nsops; i++) {
        op = &uops[i];
        semp = &sp->sem_base[op->sem_num];

        /*
         * Raise the semaphore (i.e. sema_v)
         */
        if (op->sem_op > 0) {
            if (op->sem_op + (int)semp->semval > USHRT_MAX ||
                ((op->sem_flg & SEM_UNDO) &&
                (error = sem_undo_add(op->sem_op, op->sem_num,
                undo)))) {
                if (i)
                    sem_rollback(sp, uops, i, undo);
                if (error == 0)
                    error = ERANGE;
                goto semoperr;
            }
            semp->semval += op->sem_op;
            /*
             * If we are only incrementing the semaphore value
             * by one on a binary semaphore, we can cv_signal.
             */
            if (semp->semncnt) {
                if (op->sem_op == 1 && sp->sem_binary)
                    cv_signal(&semp->semncnt_cv);
                else
                    cv_broadcast(&semp->semncnt_cv);
            }
            if (semp->semzcnt && !semp->semval)
                cv_broadcast(&semp->semzcnt_cv);
            continue;
        }

        /*
         * Lower the semaphore (i.e. sema_p)
         */
        if (op->sem_op < 0) {
            if (semp->semval >= (unsigned)(-op->sem_op)) {
                if ((op->sem_flg & SEM_UNDO) &&
                    (error = sem_undo_add(op->sem_op,
                    op->sem_num, undo))) {
                    if (i)
                        sem_rollback(sp, uops, i, undo);
                    goto semoperr;
                }
                semp->semval += op->sem_op;
                if (semp->semzcnt && !semp->semval)
                    cv_broadcast(&semp->semzcnt_cv);
                continue;
            }
            if (i)
                sem_rollback(sp, uops, i, undo);
            if (op->sem_flg & IPC_NOWAIT) {
                error = EAGAIN;
                goto semoperr;
            }

            /*
             * Mark the semaphore set as not a binary type
             * if we are decrementing the value by more than 1.
             *
             * V operations will resort to cv_broadcast
             * for this set because there are too many weird
             * cases that have to be caught.
             */
            if (op->sem_op < -1)
                sp->sem_binary = 0;
            if (!held) {
                held = 1;
                ipc_hold(sem_svc, (kipc_perm_t *)sp);
            }
            semp->semncnt++;
            cvres = cv_waituntil_sig(&semp->semncnt_cv, lock, tsp);
            lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock);

            if (!IPC_FREE(&sp->sem_perm)) {
                ASSERT(semp->semncnt != 0);
                semp->semncnt--;
                if (cvres > 0)  /* normal wakeup */
                    goto check;
            }

            /* EINTR or EAGAIN overrides EIDRM */
            if (cvres == 0)
                error = EINTR;
            else if (cvres < 0)
                error = EAGAIN;
            else
                error = EIDRM;
            goto semoperr;
        }

        /*
         * Wait for zero value
         */
        if (semp->semval) {
            if (i)
                sem_rollback(sp, uops, i, undo);
            if (op->sem_flg & IPC_NOWAIT) {
                error = EAGAIN;
                goto semoperr;
            }

            if (!held) {
                held = 1;
                ipc_hold(sem_svc, (kipc_perm_t *)sp);
            }
            semp->semzcnt++;
            cvres = cv_waituntil_sig(&semp->semzcnt_cv, lock, tsp);
            lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock);

            /*
             * Don't touch semp if the semaphores have been removed.
             */
            if (!IPC_FREE(&sp->sem_perm)) {
                ASSERT(semp->semzcnt != 0);
                semp->semzcnt--;
                if (cvres > 0)  /* normal wakeup */
                    goto check;
            }

            /* EINTR or EAGAIN overrides EIDRM */
            if (cvres == 0)
                error = EINTR;
            else if (cvres < 0)
                error = EAGAIN;
            else
                error = EIDRM;
            goto semoperr;
        }
    }

    /* All operations succeeded.  Update sempid for accessed semaphores. */
    for (i = 0, op = uops; i++ < nsops;
        sp->sem_base[(op++)->sem_num].sempid = pp->p_pid)
        ;
    sp->sem_otime = gethrestime_sec();
    if (held)
        ipc_rele(sem_svc, (kipc_perm_t *)sp);
    else
        mutex_exit(lock);

    /* Before leaving, deallocate the buffer that held the user semops */
    if (nsops != 1)
        kmem_free(uops, sizeof (*uops) * nsops);
    return (0);

    /*
     * Error return labels
     */
semoperr:
    if (held)
        ipc_rele(sem_svc, (kipc_perm_t *)sp);
    else
        mutex_exit(lock);

semoperr_unlocked:

    /* Before leaving, deallocate the buffer that held the user semops */
    if (nsops != 1)
        kmem_free(uops, sizeof (*uops) * nsops);
    return (set_errno(error));
}

/*
 * semsys - System entry point for semctl, semget, and semop system calls.
 */
static int
semsys(int opcode, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4)
{
    int error;

    switch (opcode) {
    case SEMCTL:
        error = semctl((int)a1, (uint_t)a2, (int)a3, a4);
        break;
    case SEMGET:
        error = semget((key_t)a1, (int)a2, (int)a3);
        break;
    case SEMOP:
        error = semop((int)a1, (struct sembuf *)a2, (size_t)a3, 0);
        break;
    case SEMIDS:
        error = semids((int *)a1, (uint_t)a2, (uint_t *)a3);
        break;
    case SEMTIMEDOP:
        error = semop((int)a1, (struct sembuf *)a2, (size_t)a3,
            (timespec_t *)a4);
        break;
    default:
        error = set_errno(EINVAL);
        break;
    }
    return (error);
}