common/os/ipc.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*  Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
/*  All Rights Reserved                 */


/*
 * Common Inter-Process Communication routines.
 *
 * Overview
 * --------
 *
 * The System V inter-process communication (IPC) facilities provide
 * three services, message queues, semaphore arrays, and shared memory
 * segments, which are mananged using filesystem-like namespaces.
 * Unlike a filesystem, these namespaces aren't mounted and accessible
 * via a path -- a special API is used to interact with the different
 * facilities (nothing precludes a VFS-based interface, but the
 * standards require the special APIs).  Furthermore, these special
 * APIs don't use file descriptors, nor do they have an equivalent.
 * This means that every operation which acts on an object needs to
 * perform the quivalent of a lookup, which in turn means that every
 * operation can fail if the specified object doesn't exist in the
 * facility's namespace.
 *
 * Objects
 * -------
 *
 * Each object in a namespace has a unique ID, which is assigned by the
 * system and is used to identify the object when performing operations
 * on it.  An object can also have a key, which is selected by the user
 * at allocation time and is used as a primitive rendezvous mechanism.
 * An object without a key is said to have a "private" key.
 *
 * To perform an operation on an object given its key, one must first
 * perform a lookup and obtain its ID.  The ID is then used to identify
 * the object when performing the operation.  If the object has a
 * private key, the ID must be known or obtained by other means.
 *
 * Each object in the namespace has a creator uid and gid, as well as
 * an owner uid and gid.  Both are initialized with the ruid and rgid
 * of the process which created the object.  The creator or current
 * owner has the ability to change the owner of the object.
 *
 * Each object in the namespace has a set of file-like permissions,
 * which, in conjunction with the creator and owner uid and gid,
 * control read and write access to the object (execute is ignored).
 *
 * Each object also has a creator project and zone, which are used to
 * account for its resource usage.
 *
 * Operations
 * ----------
 *
 * There are five operations which all three facilities have in
 * common: GET, SET, STAT, RMID, and IDS.
 *
 * GET, like open, is used to allocate a new object or obtain an
 * existing one (using its key).  It takes a key, a set of flags and
 * mode bits, and optionally facility-specific arguments.  If the key
 * is IPC_PRIVATE, a new object with the requested mode bits and
 * facility-specific attributes is created.  If the key isn't
 * IPC_PRIVATE, the GET will attempt to look up the specified key and
 * either return that or create a new key depending on the state of the
 * IPC_CREAT and IPC_EXCL flags, much like open.  If GET needs to
 * allocate an object, it can fail if there is insufficient space in
 * the namespace (the maximum number of ids for the facility has been
 * exceeded) or if the facility-specific initialization fails.  If GET
 * finds an object it can return, it can still fail if that object's
 * permissions or facility-specific attributes are less than those
 * requested.
 *
 * SET is used to adjust facility-specific parameters of an object, in
 * addition to the owner uid and gid, and mode bits.  It can fail if
 * the caller isn't the creator or owner.
 *
 * STAT is used to obtain information about an object including the
 * general attributes object described as well as facility-specific
 * information.  It can fail if the caller doesn't have read
 * permission.
 *
 * RMID removes an object from the namespace.  Subsequent operations
 * using the object's ID or key will fail (until another object is
 * created with the same key or ID).  Since an RMID may be performed
 * asynchronously with other operations, it is possible that other
 * threads and/or processes will have references to the object.  While
 * a facility may have actions which need to be performed at RMID time,
 * only when all references are dropped can the object be destroyed.
 * RMID will fail if the caller isn't the creator or owner.
 *
 * IDS obtains a list of all IDs in a facility's namespace.  There are
 * no facility-specific behaviors of IDS.
 *
 * Design
 * ------
 *
 * Because some IPC facilities provide services whose operations must
 * scale, a mechanism which allows fast, concurrent access to
 * individual objects is needed.  Of primary importance is object
 * lookup based on ID (SET, STAT, others).  Allocation (GET),
 * deallocation (RMID), ID enumeration (IDS), and key lookups (GET) are
 * lesser concerns, but should be implemented in such a way that ID
 * lookup isn't affected (at least not in the common case).
 *
 * Starting from the bottom up, each object is represented by a
 * structure, the first member of which must be a kipc_perm_t.  The
 * kipc_perm_t contains the information described above in "Objects", a
 * reference count (since the object may continue to exist after it has
 * been removed from the namespace), as well as some additional
 * metadata used to manage data structure membership.  These objects
 * are dynamically allocated.
 *
 * Above the objects is a power-of-two sized table of ID slots.  Each
 * slot contains a pointer to an object, a sequence number, and a
 * lock.  An object's ID is a function of its slot's index in the table
 * and its slot's sequence number.  Every time a slot is released (via
 * RMID) its sequence number is increased.  Strictly speaking, the
 * sequence number is unnecessary.  However, checking the sequence
 * number after a lookup provides a certain degree of robustness
 * against the use of stale IDs (useful since nothing else does).  When
 * the table fills up, it is resized (see Locking, below).
 *
 * Of an ID's 31 bits (an ID is, as defined by the standards, a signed
 * int) the top IPC_SEQ_BITS are used for the sequence number with the
 * remainder holding the index into the table.  The size of the table
 * is therefore bounded at 2 ^ (31 - IPC_SEQ_BITS) slots.
 *
 * Managing this table is the ipc_service structure.  It contains a
 * pointer to the dynamically allocated ID table, a namespace-global
 * lock, an id_space for managing the free space in the table, and
 * sundry other metadata necessary for the maintenance of the
 * namespace.  An AVL tree of all keyed objects in the table (sorted by
 * key) is used for key lookups.  An unordered doubly linked list of
 * all objects in the namespace (keyed or not) is maintained to
 * facilitate ID enumeration.
 *
 * To help visualize these relationships, here's a picture of a
 * namespace with a table of size 8 containing three objects
 * (IPC_SEQ_BITS = 28):
 *
 *
 * +-ipc_service_t--+
 * | table          *---\
 * | keys           *---+----------------------\
 * | all ids        *--\|                      |
 * |                |  ||                      |
 * +----------------+  ||                      |
 *                     ||                      |
 * /-------------------/|                      |
 * |    /---------------/                      |
 * |    |                                      |
 * |    v                                      |
 * |  +-0------+-1------+-2------+-3------+-4--+---+-5------+-6------+-7------+
 * |  | Seq=3  |        |        | Seq=1  |    :   |        |        | Seq=6  |
 * |  |        |        |        |        |    :   |        |        |        |
 * |  +-*------+--------+--------+-*------+----+---+--------+--------+-*------+
 * |    |                          |           |                       |
 * |    |                      /---/           |      /----------------/
 * |    |                      |               |      |
 * |    v                      v               |      v
 * |  +-kipc_perm_t-+        +-kipc_perm_t-+   |    +-kipc_perm_t-+
 * |  | id=0x30     |        | id=0x13     |   |    | id=0x67     |
 * |  | key=0xfeed  |        | key=0xbeef  |   |    | key=0xcafe  |
 * \->| [list]      |<------>| [list]      |<------>| [list]      |
 * /->| [avl left]  x   /--->| [avl left]  x   \--->| [avl left]  *---\
 * |  | [avl right] x   |    | [avl right] x        | [avl right] *---+-\
 * |  |             |   |    |             |        |             |   | |
 * |  +-------------+   |    +-------------+        +-------------+   | |
 * |                    \---------------------------------------------/ |
 * \--------------------------------------------------------------------/
 *
 * Locking
 * -------
 *
 * There are three locks (or sets of locks) which are used to ensure
 * correctness: the slot locks, the namespace lock, and p_lock (needed
 * when checking resource controls).  Their ordering is
 *
 *   namespace lock -> slot lock 0 -> ... -> slot lock t -> p_lock
 *
 * Generally speaking, the namespace lock is used to protect allocation
 * and removal from the namespace, ID enumeration, and resizing the ID
 * table.  Specifically:
 *
 * - write access to all fields of the ipc_service structure
 * - read access to all variable fields of ipc_service except
 *   ipcs_tabsz (table size) and ipcs_table (the table pointer)
 * - read/write access to ipc_avl, ipc_list in visible objects'
 *   kipc_perm structures (i.e. objects which have been removed from
 *   the namespace don't have this restriction)
 * - write access to ipct_seq and ipct_data in the table entries
 *
 * A slot lock by itself is meaningless (except when resizing).  Of
 * greater interest conceptually is the notion of an ID lock -- a
 * "virtual lock" which refers to whichever slot lock an object's ID
 * currently hashes to.
 *
 * An ID lock protects all objects with that ID.  Normally there will
 * only be one such object: the one pointed to by the locked slot.
 * However, if an object is removed from the namespace but retains
 * references (e.g. an attached shared memory segment which has been
 * RMIDed), it continues to use the lock associated with its original
 * ID.  While this can result in increased contention, operations which
 * require taking the ID lock of removed objects are infrequent.
 *
 * Specifically, an ID lock protects the contents of an object's
 * structure, including the contents of the embedded kipc_perm
 * structure (but excluding those fields protected by the namespace
 * lock).  It also protects the ipct_seq and ipct_data fields in its
 * slot (it is really a slot lock, after all).
 *
 * Recall that the table is resizable.  To avoid requiring every ID
 * lookup to take a global lock, a scheme much like that employed for
 * file descriptors (see the comment above UF_ENTER in user.h) is
 * used.  Note that the sequence number and data pointer are protected
 * by both the namespace lock and their slot lock.  When the table is
 * resized, the following operations take place:
 *
 *   1) A new table is allocated.
 *   2) The global lock is taken.
 *   3) All old slots are locked, in order.
 *   4) The first half of the new slots are locked.
 *   5) All table entries are copied to the new table, and cleared from
 *  the old table.
 *   6) The ipc_service structure is updated to point to the new table.
 *   7) The ipc_service structure is updated with the new table size.
 *   8) All slot locks (old and new) are dropped.
 *
 * Because the slot locks are embedded in the table, ID lookups and
 * other operations which require taking an slot lock need to verify
 * that the lock taken wasn't part of a stale table.  This is
 * accomplished by checking the table size before and after
 * dereferencing the table pointer and taking the lock: if the size
 * changes, the lock must be dropped and reacquired.  It is this
 * additional work which distinguishes an ID lock from a slot lock.
 *
 * Because we can't guarantee that threads aren't accessing the old
 * tables' locks, they are never deallocated.  To prevent spurious
 * reports of memory leaks, a pointer to the discarded table is stored
 * in the new one in step 5.  (Theoretically ipcs_destroy will delete
 * the discarded tables, but it is only ever called from a failed _init
 * invocation; i.e. when there aren't any.)
 *
 * Interfaces
 * ----------
 *
 * The following interfaces are provided by the ipc module for use by
 * the individual IPC facilities:
 *
 * ipcperm_access
 *
 *   Given an object and a cred structure, determines if the requested
 *   access type is allowed.
 *
 * ipcperm_set, ipcperm_stat,
 * ipcperm_set64, ipcperm_stat64
 *
 *   Performs the common portion of an STAT or SET operation.  All
 *   (except stat and stat64) can fail, so they should be called before
 *   any facility-specific non-reversible changes are made to an
 *   object.  Similarly, the set operations have side effects, so they
 *   should only be called once the possibility of a facility-specific
 *   failure is eliminated.
 *
 * ipcs_create
 *
 *   Creates an IPC namespace for use by an IPC facility.
 *
 * ipcs_destroy
 *
 *   Destroys an IPC namespace.
 *
 * ipcs_lock, ipcs_unlock
 *
 *   Takes the namespace lock.  Ideally such access wouldn't be
 *   necessary, but there may be facility-specific data protected by
 *   this lock (e.g. project-wide resource consumption).
 *
 * ipc_lock
 *
 *   Takes the lock associated with an ID.  Can't fail.
 *
 * ipc_relock
 *
 *   Like ipc_lock, but takes a pointer to a held lock.  Drops the lock
 *   unless it is the one that would have been returned by ipc_lock.
 *   Used after calls to cv_wait.
 *
 * ipc_lookup
 *
 *   Performs an ID lookup, returns with the ID lock held.  Fails if
 *   the ID doesn't exist in the namespace.
 *
 * ipc_hold
 *
 *   Takes a reference on an object.
 *
 * ipc_rele
 *
 *   Releases a reference on an object, and drops the object's lock.
 *   Calls the object's destructor if last reference is being
 *   released.
 *
 * ipc_rele_locked
 *
 *   Releases a reference on an object.  Doesn't drop lock, and may
 *   only be called when there is more than one reference to the
 *   object.
 *
 * ipc_get, ipc_commit_begin, ipc_commit_end, ipc_cleanup
 *
 *   Components of a GET operation.  ipc_get performs a key lookup,
 *   allocating an object if the key isn't found (returning with the
 *   namespace lock and p_lock held), and returning the existing object
 *   if it is (with the object lock held).  ipc_get doesn't modify the
 *   namespace.
 *
 *   ipc_commit_begin begins the process of inserting an object
 *   allocated by ipc_get into the namespace, and can fail.  If
 *   successful, it returns with the namespace lock and p_lock held.
 *   ipc_commit_end completes the process of inserting an object into
 *   the namespace and can't fail.  The facility can call ipc_cleanup
 *   at any time following a successful ipc_get and before
 *   ipc_commit_end or a failed ipc_commit_begin to fail the
 *   allocation.  Pseudocode for the suggested GET implementation:
 *
 *   top:
 *
 *     ipc_get
 *
 *     if failure
 *       return
 *
 *     if found {
 *
 *   if object meets criteria
 *     unlock object and return success
 *       else
 *     unlock object and return failure
 *
 *     } else {
 *
 *   perform resource control tests
 *   drop namespace lock, p_lock
 *   if failure
 *     ipc_cleanup
 *
 *       perform facility-specific initialization
 *   if failure {
 *     facility-specific cleanup
 *     ipc_cleanup
 *       }
 *
 *   ( At this point the object should be destructible using the
 *     destructor given to ipcs_create )
 *
 *       ipc_commit_begin
 *   if retry
 *     goto top
 *       else if failure
 *         return
 *
 *       perform facility-specific resource control tests/allocations
 *   if failure
 *     ipc_cleanup
 *
 *   ipc_commit_end
 *   perform any infallible post-creation actions, unlock, and return
 *
 *     }
 *
 * ipc_rmid
 *
 *   Performs the common portion of an RMID operation -- looks up an ID
 *   removes it, and calls the a facility-specific function to do
 *   RMID-time cleanup on the private portions of the object.
 *
 * ipc_ids
 *
 *   Performs the common portion of an IDS operation.
 *
 */

#include <sys/types.h>
#include <sys/param.h>
#include <sys/cred.h>
#include <sys/policy.h>
#include <sys/proc.h>
#include <sys/user.h>
#include <sys/ipc.h>
#include <sys/ipc_impl.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/list.h>
#include <sys/atomic.h>
#include <sys/zone.h>
#include <sys/task.h>
#include <sys/modctl.h>

#include <c2/audit.h>

static struct modlmisc modlmisc = {
    &mod_miscops,
    "common ipc code",
};

static struct modlinkage modlinkage = {
    MODREV_1, (void *)&modlmisc, NULL
};


int
_init(void)
{
    return (mod_install(&modlinkage));
}

int
_fini(void)
{
    return (mod_remove(&modlinkage));
}

int
_info(struct modinfo *modinfop)
{
    return (mod_info(&modlinkage, modinfop));
}


/*
 * Check message, semaphore, or shared memory access permissions.
 *
 * This routine verifies the requested access permission for the current
 * process.  The zone ids are compared, and the appropriate bits are
 * checked corresponding to owner, group (including the list of
 * supplementary groups), or everyone.  Zero is returned on success.
 * On failure, the security policy is asked to check to override the
 * permissions check; the policy will either return 0 for access granted
 * or EACCES.
 *
 * Access to objects in other zones requires that the caller be in the
 * global zone and have the appropriate IPC_DAC_* privilege, regardless
 * of whether the uid or gid match those of the object.  Note that
 * cross-zone accesses will normally never get here since they'll
 * fail in ipc_lookup or ipc_get.
 *
 * The arguments must be set up as follows:
 *  p - Pointer to permission structure to verify
 *  mode - Desired access permissions
 */
int
ipcperm_access(kipc_perm_t *p, int mode, cred_t *cr)
{
    int shifts = 0;
    uid_t uid = crgetuid(cr);
    zoneid_t zoneid = getzoneid();

    if (p->ipc_zoneid == zoneid) {
        if (uid != p->ipc_uid && uid != p->ipc_cuid) {
            shifts += 3;
            if (!groupmember(p->ipc_gid, cr) &&
                !groupmember(p->ipc_cgid, cr))
                shifts += 3;
        }

        mode &= ~(p->ipc_mode << shifts);

        if (mode == 0)
            return (0);
    } else if (zoneid != GLOBAL_ZONEID)
        return (EACCES);

    return (secpolicy_ipc_access(cr, p, mode));
}

/*
 * There are two versions of the ipcperm_set/stat functions:
 *   ipcperm_???        - for use with IPC_SET/STAT
 *   ipcperm_???_64     - for use with IPC_SET64/STAT64
 *
 * These functions encapsulate the common portions (copying, permission
 * checks, and auditing) of the set/stat operations.  All, except for
 * stat and stat_64 which are void, return 0 on success or a non-zero
 * errno value on error.
 */

int
ipcperm_set(ipc_service_t *service, struct cred *cr,
    kipc_perm_t *kperm, struct ipc_perm *perm, model_t model)
{
    STRUCT_HANDLE(ipc_perm, lperm);
    uid_t uid;
    gid_t gid;
    mode_t mode;
    zone_t *zone;

    ASSERT(IPC_LOCKED(service, kperm));

    STRUCT_SET_HANDLE(lperm, model, perm);
    uid = STRUCT_FGET(lperm, uid);
    gid = STRUCT_FGET(lperm, gid);
    mode = STRUCT_FGET(lperm, mode);

    if (secpolicy_ipc_owner(cr, kperm) != 0)
        return (EPERM);

    zone = crgetzone(cr);
    if (!VALID_UID(uid, zone) || !VALID_GID(gid, zone))
        return (EINVAL);

    kperm->ipc_uid = uid;
    kperm->ipc_gid = gid;
    kperm->ipc_mode = (mode & 0777) | (kperm->ipc_mode & ~0777);

    if (AU_AUDITING())
        audit_ipcget(service->ipcs_atype, kperm);

    return (0);
}

void
ipcperm_stat(struct ipc_perm *perm, kipc_perm_t *kperm, model_t model)
{
    STRUCT_HANDLE(ipc_perm, lperm);

    STRUCT_SET_HANDLE(lperm, model, perm);
    STRUCT_FSET(lperm, uid, kperm->ipc_uid);
    STRUCT_FSET(lperm, gid, kperm->ipc_gid);
    STRUCT_FSET(lperm, cuid, kperm->ipc_cuid);
    STRUCT_FSET(lperm, cgid, kperm->ipc_cgid);
    STRUCT_FSET(lperm, mode, kperm->ipc_mode);
    STRUCT_FSET(lperm, seq, 0);
    STRUCT_FSET(lperm, key, kperm->ipc_key);
}

int
ipcperm_set64(ipc_service_t *service, struct cred *cr,
    kipc_perm_t *kperm, ipc_perm64_t *perm64)
{
    zone_t *zone;

    ASSERT(IPC_LOCKED(service, kperm));

    if (secpolicy_ipc_owner(cr, kperm) != 0)
        return (EPERM);

    zone = crgetzone(cr);
    if (!VALID_UID(perm64->ipcx_uid, zone) ||
        !VALID_GID(perm64->ipcx_gid, zone))
        return (EINVAL);

    kperm->ipc_uid = perm64->ipcx_uid;
    kperm->ipc_gid = perm64->ipcx_gid;
    kperm->ipc_mode = (perm64->ipcx_mode & 0777) |
        (kperm->ipc_mode & ~0777);

    if (AU_AUDITING())
        audit_ipcget(service->ipcs_atype, kperm);

    return (0);
}

void
ipcperm_stat64(ipc_perm64_t *perm64, kipc_perm_t *kperm)
{
    perm64->ipcx_uid = kperm->ipc_uid;
    perm64->ipcx_gid = kperm->ipc_gid;
    perm64->ipcx_cuid = kperm->ipc_cuid;
    perm64->ipcx_cgid = kperm->ipc_cgid;
    perm64->ipcx_mode = kperm->ipc_mode;
    perm64->ipcx_key = kperm->ipc_key;
    perm64->ipcx_projid = kperm->ipc_proj->kpj_id;
    perm64->ipcx_zoneid = kperm->ipc_zoneid;
}


/*
 * ipc key comparator.
 */
static int
ipc_key_compar(const void *a, const void *b)
{
    kipc_perm_t *aperm = (kipc_perm_t *)a;
    kipc_perm_t *bperm = (kipc_perm_t *)b;
    int ak = aperm->ipc_key;
    int bk = bperm->ipc_key;
    zoneid_t az;
    zoneid_t bz;

    ASSERT(ak != IPC_PRIVATE);
    ASSERT(bk != IPC_PRIVATE);

    /*
     * Compare key first, then zoneid.  This optimizes performance for
     * systems with only one zone, since the zone checks will only be
     * made when the keys match.
     */
    if (ak < bk)
        return (-1);
    if (ak > bk)
        return (1);

    /* keys match */
    az = aperm->ipc_zoneid;
    bz = bperm->ipc_zoneid;
    if (az < bz)
        return (-1);
    if (az > bz)
        return (1);
    return (0);
}

/*
 * Create an ipc service.
 */
ipc_service_t *
ipcs_create(const char *name, rctl_hndl_t proj_rctl, rctl_hndl_t zone_rctl,
    size_t size, ipc_func_t *dtor, ipc_func_t *rmid, int audit_type,
    size_t rctl_offset)
{
    ipc_service_t *result;

    result = kmem_alloc(sizeof (ipc_service_t), KM_SLEEP);

    mutex_init(&result->ipcs_lock, NULL, MUTEX_ADAPTIVE, NULL);
    result->ipcs_count = 0;
    avl_create(&result->ipcs_keys, ipc_key_compar, size, 0);
    result->ipcs_tabsz = IPC_IDS_MIN;
    result->ipcs_table =
        kmem_zalloc(IPC_IDS_MIN * sizeof (ipc_slot_t), KM_SLEEP);
    result->ipcs_ssize = size;
    result->ipcs_ids = id_space_create(name, 0, IPC_IDS_MIN);
    result->ipcs_dtor = dtor;
    result->ipcs_rmid = rmid;
    result->ipcs_proj_rctl = proj_rctl;
    result->ipcs_zone_rctl = zone_rctl;
    result->ipcs_atype = audit_type;
    ASSERT(rctl_offset < sizeof (ipc_rqty_t));
    result->ipcs_rctlofs = rctl_offset;
    list_create(&result->ipcs_usedids, sizeof (kipc_perm_t),
        offsetof(kipc_perm_t, ipc_list));

    return (result);
}

/*
 * Destroy an ipc service.
 */
void
ipcs_destroy(ipc_service_t *service)
{
    ipc_slot_t *slot, *next;

    mutex_enter(&service->ipcs_lock);

    ASSERT(service->ipcs_count == 0);
    avl_destroy(&service->ipcs_keys);
    list_destroy(&service->ipcs_usedids);
    id_space_destroy(service->ipcs_ids);

    for (slot = service->ipcs_table; slot; slot = next) {
        next = slot[0].ipct_chain;
        kmem_free(slot, service->ipcs_tabsz * sizeof (ipc_slot_t));
        service->ipcs_tabsz >>= 1;
    }

    mutex_destroy(&service->ipcs_lock);
    kmem_free(service, sizeof (ipc_service_t));
}

/*
 * Takes the service lock.
 */
void
ipcs_lock(ipc_service_t *service)
{
    mutex_enter(&service->ipcs_lock);
}

/*
 * Releases the service lock.
 */
void
ipcs_unlock(ipc_service_t *service)
{
    mutex_exit(&service->ipcs_lock);
}


/*
 * Locks the specified ID.  Returns the ID's ID table index.
 */
static int
ipc_lock_internal(ipc_service_t *service, uint_t id)
{
    uint_t  tabsz;
    uint_t  index;
    kmutex_t *mutex;

    for (;;) {
        tabsz = service->ipcs_tabsz;
        membar_consumer();
        index = id & (tabsz - 1);
        mutex = &service->ipcs_table[index].ipct_lock;
        mutex_enter(mutex);
        if (tabsz == service->ipcs_tabsz)
            break;
        mutex_exit(mutex);
    }

    return (index);
}

/*
 * Locks the specified ID.  Returns a pointer to the ID's lock.
 */
kmutex_t *
ipc_lock(ipc_service_t *service, int id)
{
    uint_t index;

    /*
     * These assertions don't reflect requirements of the code
     * which follows, but they should never fail nonetheless.
     */
    ASSERT(id >= 0);
    ASSERT(IPC_INDEX(id) < service->ipcs_tabsz);
    index = ipc_lock_internal(service, id);

    return (&service->ipcs_table[index].ipct_lock);
}

/*
 * Checks to see if the held lock provided is the current lock for the
 * specified id.  If so, we return it instead of dropping it and
 * returning the result of ipc_lock.  This is intended to speed up cv
 * wakeups where we are left holding a lock which could be stale, but
 * probably isn't.
 */
kmutex_t *
ipc_relock(ipc_service_t *service, int id, kmutex_t *lock)
{
    ASSERT(id >= 0);
    ASSERT(IPC_INDEX(id) < service->ipcs_tabsz);
    ASSERT(MUTEX_HELD(lock));

    if (&service->ipcs_table[IPC_INDEX(id)].ipct_lock == lock)
        return (lock);

    mutex_exit(lock);
    return (ipc_lock(service, id));
}

/*
 * Performs an ID lookup.  If the ID doesn't exist or has been removed,
 * or isn't visible to the caller (because of zones), NULL is returned.
 * Otherwise, a pointer to the ID's perm structure and held ID lock are
 * returned.
 */
kmutex_t *
ipc_lookup(ipc_service_t *service, int id, kipc_perm_t **perm)
{
    kipc_perm_t *result;
    uint_t index;

    /*
     * There is no need to check to see if id is in-range (i.e.
     * positive and fits into the table).  If it is out-of-range,
     * the id simply won't match the object's.
     */

    index = ipc_lock_internal(service, id);
    result = service->ipcs_table[index].ipct_data;
    if (result == NULL || result->ipc_id != (uint_t)id ||
        !HASZONEACCESS(curproc, result->ipc_zoneid)) {
        mutex_exit(&service->ipcs_table[index].ipct_lock);
        return (NULL);
    }

    ASSERT(IPC_SEQ(id) == service->ipcs_table[index].ipct_seq);

    *perm = result;
    if (AU_AUDITING())
        audit_ipc(service->ipcs_atype, id, result);

    return (&service->ipcs_table[index].ipct_lock);
}

/*
 * Increase the reference count on an ID.
 */
/*ARGSUSED*/
void
ipc_hold(ipc_service_t *s, kipc_perm_t *perm)
{
    ASSERT(IPC_INDEX(perm->ipc_id) < s->ipcs_tabsz);
    ASSERT(IPC_LOCKED(s, perm));
    perm->ipc_ref++;
}

/*
 * Decrease the reference count on an ID and drops the ID's lock.
 * Destroys the ID if the new reference count is zero.
 */
void
ipc_rele(ipc_service_t *s, kipc_perm_t *perm)
{
    int nref;

    ASSERT(IPC_INDEX(perm->ipc_id) < s->ipcs_tabsz);
    ASSERT(IPC_LOCKED(s, perm));
    ASSERT(perm->ipc_ref > 0);

    nref = --perm->ipc_ref;
    mutex_exit(&s->ipcs_table[IPC_INDEX(perm->ipc_id)].ipct_lock);

    if (nref == 0) {
        ASSERT(IPC_FREE(perm));     /* ipc_rmid clears IPC_ALLOC */
        s->ipcs_dtor(perm);
        project_rele(perm->ipc_proj);
        zone_rele_ref(&perm->ipc_zone_ref, ZONE_REF_IPC);
        kmem_free(perm, s->ipcs_ssize);
    }
}

/*
 * Decrease the reference count on an ID, but don't drop the ID lock.
 * Used in cases where one thread needs to remove many references (on
 * behalf of other parties).
 */
void
ipc_rele_locked(ipc_service_t *s, kipc_perm_t *perm)
{
    ASSERT(perm->ipc_ref > 1);
    ASSERT(IPC_INDEX(perm->ipc_id) < s->ipcs_tabsz);
    ASSERT(IPC_LOCKED(s, perm));

    perm->ipc_ref--;
}


/*
 * Internal function to grow the service ID table.
 */
static int
ipc_grow(ipc_service_t *service)
{
    ipc_slot_t *new, *old;
    int i, oldsize, newsize;

    ASSERT(MUTEX_HELD(&service->ipcs_lock));
    ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));

    if (service->ipcs_tabsz == IPC_IDS_MAX)
        return (ENOSPC);

    oldsize = service->ipcs_tabsz;
    newsize = oldsize << 1;
    new = kmem_zalloc(newsize * sizeof (ipc_slot_t), KM_NOSLEEP);
    if (new == NULL)
        return (ENOSPC);

    old = service->ipcs_table;
    for (i = 0; i < oldsize; i++) {
        mutex_enter(&old[i].ipct_lock);
        mutex_enter(&new[i].ipct_lock);

        new[i].ipct_seq = old[i].ipct_seq;
        new[i].ipct_data = old[i].ipct_data;
        old[i].ipct_data = NULL;
    }

    new[0].ipct_chain = old;
    service->ipcs_table = new;
    membar_producer();
    service->ipcs_tabsz = newsize;

    for (i = 0; i < oldsize; i++) {
        mutex_exit(&old[i].ipct_lock);
        mutex_exit(&new[i].ipct_lock);
    }

    id_space_extend(service->ipcs_ids, oldsize, service->ipcs_tabsz);

    return (0);
}


static int
ipc_keylookup(ipc_service_t *service, key_t key, int flag, kipc_perm_t **permp)
{
    kipc_perm_t *perm = NULL;
    avl_index_t where;
    kipc_perm_t template;

    ASSERT(MUTEX_HELD(&service->ipcs_lock));

    template.ipc_key = key;
    template.ipc_zoneid = getzoneid();
    if (perm = avl_find(&service->ipcs_keys, &template, &where)) {
        ASSERT(!IPC_FREE(perm));
        if ((flag & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
            return (EEXIST);
        if ((flag & 0777) & ~perm->ipc_mode) {
            if (AU_AUDITING())
                audit_ipcget(NULL, (void *)perm);
            return (EACCES);
        }
        *permp = perm;
        return (0);
    } else if (flag & IPC_CREAT) {
        *permp = NULL;
        return (0);
    }
    return (ENOENT);
}

static int
ipc_alloc_test(ipc_service_t *service, proc_t *pp)
{
    ASSERT(MUTEX_HELD(&service->ipcs_lock));

    /*
     * Resizing the table first would result in a cleaner code
     * path, but would also allow a user to (permanently) double
     * the id table size in cases where the allocation would be
     * denied.  Hence we test the rctl first.
     */
retry:
    mutex_enter(&pp->p_lock);
    if ((rctl_test(service->ipcs_proj_rctl, pp->p_task->tk_proj->kpj_rctls,
        pp, 1, RCA_SAFE) & RCT_DENY) ||
        (rctl_test(service->ipcs_zone_rctl, pp->p_zone->zone_rctls,
        pp, 1, RCA_SAFE) & RCT_DENY)) {
        mutex_exit(&pp->p_lock);
        return (ENOSPC);
    }

    if (service->ipcs_count == service->ipcs_tabsz) {
        int error;

        mutex_exit(&pp->p_lock);
        if (error = ipc_grow(service))
            return (error);
        goto retry;
    }

    return (0);
}

/*
 * Given a key, search for or create the associated identifier.
 *
 * If IPC_CREAT is specified and the key isn't found, or if the key is
 * equal to IPC_PRIVATE, we return 0 and place a pointer to a newly
 * allocated object structure in permp.  A pointer to the held service
 * lock is placed in lockp.  ipc_mode's IPC_ALLOC bit is clear.
 *
 * If the key is found and no error conditions arise, we return 0 and
 * place a pointer to the existing object structure in permp.  A
 * pointer to the held ID lock is placed in lockp.  ipc_mode's
 * IPC_ALLOC bit is set.
 *
 * Otherwise, a non-zero errno value is returned.
 */
int
ipc_get(ipc_service_t *service, key_t key, int flag, kipc_perm_t **permp,
    kmutex_t **lockp)
{
    kipc_perm_t *perm = NULL;
    proc_t      *pp = curproc;
    int     error, index;
    cred_t      *cr = CRED();

    if (key != IPC_PRIVATE) {

        mutex_enter(&service->ipcs_lock);
        error = ipc_keylookup(service, key, flag, &perm);
        if (perm != NULL)
            index = ipc_lock_internal(service, perm->ipc_id);
        mutex_exit(&service->ipcs_lock);

        if (error) {
            ASSERT(perm == NULL);
            return (error);
        }

        if (perm) {
            ASSERT(!IPC_FREE(perm));
            *permp = perm;
            *lockp = &service->ipcs_table[index].ipct_lock;
            return (0);
        }

        /* Key not found; fall through */
    }

    perm = kmem_zalloc(service->ipcs_ssize, KM_SLEEP);

    mutex_enter(&service->ipcs_lock);
    if (error = ipc_alloc_test(service, pp)) {
        mutex_exit(&service->ipcs_lock);
        kmem_free(perm, service->ipcs_ssize);
        return (error);
    }

    perm->ipc_cuid = perm->ipc_uid = crgetuid(cr);
    perm->ipc_cgid = perm->ipc_gid = crgetgid(cr);
    perm->ipc_zoneid = getzoneid();
    perm->ipc_mode = flag & 0777;
    perm->ipc_key = key;
    perm->ipc_ref = 1;
    perm->ipc_id = IPC_ID_INVAL;
    *permp = perm;
    *lockp = &service->ipcs_lock;

    return (0);
}

/*
 * Attempts to add the a newly created ID to the global namespace.  If
 * creating it would cause an error, we return the error.  If there is
 * the possibility that we could obtain the existing ID and return it
 * to the user, we return EAGAIN.  Otherwise, we return 0 with p_lock
 * and the service lock held.
 *
 * Since this should be only called after all initialization has been
 * completed, on failure we automatically invoke the destructor for the
 * object and deallocate the memory associated with it.
 */
int
ipc_commit_begin(ipc_service_t *service, key_t key, int flag,
    kipc_perm_t *newperm)
{
    kipc_perm_t *perm;
    int error;
    proc_t *pp = curproc;

    ASSERT(newperm->ipc_ref == 1);
    ASSERT(IPC_FREE(newperm));

    /*
     * Set ipc_proj and ipc_zone_ref so that future calls to ipc_cleanup()
     * clean up the necessary state.  This must be done before the
     * potential call to ipcs_dtor() below.
     */
    newperm->ipc_proj = pp->p_task->tk_proj;
    zone_init_ref(&newperm->ipc_zone_ref);
    zone_hold_ref(pp->p_zone, &newperm->ipc_zone_ref, ZONE_REF_IPC);

    mutex_enter(&service->ipcs_lock);
    /*
     * Ensure that no-one has raced with us and created the key.
     */
    if ((key != IPC_PRIVATE) &&
        (((error = ipc_keylookup(service, key, flag, &perm)) != 0) ||
        (perm != NULL))) {
        error = error ? error : EAGAIN;
        goto errout;
    }

    /*
     * Ensure that no-one has raced with us and used the last of
     * the permissible ids, or the last of the free spaces in the
     * id table.
     */
    if (error = ipc_alloc_test(service, pp))
        goto errout;

    ASSERT(MUTEX_HELD(&service->ipcs_lock));
    ASSERT(MUTEX_HELD(&pp->p_lock));

    return (0);
errout:
    mutex_exit(&service->ipcs_lock);
    service->ipcs_dtor(newperm);
    zone_rele_ref(&newperm->ipc_zone_ref, ZONE_REF_IPC);
    kmem_free(newperm, service->ipcs_ssize);
    return (error);
}

/*
 * Commit the ID allocation transaction.  Called with p_lock and the
 * service lock held, both of which are dropped.  Returns the held ID
 * lock so the caller can extract the ID and perform ipcget auditing.
 */
kmutex_t *
ipc_commit_end(ipc_service_t *service, kipc_perm_t *perm)
{
    ipc_slot_t *slot;
    avl_index_t where;
    int index;
    void *loc;

    ASSERT(MUTEX_HELD(&service->ipcs_lock));
    ASSERT(MUTEX_HELD(&curproc->p_lock));

    (void) project_hold(perm->ipc_proj);
    mutex_exit(&curproc->p_lock);

    /*
     * Pick out our slot.
     */
    service->ipcs_count++;
    index = id_alloc(service->ipcs_ids);
    ASSERT(index < service->ipcs_tabsz);
    slot = &service->ipcs_table[index];
    mutex_enter(&slot->ipct_lock);
    ASSERT(slot->ipct_data == NULL);

    /*
     * Update the perm structure.
     */
    perm->ipc_mode |= IPC_ALLOC;
    perm->ipc_id = (slot->ipct_seq << IPC_SEQ_SHIFT) | index;

    /*
     * Push into global visibility.
     */
    slot->ipct_data = perm;
    if (perm->ipc_key != IPC_PRIVATE) {
        loc = avl_find(&service->ipcs_keys, perm, &where);
        ASSERT(loc == NULL);
        avl_insert(&service->ipcs_keys, perm, where);
    }
    list_insert_head(&service->ipcs_usedids, perm);

    /*
     * Update resource consumption.
     */
    IPC_PROJ_USAGE(perm, service) += 1;
    IPC_ZONE_USAGE(perm, service) += 1;

    mutex_exit(&service->ipcs_lock);
    return (&slot->ipct_lock);
}

/*
 * Clean up function, in case the allocation fails.  If called between
 * ipc_lookup and ipc_commit_begin, perm->ipc_proj will be 0 and we
 * merely free the perm structure.  If called after ipc_commit_begin,
 * we also drop locks and call the ID's destructor.
 */
void
ipc_cleanup(ipc_service_t *service, kipc_perm_t *perm)
{
    ASSERT(IPC_FREE(perm));
    if (perm->ipc_proj) {
        mutex_exit(&curproc->p_lock);
        mutex_exit(&service->ipcs_lock);
        service->ipcs_dtor(perm);
    }
    if (perm->ipc_zone_ref.zref_zone != NULL)
        zone_rele_ref(&perm->ipc_zone_ref, ZONE_REF_IPC);
    kmem_free(perm, service->ipcs_ssize);
}


/*
 * Common code to remove an IPC object.  This should be called after
 * all permissions checks have been performed, and with the service
 * and ID locked.  Note that this does not remove the object from
 * the ipcs_usedids list (this needs to be done by the caller before
 * dropping the service lock).
 */
static void
ipc_remove(ipc_service_t *service, kipc_perm_t *perm)
{
    int id = perm->ipc_id;
    int index;

    ASSERT(MUTEX_HELD(&service->ipcs_lock));
    ASSERT(IPC_LOCKED(service, perm));

    index = IPC_INDEX(id);

    service->ipcs_table[index].ipct_data = NULL;

    if (perm->ipc_key != IPC_PRIVATE)
        avl_remove(&service->ipcs_keys, perm);
    list_remove(&service->ipcs_usedids, perm);
    perm->ipc_mode &= ~IPC_ALLOC;

    id_free(service->ipcs_ids, index);

    if (service->ipcs_table[index].ipct_seq++ == IPC_SEQ_MASK)
        service->ipcs_table[index].ipct_seq = 0;
    service->ipcs_count--;
    ASSERT(IPC_PROJ_USAGE(perm, service) > 0);
    ASSERT(IPC_ZONE_USAGE(perm, service) > 0);
    IPC_PROJ_USAGE(perm, service) -= 1;
    IPC_ZONE_USAGE(perm, service) -= 1;
    ASSERT(service->ipcs_count || ((IPC_PROJ_USAGE(perm, service) == 0) &&
        (IPC_ZONE_USAGE(perm, service) == 0)));
}


/*
 * Common code to perform an IPC_RMID.  Returns an errno value on
 * failure, 0 on success.
 */
int
ipc_rmid(ipc_service_t *service, int id, cred_t *cr)
{
    kipc_perm_t *perm;
    kmutex_t *lock;

    mutex_enter(&service->ipcs_lock);

    lock = ipc_lookup(service, id, &perm);
    if (lock == NULL) {
        mutex_exit(&service->ipcs_lock);
        return (EINVAL);
    }

    ASSERT(service->ipcs_count > 0);

    if (secpolicy_ipc_owner(cr, perm) != 0) {
        mutex_exit(lock);
        mutex_exit(&service->ipcs_lock);
        return (EPERM);
    }

    /*
     * Nothing can fail from this point on.
     */
    ipc_remove(service, perm);
    mutex_exit(&service->ipcs_lock);

    /* perform any per-service removal actions */
    service->ipcs_rmid(perm);

    ipc_rele(service, perm);

    return (0);
}

/*
 * Implementation for shmids, semids, and msgids.  buf is the address
 * of the user buffer, nids is the size, and pnids is a pointer to
 * where we write the actual number of ids that [would] have been
 * copied out.
 */
int
ipc_ids(ipc_service_t *service, int *buf, uint_t nids, uint_t *pnids)
{
    kipc_perm_t *perm;
    size_t  idsize = 0;
    int error = 0;
    int idcount;
    int *ids;
    int numids = 0;
    zoneid_t zoneid = getzoneid();
    int global = INGLOBALZONE(curproc);

    if (buf == NULL)
        nids = 0;

    /*
     * Get an accurate count of the total number of ids, and allocate a
     * staging buffer.  Since ipcs_count is always sane, we don't have
     * to take ipcs_lock for our first guess.  If there are no ids, or
     * we're in the global zone and the number of ids is greater than
     * the size of the specified buffer, we shunt to the end.  Otherwise,
     * we go through the id list looking for (and counting) what is
     * visible in the specified zone.
     */
    idcount = service->ipcs_count;
    for (;;) {
        if ((global && idcount > nids) || idcount == 0) {
            numids = idcount;
            nids = 0;
            goto out;
        }

        idsize = idcount * sizeof (int);
        ids = kmem_alloc(idsize, KM_SLEEP);

        mutex_enter(&service->ipcs_lock);
        if (idcount >= service->ipcs_count)
            break;
        idcount = service->ipcs_count;
        mutex_exit(&service->ipcs_lock);

        if (idsize != 0) {
            kmem_free(ids, idsize);
            idsize = 0;
        }
    }

    for (perm = list_head(&service->ipcs_usedids); perm != NULL;
        perm = list_next(&service->ipcs_usedids, perm)) {
        ASSERT(!IPC_FREE(perm));
        if (global || perm->ipc_zoneid == zoneid)
            ids[numids++] = perm->ipc_id;
    }
    mutex_exit(&service->ipcs_lock);

    /*
     * If there isn't enough space to hold all of the ids, just
     * return the number of ids without copying out any of them.
     */
    if (nids < numids)
        nids = 0;

out:
    if (suword32(pnids, (uint32_t)numids) ||
        (nids != 0 && copyout(ids, buf, numids * sizeof (int))))
        error = EFAULT;
    if (idsize != 0)
        kmem_free(ids, idsize);
    return (error);
}

/*
 * Destroy IPC objects from the given service that are associated with
 * the given zone.
 *
 * We can't hold on to the service lock when freeing objects, so we
 * first search the service and move all the objects to a private
 * list, then walk through and free them after dropping the lock.
 */
void
ipc_remove_zone(ipc_service_t *service, zoneid_t zoneid)
{
    kipc_perm_t *perm, *next;
    list_t rmlist;
    kmutex_t *lock;

    list_create(&rmlist, sizeof (kipc_perm_t),
        offsetof(kipc_perm_t, ipc_list));

    mutex_enter(&service->ipcs_lock);
    for (perm = list_head(&service->ipcs_usedids); perm != NULL;
        perm = next) {
        next = list_next(&service->ipcs_usedids, perm);
        if (perm->ipc_zoneid != zoneid)
            continue;

        /*
         * Remove the object from the service, then put it on
         * the removal list so we can defer the call to
         * ipc_rele (which will actually free the structure).
         * We need to do this since the destructor may grab
         * the service lock.
         */
        ASSERT(!IPC_FREE(perm));
        lock = ipc_lock(service, perm->ipc_id);
        ipc_remove(service, perm);
        mutex_exit(lock);
        list_insert_tail(&rmlist, perm);
    }
    mutex_exit(&service->ipcs_lock);

    /*
     * Now that we've dropped the service lock, loop through the
     * private list freeing removed objects.
     */
    for (perm = list_head(&rmlist); perm != NULL; perm = next) {
        next = list_next(&rmlist, perm);
        list_remove(&rmlist, perm);

        (void) ipc_lock(service, perm->ipc_id);

        /* perform any per-service removal actions */
        service->ipcs_rmid(perm);

        /* release reference */
        ipc_rele(service, perm);
    }

    list_destroy(&rmlist);
}