/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* Common Inter-Process Communication routines.
*
* Overview
* --------
*
* The System V inter-process communication (IPC) facilities provide
* three services, message queues, semaphore arrays, and shared memory
* segments, which are mananged using filesystem-like namespaces.
* Unlike a filesystem, these namespaces aren't mounted and accessible
* via a path -- a special API is used to interact with the different
* facilities (nothing precludes a VFS-based interface, but the
* standards require the special APIs). Furthermore, these special
* APIs don't use file descriptors, nor do they have an equivalent.
* This means that every operation which acts on an object needs to
* perform the quivalent of a lookup, which in turn means that every
* operation can fail if the specified object doesn't exist in the
* facility's namespace.
*
* Objects
* -------
*
* Each object in a namespace has a unique ID, which is assigned by the
* system and is used to identify the object when performing operations
* on it. An object can also have a key, which is selected by the user
* at allocation time and is used as a primitive rendezvous mechanism.
* An object without a key is said to have a "private" key.
*
* To perform an operation on an object given its key, one must first
* perform a lookup and obtain its ID. The ID is then used to identify
* the object when performing the operation. If the object has a
* private key, the ID must be known or obtained by other means.
*
* Each object in the namespace has a creator uid and gid, as well as
* an owner uid and gid. Both are initialized with the ruid and rgid
* of the process which created the object. The creator or current
* owner has the ability to change the owner of the object.
*
* Each object in the namespace has a set of file-like permissions,
* which, in conjunction with the creator and owner uid and gid,
* control read and write access to the object (execute is ignored).
*
* Each object also has a creator project and zone, which are used to
* account for its resource usage.
*
* Operations
* ----------
*
* There are five operations which all three facilities have in
* common: GET, SET, STAT, RMID, and IDS.
*
* GET, like open, is used to allocate a new object or obtain an
* existing one (using its key). It takes a key, a set of flags and
* mode bits, and optionally facility-specific arguments. If the key
* is IPC_PRIVATE, a new object with the requested mode bits and
* facility-specific attributes is created. If the key isn't
* IPC_PRIVATE, the GET will attempt to look up the specified key and
* either return that or create a new key depending on the state of the
* IPC_CREAT and IPC_EXCL flags, much like open. If GET needs to
* allocate an object, it can fail if there is insufficient space in
* the namespace (the maximum number of ids for the facility has been
* exceeded) or if the facility-specific initialization fails. If GET
* finds an object it can return, it can still fail if that object's
* permissions or facility-specific attributes are less than those
* requested.
*
* SET is used to adjust facility-specific parameters of an object, in
* addition to the owner uid and gid, and mode bits. It can fail if
* the caller isn't the creator or owner.
*
* STAT is used to obtain information about an object including the
* general attributes object described as well as facility-specific
* information. It can fail if the caller doesn't have read
* permission.
*
* RMID removes an object from the namespace. Subsequent operations
* using the object's ID or key will fail (until another object is
* created with the same key or ID). Since an RMID may be performed
* asynchronously with other operations, it is possible that other
* threads and/or processes will have references to the object. While
* a facility may have actions which need to be performed at RMID time,
* only when all references are dropped can the object be destroyed.
* RMID will fail if the caller isn't the creator or owner.
*
* IDS obtains a list of all IDs in a facility's namespace. There are
* no facility-specific behaviors of IDS.
*
* Design
* ------
*
* Because some IPC facilities provide services whose operations must
* scale, a mechanism which allows fast, concurrent access to
* individual objects is needed. Of primary importance is object
* lookup based on ID (SET, STAT, others). Allocation (GET),
* deallocation (RMID), ID enumeration (IDS), and key lookups (GET) are
* lesser concerns, but should be implemented in such a way that ID
* lookup isn't affected (at least not in the common case).
*
* Starting from the bottom up, each object is represented by a
* structure, the first member of which must be a kipc_perm_t. The
* kipc_perm_t contains the information described above in "Objects", a
* reference count (since the object may continue to exist after it has
* been removed from the namespace), as well as some additional
* metadata used to manage data structure membership. These objects
* are dynamically allocated.
*
* Above the objects is a power-of-two sized table of ID slots. Each
* slot contains a pointer to an object, a sequence number, and a
* lock. An object's ID is a function of its slot's index in the table
* and its slot's sequence number. Every time a slot is released (via
* RMID) its sequence number is increased. Strictly speaking, the
* sequence number is unnecessary. However, checking the sequence
* number after a lookup provides a certain degree of robustness
* against the use of stale IDs (useful since nothing else does). When
* the table fills up, it is resized (see Locking, below).
*
* Of an ID's 31 bits (an ID is, as defined by the standards, a signed
* int) the top IPC_SEQ_BITS are used for the sequence number with the
* remainder holding the index into the table. The size of the table
* is therefore bounded at 2 ^ (31 - IPC_SEQ_BITS) slots.
*
* Managing this table is the ipc_service structure. It contains a
* pointer to the dynamically allocated ID table, a namespace-global
* lock, an id_space for managing the free space in the table, and
* sundry other metadata necessary for the maintenance of the
* namespace. An AVL tree of all keyed objects in the table (sorted by
* key) is used for key lookups. An unordered doubly linked list of
* all objects in the namespace (keyed or not) is maintained to
* facilitate ID enumeration.
*
* To help visualize these relationships, here's a picture of a
* namespace with a table of size 8 containing three objects
* (IPC_SEQ_BITS = 28):
*
*
* +-ipc_service_t--+
* | table *---\
* | keys *---+----------------------\
* | all ids *--\| |
* | | || |
* +----------------+ || |
* || |
* /-------------------/| |
* | /---------------/ |
* | | |
* | v |
* | +-0------+-1------+-2------+-3------+-4--+---+-5------+-6------+-7------+
* | | Seq=3 | | | Seq=1 | : | | | Seq=6 |
* | | | | | | : | | | |
* | +-*------+--------+--------+-*------+----+---+--------+--------+-*------+
* | | | | |
* | | /---/ | /----------------/
* | | | | |
* | v v | v
* | +-kipc_perm_t-+ +-kipc_perm_t-+ | +-kipc_perm_t-+
* | | id=0x30 | | id=0x13 | | | id=0x67 |
* | | key=0xfeed | | key=0xbeef | | | key=0xcafe |
* \->| [list] |<------>| [list] |<------>| [list] |
* /->| [avl left] x /--->| [avl left] x \--->| [avl left] *---\
* | | [avl right] x | | [avl right] x | [avl right] *---+-\
* | | | | | | | | | |
* | +-------------+ | +-------------+ +-------------+ | |
* | \---------------------------------------------/ |
* \--------------------------------------------------------------------/
*
* Locking
* -------
*
* There are three locks (or sets of locks) which are used to ensure
* correctness: the slot locks, the namespace lock, and p_lock (needed
* when checking resource controls). Their ordering is
*
* namespace lock -> slot lock 0 -> ... -> slot lock t -> p_lock
*
* Generally speaking, the namespace lock is used to protect allocation
* and removal from the namespace, ID enumeration, and resizing the ID
* table. Specifically:
*
* - write access to all fields of the ipc_service structure
* - read access to all variable fields of ipc_service except
* ipcs_tabsz (table size) and ipcs_table (the table pointer)
* - read/write access to ipc_avl, ipc_list in visible objects'
* kipc_perm structures (i.e. objects which have been removed from
* the namespace don't have this restriction)
* - write access to ipct_seq and ipct_data in the table entries
*
* A slot lock by itself is meaningless (except when resizing). Of
* greater interest conceptually is the notion of an ID lock -- a
* "virtual lock" which refers to whichever slot lock an object's ID
* currently hashes to.
*
* An ID lock protects all objects with that ID. Normally there will
* only be one such object: the one pointed to by the locked slot.
* However, if an object is removed from the namespace but retains
* references (e.g. an attached shared memory segment which has been
* RMIDed), it continues to use the lock associated with its original
* ID. While this can result in increased contention, operations which
* require taking the ID lock of removed objects are infrequent.
*
* Specifically, an ID lock protects the contents of an object's
* structure, including the contents of the embedded kipc_perm
* structure (but excluding those fields protected by the namespace
* lock). It also protects the ipct_seq and ipct_data fields in its
* slot (it is really a slot lock, after all).
*
* Recall that the table is resizable. To avoid requiring every ID
* lookup to take a global lock, a scheme much like that employed for
* file descriptors (see the comment above UF_ENTER in user.h) is
* used. Note that the sequence number and data pointer are protected
* by both the namespace lock and their slot lock. When the table is
* resized, the following operations take place:
*
* 1) A new table is allocated.
* 2) The global lock is taken.
* 3) All old slots are locked, in order.
* 4) The first half of the new slots are locked.
* 5) All table entries are copied to the new table, and cleared from
* the old table.
* 6) The ipc_service structure is updated to point to the new table.
* 7) The ipc_service structure is updated with the new table size.
* 8) All slot locks (old and new) are dropped.
*
* Because the slot locks are embedded in the table, ID lookups and
* other operations which require taking an slot lock need to verify
* that the lock taken wasn't part of a stale table. This is
* accomplished by checking the table size before and after
* dereferencing the table pointer and taking the lock: if the size
* changes, the lock must be dropped and reacquired. It is this
* additional work which distinguishes an ID lock from a slot lock.
*
* Because we can't guarantee that threads aren't accessing the old
* tables' locks, they are never deallocated. To prevent spurious
* reports of memory leaks, a pointer to the discarded table is stored
* in the new one in step 5. (Theoretically ipcs_destroy will delete
* the discarded tables, but it is only ever called from a failed _init
* invocation; i.e. when there aren't any.)
*
* Interfaces
* ----------
*
* The following interfaces are provided by the ipc module for use by
* the individual IPC facilities:
*
* ipcperm_access
*
* Given an object and a cred structure, determines if the requested
* access type is allowed.
*
* ipcperm_set, ipcperm_stat,
* ipcperm_set64, ipcperm_stat64
*
* Performs the common portion of an STAT or SET operation. All
* (except stat and stat64) can fail, so they should be called before
* any facility-specific non-reversible changes are made to an
* object. Similarly, the set operations have side effects, so they
* should only be called once the possibility of a facility-specific
* failure is eliminated.
*
* ipcs_create
*
* Creates an IPC namespace for use by an IPC facility.
*
* ipcs_destroy
*
* Destroys an IPC namespace.
*
* ipcs_lock, ipcs_unlock
*
* Takes the namespace lock. Ideally such access wouldn't be
* necessary, but there may be facility-specific data protected by
* this lock (e.g. project-wide resource consumption).
*
* ipc_lock
*
* Takes the lock associated with an ID. Can't fail.
*
* ipc_relock
*
* Like ipc_lock, but takes a pointer to a held lock. Drops the lock
* unless it is the one that would have been returned by ipc_lock.
* Used after calls to cv_wait.
*
* ipc_lookup
*
* Performs an ID lookup, returns with the ID lock held. Fails if
* the ID doesn't exist in the namespace.
*
* ipc_hold
*
* Takes a reference on an object.
*
* ipc_rele
*
* Releases a reference on an object, and drops the object's lock.
* Calls the object's destructor if last reference is being
* released.
*
* ipc_rele_locked
*
* Releases a reference on an object. Doesn't drop lock, and may
* only be called when there is more than one reference to the
* object.
*
* ipc_get, ipc_commit_begin, ipc_commit_end, ipc_cleanup
*
* Components of a GET operation. ipc_get performs a key lookup,
* allocating an object if the key isn't found (returning with the
* namespace lock and p_lock held), and returning the existing object
* if it is (with the object lock held). ipc_get doesn't modify the
* namespace.
*
* ipc_commit_begin begins the process of inserting an object
* allocated by ipc_get into the namespace, and can fail. If
* successful, it returns with the namespace lock and p_lock held.
* ipc_commit_end completes the process of inserting an object into
* the namespace and can't fail. The facility can call ipc_cleanup
* at any time following a successful ipc_get and before
* ipc_commit_end or a failed ipc_commit_begin to fail the
* allocation. Pseudocode for the suggested GET implementation:
*
* top:
*
* ipc_get
*
* if failure
* return
*
* if found {
*
* if object meets criteria
* unlock object and return success
* else
* unlock object and return failure
*
* } else {
*
* perform resource control tests
* drop namespace lock, p_lock
* if failure
* ipc_cleanup
*
* perform facility-specific initialization
* if failure {
* facility-specific cleanup
* ipc_cleanup
* }
*
* ( At this point the object should be destructible using the
* destructor given to ipcs_create )
*
* ipc_commit_begin
* if retry
* goto top
* else if failure
* return
*
* perform facility-specific resource control tests/allocations
* if failure
* ipc_cleanup
*
* ipc_commit_end
* perform any infallible post-creation actions, unlock, and return
*
* }
*
* ipc_rmid
*
* Performs the common portion of an RMID operation -- looks up an ID
* removes it, and calls the a facility-specific function to do
* RMID-time cleanup on the private portions of the object.
*
* ipc_ids
*
* Performs the common portion of an IDS operation.
*
*/
#include <sys/types.h>
#include <sys/param.h>
#include <sys/cred.h>
#include <sys/policy.h>
#include <sys/proc.h>
#include <sys/user.h>
#include <sys/ipc.h>
#include <sys/ipc_impl.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/list.h>
#include <sys/atomic.h>
#include <sys/zone.h>
#include <sys/task.h>
#include <sys/modctl.h>
#include <c2/audit.h>
static struct modlmisc modlmisc = {
&mod_miscops,
"common ipc code",
};
static struct modlinkage modlinkage = {
MODREV_1, (void *)&modlmisc, NULL
};
int
_init(void)
{
return (mod_install(&modlinkage));
}
int
_fini(void)
{
return (mod_remove(&modlinkage));
}
int
_info(struct modinfo *modinfop)
{
return (mod_info(&modlinkage, modinfop));
}
/*
* Check message, semaphore, or shared memory access permissions.
*
* This routine verifies the requested access permission for the current
* process. The zone ids are compared, and the appropriate bits are
* checked corresponding to owner, group (including the list of
* supplementary groups), or everyone. Zero is returned on success.
* On failure, the security policy is asked to check to override the
* permissions check; the policy will either return 0 for access granted
* or EACCES.
*
* Access to objects in other zones requires that the caller be in the
* global zone and have the appropriate IPC_DAC_* privilege, regardless
* of whether the uid or gid match those of the object. Note that
* cross-zone accesses will normally never get here since they'll
* fail in ipc_lookup or ipc_get.
*
* The arguments must be set up as follows:
* p - Pointer to permission structure to verify
* mode - Desired access permissions
*/
int
ipcperm_access(kipc_perm_t *p, int mode, cred_t *cr)
{
int shifts = 0;
uid_t uid = crgetuid(cr);
zoneid_t zoneid = getzoneid();
if (p->ipc_zoneid == zoneid) {
if (uid != p->ipc_uid && uid != p->ipc_cuid) {
shifts += 3;
if (!groupmember(p->ipc_gid, cr) &&
!groupmember(p->ipc_cgid, cr))
shifts += 3;
}
mode &= ~(p->ipc_mode << shifts);
if (mode == 0)
return (0);
} else if (zoneid != GLOBAL_ZONEID)
return (EACCES);
return (secpolicy_ipc_access(cr, p, mode));
}
/*
* There are two versions of the ipcperm_set/stat functions:
* ipcperm_??? - for use with IPC_SET/STAT
* ipcperm_???_64 - for use with IPC_SET64/STAT64
*
* These functions encapsulate the common portions (copying, permission
* checks, and auditing) of the set/stat operations. All, except for
* stat and stat_64 which are void, return 0 on success or a non-zero
* errno value on error.
*/
int
ipcperm_set(ipc_service_t *service, struct cred *cr,
kipc_perm_t *kperm, struct ipc_perm *perm, model_t model)
{
STRUCT_HANDLE(ipc_perm, lperm);
uid_t uid;
gid_t gid;
mode_t mode;
zone_t *zone;
ASSERT(IPC_LOCKED(service, kperm));
STRUCT_SET_HANDLE(lperm, model, perm);
uid = STRUCT_FGET(lperm, uid);
gid = STRUCT_FGET(lperm, gid);
mode = STRUCT_FGET(lperm, mode);
if (secpolicy_ipc_owner(cr, kperm) != 0)
return (EPERM);
zone = crgetzone(cr);
if (!VALID_UID(uid, zone) || !VALID_GID(gid, zone))
return (EINVAL);
kperm->ipc_uid = uid;
kperm->ipc_gid = gid;
kperm->ipc_mode = (mode & 0777) | (kperm->ipc_mode & ~0777);
if (AU_AUDITING())
audit_ipcget(service->ipcs_atype, kperm);
return (0);
}
void
ipcperm_stat(struct ipc_perm *perm, kipc_perm_t *kperm, model_t model)
{
STRUCT_HANDLE(ipc_perm, lperm);
STRUCT_SET_HANDLE(lperm, model, perm);
STRUCT_FSET(lperm, uid, kperm->ipc_uid);
STRUCT_FSET(lperm, gid, kperm->ipc_gid);
STRUCT_FSET(lperm, cuid, kperm->ipc_cuid);
STRUCT_FSET(lperm, cgid, kperm->ipc_cgid);
STRUCT_FSET(lperm, mode, kperm->ipc_mode);
STRUCT_FSET(lperm, seq, 0);
STRUCT_FSET(lperm, key, kperm->ipc_key);
}
int
ipcperm_set64(ipc_service_t *service, struct cred *cr,
kipc_perm_t *kperm, ipc_perm64_t *perm64)
{
zone_t *zone;
ASSERT(IPC_LOCKED(service, kperm));
if (secpolicy_ipc_owner(cr, kperm) != 0)
return (EPERM);
zone = crgetzone(cr);
if (!VALID_UID(perm64->ipcx_uid, zone) ||
!VALID_GID(perm64->ipcx_gid, zone))
return (EINVAL);
kperm->ipc_uid = perm64->ipcx_uid;
kperm->ipc_gid = perm64->ipcx_gid;
kperm->ipc_mode = (perm64->ipcx_mode & 0777) |
(kperm->ipc_mode & ~0777);
if (AU_AUDITING())
audit_ipcget(service->ipcs_atype, kperm);
return (0);
}
void
ipcperm_stat64(ipc_perm64_t *perm64, kipc_perm_t *kperm)
{
perm64->ipcx_uid = kperm->ipc_uid;
perm64->ipcx_gid = kperm->ipc_gid;
perm64->ipcx_cuid = kperm->ipc_cuid;
perm64->ipcx_cgid = kperm->ipc_cgid;
perm64->ipcx_mode = kperm->ipc_mode;
perm64->ipcx_key = kperm->ipc_key;
perm64->ipcx_projid = kperm->ipc_proj->kpj_id;
perm64->ipcx_zoneid = kperm->ipc_zoneid;
}
/*
* ipc key comparator.
*/
static int
ipc_key_compar(const void *a, const void *b)
{
kipc_perm_t *aperm = (kipc_perm_t *)a;
kipc_perm_t *bperm = (kipc_perm_t *)b;
int ak = aperm->ipc_key;
int bk = bperm->ipc_key;
zoneid_t az;
zoneid_t bz;
ASSERT(ak != IPC_PRIVATE);
ASSERT(bk != IPC_PRIVATE);
/*
* Compare key first, then zoneid. This optimizes performance for
* systems with only one zone, since the zone checks will only be
* made when the keys match.
*/
if (ak < bk)
return (-1);
if (ak > bk)
return (1);
/* keys match */
az = aperm->ipc_zoneid;
bz = bperm->ipc_zoneid;
if (az < bz)
return (-1);
if (az > bz)
return (1);
return (0);
}
/*
* Create an ipc service.
*/
ipc_service_t *
ipcs_create(const char *name, rctl_hndl_t proj_rctl, rctl_hndl_t zone_rctl,
size_t size, ipc_func_t *dtor, ipc_func_t *rmid, int audit_type,
size_t rctl_offset)
{
ipc_service_t *result;
result = kmem_alloc(sizeof (ipc_service_t), KM_SLEEP);
mutex_init(&result->ipcs_lock, NULL, MUTEX_ADAPTIVE, NULL);
result->ipcs_count = 0;
avl_create(&result->ipcs_keys, ipc_key_compar, size, 0);
result->ipcs_tabsz = IPC_IDS_MIN;
result->ipcs_table =
kmem_zalloc(IPC_IDS_MIN * sizeof (ipc_slot_t), KM_SLEEP);
result->ipcs_ssize = size;
result->ipcs_ids = id_space_create(name, 0, IPC_IDS_MIN);
result->ipcs_dtor = dtor;
result->ipcs_rmid = rmid;
result->ipcs_proj_rctl = proj_rctl;
result->ipcs_zone_rctl = zone_rctl;
result->ipcs_atype = audit_type;
ASSERT(rctl_offset < sizeof (ipc_rqty_t));
result->ipcs_rctlofs = rctl_offset;
list_create(&result->ipcs_usedids, sizeof (kipc_perm_t),
offsetof(kipc_perm_t, ipc_list));
return (result);
}
/*
* Destroy an ipc service.
*/
void
ipcs_destroy(ipc_service_t *service)
{
ipc_slot_t *slot, *next;
mutex_enter(&service->ipcs_lock);
ASSERT(service->ipcs_count == 0);
avl_destroy(&service->ipcs_keys);
list_destroy(&service->ipcs_usedids);
id_space_destroy(service->ipcs_ids);
for (slot = service->ipcs_table; slot; slot = next) {
next = slot[0].ipct_chain;
kmem_free(slot, service->ipcs_tabsz * sizeof (ipc_slot_t));
service->ipcs_tabsz >>= 1;
}
mutex_destroy(&service->ipcs_lock);
kmem_free(service, sizeof (ipc_service_t));
}
/*
* Takes the service lock.
*/
void
ipcs_lock(ipc_service_t *service)
{
mutex_enter(&service->ipcs_lock);
}
/*
* Releases the service lock.
*/
void
ipcs_unlock(ipc_service_t *service)
{
mutex_exit(&service->ipcs_lock);
}
/*
* Locks the specified ID. Returns the ID's ID table index.
*/
static int
ipc_lock_internal(ipc_service_t *service, uint_t id)
{
uint_t tabsz;
uint_t index;
kmutex_t *mutex;
for (;;) {
tabsz = service->ipcs_tabsz;
membar_consumer();
index = id & (tabsz - 1);
mutex = &service->ipcs_table[index].ipct_lock;
mutex_enter(mutex);
if (tabsz == service->ipcs_tabsz)
break;
mutex_exit(mutex);
}
return (index);
}
/*
* Locks the specified ID. Returns a pointer to the ID's lock.
*/
kmutex_t *
ipc_lock(ipc_service_t *service, int id)
{
uint_t index;
/*
* These assertions don't reflect requirements of the code
* which follows, but they should never fail nonetheless.
*/
ASSERT(id >= 0);
ASSERT(IPC_INDEX(id) < service->ipcs_tabsz);
index = ipc_lock_internal(service, id);
return (&service->ipcs_table[index].ipct_lock);
}
/*
* Checks to see if the held lock provided is the current lock for the
* specified id. If so, we return it instead of dropping it and
* returning the result of ipc_lock. This is intended to speed up cv
* wakeups where we are left holding a lock which could be stale, but
* probably isn't.
*/
kmutex_t *
ipc_relock(ipc_service_t *service, int id, kmutex_t *lock)
{
ASSERT(id >= 0);
ASSERT(IPC_INDEX(id) < service->ipcs_tabsz);
ASSERT(MUTEX_HELD(lock));
if (&service->ipcs_table[IPC_INDEX(id)].ipct_lock == lock)
return (lock);
mutex_exit(lock);
return (ipc_lock(service, id));
}
/*
* Performs an ID lookup. If the ID doesn't exist or has been removed,
* or isn't visible to the caller (because of zones), NULL is returned.
* Otherwise, a pointer to the ID's perm structure and held ID lock are
* returned.
*/
kmutex_t *
ipc_lookup(ipc_service_t *service, int id, kipc_perm_t **perm)
{
kipc_perm_t *result;
uint_t index;
/*
* There is no need to check to see if id is in-range (i.e.
* positive and fits into the table). If it is out-of-range,
* the id simply won't match the object's.
*/
index = ipc_lock_internal(service, id);
result = service->ipcs_table[index].ipct_data;
if (result == NULL || result->ipc_id != (uint_t)id ||
!HASZONEACCESS(curproc, result->ipc_zoneid)) {
mutex_exit(&service->ipcs_table[index].ipct_lock);
return (NULL);
}
ASSERT(IPC_SEQ(id) == service->ipcs_table[index].ipct_seq);
*perm = result;
if (AU_AUDITING())
audit_ipc(service->ipcs_atype, id, result);
return (&service->ipcs_table[index].ipct_lock);
}
/*
* Increase the reference count on an ID.
*/
/*ARGSUSED*/
void
ipc_hold(ipc_service_t *s, kipc_perm_t *perm)
{
ASSERT(IPC_INDEX(perm->ipc_id) < s->ipcs_tabsz);
ASSERT(IPC_LOCKED(s, perm));
perm->ipc_ref++;
}
/*
* Decrease the reference count on an ID and drops the ID's lock.
* Destroys the ID if the new reference count is zero.
*/
void
ipc_rele(ipc_service_t *s, kipc_perm_t *perm)
{
int nref;
ASSERT(IPC_INDEX(perm->ipc_id) < s->ipcs_tabsz);
ASSERT(IPC_LOCKED(s, perm));
ASSERT(perm->ipc_ref > 0);
nref = --perm->ipc_ref;
mutex_exit(&s->ipcs_table[IPC_INDEX(perm->ipc_id)].ipct_lock);
if (nref == 0) {
ASSERT(IPC_FREE(perm)); /* ipc_rmid clears IPC_ALLOC */
s->ipcs_dtor(perm);
project_rele(perm->ipc_proj);
zone_rele_ref(&perm->ipc_zone_ref, ZONE_REF_IPC);
kmem_free(perm, s->ipcs_ssize);
}
}
/*
* Decrease the reference count on an ID, but don't drop the ID lock.
* Used in cases where one thread needs to remove many references (on
* behalf of other parties).
*/
void
ipc_rele_locked(ipc_service_t *s, kipc_perm_t *perm)
{
ASSERT(perm->ipc_ref > 1);
ASSERT(IPC_INDEX(perm->ipc_id) < s->ipcs_tabsz);
ASSERT(IPC_LOCKED(s, perm));
perm->ipc_ref--;
}
/*
* Internal function to grow the service ID table.
*/
static int
ipc_grow(ipc_service_t *service)
{
ipc_slot_t *new, *old;
int i, oldsize, newsize;
ASSERT(MUTEX_HELD(&service->ipcs_lock));
ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
if (service->ipcs_tabsz == IPC_IDS_MAX)
return (ENOSPC);
oldsize = service->ipcs_tabsz;
newsize = oldsize << 1;
new = kmem_zalloc(newsize * sizeof (ipc_slot_t), KM_NOSLEEP);
if (new == NULL)
return (ENOSPC);
old = service->ipcs_table;
for (i = 0; i < oldsize; i++) {
mutex_enter(&old[i].ipct_lock);
mutex_enter(&new[i].ipct_lock);
new[i].ipct_seq = old[i].ipct_seq;
new[i].ipct_data = old[i].ipct_data;
old[i].ipct_data = NULL;
}
new[0].ipct_chain = old;
service->ipcs_table = new;
membar_producer();
service->ipcs_tabsz = newsize;
for (i = 0; i < oldsize; i++) {
mutex_exit(&old[i].ipct_lock);
mutex_exit(&new[i].ipct_lock);
}
id_space_extend(service->ipcs_ids, oldsize, service->ipcs_tabsz);
return (0);
}
static int
ipc_keylookup(ipc_service_t *service, key_t key, int flag, kipc_perm_t **permp)
{
kipc_perm_t *perm = NULL;
avl_index_t where;
kipc_perm_t template;
ASSERT(MUTEX_HELD(&service->ipcs_lock));
template.ipc_key = key;
template.ipc_zoneid = getzoneid();
if (perm = avl_find(&service->ipcs_keys, &template, &where)) {
ASSERT(!IPC_FREE(perm));
if ((flag & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
return (EEXIST);
if ((flag & 0777) & ~perm->ipc_mode) {
if (AU_AUDITING())
audit_ipcget(NULL, (void *)perm);
return (EACCES);
}
*permp = perm;
return (0);
} else if (flag & IPC_CREAT) {
*permp = NULL;
return (0);
}
return (ENOENT);
}
static int
ipc_alloc_test(ipc_service_t *service, proc_t *pp)
{
ASSERT(MUTEX_HELD(&service->ipcs_lock));
/*
* Resizing the table first would result in a cleaner code
* path, but would also allow a user to (permanently) double
* the id table size in cases where the allocation would be
* denied. Hence we test the rctl first.
*/
retry:
mutex_enter(&pp->p_lock);
if ((rctl_test(service->ipcs_proj_rctl, pp->p_task->tk_proj->kpj_rctls,
pp, 1, RCA_SAFE) & RCT_DENY) ||
(rctl_test(service->ipcs_zone_rctl, pp->p_zone->zone_rctls,
pp, 1, RCA_SAFE) & RCT_DENY)) {
mutex_exit(&pp->p_lock);
return (ENOSPC);
}
if (service->ipcs_count == service->ipcs_tabsz) {
int error;
mutex_exit(&pp->p_lock);
if (error = ipc_grow(service))
return (error);
goto retry;
}
return (0);
}
/*
* Given a key, search for or create the associated identifier.
*
* If IPC_CREAT is specified and the key isn't found, or if the key is
* equal to IPC_PRIVATE, we return 0 and place a pointer to a newly
* allocated object structure in permp. A pointer to the held service
* lock is placed in lockp. ipc_mode's IPC_ALLOC bit is clear.
*
* If the key is found and no error conditions arise, we return 0 and
* place a pointer to the existing object structure in permp. A
* pointer to the held ID lock is placed in lockp. ipc_mode's
* IPC_ALLOC bit is set.
*
* Otherwise, a non-zero errno value is returned.
*/
int
ipc_get(ipc_service_t *service, key_t key, int flag, kipc_perm_t **permp,
kmutex_t **lockp)
{
kipc_perm_t *perm = NULL;
proc_t *pp = curproc;
int error, index;
cred_t *cr = CRED();
if (key != IPC_PRIVATE) {
mutex_enter(&service->ipcs_lock);
error = ipc_keylookup(service, key, flag, &perm);
if (perm != NULL)
index = ipc_lock_internal(service, perm->ipc_id);
mutex_exit(&service->ipcs_lock);
if (error) {
ASSERT(perm == NULL);
return (error);
}
if (perm) {
ASSERT(!IPC_FREE(perm));
*permp = perm;
*lockp = &service->ipcs_table[index].ipct_lock;
return (0);
}
/* Key not found; fall through */
}
perm = kmem_zalloc(service->ipcs_ssize, KM_SLEEP);
mutex_enter(&service->ipcs_lock);
if (error = ipc_alloc_test(service, pp)) {
mutex_exit(&service->ipcs_lock);
kmem_free(perm, service->ipcs_ssize);
return (error);
}
perm->ipc_cuid = perm->ipc_uid = crgetuid(cr);
perm->ipc_cgid = perm->ipc_gid = crgetgid(cr);
perm->ipc_zoneid = getzoneid();
perm->ipc_mode = flag & 0777;
perm->ipc_key = key;
perm->ipc_ref = 1;
perm->ipc_id = IPC_ID_INVAL;
*permp = perm;
*lockp = &service->ipcs_lock;
return (0);
}
/*
* Attempts to add the a newly created ID to the global namespace. If
* creating it would cause an error, we return the error. If there is
* the possibility that we could obtain the existing ID and return it
* to the user, we return EAGAIN. Otherwise, we return 0 with p_lock
* and the service lock held.
*
* Since this should be only called after all initialization has been
* completed, on failure we automatically invoke the destructor for the
* object and deallocate the memory associated with it.
*/
int
ipc_commit_begin(ipc_service_t *service, key_t key, int flag,
kipc_perm_t *newperm)
{
kipc_perm_t *perm;
int error;
proc_t *pp = curproc;
ASSERT(newperm->ipc_ref == 1);
ASSERT(IPC_FREE(newperm));
/*
* Set ipc_proj and ipc_zone_ref so that future calls to ipc_cleanup()
* clean up the necessary state. This must be done before the
* potential call to ipcs_dtor() below.
*/
newperm->ipc_proj = pp->p_task->tk_proj;
zone_init_ref(&newperm->ipc_zone_ref);
zone_hold_ref(pp->p_zone, &newperm->ipc_zone_ref, ZONE_REF_IPC);
mutex_enter(&service->ipcs_lock);
/*
* Ensure that no-one has raced with us and created the key.
*/
if ((key != IPC_PRIVATE) &&
(((error = ipc_keylookup(service, key, flag, &perm)) != 0) ||
(perm != NULL))) {
error = error ? error : EAGAIN;
goto errout;
}
/*
* Ensure that no-one has raced with us and used the last of
* the permissible ids, or the last of the free spaces in the
* id table.
*/
if (error = ipc_alloc_test(service, pp))
goto errout;
ASSERT(MUTEX_HELD(&service->ipcs_lock));
ASSERT(MUTEX_HELD(&pp->p_lock));
return (0);
errout:
mutex_exit(&service->ipcs_lock);
service->ipcs_dtor(newperm);
zone_rele_ref(&newperm->ipc_zone_ref, ZONE_REF_IPC);
kmem_free(newperm, service->ipcs_ssize);
return (error);
}
/*
* Commit the ID allocation transaction. Called with p_lock and the
* service lock held, both of which are dropped. Returns the held ID
* lock so the caller can extract the ID and perform ipcget auditing.
*/
kmutex_t *
ipc_commit_end(ipc_service_t *service, kipc_perm_t *perm)
{
ipc_slot_t *slot;
avl_index_t where;
int index;
void *loc;
ASSERT(MUTEX_HELD(&service->ipcs_lock));
ASSERT(MUTEX_HELD(&curproc->p_lock));
(void) project_hold(perm->ipc_proj);
mutex_exit(&curproc->p_lock);
/*
* Pick out our slot.
*/
service->ipcs_count++;
index = id_alloc(service->ipcs_ids);
ASSERT(index < service->ipcs_tabsz);
slot = &service->ipcs_table[index];
mutex_enter(&slot->ipct_lock);
ASSERT(slot->ipct_data == NULL);
/*
* Update the perm structure.
*/
perm->ipc_mode |= IPC_ALLOC;
perm->ipc_id = (slot->ipct_seq << IPC_SEQ_SHIFT) | index;
/*
* Push into global visibility.
*/
slot->ipct_data = perm;
if (perm->ipc_key != IPC_PRIVATE) {
loc = avl_find(&service->ipcs_keys, perm, &where);
ASSERT(loc == NULL);
avl_insert(&service->ipcs_keys, perm, where);
}
list_insert_head(&service->ipcs_usedids, perm);
/*
* Update resource consumption.
*/
IPC_PROJ_USAGE(perm, service) += 1;
IPC_ZONE_USAGE(perm, service) += 1;
mutex_exit(&service->ipcs_lock);
return (&slot->ipct_lock);
}
/*
* Clean up function, in case the allocation fails. If called between
* ipc_lookup and ipc_commit_begin, perm->ipc_proj will be 0 and we
* merely free the perm structure. If called after ipc_commit_begin,
* we also drop locks and call the ID's destructor.
*/
void
ipc_cleanup(ipc_service_t *service, kipc_perm_t *perm)
{
ASSERT(IPC_FREE(perm));
if (perm->ipc_proj) {
mutex_exit(&curproc->p_lock);
mutex_exit(&service->ipcs_lock);
service->ipcs_dtor(perm);
}
if (perm->ipc_zone_ref.zref_zone != NULL)
zone_rele_ref(&perm->ipc_zone_ref, ZONE_REF_IPC);
kmem_free(perm, service->ipcs_ssize);
}
/*
* Common code to remove an IPC object. This should be called after
* all permissions checks have been performed, and with the service
* and ID locked. Note that this does not remove the object from
* the ipcs_usedids list (this needs to be done by the caller before
* dropping the service lock).
*/
static void
ipc_remove(ipc_service_t *service, kipc_perm_t *perm)
{
int id = perm->ipc_id;
int index;
ASSERT(MUTEX_HELD(&service->ipcs_lock));
ASSERT(IPC_LOCKED(service, perm));
index = IPC_INDEX(id);
service->ipcs_table[index].ipct_data = NULL;
if (perm->ipc_key != IPC_PRIVATE)
avl_remove(&service->ipcs_keys, perm);
list_remove(&service->ipcs_usedids, perm);
perm->ipc_mode &= ~IPC_ALLOC;
id_free(service->ipcs_ids, index);
if (service->ipcs_table[index].ipct_seq++ == IPC_SEQ_MASK)
service->ipcs_table[index].ipct_seq = 0;
service->ipcs_count--;
ASSERT(IPC_PROJ_USAGE(perm, service) > 0);
ASSERT(IPC_ZONE_USAGE(perm, service) > 0);
IPC_PROJ_USAGE(perm, service) -= 1;
IPC_ZONE_USAGE(perm, service) -= 1;
ASSERT(service->ipcs_count || ((IPC_PROJ_USAGE(perm, service) == 0) &&
(IPC_ZONE_USAGE(perm, service) == 0)));
}
/*
* Common code to perform an IPC_RMID. Returns an errno value on
* failure, 0 on success.
*/
int
ipc_rmid(ipc_service_t *service, int id, cred_t *cr)
{
kipc_perm_t *perm;
kmutex_t *lock;
mutex_enter(&service->ipcs_lock);
lock = ipc_lookup(service, id, &perm);
if (lock == NULL) {
mutex_exit(&service->ipcs_lock);
return (EINVAL);
}
ASSERT(service->ipcs_count > 0);
if (secpolicy_ipc_owner(cr, perm) != 0) {
mutex_exit(lock);
mutex_exit(&service->ipcs_lock);
return (EPERM);
}
/*
* Nothing can fail from this point on.
*/
ipc_remove(service, perm);
mutex_exit(&service->ipcs_lock);
/* perform any per-service removal actions */
service->ipcs_rmid(perm);
ipc_rele(service, perm);
return (0);
}
/*
* Implementation for shmids, semids, and msgids. buf is the address
* of the user buffer, nids is the size, and pnids is a pointer to
* where we write the actual number of ids that [would] have been
* copied out.
*/
int
ipc_ids(ipc_service_t *service, int *buf, uint_t nids, uint_t *pnids)
{
kipc_perm_t *perm;
size_t idsize = 0;
int error = 0;
int idcount;
int *ids;
int numids = 0;
zoneid_t zoneid = getzoneid();
int global = INGLOBALZONE(curproc);
if (buf == NULL)
nids = 0;
/*
* Get an accurate count of the total number of ids, and allocate a
* staging buffer. Since ipcs_count is always sane, we don't have
* to take ipcs_lock for our first guess. If there are no ids, or
* we're in the global zone and the number of ids is greater than
* the size of the specified buffer, we shunt to the end. Otherwise,
* we go through the id list looking for (and counting) what is
* visible in the specified zone.
*/
idcount = service->ipcs_count;
for (;;) {
if ((global && idcount > nids) || idcount == 0) {
numids = idcount;
nids = 0;
goto out;
}
idsize = idcount * sizeof (int);
ids = kmem_alloc(idsize, KM_SLEEP);
mutex_enter(&service->ipcs_lock);
if (idcount >= service->ipcs_count)
break;
idcount = service->ipcs_count;
mutex_exit(&service->ipcs_lock);
if (idsize != 0) {
kmem_free(ids, idsize);
idsize = 0;
}
}
for (perm = list_head(&service->ipcs_usedids); perm != NULL;
perm = list_next(&service->ipcs_usedids, perm)) {
ASSERT(!IPC_FREE(perm));
if (global || perm->ipc_zoneid == zoneid)
ids[numids++] = perm->ipc_id;
}
mutex_exit(&service->ipcs_lock);
/*
* If there isn't enough space to hold all of the ids, just
* return the number of ids without copying out any of them.
*/
if (nids < numids)
nids = 0;
out:
if (suword32(pnids, (uint32_t)numids) ||
(nids != 0 && copyout(ids, buf, numids * sizeof (int))))
error = EFAULT;
if (idsize != 0)
kmem_free(ids, idsize);
return (error);
}
/*
* Destroy IPC objects from the given service that are associated with
* the given zone.
*
* We can't hold on to the service lock when freeing objects, so we
* first search the service and move all the objects to a private
* list, then walk through and free them after dropping the lock.
*/
void
ipc_remove_zone(ipc_service_t *service, zoneid_t zoneid)
{
kipc_perm_t *perm, *next;
list_t rmlist;
kmutex_t *lock;
list_create(&rmlist, sizeof (kipc_perm_t),
offsetof(kipc_perm_t, ipc_list));
mutex_enter(&service->ipcs_lock);
for (perm = list_head(&service->ipcs_usedids); perm != NULL;
perm = next) {
next = list_next(&service->ipcs_usedids, perm);
if (perm->ipc_zoneid != zoneid)
continue;
/*
* Remove the object from the service, then put it on
* the removal list so we can defer the call to
* ipc_rele (which will actually free the structure).
* We need to do this since the destructor may grab
* the service lock.
*/
ASSERT(!IPC_FREE(perm));
lock = ipc_lock(service, perm->ipc_id);
ipc_remove(service, perm);
mutex_exit(lock);
list_insert_tail(&rmlist, perm);
}
mutex_exit(&service->ipcs_lock);
/*
* Now that we've dropped the service lock, loop through the
* private list freeing removed objects.
*/
for (perm = list_head(&rmlist); perm != NULL; perm = next) {
next = list_next(&rmlist, perm);
list_remove(&rmlist, perm);
(void) ipc_lock(service, perm->ipc_id);
/* perform any per-service removal actions */
service->ipcs_rmid(perm);
/* release reference */
ipc_rele(service, perm);
}
list_destroy(&rmlist);
}