lx_futex.c revision a913396d8daab34d2fa497f49ae18d9f3d3a059f
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/errno.h>
#include <sys/debug.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_vn.h>
#include <vm/page.h>
#include <sys/mman.h>
#include <sys/timer.h>
#include <sys/condvar.h>
#include <sys/inttypes.h>
#include <sys/lx_futex.h>
/*
* Futexes are a Linux-specific implementation of inter-process mutexes.
* They are designed to use shared memory for simple, uncontested
* operations, and rely on the kernel to resolve any contention issues.
*
* Most of the information in this section comes from the paper "Futexes
* Are Tricky", by Ulrich Drepper. This paper is currently available at:
* http://people.redhat.com/~drepper/futex.pdf.
*
* A futex itself a 4-byte integer, which must be 4-byte aligned. The
* value of this integer is expected to be modified using user-level atomic
* operations. The futex(4) design itself does not impose any semantic
* constraints on the value stored in the futex; it is up to the
* application to define its own protocol.
*
* When the application decides that kernel intervention is required, it
* will use the futex(2) system call. There are 5 different operations
* that can be performed on a futex, using this system call. Since this
* interface has evolved over time, there are several different prototypes
* available to the user. Fortunately, there is only a single kernel-level
* interface:
*
* long sys_futex(void *futex1, int cmd, int val1,
* struct timespec *timeout, void *futex2, int val2)
*
* The kernel-level operations that may be performed on a futex are:
*
* FUTEX_WAIT
*
* Atomically verify that futex1 contains the value val1. If it
* doesn't, return EWOULDBLOCK. If it does contain the expected
* value, the thread will sleep until somebody performs a FUTEX_WAKE
* on the futex. The caller may also specify a timeout, indicating
* the maximum time the thread should sleep. If the timer expires,
* the call returns ETIMEDOUT. If the thread is awoken with a signal,
* the call returns EINTR. Otherwise, the call returns 0.
*
* FUTEX_WAKE
*
* Wake up val1 processes that are waiting on futex1. The call
* returns the number of blocked threads that were woken up.
*
* FUTEX_CMP_REQUEUE
*
* If the value stored in futex1 matches that passed in in val2, wake
* up val1 processes that are waiting on futex1. Otherwise, return
* EAGAIN.
*
* If there are more than val1 threads waiting on the futex, remove
* the remaining threads from this futex, and requeue them on futex2.
* The caller can limit the number of threads being requeued by
* encoding an integral numerical value in the position usually used
* for the timeout pointer.
*
* The call returns the number of blocked threads that were woken up
* or requeued.
*
* FUTEX_REQUEUE
*
* Identical to FUTEX_CMP_REQUEUE except that it does not use val2.
* This command has been declared broken and obsolete, but we still
* need to support it.
*
* FUTEX_FD
*
* Return a file descriptor, which can be used to refer to the futex.
* We don't support this operation.
*/
/*
* This structure is used to track all the threads currently waiting on a
* futex. There is one fwaiter_t for each blocked thread. We store all
* fwaiter_t's in a hash structure, indexed by the memid_t of the integer
* containing the futex's value.
*
* At the moment, all fwaiter_t's for a single futex are simply dumped into
* the hash bucket. If futex contention ever becomes a hot path, we can
* chain a single futex's waiters together.
*/
typedef struct fwaiter {
memid_t fw_memid; /* memid of the user-space futex */
kcondvar_t fw_cv; /* cond var */
struct fwaiter *fw_next; /* hash queue */
struct fwaiter *fw_prev; /* hash queue */
volatile int fw_woken;
} fwaiter_t;
#define MEMID_COPY(s, d) \
{ (d)->val[0] = (s)->val[0]; (d)->val[1] = (s)->val[1]; }
#define MEMID_EQUAL(s, d) \
((d)->val[0] == (s)->val[0] && (d)->val[1] == (s)->val[1])
/* Borrowed from the page freelist hash code. */
#define HASH_SHIFT_SZ 7
#define HASH_SIZE (1 << HASH_SHIFT_SZ)
#define HASH_FUNC(id) \
((((uintptr_t)((id)->val[1]) >> PAGESHIFT) + \
((uintptr_t)((id)->val[1]) >> (PAGESHIFT + HASH_SHIFT_SZ)) + \
((uintptr_t)((id)->val[0]) >> 3) + \
((uintptr_t)((id)->val[0]) >> (3 + HASH_SHIFT_SZ)) + \
((uintptr_t)((id)->val[0]) >> (3 + 2 * HASH_SHIFT_SZ))) & \
(HASH_SIZE - 1))
static fwaiter_t *futex_hash[HASH_SIZE];
static kmutex_t futex_hash_lock[HASH_SIZE];
static void
futex_hashin(fwaiter_t *fwp)
{
int index;
index = HASH_FUNC(&fwp->fw_memid);
ASSERT(MUTEX_HELD(&futex_hash_lock[index]));
fwp->fw_prev = NULL;
fwp->fw_next = futex_hash[index];
if (fwp->fw_next)
fwp->fw_next->fw_prev = fwp;
futex_hash[index] = fwp;
}
static void
futex_hashout(fwaiter_t *fwp)
{
int index;
index = HASH_FUNC(&fwp->fw_memid);
ASSERT(MUTEX_HELD(&futex_hash_lock[index]));
if (fwp->fw_prev)
fwp->fw_prev->fw_next = fwp->fw_next;
if (fwp->fw_next)
fwp->fw_next->fw_prev = fwp->fw_prev;
if (futex_hash[index] == fwp)
futex_hash[index] = fwp->fw_next;
fwp->fw_prev = NULL;
fwp->fw_next = NULL;
}
/*
* Go to sleep until somebody does a WAKE operation on this futex, we get a
* signal, or the timeout expires.
*/
static int
futex_wait(memid_t *memid, caddr_t addr, int val, timespec_t *timeout)
{
int err, ret;
int32_t curval;
fwaiter_t fw;
int index;
fw.fw_woken = 0;
MEMID_COPY(memid, &fw.fw_memid);
cv_init(&fw.fw_cv, NULL, CV_DEFAULT, NULL);
index = HASH_FUNC(&fw.fw_memid);
mutex_enter(&futex_hash_lock[index]);
if (fuword32(addr, (uint32_t *)&curval)) {
err = set_errno(EFAULT);
goto out;
}
if (curval != val) {
err = set_errno(EWOULDBLOCK);
goto out;
}
futex_hashin(&fw);
err = 0;
while ((fw.fw_woken == 0) && (err == 0)) {
ret = cv_waituntil_sig(&fw.fw_cv, &futex_hash_lock[index],
timeout);
if (ret < 0)
err = set_errno(ETIMEDOUT);
else if (ret == 0)
err = set_errno(EINTR);
}
/*
* The futex is normally hashed out in wakeup. If we timed out or
* got a signal, we need to hash it out here instead.
*/
if (fw.fw_woken == 0)
futex_hashout(&fw);
out:
mutex_exit(&futex_hash_lock[index]);
return (err);
}
/*
* Wake up to wake_threads threads that are blocked on the futex at memid.
*/
static int
futex_wake(memid_t *memid, int wake_threads)
{
fwaiter_t *fwp, *next;
int index;
int ret = 0;
index = HASH_FUNC(memid);
mutex_enter(&futex_hash_lock[index]);
for (fwp = futex_hash[index]; fwp && ret < wake_threads; fwp = next) {
next = fwp->fw_next;
if (MEMID_EQUAL(&fwp->fw_memid, memid)) {
futex_hashout(fwp);
fwp->fw_woken = 1;
cv_signal(&fwp->fw_cv);
ret++;
}
}
mutex_exit(&futex_hash_lock[index]);
return (ret);
}
/*
* Wake up to wake_threads waiting on the futex at memid. If there are
* more than that many threads waiting, requeue the remaining threads on
* the futex at requeue_memid.
*/
static int
futex_requeue(memid_t *memid, memid_t *requeue_memid, int wake_threads,
ulong_t requeue_threads, caddr_t addr, int *cmpval)
{
fwaiter_t *fwp, *next;
int index1, index2;
int ret = 0;
int32_t curval;
kmutex_t *l1, *l2;
/*
* To ensure that we don't miss a wakeup if the value of cmpval
* changes, we need to grab locks on both the original and new hash
* buckets. To avoid deadlock, we always grab the lower-indexed
* lock first.
*/
index1 = HASH_FUNC(memid);
index2 = HASH_FUNC(requeue_memid);
if (index1 == index2) {
l1 = &futex_hash_lock[index1];
l2 = NULL;
} else if (index1 < index2) {
l1 = &futex_hash_lock[index1];
l2 = &futex_hash_lock[index2];
} else {
l1 = &futex_hash_lock[index2];
l2 = &futex_hash_lock[index1];
}
mutex_enter(l1);
if (l2 != NULL)
mutex_enter(l2);
if (cmpval != NULL) {
if (fuword32(addr, (uint32_t *)&curval)) {
ret = -EFAULT;
goto out;
}
if (curval != *cmpval) {
ret = -EAGAIN;
goto out;
}
}
for (fwp = futex_hash[index1]; fwp; fwp = next) {
next = fwp->fw_next;
if (!MEMID_EQUAL(&fwp->fw_memid, memid))
continue;
futex_hashout(fwp);
if (ret++ < wake_threads) {
fwp->fw_woken = 1;
cv_signal(&fwp->fw_cv);
} else {
MEMID_COPY(requeue_memid, &fwp->fw_memid);
futex_hashin(fwp);
if ((ret - wake_threads) >= requeue_threads)
break;
}
}
out:
if (l2 != NULL)
mutex_exit(l2);
mutex_exit(l1);
if (ret < 0)
return (set_errno(-ret));
return (ret);
}
/*
* Copy in the relative timeout provided by the application and convert it
* to an absolute timeout.
*/
static int
get_timeout(void *lx_timeout, timestruc_t *timeout)
{
timestruc_t now;
if (get_udatamodel() == DATAMODEL_NATIVE) {
if (copyin(lx_timeout, timeout, sizeof (timestruc_t)))
return (EFAULT);
}
#ifdef _SYSCALL32_IMPL
else {
timestruc32_t timeout32;
if (copyin(lx_timeout, &timeout32, sizeof (timestruc32_t)))
return (EFAULT);
timeout->tv_sec = (time_t)timeout32.tv_sec;
timeout->tv_nsec = timeout32.tv_nsec;
}
#endif
gethrestime(&now);
if (itimerspecfix(timeout))
return (EINVAL);
timespecadd(timeout, &now);
return (0);
}
long
lx_futex(uintptr_t addr, int cmd, int val, uintptr_t lx_timeout,
uintptr_t addr2, int val2)
{
struct as *as = curproc->p_as;
memid_t memid, requeue_memid;
timestruc_t timeout;
timestruc_t *tptr = NULL;
int requeue_threads;
int *requeue_cmp = NULL;
int rval = 0;
/* must be aligned on int boundary */
if (addr & 0x3)
return (set_errno(EINVAL));
/* Sanity check the futex command */
if (cmd < 0 || cmd > FUTEX_MAX_CMD)
return (set_errno(EINVAL));
/* Copy in the timeout structure from userspace. */
if (cmd == FUTEX_WAIT && lx_timeout != NULL) {
rval = get_timeout((timespec_t *)lx_timeout, &timeout);
if (rval != 0)
return (set_errno(rval));
tptr = &timeout;
}
if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) {
if (cmd == FUTEX_CMP_REQUEUE)
requeue_cmp = &val2;
/*
* lx_timeout is nominally a pointer to a userspace
* address. For these two commands, it actually contains
* an integer which indicates the maximum number of threads
* to requeue. This is horrible, and I'm sorry.
*/
requeue_threads = (int)lx_timeout;
}
/*
* Translate the process-specific, user-space futex virtual
* address(es) to universal memid.
*/
rval = as_getmemid(as, (void *)addr, &memid);
if (rval != 0)
return (set_errno(rval));
if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) {
rval = as_getmemid(as, (void *)addr2, &requeue_memid);
if (rval)
return (set_errno(rval));
}
switch (cmd) {
case FUTEX_WAIT:
rval = futex_wait(&memid, (void *)addr, val, tptr);
break;
case FUTEX_WAKE:
rval = futex_wake(&memid, val);
break;
case FUTEX_CMP_REQUEUE:
case FUTEX_REQUEUE:
rval = futex_requeue(&memid, &requeue_memid, val,
requeue_threads, (void *)addr2, requeue_cmp);
break;
}
return (rval);
}
void
lx_futex_init(void)
{
int i;
for (i = 0; i < HASH_SIZE; i++)
mutex_init(&futex_hash_lock[i], NULL, MUTEX_DEFAULT, NULL);
bzero(futex_hash, sizeof (futex_hash));
}
int
lx_futex_fini(void)
{
int i, err;
err = 0;
for (i = 0; (err == 0) && (i < HASH_SIZE); i++) {
mutex_enter(&futex_hash_lock[i]);
if (futex_hash[i] != NULL)
err = EBUSY;
mutex_exit(&futex_hash_lock[i]);
}
return (err);
}