devpoll.c revision bf75909a55b2efbe96ab2fe820866fc77fab58c9
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright 2016 Joyent, Inc.
*/
#include <sys/poll_impl.h>
#include <sys/sysmacros.h>
#include <sys/resource.h>
#include <sys/schedctl.h>
#define RESERVED 1
/* local data struct */
int devpoll_init; /* is /dev/poll initialized already */
/* device local functions */
int *rvalp);
static dev_info_t *dpdevi;
dpopen, /* open */
dpclose, /* close */
nodev, /* strategy */
nodev, /* print */
nodev, /* dump */
nodev, /* read */
dpwrite, /* write */
dpioctl, /* ioctl */
nodev, /* devmap */
nodev, /* mmap */
nodev, /* segmap */
dppoll, /* poll */
ddi_prop_op, /* prop_op */
(struct streamtab *)0, /* streamtab */
D_MP, /* flags */
CB_REV, /* cb_ops revision */
nodev, /* aread */
nodev /* awrite */
};
DEVO_REV, /* devo_rev */
0, /* refcnt */
dpinfo, /* info */
nulldev, /* identify */
nulldev, /* probe */
dpattach, /* attach */
dpdetach, /* detach */
nodev, /* reset */
&dp_cb_ops, /* driver operations */
nulldev, /* power */
ddi_quiesce_not_needed, /* quiesce */
};
&mod_driverops, /* type of module - a driver */
&dp_ops,
};
static struct modlinkage modlinkage = {
(void *)&modldrv,
};
static void pcachelink_mark_stale(pollcache_t *);
static void pcachelink_purge_stale(pollcache_t *);
static void pcachelink_purge_all(pollcache_t *);
/*
* Locking Design
*
* structure is per lwp. An implicit assumption is made there that some
* portion of pollcache will never be touched by other lwps. E.g., in
* poll(2) design, no lwp will ever need to grow bitmap of other lwp.
* locking.
*
* minor number) has its own lock. Since read (dpioctl) is a much more
* frequent operation than write, we want to allow multiple reads on same
* priority to write operation. Theoretically writes can starve reads as
* well. But in practical sense this is not important because (1) writes
* happens less often than reads, and (2) write operation defines the
* content of poll fd a cache set. If writes happens so often that they
* can starve reads, that means the cached set is very unstable. It may
* not make sense to read an unstable cache set anyway. Therefore, the
* writers starving readers case is not handled in this design.
*/
int
_init()
{
int error;
devpoll_init = 1;
devpoll_init = 0;
}
return (error);
}
int
_fini()
{
int error;
return (error);
}
return (0);
}
int
{
}
/*ARGSUSED*/
static int
{
== DDI_FAILURE) {
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
static int
{
if (cmd != DDI_DETACH)
return (DDI_FAILURE);
return (DDI_SUCCESS);
}
/* ARGSUSED */
static int
{
int error;
switch (infocmd) {
case DDI_INFO_DEVT2DEVINFO:
error = DDI_SUCCESS;
break;
case DDI_INFO_DEVT2INSTANCE:
*result = (void *)0;
error = DDI_SUCCESS;
break;
default:
error = DDI_FAILURE;
}
return (error);
}
/*
* dp_pcache_poll has similar logic to pcache_poll() in poll.c. The major
* where it was stopped last time, instead of always starting from 0,
* (2) since user may not have cleaned up the cached fds when they are
* closed, some polldats in cache may refer to closed or reused fds. We
* need to check for those cases.
*
* NOTE: Upon closing an fd, automatic poll cache cleanup is done for
* stale entries!
*/
static int
{
short revent;
int error = 0;
/*
* No Need to search because no poll fd
* has been cached.
*/
return (error);
}
if (is_epoll) {
} else {
}
if (start == 0) {
/*
* started from every begining, no need to wrap around.
*/
} else {
}
fdcnt = 0;
revent = 0;
/*
* Examine the bit map in a circular fashion
* to avoid starvation. Always resume from
* last stop. Scan till end of the map. Then
* wrap around.
*/
if (fd >= 0) {
if (no_wrap) {
} else {
start = 0;
}
} else {
}
/*
* The fd is POLLREMOVed. This fd is
* logically no longer cached. So move
* on to the next one.
*/
continue;
}
/*
* The fd has been closed, but user has not
* done a POLLREMOVE on this fd yet. Instead
* of cleaning it here implicitly, we return
* POLLNVAL. This is consistent with poll(2)
* polling a closed fd. Hope this will remind
* user to do a POLLREMOVE.
*/
fdcnt++;
continue;
}
/*
* In the epoll compatibility case, we actually
* perform the implicit removal to remain
* closer to the epoll semantics.
*/
if (is_epoll) {
}
continue;
}
}
/*
* user is polling on a cached fd which was
* closed and then reused. Unfortunately
* there is no good way to inform user.
* If the file struct is also reused, we
* may not be able to detect the fd reuse
* at all. As long as this does not
* we will play along. Man page states if
* user does not clean up closed fds, polling
* results will be indeterministic.
*
* XXX - perhaps log the detection of fd
* reuse?
*/
}
/*
* XXX - pollrelock() logic needs to know which
* which pollcache lock to grab. It'd be a
* cleaner solution if we could pass pcp as
* an arguement in VOP_POLL interface instead
* of implicitly passing it using thread_t
* struct. On the other hand, changing VOP_POLL
* poll routine to change. May want to revisit
* the tradeoff later.
*/
if (error != 0) {
break;
}
/*
* layered devices (e.g. console driver)
* may change the vnode and thus the pollhead
* pointer out from underneath us.
*/
/*
* The bit should still be set.
*/
goto retry;
}
if (revent != 0) {
/*
* If any of the event bits are set for
* which poll and epoll representations
* differ, swizzle in the native epoll
* values.
*/
EPOLLRDHUP : 0) |
((revent & POLLWRBAND) ?
EPOLLWRBAND : 0);
} else {
}
/*
* We define POLLWRNORM to be POLLOUT,
* but epoll has separate definitions
* for them; if POLLOUT is set and the
* user has asked for EPOLLWRNORM, set
* that as well.
*/
}
} else {
pollstate_t *ps =
/*
* The devpoll handle itself is being
* polled. Notify the caller of any
* readable event(s), leaving as much
* state as possible untouched.
*/
/*
* If a call to pollunlock() fails
* during VOP_POLL, skip over the fd
* and continue polling.
*
* Otherwise, report that there is an
* event pending.
*/
!= 0) {
continue;
} else {
fdcnt++;
break;
}
}
/*
* If POLLET is set, clear the bit in the
* bitmap -- which effectively latches the
* edge on a pollwakeup() from the driver.
*/
/*
* If POLLONESHOT is set, perform the implicit
* POLLREMOVE.
*/
}
}
fdcnt++;
/*
* We clear a bit or cache a poll fd if
* the driver returns a poll head ptr,
* which is expected in the case of 0
* revents. Some buggy driver may return
* NULL php pointer with 0 revents. In
* this case, we just treat the driver as
* "noncachable" and not clearing the bit
* in bitmap.
*/
}
/*
* An event of interest may have
* arrived between the VOP_POLL() and
* the pollhead_insert(); check again.
*/
goto repoll;
}
}
} else {
/*
* No bit set in the range. Check for wrap around.
*/
if (!no_wrap) {
start = 0;
} else {
}
}
}
if (!done) {
}
return (error);
}
/*ARGSUSED*/
static int
{
break;
}
}
dp_entry_t **newtbl;
/*
* Used up every entry in the existing devpoll table.
* Grow the table by DEVPOLLSIZE.
*/
return (ENXIO);
}
dptblsize += DEVPOLLSIZE;
}
devpolltbl = newtbl;
}
/*
* allocate a pollcache skeleton here. Delay allocating bitmap
* structures until dpwrite() time, since we don't know the
* optimal size yet. We also delay setting the pid until either
* dpwrite() or attempt to poll on the instance, allowing parents
* epoll compatibility case, this check isn't performed to maintain
* semantic compatibility.)
*/
pcp = pcache_alloc();
return (0);
}
/*
* or change poll events for a watched fd.
*/
/*ARGSUSED*/
static int
{
int fd;
return (EACCES);
}
}
return (EINVAL);
}
/*
* Copy in the pollfd array. Walk through the array and add
* each polled fd to the cached set.
*/
/*
* not supposed to function as a seekable device. To prevent offset
* from growing and eventually exceed the maximum, reset the offset
* here for every call.
*/
uiop->uio_loffset = 0;
!= 0) {
return (error);
}
/*
* We are about to enter the core portion of dpwrite(). Make sure this
* write has exclusive access in this portion of the code, i.e., no
* other writers in this code.
*
* Waiting for all readers to drop their references to the dpe is
* unecessary since the pollcache itself is protected by pc_lock.
*/
dpep->dpe_writerwait++;
dpep->dpe_writerwait--;
return (EINTR);
}
}
dpep->dpe_writerwait--;
dpep->dpe_refcnt++;
/*
* The epoll compat mode was enabled while we were waiting to
* establish write access. It is not safe to continue since
* state was prepared for non-epoll operation.
*/
goto bypass;
}
/*
* pollstate_enter() deadlock and loop detection must be used.
*/
(void) pollstate_create();
}
/*
* epoll semantics demand that we return EBADF if our
* specified fd is invalid.
*/
if (is_epoll) {
break;
}
continue;
}
/*
* If we're in epoll compatibility mode, check
* that the fd is valid before allocating
* anything for it; epoll semantics demand that
* we return EBADF if our specified fd is
* invalid.
*/
if (is_epoll) {
break;
}
}
pdp = pcache_alloc_fd(0);
} else {
/*
* epoll semantics demand that we error out if
* a file descriptor is added twice, which we
* check (imperfectly) by checking if we both
* have the file descriptor cached and the
* file pointer that correponds to the file
* descriptor matches our cached value. If
* there is a pointer mismatch, the file
* descriptor was closed without being removed.
* The converse is clearly not true, however,
* so to narrow the window by which a spurious
* EEXIST may be returned, we also check if
* this fp has been added to an epoll control
* descriptor in the past; if it hasn't, we
* know that this is due to fp reuse -- it's
* not a true EEXIST case. (By performing this
* additional check, we limit the window of
* spurious EEXIST to situations where a single
* file descriptor is being used across two or
* more epoll control descriptors -- and even
* then, the file descriptor must be closed and
* reused in a relatively tight time span.)
*/
if (is_epoll) {
break;
}
/*
* We have decided that the cached
* information was stale: it either
* didn't match, or the fp had never
* actually been epoll()'d on before.
* We need to now clear our pd_events
* to assure that we don't mistakenly
* operate on cached event disposition.
*/
}
}
if (is_epoll) {
}
}
}
/*
* The fd is not valid. Since we can't pass
* this error back in the write() call, set
* the bit in bitmap to force DP_POLL ioctl
* to examine it.
*/
continue;
}
/*
* To (greatly) reduce EEXIST false positives, we
* denote that this fp has been epoll()'d. We do this
* regardless of epoll compatibility mode, as the flag
* is harmless if not in epoll compatibility mode.
*/
/*
* Don't do VOP_POLL for an already cached fd with
* same poll events.
*/
/*
* the events are already cached
*/
continue;
}
/*
* do VOP_POLL and cache this poll fd.
*/
/*
* XXX - pollrelock() logic needs to know which
* which pollcache lock to grab. It'd be a
* cleaner solution if we could pass pcp as
* an arguement in VOP_POLL interface instead
* of implicitly passing it using thread_t
* struct. On the other hand, changing VOP_POLL
* poll routine to change. May want to revisit
* the tradeoff later.
*/
/*
* We always set the bit when this fd is cached;
* this forces the first DP_POLL to poll this fd.
* Real performance gain comes from subsequent
* DP_POLL. We also attempt a pollhead_insert();
* if it's not possible, we'll do it in dpioctl().
*/
if (error != 0) {
break;
}
} else {
pdp);
}
}
}
} else {
if (is_epoll) {
/*
* As with the add case (above), epoll
* semantics demand that we error out
* in this case.
*/
break;
}
continue;
}
}
}
}
/*
* Wake any pollcache waiters so they can check the new descriptors.
*
* Any fds added to an recursive-capable pollcache could themselves be
* parent pollcaches are woken too, so that they can create any needed
* pollcache links.
*/
if (fds_added) {
}
dpep->dpe_refcnt--;
return (error);
}
#define DP_SIGMASK_RESTORE(ksetp) { \
mutex_enter(&p->p_lock); \
if (lwp->lwp_cursig == 0) { \
} \
mutex_exit(&p->p_lock); \
} \
}
/*ARGSUSED*/
static int
{
int error = 0;
/* do this now, before we sleep on DP_WRITER_PRESENT */
}
if (cmd == DP_EPOLLCOMPAT) {
if (dpep->dpe_refcnt != 0) {
/*
* We can't turn on epoll compatibility while there
* are outstanding operations.
*/
return (EBUSY);
}
/*
* epoll compatibility is a one-way street: there's no way
* to turn it off for a particular open.
*/
return (0);
}
return (EACCES);
}
}
/* Wait until all writers have cleared the handle before continuing */
(dpep->dpe_writerwait != 0)) {
return (EINTR);
}
}
dpep->dpe_refcnt++;
switch (cmd) {
case DP_POLL:
case DP_PPOLL:
{
int fdcnt = 0;
/*
* which otherwise uses the same structure as DP_POLL.
*/
} else {
}
/* Kernel-internal ioctl call */
error = 0;
} else {
dpsize);
}
if (error) {
return (EFAULT);
}
if (deadline > 0) {
/*
* Convert the deadline from relative milliseconds
* to absolute nanoseconds. They must wait for at
* least a tick.
*/
}
return (EFAULT);
}
mutex_enter(&p->p_lock);
/*
* Like ppoll() with a non-NULL sigset, we'll
* call cv_reltimedwait_sig() just to check for
* signals. This call will return immediately
* with either 0 (signalled) or -1 (no signal).
* There are some conditions whereby we can
* get 0 from cv_reltimedwait_sig() without
* a true signal (e.g., a directed stop), so
* we restore our signal mask in the unlikely
* event that lwp_cursig is 0.
*/
if (!cv_reltimedwait_sig(&t->t_delay_cv,
&p->p_lock, 0, TR_CLOCK_TICK)) {
if (lwp->lwp_cursig == 0) {
}
mutex_exit(&p->p_lock);
return (EINTR);
}
mutex_exit(&p->p_lock);
}
}
/*
* We are just using DP_POLL to sleep, so
* we don't any of the devpoll apparatus.
* Do not check for signals if we have a zero timeout.
*/
if (deadline == 0) {
return (0);
}
while ((error =
continue;
}
if (is_epoll) {
} else {
}
/*
* XXX It would be nice not to have to alloc each time, but it
* requires another per thread structure hook. This can be
* implemented later if data suggests that it's necessary.
*/
ps = pollstate_create();
/*
* If nfds is larger than twice the current maximum
* open file count, we'll silently clamp it. This
* only limits our exposure to allocating an
* inordinate amount of kernel memory; it doesn't
* otherwise affect the semantics. (We have this
* check at twice the maximum instead of merely the
* maximum because some applications pass an nfds that
* is only slightly larger than their limit.)
*/
mutex_enter(&p->p_lock);
}
mutex_exit(&p->p_lock);
}
}
for (;;) {
/*
* Mark all child pcachelinks as stale.
* Those which are still part of the tree will be
* marked as valid during the poll.
*/
break;
/* Purge still-stale child pcachelinks */
/*
* A pollwake has happened since we polled cache.
*/
continue;
/*
* Sleep until we are notified, signaled, or timed out.
*/
if (deadline == 0) {
/* immediate timeout; do not check signals */
break;
}
/*
* If we were awakened by a signal or timeout then
* break the loop, else poll again.
*/
if (error <= 0) {
break;
} else {
error = 0;
}
}
return (EFAULT);
}
}
break;
}
case DP_ISPOLLED:
{
if (error) {
return (EFAULT);
}
/*
* No Need to search because no poll fd
* has been cached.
*/
return (0);
}
break;
}
return (EFAULT);
}
*rvalp = 1;
}
break;
}
default:
return (EINVAL);
}
return (error);
}
/*
* Overview of Recursive Polling
*
* represented as readable data via the POLLIN flag. To limit surface area,
* been placed in epoll mode via the DP_EPOLLCOMPAT ioctl. Recursion depth is
* limited to 5 in order to be consistent with Linux epoll.
*
* Extending dppoll() for VOP_POLL:
*
* report when resources contained in the pollcache have relevant event state.
* At the highest level, it means calling dp_pcache_poll() so it indicates if
* fd events are present without consuming them or altering the pollcache
* bitmap. This ensures that a subsequent DP_POLL operation on the bitmap will
* yield the initiating event. Additionally, the VOP_POLL should return in
* such a way that dp_pcache_poll() does not clear the parent bitmap entry
* pollcaches will be checked during every poll which facilitates wake-up
* behavior detailed below.
*
* Pollcache Links and Wake Events:
*
* pollwakeup events by eschewing the traditional pollhead mechanism in favor
* of a different approach. For each pollcache at the root of a recursive
* linked list of pcachelink_t entries is walked, where those marked as valid
* incur a cv_broadcast to their parent pollcache. Most notably, these
* pcachelink_t cv wakeups are performed without acquiring pc_lock on the
* parent pollcache (which would require careful deadlock avoidance). This
* still allows the woken poll on the parent to discover the pertinent events
* due to the fact that bitmap entires for the child pollcache are always
* maintained by the dppoll() logic above.
*
* Depth Limiting and Loop Prevention:
*
* As each pollcache is encountered (either via DP_POLL or dppoll()), depth and
* loop constraints are enforced via pollstate_enter(). The pollcache_t
* pointer is compared against any existing entries in ps_pc_stack and is added
* to the end if no match (and therefore loop) is found. Once poll operations
* for a given pollcache_t are complete, pollstate_exit() clears the pointer
* from the list. The pollstate_enter() and pollstate_exit() functions are
* responsible for acquiring and releasing pc_lock, respectively.
*
* Deadlock Safety:
*
* business of sequentially entering multiple pollcache locks. This tree
* topology cannot define a lock acquisition order in such a way that it is
* immune to deadlocks between threads. The pollstate_enter() and
* operations to safely lock pollcaches while failing gracefully in the face of
* deadlocking topologies. (See pollstate_contend() for more detail about how
* deadlocks are detected and resolved.)
*/
/*ARGSUSED*/
static int
{
/* Poll recursion is not yet supported for non-epoll handles */
return (0);
} else {
dpep->dpe_refcnt++;
}
if (res == PSE_SUCCESS) {
int fdcnt = 0;
if (rc == 0) {
}
} else {
switch (res) {
case PSE_FAIL_DEPTH:
break;
case PSE_FAIL_LOOP:
case PSE_FAIL_DEADLOCK:
break;
default:
/*
* If anything else has gone awry, such as being polled
* from an unexpected context, fall back to the
* recursion-intolerant response.
*/
rc = 0;
break;
}
}
return (rc);
}
/*
* devpoll close should do enough clean up before the pollcache is deleted,
* i.e., it should ensure no one still references the pollcache later.
* There is no "permission" check in here. Any process having the last
*/
/*ARGSUSED*/
static int
{
int i;
/*
* At this point, no other lwp can access this pollcache via the
* up without the pc_lock.
*/
for (i = 0; i < pcp->pc_hashsize; i++) {
}
}
}
/*
* pollwakeup() may still interact with this pollcache. Wait until
* it is done.
*/
/*
* Because of the locking rules for pcachelink manipulation,
* acquring pc_lock is required for this step.
*/
}
return (0);
}
static void
{
pl->pcl_refcnt--;
if (pl->pcl_refcnt == 0) {
} else {
}
}
/*
* Associate parent and child pollcaches via a pcachelink_t. If an existing
* link (stale or valid) between the two is found, it will be reused. If a
* suitable link is not found for reuse, a new one will be allocated.
*/
static void
{
/* Search for an existing link we can reuse. */
/* Clean any invalid links while walking the list */
/* Successfully found parent link */
return;
} else {
}
}
/* No existing link to the parent was found. Create a fresh one. */
pl->pcl_refcnt++;
pl->pcl_refcnt++;
}
/*
* Mark all child links in a pollcache as stale. Any invalid child links found
* during iteration are purged.
*/
static void
{
/*
* Remove any invalid links while we are going to the
* trouble of walking the list.
*/
} else {
}
}
}
/*
* Purge all stale (or invalid) child links from a pollcache.
*/
static void
{
case PCL_STALE:
/* FALLTHROUGH */
case PCL_INVALID:
break;
default:
}
}
}
/*
* Purge all child and parent links from a pollcache, regardless of status.
*/
static void
{
}
}
}