aio_subr.c revision d2749ac6e20fe35dbfa822f9b55c185325a2147e
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/types.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/errno.h>
#include <sys/param.h>
#include <sys/sysmacros.h>
#include <sys/cmn_err.h>
#include <sys/systm.h>
#include <vm/as.h>
#include <vm/page.h>
#include <sys/uio.h>
#include <sys/kmem.h>
#include <sys/debug.h>
#include <sys/aio_impl.h>
#include <sys/epm.h>
#include <sys/fs/snode.h>
#include <sys/siginfo.h>
#include <sys/cpuvar.h>
#include <sys/tnf_probe.h>
#include <sys/conf.h>
#include <sys/sdt.h>
int aphysio(int (*)(), int (*)(), dev_t, int, void (*)(), struct aio_req *);
void aio_done(struct buf *);
void aphysio_unlock(aio_req_t *);
void aio_cleanup(int);
void aio_cleanup_exit(void);
/*
* private functions
*/
static void aio_sigev_send(proc_t *, sigqueue_t *);
static void aio_hash_delete(aio_t *, aio_req_t *);
static void aio_lio_free(aio_t *, aio_lio_t *);
static int aio_cleanup_cleanupq(aio_t *, aio_req_t *, int);
static int aio_cleanup_notifyq(aio_t *, aio_req_t *, int);
static void aio_cleanup_pollq(aio_t *, aio_req_t *, int);
static void aio_cleanup_portq(aio_t *, aio_req_t *, int);
/*
* async version of physio() that doesn't wait synchronously
* for the driver's strategy routine to complete.
*/
int
aphysio(
int (*strategy)(struct buf *),
int (*cancel)(struct buf *),
dev_t dev,
int rw,
void (*mincnt)(struct buf *),
struct aio_req *aio)
{
struct uio *uio = aio->aio_uio;
aio_req_t *reqp = (aio_req_t *)aio->aio_private;
struct buf *bp = &reqp->aio_req_buf;
struct iovec *iov;
struct as *as;
char *a;
int error;
size_t c;
struct page **pplist;
struct dev_ops *ops = devopsp[getmajor(dev)];
if (uio->uio_loffset < 0)
return (EINVAL);
#ifdef _ILP32
/*
* For 32-bit kernels, check against SPEC_MAXOFFSET_T which represents
* the maximum size that can be supported by the IO subsystem.
* XXX this code assumes a D_64BIT driver.
*/
if (uio->uio_loffset > SPEC_MAXOFFSET_T)
return (EINVAL);
#endif /* _ILP32 */
TNF_PROBE_5(aphysio_start, "kaio", /* CSTYLED */,
tnf_opaque, bp, bp,
tnf_device, device, dev,
tnf_offset, blkno, btodt(uio->uio_loffset),
tnf_size, size, uio->uio_iov->iov_len,
tnf_bioflags, rw, rw);
if (rw == B_READ) {
CPU_STATS_ADD_K(sys, phread, 1);
} else {
CPU_STATS_ADD_K(sys, phwrite, 1);
}
iov = uio->uio_iov;
sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
bp->b_error = 0;
bp->b_flags = B_BUSY | B_PHYS | B_ASYNC | rw;
bp->b_edev = dev;
bp->b_dev = cmpdev(dev);
bp->b_lblkno = btodt(uio->uio_loffset);
bp->b_offset = uio->uio_loffset;
(void) ops->devo_getinfo(NULL, DDI_INFO_DEVT2DEVINFO,
(void *)bp->b_edev, (void **)&bp->b_dip);
/*
* Clustering: Clustering can set the b_iodone, b_forw and
* b_proc fields to cluster-specifc values.
*/
if (bp->b_iodone == NULL) {
bp->b_iodone = (int (*)()) aio_done;
/* b_forw points at an aio_req_t structure */
bp->b_forw = (struct buf *)reqp;
bp->b_proc = curproc;
}
a = bp->b_un.b_addr = iov->iov_base;
c = bp->b_bcount = iov->iov_len;
(*mincnt)(bp);
if (bp->b_bcount != iov->iov_len)
return (ENOTSUP);
as = bp->b_proc->p_as;
error = as_pagelock(as, &pplist, a,
c, rw == B_READ? S_WRITE : S_READ);
if (error != 0) {
bp->b_flags |= B_ERROR;
bp->b_error = error;
bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
return (error);
}
reqp->aio_req_flags |= AIO_PAGELOCKDONE;
bp->b_shadow = pplist;
if (pplist != NULL) {
bp->b_flags |= B_SHADOW;
}
if (cancel != anocancel)
cmn_err(CE_PANIC,
"aphysio: cancellation not supported, use anocancel");
reqp->aio_req_cancel = cancel;
DTRACE_IO1(start, struct buf *, bp);
return ((*strategy)(bp));
}
/*ARGSUSED*/
int
anocancel(struct buf *bp)
{
return (ENXIO);
}
/*
* Called from biodone().
* Notify process that a pending AIO has finished.
*/
/*
* Clustering: This function is made non-static as it is used
* by clustering s/w as contract private interface.
*/
void
aio_done(struct buf *bp)
{
proc_t *p;
struct as *as;
aio_req_t *reqp;
aio_lio_t *head = NULL;
aio_t *aiop;
sigqueue_t *sigev = NULL;
sigqueue_t *lio_sigev = NULL;
port_kevent_t *pkevp = NULL;
port_kevent_t *lio_pkevp = NULL;
int fd;
int cleanupqflag;
int pollqflag;
int portevpend;
void (*func)();
int use_port = 0;
int reqp_flags = 0;
int send_signal = 0;
p = bp->b_proc;
as = p->p_as;
reqp = (aio_req_t *)bp->b_forw;
fd = reqp->aio_req_fd;
TNF_PROBE_5(aphysio_end, "kaio", /* CSTYLED */,
tnf_opaque, bp, bp,
tnf_device, device, bp->b_edev,
tnf_offset, blkno, btodt(reqp->aio_req_uio.uio_loffset),
tnf_size, size, reqp->aio_req_uio.uio_iov->iov_len,
tnf_bioflags, rw, (bp->b_flags & (B_READ|B_WRITE)));
/*
* mapout earlier so that more kmem is available when aio is
* heavily used. bug #1262082
*/
if (bp->b_flags & B_REMAPPED)
bp_mapout(bp);
/* decrement fd's ref count by one, now that aio request is done. */
areleasef(fd, P_FINFO(p));
aiop = p->p_aio;
ASSERT(aiop != NULL);
mutex_enter(&aiop->aio_portq_mutex);
mutex_enter(&aiop->aio_mutex);
ASSERT(aiop->aio_pending > 0);
ASSERT(reqp->aio_req_flags & AIO_PENDING);
aiop->aio_pending--;
reqp->aio_req_flags &= ~AIO_PENDING;
reqp_flags = reqp->aio_req_flags;
if ((pkevp = reqp->aio_req_portkev) != NULL) {
/* Event port notification is desired for this transaction */
if (reqp->aio_req_flags & AIO_CLOSE_PORT) {
/*
* The port is being closed and it is waiting for
* pending asynchronous I/O transactions to complete.
*/
portevpend = --aiop->aio_portpendcnt;
aio_deq(&aiop->aio_portpending, reqp);
aio_enq(&aiop->aio_portq, reqp, 0);
mutex_exit(&aiop->aio_mutex);
mutex_exit(&aiop->aio_portq_mutex);
port_send_event(pkevp);
if (portevpend == 0)
cv_broadcast(&aiop->aio_portcv);
return;
}
if (aiop->aio_flags & AIO_CLEANUP) {
/*
* aio_cleanup_thread() is waiting for completion of
* transactions.
*/
mutex_enter(&as->a_contents);
aio_deq(&aiop->aio_portpending, reqp);
aio_enq(&aiop->aio_portcleanupq, reqp, 0);
cv_signal(&aiop->aio_cleanupcv);
mutex_exit(&as->a_contents);
mutex_exit(&aiop->aio_mutex);
mutex_exit(&aiop->aio_portq_mutex);
return;
}
aio_deq(&aiop->aio_portpending, reqp);
aio_enq(&aiop->aio_portq, reqp, 0);
use_port = 1;
} else {
/*
* when the AIO_CLEANUP flag is enabled for this
* process, or when the AIO_POLL bit is set for
* this request, special handling is required.
* otherwise the request is put onto the doneq.
*/
cleanupqflag = (aiop->aio_flags & AIO_CLEANUP);
pollqflag = (reqp->aio_req_flags & AIO_POLL);
if (cleanupqflag | pollqflag) {
if (cleanupqflag)
mutex_enter(&as->a_contents);
/*
* requests with their AIO_POLL bit set are put
* on the pollq, requests with sigevent structures
* or with listio heads are put on the notifyq, and
* the remaining requests don't require any special
* cleanup handling, so they're put onto the default
* cleanupq.
*/
if (pollqflag)
aio_enq(&aiop->aio_pollq, reqp, AIO_POLLQ);
else if (reqp->aio_req_sigqp || reqp->aio_req_lio)
aio_enq(&aiop->aio_notifyq, reqp, AIO_NOTIFYQ);
else
aio_enq(&aiop->aio_cleanupq, reqp,
AIO_CLEANUPQ);
if (cleanupqflag) {
cv_signal(&aiop->aio_cleanupcv);
mutex_exit(&as->a_contents);
mutex_exit(&aiop->aio_mutex);
mutex_exit(&aiop->aio_portq_mutex);
} else {
ASSERT(pollqflag);
/* block aio_cleanup_exit until we're done */
aiop->aio_flags |= AIO_DONE_ACTIVE;
mutex_exit(&aiop->aio_mutex);
mutex_exit(&aiop->aio_portq_mutex);
/*
* let the cleanup processing happen from an AST
* set an AST on all threads in this process
*/
mutex_enter(&p->p_lock);
set_proc_ast(p);
mutex_exit(&p->p_lock);
mutex_enter(&aiop->aio_mutex);
/* wakeup anybody waiting in aiowait() */
cv_broadcast(&aiop->aio_waitcv);
/* wakeup aio_cleanup_exit if needed */
if (aiop->aio_flags & AIO_CLEANUP)
cv_signal(&aiop->aio_cleanupcv);
aiop->aio_flags &= ~AIO_DONE_ACTIVE;
mutex_exit(&aiop->aio_mutex);
}
return;
}
/*
* save req's sigevent pointer, and check its
* value after releasing aio_mutex lock.
*/
sigev = reqp->aio_req_sigqp;
reqp->aio_req_sigqp = NULL;
/* put request on done queue. */
aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
} /* portkevent */
/*
* when list IO notification is enabled, a notification or
* signal is sent only when all entries in the list are done.
*/
if ((head = reqp->aio_req_lio) != NULL) {
ASSERT(head->lio_refcnt > 0);
if (--head->lio_refcnt == 0) {
/*
* save lio's sigevent pointer, and check
* its value after releasing aio_mutex lock.
*/
lio_sigev = head->lio_sigqp;
head->lio_sigqp = NULL;
cv_signal(&head->lio_notify);
if (head->lio_port >= 0 &&
(lio_pkevp = head->lio_portkev) != NULL)
head->lio_port = -1;
}
}
/*
* if AIO_WAITN set then
* send signal only when we reached the
* required amount of IO's finished
* or when all IO's are done
*/
if (aiop->aio_flags & AIO_WAITN) {
if (aiop->aio_waitncnt > 0)
aiop->aio_waitncnt--;
if (aiop->aio_pending == 0 ||
aiop->aio_waitncnt == 0)
cv_broadcast(&aiop->aio_waitcv);
} else {
cv_broadcast(&aiop->aio_waitcv);
}
/*
* No need to set this flag for pollq, portq, lio requests.
* If this is an old Solaris aio request, and the process has
* a SIGIO signal handler enabled, then send a SIGIO signal.
*/
if (!sigev && !use_port && head == NULL &&
(reqp->aio_req_flags & AIO_SOLARIS) &&
(func = PTOU(p)->u_signal[SIGIO - 1]) != SIG_DFL &&
(func != SIG_IGN)) {
send_signal = 1;
reqp->aio_req_flags |= AIO_SIGNALLED;
}
mutex_exit(&aiop->aio_mutex);
mutex_exit(&aiop->aio_portq_mutex);
/*
* Could the cleanup thread be waiting for AIO with locked
* resources to finish?
* Ideally in that case cleanup thread should block on cleanupcv,
* but there is a window, where it could miss to see a new aio
* request that sneaked in.
*/
mutex_enter(&as->a_contents);
if ((reqp_flags & AIO_PAGELOCKDONE) && AS_ISUNMAPWAIT(as))
cv_broadcast(&as->a_cv);
mutex_exit(&as->a_contents);
if (sigev)
aio_sigev_send(p, sigev);
else if (send_signal)
psignal(p, SIGIO);
if (pkevp)
port_send_event(pkevp);
if (lio_sigev)
aio_sigev_send(p, lio_sigev);
if (lio_pkevp)
port_send_event(lio_pkevp);
}
/*
* send a queued signal to the specified process when
* the event signal is non-NULL. A return value of 1
* will indicate that a signal is queued, and 0 means that
* no signal was specified, nor sent.
*/
static void
aio_sigev_send(proc_t *p, sigqueue_t *sigev)
{
ASSERT(sigev != NULL);
mutex_enter(&p->p_lock);
sigaddqa(p, NULL, sigev);
mutex_exit(&p->p_lock);
}
/*
* special case handling for zero length requests. the aio request
* short circuits the normal completion path since all that's required
* to complete this request is to copyout a zero to the aio request's
* return value.
*/
void
aio_zerolen(aio_req_t *reqp)
{
struct buf *bp = &reqp->aio_req_buf;
reqp->aio_req_flags |= AIO_ZEROLEN;
bp->b_forw = (struct buf *)reqp;
bp->b_proc = curproc;
bp->b_resid = 0;
bp->b_flags = 0;
aio_done(bp);
}
/*
* unlock pages previously locked by as_pagelock
*/
void
aphysio_unlock(aio_req_t *reqp)
{
struct buf *bp;
struct iovec *iov;
int flags;
if (reqp->aio_req_flags & AIO_PHYSIODONE)
return;
reqp->aio_req_flags |= AIO_PHYSIODONE;
if (reqp->aio_req_flags & AIO_ZEROLEN)
return;
bp = &reqp->aio_req_buf;
iov = reqp->aio_req_uio.uio_iov;
flags = (((bp->b_flags & B_READ) == B_READ) ? S_WRITE : S_READ);
if (reqp->aio_req_flags & AIO_PAGELOCKDONE) {
as_pageunlock(bp->b_proc->p_as,
bp->b_flags & B_SHADOW ? bp->b_shadow : NULL,
iov->iov_base, iov->iov_len, flags);
reqp->aio_req_flags &= ~AIO_PAGELOCKDONE;
}
bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
bp->b_flags |= B_DONE;
}
/*
* deletes a requests id from the hash table of outstanding io.
*/
static void
aio_hash_delete(aio_t *aiop, struct aio_req_t *reqp)
{
long index;
aio_result_t *resultp = reqp->aio_req_resultp;
aio_req_t *current;
aio_req_t **nextp;
index = AIO_HASH(resultp);
nextp = (aiop->aio_hash + index);
while ((current = *nextp) != NULL) {
if (current->aio_req_resultp == resultp) {
*nextp = current->aio_hash_next;
return;
}
nextp = &current->aio_hash_next;
}
}
/*
* Put a list head struct onto its free list.
*/
static void
aio_lio_free(aio_t *aiop, aio_lio_t *head)
{
ASSERT(MUTEX_HELD(&aiop->aio_mutex));
if (head->lio_sigqp != NULL)
kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
head->lio_next = aiop->aio_lio_free;
aiop->aio_lio_free = head;
}
/*
* Put a reqp onto the freelist.
*/
void
aio_req_free(aio_t *aiop, aio_req_t *reqp)
{
aio_lio_t *liop;
ASSERT(MUTEX_HELD(&aiop->aio_mutex));
if (reqp->aio_req_portkev) {
port_free_event(reqp->aio_req_portkev);
reqp->aio_req_portkev = NULL;
}
if ((liop = reqp->aio_req_lio) != NULL) {
if (--liop->lio_nent == 0)
aio_lio_free(aiop, liop);
reqp->aio_req_lio = NULL;
}
if (reqp->aio_req_sigqp != NULL) {
kmem_free(reqp->aio_req_sigqp, sizeof (sigqueue_t));
reqp->aio_req_sigqp = NULL;
}
reqp->aio_req_next = aiop->aio_free;
reqp->aio_req_prev = NULL;
aiop->aio_free = reqp;
aiop->aio_outstanding--;
if (aiop->aio_outstanding == 0)
cv_broadcast(&aiop->aio_waitcv);
aio_hash_delete(aiop, reqp);
}
/*
* Put a reqp onto the freelist.
*/
void
aio_req_free_port(aio_t *aiop, aio_req_t *reqp)
{
ASSERT(MUTEX_HELD(&aiop->aio_mutex));
reqp->aio_req_next = aiop->aio_free;
reqp->aio_req_prev = NULL;
aiop->aio_free = reqp;
aiop->aio_outstanding--;
aio_hash_delete(aiop, reqp);
}
/*
* Verify the integrity of a queue.
*/
#if defined(DEBUG)
static void
aio_verify_queue(aio_req_t *head,
aio_req_t *entry_present, aio_req_t *entry_missing)
{
aio_req_t *reqp;
int found = 0;
int present = 0;
if ((reqp = head) != NULL) {
do {
ASSERT(reqp->aio_req_prev->aio_req_next == reqp);
ASSERT(reqp->aio_req_next->aio_req_prev == reqp);
if (entry_present == reqp)
found++;
if (entry_missing == reqp)
present++;
} while ((reqp = reqp->aio_req_next) != head);
}
ASSERT(entry_present == NULL || found == 1);
ASSERT(entry_missing == NULL || present == 0);
}
#else
#define aio_verify_queue(x, y, z)
#endif
/*
* Put a request onto the tail of a queue.
*/
void
aio_enq(aio_req_t **qhead, aio_req_t *reqp, int qflg_new)
{
aio_req_t *head;
aio_req_t *prev;
aio_verify_queue(*qhead, NULL, reqp);
if ((head = *qhead) == NULL) {
reqp->aio_req_next = reqp;
reqp->aio_req_prev = reqp;
*qhead = reqp;
} else {
reqp->aio_req_next = head;
reqp->aio_req_prev = prev = head->aio_req_prev;
prev->aio_req_next = reqp;
head->aio_req_prev = reqp;
}
reqp->aio_req_flags |= qflg_new;
}
/*
* Remove a request from its queue.
*/
void
aio_deq(aio_req_t **qhead, aio_req_t *reqp)
{
aio_verify_queue(*qhead, reqp, NULL);
if (reqp->aio_req_next == reqp) {
*qhead = NULL;
} else {
reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
if (*qhead == reqp)
*qhead = reqp->aio_req_next;
}
reqp->aio_req_next = NULL;
reqp->aio_req_prev = NULL;
}
/*
* concatenate a specified queue with the cleanupq. the specified
* queue is put onto the tail of the cleanupq. all elements on the
* specified queue should have their aio_req_flags field cleared.
*/
/*ARGSUSED*/
void
aio_cleanupq_concat(aio_t *aiop, aio_req_t *q2, int qflg)
{
aio_req_t *cleanupqhead, *q2tail;
aio_req_t *reqp = q2;
do {
ASSERT(reqp->aio_req_flags & qflg);
reqp->aio_req_flags &= ~qflg;
reqp->aio_req_flags |= AIO_CLEANUPQ;
} while ((reqp = reqp->aio_req_next) != q2);
cleanupqhead = aiop->aio_cleanupq;
if (cleanupqhead == NULL)
aiop->aio_cleanupq = q2;
else {
cleanupqhead->aio_req_prev->aio_req_next = q2;
q2tail = q2->aio_req_prev;
q2tail->aio_req_next = cleanupqhead;
q2->aio_req_prev = cleanupqhead->aio_req_prev;
cleanupqhead->aio_req_prev = q2tail;
}
}
/*
* cleanup aio requests that are on the per-process poll queue.
*/
void
aio_cleanup(int flag)
{
aio_t *aiop = curproc->p_aio;
aio_req_t *pollqhead, *cleanupqhead, *notifyqhead;
aio_req_t *cleanupport;
aio_req_t *portq = NULL;
void (*func)();
int signalled = 0;
int qflag = 0;
int exitflg;
ASSERT(aiop != NULL);
if (flag == AIO_CLEANUP_EXIT)
exitflg = AIO_CLEANUP_EXIT;
else
exitflg = 0;
/*
* We need to get the aio_cleanupq_mutex because we are calling
* aio_cleanup_cleanupq()
*/
mutex_enter(&aiop->aio_cleanupq_mutex);
/*
* take all the requests off the cleanupq, the notifyq,
* and the pollq.
*/
mutex_enter(&aiop->aio_mutex);
if ((cleanupqhead = aiop->aio_cleanupq) != NULL) {
aiop->aio_cleanupq = NULL;
qflag++;
}
if ((notifyqhead = aiop->aio_notifyq) != NULL) {
aiop->aio_notifyq = NULL;
qflag++;
}
if ((pollqhead = aiop->aio_pollq) != NULL) {
aiop->aio_pollq = NULL;
qflag++;
}
if (flag) {
if ((portq = aiop->aio_portq) != NULL)
qflag++;
if ((cleanupport = aiop->aio_portcleanupq) != NULL) {
aiop->aio_portcleanupq = NULL;
qflag++;
}
}
mutex_exit(&aiop->aio_mutex);
/*
* return immediately if cleanupq, pollq, and
* notifyq are all empty. someone else must have
* emptied them.
*/
if (!qflag) {
mutex_exit(&aiop->aio_cleanupq_mutex);
return;
}
/*
* do cleanup for the various queues.
*/
if (cleanupqhead)
signalled = aio_cleanup_cleanupq(aiop, cleanupqhead, exitflg);
mutex_exit(&aiop->aio_cleanupq_mutex);
if (notifyqhead)
signalled = aio_cleanup_notifyq(aiop, notifyqhead, exitflg);
if (pollqhead)
aio_cleanup_pollq(aiop, pollqhead, exitflg);
if (flag && (cleanupport || portq))
aio_cleanup_portq(aiop, cleanupport, exitflg);
if (exitflg)
return;
/*
* If we have an active aio_cleanup_thread it's possible for
* this routine to push something on to the done queue after
* an aiowait/aiosuspend thread has already decided to block.
* This being the case, we need a cv_broadcast here to wake
* these threads up. It is simpler and cleaner to do this
* broadcast here than in the individual cleanup routines.
*/
mutex_enter(&aiop->aio_mutex);
/*
* If there has never been an old solaris aio request
* issued by this process, then do not send a SIGIO signal.
*/
if (!(aiop->aio_flags & AIO_SOLARIS_REQ))
signalled = 1;
cv_broadcast(&aiop->aio_waitcv);
mutex_exit(&aiop->aio_mutex);
/*
* Only if the process wasn't already signalled,
* determine if a SIGIO signal should be delievered.
*/
if (!signalled &&
(func = PTOU(curproc)->u_signal[SIGIO - 1]) != SIG_DFL &&
func != SIG_IGN)
psignal(curproc, SIGIO);
}
/*
* Do cleanup for every element of the port cleanup queue.
*/
static void
aio_cleanup_portq(aio_t *aiop, aio_req_t *cleanupq, int exitflag)
{
aio_req_t *reqp;
aio_req_t *next;
aio_req_t *headp;
aio_lio_t *liop;
/* first check the portq */
if (exitflag || ((aiop->aio_flags & AIO_CLEANUP_PORT) == 0)) {
mutex_enter(&aiop->aio_mutex);
if (aiop->aio_flags & AIO_CLEANUP)
aiop->aio_flags |= AIO_CLEANUP_PORT;
mutex_exit(&aiop->aio_mutex);
/*
* It is not allowed to hold locks during aphysio_unlock().
* The aio_done() interrupt function will try to acquire
* aio_mutex and aio_portq_mutex. Therefore we disconnect
* the portq list from the aiop for the duration of the
* aphysio_unlock() loop below.
*/
mutex_enter(&aiop->aio_portq_mutex);
headp = aiop->aio_portq;
aiop->aio_portq = NULL;
mutex_exit(&aiop->aio_portq_mutex);
if ((reqp = headp) != NULL) {
do {
next = reqp->aio_req_next;
aphysio_unlock(reqp);
if (exitflag) {
mutex_enter(&aiop->aio_mutex);
aio_req_free(aiop, reqp);
mutex_exit(&aiop->aio_mutex);
}
} while ((reqp = next) != headp);
}
if (headp != NULL && exitflag == 0) {
/* move unlocked requests back to the port queue */
aio_req_t *newq;
mutex_enter(&aiop->aio_portq_mutex);
if ((newq = aiop->aio_portq) != NULL) {
aio_req_t *headprev = headp->aio_req_prev;
aio_req_t *newqprev = newq->aio_req_prev;
headp->aio_req_prev = newqprev;
newq->aio_req_prev = headprev;
headprev->aio_req_next = newq;
newqprev->aio_req_next = headp;
}
aiop->aio_portq = headp;
cv_broadcast(&aiop->aio_portcv);
mutex_exit(&aiop->aio_portq_mutex);
}
}
/* now check the port cleanup queue */
if ((reqp = cleanupq) == NULL)
return;
do {
next = reqp->aio_req_next;
aphysio_unlock(reqp);
if (exitflag) {
mutex_enter(&aiop->aio_mutex);
aio_req_free(aiop, reqp);
mutex_exit(&aiop->aio_mutex);
} else {
mutex_enter(&aiop->aio_portq_mutex);
aio_enq(&aiop->aio_portq, reqp, 0);
mutex_exit(&aiop->aio_portq_mutex);
port_send_event(reqp->aio_req_portkev);
if ((liop = reqp->aio_req_lio) != NULL) {
int send_event = 0;
mutex_enter(&aiop->aio_mutex);
ASSERT(liop->lio_refcnt > 0);
if (--liop->lio_refcnt == 0) {
if (liop->lio_port >= 0 &&
liop->lio_portkev) {
liop->lio_port = -1;
send_event = 1;
}
}
mutex_exit(&aiop->aio_mutex);
if (send_event)
port_send_event(liop->lio_portkev);
}
}
} while ((reqp = next) != cleanupq);
}
/*
* Do cleanup for every element of the cleanupq.
*/
static int
aio_cleanup_cleanupq(aio_t *aiop, aio_req_t *qhead, int exitflg)
{
aio_req_t *reqp, *next;
int signalled = 0;
ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
/*
* Since aio_req_done() or aio_req_find() use the HASH list to find
* the required requests, they could potentially take away elements
* if they are already done (AIO_DONEQ is set).
* The aio_cleanupq_mutex protects the queue for the duration of the
* loop from aio_req_done() and aio_req_find().
*/
if ((reqp = qhead) == NULL)
return (0);
do {
ASSERT(reqp->aio_req_flags & AIO_CLEANUPQ);
ASSERT(reqp->aio_req_portkev == NULL);
next = reqp->aio_req_next;
aphysio_unlock(reqp);
mutex_enter(&aiop->aio_mutex);
if (exitflg)
aio_req_free(aiop, reqp);
else
aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
if (!exitflg) {
if (reqp->aio_req_flags & AIO_SIGNALLED)
signalled++;
else
reqp->aio_req_flags |= AIO_SIGNALLED;
}
mutex_exit(&aiop->aio_mutex);
} while ((reqp = next) != qhead);
return (signalled);
}
/*
* do cleanup for every element of the notify queue.
*/
static int
aio_cleanup_notifyq(aio_t *aiop, aio_req_t *qhead, int exitflg)
{
aio_req_t *reqp, *next;
aio_lio_t *liohead;
sigqueue_t *sigev, *lio_sigev = NULL;
int signalled = 0;
if ((reqp = qhead) == NULL)
return (0);
do {
ASSERT(reqp->aio_req_flags & AIO_NOTIFYQ);
next = reqp->aio_req_next;
aphysio_unlock(reqp);
if (exitflg) {
mutex_enter(&aiop->aio_mutex);
aio_req_free(aiop, reqp);
mutex_exit(&aiop->aio_mutex);
} else {
mutex_enter(&aiop->aio_mutex);
aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
sigev = reqp->aio_req_sigqp;
reqp->aio_req_sigqp = NULL;
if ((liohead = reqp->aio_req_lio) != NULL) {
ASSERT(liohead->lio_refcnt > 0);
if (--liohead->lio_refcnt == 0) {
cv_signal(&liohead->lio_notify);
lio_sigev = liohead->lio_sigqp;
liohead->lio_sigqp = NULL;
}
}
mutex_exit(&aiop->aio_mutex);
if (sigev) {
signalled++;
aio_sigev_send(reqp->aio_req_buf.b_proc,
sigev);
}
if (lio_sigev) {
signalled++;
aio_sigev_send(reqp->aio_req_buf.b_proc,
lio_sigev);
}
}
} while ((reqp = next) != qhead);
return (signalled);
}
/*
* Do cleanup for every element of the poll queue.
*/
static void
aio_cleanup_pollq(aio_t *aiop, aio_req_t *qhead, int exitflg)
{
aio_req_t *reqp, *next;
/*
* As no other threads should be accessing the queue at this point,
* it isn't necessary to hold aio_mutex while we traverse its elements.
*/
if ((reqp = qhead) == NULL)
return;
do {
ASSERT(reqp->aio_req_flags & AIO_POLLQ);
next = reqp->aio_req_next;
aphysio_unlock(reqp);
if (exitflg) {
mutex_enter(&aiop->aio_mutex);
aio_req_free(aiop, reqp);
mutex_exit(&aiop->aio_mutex);
} else {
aio_copyout_result(reqp);
mutex_enter(&aiop->aio_mutex);
aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
mutex_exit(&aiop->aio_mutex);
}
} while ((reqp = next) != qhead);
}
/*
* called by exit(). waits for all outstanding kaio to finish
* before the kaio resources are freed.
*/
void
aio_cleanup_exit(void)
{
proc_t *p = curproc;
aio_t *aiop = p->p_aio;
aio_req_t *reqp, *next, *head;
aio_lio_t *nxtlio, *liop;
/*
* wait for all outstanding kaio to complete. process
* is now single-threaded; no other kaio requests can
* happen once aio_pending is zero.
*/
mutex_enter(&aiop->aio_mutex);
aiop->aio_flags |= AIO_CLEANUP;
while ((aiop->aio_pending != 0) || (aiop->aio_flags & AIO_DONE_ACTIVE))
cv_wait(&aiop->aio_cleanupcv, &aiop->aio_mutex);
mutex_exit(&aiop->aio_mutex);
/* cleanup the cleanup-thread queues. */
aio_cleanup(AIO_CLEANUP_EXIT);
/*
* Although this process is now single-threaded, we
* still need to protect ourselves against a race with
* aio_cleanup_dr_delete_memory().
*/
mutex_enter(&p->p_lock);
/*
* free up the done queue's resources.
*/
if ((head = aiop->aio_doneq) != NULL) {
aiop->aio_doneq = NULL;
reqp = head;
do {
next = reqp->aio_req_next;
aphysio_unlock(reqp);
kmem_free(reqp, sizeof (struct aio_req_t));
} while ((reqp = next) != head);
}
/*
* release aio request freelist.
*/
for (reqp = aiop->aio_free; reqp != NULL; reqp = next) {
next = reqp->aio_req_next;
kmem_free(reqp, sizeof (struct aio_req_t));
}
/*
* release io list head freelist.
*/
for (liop = aiop->aio_lio_free; liop != NULL; liop = nxtlio) {
nxtlio = liop->lio_next;
kmem_free(liop, sizeof (aio_lio_t));
}
if (aiop->aio_iocb)
kmem_free(aiop->aio_iocb, aiop->aio_iocbsz);
mutex_destroy(&aiop->aio_mutex);
mutex_destroy(&aiop->aio_portq_mutex);
mutex_destroy(&aiop->aio_cleanupq_mutex);
p->p_aio = NULL;
mutex_exit(&p->p_lock);
kmem_free(aiop, sizeof (struct aio));
}
/*
* copy out aio request's result to a user-level result_t buffer.
*/
void
aio_copyout_result(aio_req_t *reqp)
{
struct buf *bp;
struct iovec *iov;
void *resultp;
int error;
size_t retval;
if (reqp->aio_req_flags & AIO_COPYOUTDONE)
return;
reqp->aio_req_flags |= AIO_COPYOUTDONE;
iov = reqp->aio_req_uio.uio_iov;
bp = &reqp->aio_req_buf;
/* "resultp" points to user-level result_t buffer */
resultp = (void *)reqp->aio_req_resultp;
if (bp->b_flags & B_ERROR) {
if (bp->b_error)
error = bp->b_error;
else
error = EIO;
retval = (size_t)-1;
} else {
error = 0;
retval = iov->iov_len - bp->b_resid;
}
#ifdef _SYSCALL32_IMPL
if (get_udatamodel() == DATAMODEL_NATIVE) {
(void) sulword(&((aio_result_t *)resultp)->aio_return, retval);
(void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
} else {
(void) suword32(&((aio_result32_t *)resultp)->aio_return,
(int)retval);
(void) suword32(&((aio_result32_t *)resultp)->aio_errno, error);
}
#else
(void) suword32(&((aio_result_t *)resultp)->aio_return, retval);
(void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
#endif
}
void
aio_copyout_result_port(struct iovec *iov, struct buf *bp, void *resultp)
{
int errno;
size_t retval;
if (bp->b_flags & B_ERROR) {
if (bp->b_error)
errno = bp->b_error;
else
errno = EIO;
retval = (size_t)-1;
} else {
errno = 0;
retval = iov->iov_len - bp->b_resid;
}
#ifdef _SYSCALL32_IMPL
if (get_udatamodel() == DATAMODEL_NATIVE) {
(void) sulword(&((aio_result_t *)resultp)->aio_return, retval);
(void) suword32(&((aio_result_t *)resultp)->aio_errno, errno);
} else {
(void) suword32(&((aio_result32_t *)resultp)->aio_return,
(int)retval);
(void) suword32(&((aio_result32_t *)resultp)->aio_errno, errno);
}
#else
(void) suword32(&((aio_result_t *)resultp)->aio_return, retval);
(void) suword32(&((aio_result_t *)resultp)->aio_errno, errno);
#endif
}
/*
* This function is used to remove a request from the done queue.
*/
void
aio_req_remove_portq(aio_t *aiop, aio_req_t *reqp)
{
ASSERT(MUTEX_HELD(&aiop->aio_portq_mutex));
while (aiop->aio_portq == NULL) {
/*
* aio_portq is set to NULL when aio_cleanup_portq()
* is working with the event queue.
* The aio_cleanup_thread() uses aio_cleanup_portq()
* to unlock all AIO buffers with completed transactions.
* Wait here until aio_cleanup_portq() restores the
* list of completed transactions in aio_portq.
*/
cv_wait(&aiop->aio_portcv, &aiop->aio_portq_mutex);
}
aio_deq(&aiop->aio_portq, reqp);
}
/* ARGSUSED */
void
aio_close_port(void *arg, int port, pid_t pid, int lastclose)
{
aio_t *aiop;
aio_req_t *reqp;
aio_req_t *next;
aio_req_t *headp;
int counter;
if (arg == NULL)
aiop = curproc->p_aio;
else
aiop = (aio_t *)arg;
/*
* The PORT_SOURCE_AIO source is always associated with every new
* created port by default.
* If no asynchronous I/O transactions were associated with the port
* then the aiop pointer will still be set to NULL.
*/
if (aiop == NULL)
return;
/*
* Within a process event ports can be used to collect events other
* than PORT_SOURCE_AIO events. At the same time the process can submit
* asynchronous I/Os transactions which are not associated with the
* current port.
* The current process oriented model of AIO uses a sigle queue for
* pending events. On close the pending queue (queue of asynchronous
* I/O transactions using event port notification) must be scanned
* to detect and handle pending I/Os using the current port.
*/
mutex_enter(&aiop->aio_portq_mutex);
mutex_enter(&aiop->aio_mutex);
counter = 0;
if ((headp = aiop->aio_portpending) != NULL) {
reqp = headp;
do {
if (reqp->aio_req_portkev &&
reqp->aio_req_port == port) {
reqp->aio_req_flags |= AIO_CLOSE_PORT;
counter++;
}
} while ((reqp = reqp->aio_req_next) != headp);
}
if (counter == 0) {
/* no AIOs pending */
mutex_exit(&aiop->aio_mutex);
mutex_exit(&aiop->aio_portq_mutex);
return;
}
aiop->aio_portpendcnt += counter;
mutex_exit(&aiop->aio_mutex);
while (aiop->aio_portpendcnt)
cv_wait(&aiop->aio_portcv, &aiop->aio_portq_mutex);
/*
* all pending AIOs are completed.
* check port doneq
*/
headp = NULL;
if ((reqp = aiop->aio_portq) != NULL) {
do {
next = reqp->aio_req_next;
if (reqp->aio_req_port == port) {
/* dequeue request and discard event */
aio_req_remove_portq(aiop, reqp);
port_free_event(reqp->aio_req_portkev);
/* put request in temporary queue */
reqp->aio_req_next = headp;
headp = reqp;
}
} while ((reqp = next) != aiop->aio_portq);
}
mutex_exit(&aiop->aio_portq_mutex);
/* headp points to the list of requests to be discarded */
for (reqp = headp; reqp != NULL; reqp = next) {
next = reqp->aio_req_next;
aphysio_unlock(reqp);
mutex_enter(&aiop->aio_mutex);
aio_req_free_port(aiop, reqp);
mutex_exit(&aiop->aio_mutex);
}
if (aiop->aio_flags & AIO_CLEANUP)
cv_broadcast(&aiop->aio_waitcv);
}
/*
* aio_cleanup_dr_delete_memory is used by dr's delete_memory_thread
* to kick start the aio_cleanup_thread for the give process to do the
* necessary cleanup.
* This is needed so that delete_memory_thread can obtain writer locks
* on pages that need to be relocated during a dr memory delete operation,
* otherwise a deadly embrace may occur.
*/
int
aio_cleanup_dr_delete_memory(proc_t *procp)
{
struct aio *aiop = procp->p_aio;
struct as *as = procp->p_as;
int ret = 0;
ASSERT(MUTEX_HELD(&procp->p_lock));
mutex_enter(&as->a_contents);
if (aiop != NULL) {
aiop->aio_rqclnup = 1;
cv_broadcast(&as->a_cv);
ret = 1;
}
mutex_exit(&as->a_contents);
return (ret);
}