common/os/aio.c

	aio.c revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

/*
 * Kernel asynchronous I/O.
 * This is only for raw devices now (as of Nov. 1993).
 */

#include <sys/types.h>
#include <sys/errno.h>
#include <sys/conf.h>
#include <sys/file.h>
#include <sys/fs/snode.h>
#include <sys/unistd.h>
#include <sys/cmn_err.h>
#include <vm/as.h>
#include <vm/faultcode.h>
#include <sys/sysmacros.h>
#include <sys/procfs.h>
#include <sys/kmem.h>
#include <sys/autoconf.h>
#include <sys/ddi_impldefs.h>
#include <sys/sunddi.h>
#include <sys/aio_impl.h>
#include <sys/debug.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/vmsystm.h>
#include <sys/fs/pxfs_ki.h>
#include <sys/contract/process_impl.h>

/*
 * external entry point.
 */
#ifdef _LP64
static int64_t kaioc(long, long, long, long, long, long);
#endif
static int kaio(ulong_t *, rval_t *);


#define AIO_64  0
#define AIO_32  1
#define AIO_LARGEFILE   2

/*
 * implementation specific functions (private)
 */
#ifdef _LP64
static int alio(int, int, aiocb_t **, int, struct sigevent *);
#endif
static int aionotify(void);
static int aioinit(void);
static int aiostart(void);
static void alio_cleanup(aio_t *, aiocb_t **, int, int);
static int (*check_vp(struct vnode *, int))(vnode_t *, struct aio_req *,
    cred_t *);
static void lio_set_error(aio_req_t *);
static aio_t *aio_aiop_alloc();
static int aio_req_alloc(aio_req_t **, aio_result_t *);
static int aio_lio_alloc(aio_lio_t **);
static aio_req_t *aio_req_done(void *);
static aio_req_t *aio_req_remove(aio_req_t *);
static int aio_req_find(aio_result_t *, aio_req_t **);
static int aio_hash_insert(struct aio_req_t *, aio_t *);
static int aio_req_setup(aio_req_t **, aio_t *, aiocb_t *,
    aio_result_t *, int, vnode_t *);
static int aio_cleanup_thread(aio_t *);
static aio_lio_t *aio_list_get(aio_result_t *);
static void lio_set_uerror(void *, int);
extern void aio_zerolen(aio_req_t *);
static int aiowait(struct timeval *, int, long  *);
static int aiowaitn(void *, uint_t, uint_t *, timespec_t *);
static int aio_unlock_requests(caddr_t iocblist, int iocb_index,
    aio_req_t *reqlist, aio_t *aiop, model_t model);
static int aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max);
static int aiosuspend(void *, int, struct  timespec *, int,
    long    *, int);
static int aliowait(int, void *, int, void *, int);
static int aioerror(void *, int);
static int aio_cancel(int, void *, long *, int);
static int arw(int, int, char *, int, offset_t, aio_result_t *, int);
static int aiorw(int, void *, int, int);

static int alioLF(int, void *, int, void *);
static int aio_req_setupLF(aio_req_t **, aio_t *,
    aiocb64_32_t *, aio_result_t *, int, vnode_t *);
static int alio32(int, void *, int, void *);
static int driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
static int driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);

#ifdef  _SYSCALL32_IMPL
static void aiocb_LFton(aiocb64_32_t *, aiocb_t *);
void    aiocb_32ton(aiocb32_t *, aiocb_t *);
#endif /* _SYSCALL32_IMPL */

/*
 * implementation specific functions (external)
 */
void aio_req_free(aio_t *, aio_req_t *);

/*
 * Event Port framework
 */

void aio_req_free_port(aio_t *, aio_req_t *);
static int aio_port_callback(void *, int *, pid_t, int, void *);

/*
 * This is the loadable module wrapper.
 */
#include <sys/modctl.h>
#include <sys/syscall.h>

#ifdef _LP64

static struct sysent kaio_sysent = {
    6,
    SE_NOUNLOAD | SE_64RVAL | SE_ARGC,
    (int (*)())kaioc
};

#ifdef _SYSCALL32_IMPL
static struct sysent kaio_sysent32 = {
    7,
    SE_NOUNLOAD | SE_64RVAL,
    kaio
};
#endif  /* _SYSCALL32_IMPL */

#else   /* _LP64 */

static struct sysent kaio_sysent = {
    7,
    SE_NOUNLOAD | SE_32RVAL1,
    kaio
};

#endif  /* _LP64 */

/*
 * Module linkage information for the kernel.
 */

static struct modlsys modlsys = {
    &mod_syscallops,
    "kernel Async I/O",
    &kaio_sysent
};

#ifdef  _SYSCALL32_IMPL
static struct modlsys modlsys32 = {
    &mod_syscallops32,
    "kernel Async I/O for 32 bit compatibility",
    &kaio_sysent32
};
#endif  /* _SYSCALL32_IMPL */


static struct modlinkage modlinkage = {
    MODREV_1,
    &modlsys,
#ifdef  _SYSCALL32_IMPL
    &modlsys32,
#endif
    NULL
};

int
_init(void)
{
    int retval;

    if ((retval = mod_install(&modlinkage)) != 0)
        return (retval);

    return (0);
}

int
_fini(void)
{
    int retval;

    retval = mod_remove(&modlinkage);

    return (retval);
}

int
_info(struct modinfo *modinfop)
{
    return (mod_info(&modlinkage, modinfop));
}

#ifdef  _LP64
static int64_t
kaioc(
    long    a0,
    long    a1,
    long    a2,
    long    a3,
    long    a4,
    long    a5)
{
    int error;
    long    rval = 0;

    switch ((int)a0 & ~AIO_POLL_BIT) {
    case AIOREAD:
        error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
            (offset_t)a4, (aio_result_t *)a5, FREAD);
        break;
    case AIOWRITE:
        error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
            (offset_t)a4, (aio_result_t *)a5, FWRITE);
        break;
    case AIOWAIT:
        error = aiowait((struct timeval *)a1, (int)a2, &rval);
        break;
    case AIOWAITN:
        error = aiowaitn((void *)a1, (uint_t)a2, (uint_t *)a3,
            (timespec_t *)a4);
        break;
    case AIONOTIFY:
        error = aionotify();
        break;
    case AIOINIT:
        error = aioinit();
        break;
    case AIOSTART:
        error = aiostart();
        break;
    case AIOLIO:
        error = alio((int)a0, (int)a1, (aiocb_t **)a2, (int)a3,
            (struct sigevent *)a4);
        break;
    case AIOLIOWAIT:
        error = aliowait((int)a1, (void *)a2, (int)a3,
            (struct sigevent *)a4, AIO_64);
        break;
    case AIOSUSPEND:
        error = aiosuspend((void *)a1, (int)a2, (timespec_t *)a3,
            (int)a4, &rval, AIO_64);
        break;
    case AIOERROR:
        error = aioerror((void *)a1, AIO_64);
        break;
    case AIOAREAD:
        error = aiorw((int)a0, (void *)a1, FREAD, AIO_64);
        break;
    case AIOAWRITE:
        error = aiorw((int)a0, (void *)a1, FWRITE, AIO_64);
        break;
    case AIOCANCEL:
        error = aio_cancel((int)a1, (void *)a2, &rval, AIO_64);
        break;

    /*
     * The large file related stuff is valid only for
     * 32 bit kernel and not for 64 bit kernel
     * On 64 bit kernel we convert large file calls
     * to regular 64bit calls.
     */

    default:
        error = EINVAL;
    }
    if (error)
        return ((int64_t)set_errno(error));
    return (rval);
}
#endif

static int
kaio(
    ulong_t *uap,
    rval_t *rvp)
{
    long rval = 0;
    int error = 0;
    offset_t    off;


        rvp->r_vals = 0;
#if defined(_LITTLE_ENDIAN)
    off = ((u_offset_t)uap[5] << 32) | (u_offset_t)uap[4];
#else
    off = ((u_offset_t)uap[4] << 32) | (u_offset_t)uap[5];
#endif

    switch (uap[0] & ~AIO_POLL_BIT) {
    /*
     * It must be the 32 bit system call on 64 bit kernel
     */
    case AIOREAD:
        return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
            (int)uap[3], off, (aio_result_t *)uap[6], FREAD));
    case AIOWRITE:
        return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
            (int)uap[3], off, (aio_result_t *)uap[6], FWRITE));
    case AIOWAIT:
        error = aiowait((struct timeval *)uap[1], (int)uap[2],
            &rval);
        break;
    case AIOWAITN:
        error = aiowaitn((void *)uap[1], (uint_t)uap[2],
            (uint_t *)uap[3], (timespec_t *)uap[4]);
        break;
    case AIONOTIFY:
        return (aionotify());
    case AIOINIT:
        return (aioinit());
    case AIOSTART:
        return (aiostart());
    case AIOLIO:
        return (alio32((int)uap[1], (void *)uap[2], (int)uap[3],
            (void *)uap[4]));
    case AIOLIOWAIT:
        return (aliowait((int)uap[1], (void *)uap[2],
            (int)uap[3], (struct sigevent *)uap[4], AIO_32));
    case AIOSUSPEND:
        error = aiosuspend((void *)uap[1], (int)uap[2],
            (timespec_t *)uap[3], (int)uap[4],
            &rval, AIO_32);
        break;
    case AIOERROR:
        return (aioerror((void *)uap[1], AIO_32));
    case AIOAREAD:
        return (aiorw((int)uap[0], (void *)uap[1],
            FREAD, AIO_32));
    case AIOAWRITE:
        return (aiorw((int)uap[0], (void *)uap[1],
            FWRITE, AIO_32));
    case AIOCANCEL:
        error = (aio_cancel((int)uap[1], (void *)uap[2], &rval,
            AIO_32));
        break;
    case AIOLIO64:
        return (alioLF((int)uap[1], (void *)uap[2],
            (int)uap[3], (void *)uap[4]));
    case AIOLIOWAIT64:
        return (aliowait(uap[1], (void *)uap[2],
            (int)uap[3], (void *)uap[4], AIO_LARGEFILE));
    case AIOSUSPEND64:
        error = aiosuspend((void *)uap[1], (int)uap[2],
            (timespec_t *)uap[3], (int)uap[4], &rval,
            AIO_LARGEFILE);
        break;
    case AIOERROR64:
        return (aioerror((void *)uap[1], AIO_LARGEFILE));
    case AIOAREAD64:
        return (aiorw((int)uap[0], (void *)uap[1], FREAD,
            AIO_LARGEFILE));
    case AIOAWRITE64:
        return (aiorw((int)uap[0], (void *)uap[1], FWRITE,
            AIO_LARGEFILE));
    case AIOCANCEL64:
        error = (aio_cancel((int)uap[1], (void *)uap[2],
            &rval, AIO_LARGEFILE));
        break;
    default:
        return (EINVAL);
    }

    rvp->r_val1 = rval;
    return (error);
}

/*
 * wake up LWPs in this process that are sleeping in
 * aiowait().
 */
static int
aionotify(void)
{
    aio_t   *aiop;

    aiop = curproc->p_aio;
    if (aiop == NULL)
        return (0);

    mutex_enter(&aiop->aio_mutex);
    aiop->aio_notifycnt++;
    cv_broadcast(&aiop->aio_waitcv);
    mutex_exit(&aiop->aio_mutex);

    return (0);
}

static int
timeval2reltime(struct timeval *timout, timestruc_t *rqtime,
    timestruc_t **rqtp, int *blocking)
{
#ifdef  _SYSCALL32_IMPL
    struct timeval32 wait_time_32;
#endif
    struct timeval wait_time;
    model_t model = get_udatamodel();

    *rqtp = NULL;
    if (timout == NULL) {       /* wait indefinitely */
        *blocking = 1;
        return (0);
    }

    /*
     * Need to correctly compare with the -1 passed in for a user
     * address pointer, with both 32 bit and 64 bit apps.
     */
    if (model == DATAMODEL_NATIVE) {
        if ((intptr_t)timout == (intptr_t)-1) { /* don't wait */
            *blocking = 0;
            return (0);
        }

        if (copyin(timout, &wait_time, sizeof (wait_time)))
            return (EFAULT);
    }
#ifdef  _SYSCALL32_IMPL
    else {
        /*
         * -1 from a 32bit app. It will not get sign extended.
         * don't wait if -1.
         */
        if ((intptr_t)timout == (intptr_t)((uint32_t)-1)) {
            *blocking = 0;
            return (0);
        }

        if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
            return (EFAULT);
        TIMEVAL32_TO_TIMEVAL(&wait_time, &wait_time_32);
    }
#endif  /* _SYSCALL32_IMPL */

    if (wait_time.tv_sec == 0 && wait_time.tv_usec == 0) {  /* don't wait */
        *blocking = 0;
        return (0);
    }

    if (wait_time.tv_sec < 0 ||
        wait_time.tv_usec < 0 || wait_time.tv_usec >= MICROSEC)
        return (EINVAL);

    rqtime->tv_sec = wait_time.tv_sec;
    rqtime->tv_nsec = wait_time.tv_usec * 1000;
    *rqtp = rqtime;
    *blocking = 1;

    return (0);
}

static int
timespec2reltime(timespec_t *timout, timestruc_t *rqtime,
    timestruc_t **rqtp, int *blocking)
{
#ifdef  _SYSCALL32_IMPL
    timespec32_t wait_time_32;
#endif
    model_t model = get_udatamodel();

    *rqtp = NULL;
    if (timout == NULL) {
        *blocking = 1;
        return (0);
    }

    if (model == DATAMODEL_NATIVE) {
        if (copyin(timout, rqtime, sizeof (*rqtime)))
            return (EFAULT);
    }
#ifdef  _SYSCALL32_IMPL
    else {
        if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
            return (EFAULT);
        TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32);
    }
#endif  /* _SYSCALL32_IMPL */

    if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) {
        *blocking = 0;
        return (0);
    }

    if (rqtime->tv_sec < 0 ||
        rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC)
        return (EINVAL);

    *rqtp = rqtime;
    *blocking = 1;

    return (0);
}

/*ARGSUSED*/
static int
aiowait(
    struct timeval  *timout,
    int dontblockflg,
    long    *rval)
{
    int         error;
    aio_t       *aiop;
    aio_req_t   *reqp;
    clock_t     status;
    int     blocking;
    int     timecheck;
    timestruc_t rqtime;
    timestruc_t *rqtp;

    aiop = curproc->p_aio;
    if (aiop == NULL)
        return (EINVAL);

    /*
     * Establish the absolute future time for the timeout.
     */
    error = timeval2reltime(timout, &rqtime, &rqtp, &blocking);
    if (error)
        return (error);
    if (rqtp) {
        timestruc_t now;
        timecheck = timechanged;
        gethrestime(&now);
        timespecadd(rqtp, &now);
    }

    mutex_enter(&aiop->aio_mutex);
    for (;;) {
        /* process requests on poll queue */
        if (aiop->aio_pollq) {
            mutex_exit(&aiop->aio_mutex);
            aio_cleanup(0);
            mutex_enter(&aiop->aio_mutex);
        }
        if ((reqp = aio_req_remove(NULL)) != NULL) {
            *rval = (long)reqp->aio_req_resultp;
            break;
        }
        /* user-level done queue might not be empty */
        if (aiop->aio_notifycnt > 0) {
            aiop->aio_notifycnt--;
            *rval = 1;
            break;
        }
        /* don't block if no outstanding aio */
        if (aiop->aio_outstanding == 0 && dontblockflg) {
            error = EINVAL;
            break;
        }
        if (blocking) {
            status = cv_waituntil_sig(&aiop->aio_waitcv,
                &aiop->aio_mutex, rqtp, timecheck);

            if (status > 0)     /* check done queue again */
                continue;
            if (status == 0) {  /* interrupted by a signal */
                error = EINTR;
                *rval = -1;
            } else {        /* timer expired */
                error = ETIME;
            }
        }
        break;
    }
    mutex_exit(&aiop->aio_mutex);
    if (reqp) {
        aphysio_unlock(reqp);
        aio_copyout_result(reqp);
        mutex_enter(&aiop->aio_mutex);
        aio_req_free(aiop, reqp);
        mutex_exit(&aiop->aio_mutex);
    }
    return (error);
}

/*
 * aiowaitn can be used to reap completed asynchronous requests submitted with
 * lio_listio, aio_read or aio_write.
 * This function only reaps asynchronous raw I/Os.
 */

/*ARGSUSED*/
static int
aiowaitn(void *uiocb, uint_t nent, uint_t *nwait, timespec_t *timout)
{
    int         error = 0;
    aio_t       *aiop;
    aio_req_t   *reqlist = NULL;
    caddr_t     iocblist = NULL;    /* array of iocb ptr's */
    uint_t      waitcnt, cnt = 0;   /* iocb cnt */
    size_t      iocbsz;         /* users iocb size */
    size_t      riocbsz;        /* returned iocb size */
    int     iocb_index = 0;
    model_t     model = get_udatamodel();
    int     blocking = 1;
    int     timecheck;
    timestruc_t rqtime;
    timestruc_t *rqtp;

    aiop = curproc->p_aio;
    if (aiop == NULL)
        return (EINVAL);

    if (aiop->aio_outstanding == 0)
        return (EAGAIN);

    if (copyin(nwait, &waitcnt, sizeof (uint_t)))
        return (EFAULT);

    /* set *nwait to zero, if we must return prematurely */
    if (copyout(&cnt, nwait, sizeof (uint_t)))
        return (EFAULT);

    if (waitcnt == 0) {
        blocking = 0;
        rqtp = NULL;
        waitcnt = nent;
    } else {
        error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
        if (error)
            return (error);
    }

    if (model == DATAMODEL_NATIVE)
        iocbsz = (sizeof (aiocb_t *) * nent);
#ifdef  _SYSCALL32_IMPL
    else
        iocbsz = (sizeof (caddr32_t) * nent);
#endif  /* _SYSCALL32_IMPL */

    /*
     * Only one aio_waitn call is allowed at a time.
     * The active aio_waitn will collect all requests
     * out of the "done" list and if necessary it will wait
     * for some/all pending requests to fulfill the nwait
     * parameter.
     * A second or further aio_waitn calls will sleep here
     * until the active aio_waitn finishes and leaves the kernel
     * If the second call does not block (poll), then return
     * immediately with the error code : EAGAIN.
     * If the second call should block, then sleep here, but
     * do not touch the timeout. The timeout starts when this
     * aio_waitn-call becomes active.
     */

    mutex_enter(&aiop->aio_mutex);

    while (aiop->aio_flags & AIO_WAITN) {
        if (blocking == 0) {
            mutex_exit(&aiop->aio_mutex);
            return (EAGAIN);
        }

        /* block, no timeout */
        aiop->aio_flags |= AIO_WAITN_PENDING;
        if (!cv_wait_sig(&aiop->aio_waitncv, &aiop->aio_mutex)) {
            mutex_exit(&aiop->aio_mutex);
            return (EINTR);
        }
    }

    /*
     * Establish the absolute future time for the timeout.
     */
    if (rqtp) {
        timestruc_t now;
        timecheck = timechanged;
        gethrestime(&now);
        timespecadd(rqtp, &now);
    }

    if (iocbsz > aiop->aio_iocbsz && aiop->aio_iocb != NULL) {
        kmem_free(aiop->aio_iocb, aiop->aio_iocbsz);
        aiop->aio_iocb = NULL;
    }

    if (aiop->aio_iocb == NULL) {
        iocblist = kmem_zalloc(iocbsz, KM_NOSLEEP);
        if (iocblist == NULL) {
            mutex_exit(&aiop->aio_mutex);
            return (ENOMEM);
        }
        aiop->aio_iocb = (aiocb_t **)iocblist;
        aiop->aio_iocbsz = iocbsz;
    } else {
        iocblist = (char *)aiop->aio_iocb;
    }

    aiop->aio_waitncnt = waitcnt;
    aiop->aio_flags |= AIO_WAITN;

    for (;;) {
        /* push requests on poll queue to done queue */
        if (aiop->aio_pollq) {
            mutex_exit(&aiop->aio_mutex);
            aio_cleanup(0);
            mutex_enter(&aiop->aio_mutex);
        }

        /* check for requests on done queue */
        if (aiop->aio_doneq) {
            cnt += aio_reqlist_concat(aiop, &reqlist, nent - cnt);
            aiop->aio_waitncnt = waitcnt - cnt;
        }

        /* user-level done queue might not be empty */
        if (aiop->aio_notifycnt > 0) {
            aiop->aio_notifycnt--;
            error = 0;
            break;
        }

        /*
         * if we are here second time as a result of timer
         * expiration, we reset error if there are enough
         * aiocb's to satisfy request.
         * We return also if all requests are already done
         * and we picked up the whole done queue.
         */

        if ((cnt >= waitcnt) || (cnt > 0 && aiop->aio_pending == 0 &&
            aiop->aio_doneq == NULL)) {
            error = 0;
            break;
        }

        if ((cnt < waitcnt) && blocking) {
            int rval = cv_waituntil_sig(&aiop->aio_waitcv,
                &aiop->aio_mutex, rqtp, timecheck);
            if (rval > 0)
                continue;
            if (rval < 0) {
                error = ETIME;
                blocking = 0;
                continue;
            }
            error = EINTR;
        }
        break;
    }

    mutex_exit(&aiop->aio_mutex);

    if (cnt > 0) {

        iocb_index = aio_unlock_requests(iocblist, iocb_index, reqlist,
            aiop, model);

        if (model == DATAMODEL_NATIVE)
            riocbsz = (sizeof (aiocb_t *) * cnt);
#ifdef  _SYSCALL32_IMPL
        else
            riocbsz = (sizeof (caddr32_t) * cnt);
#endif  /* _SYSCALL32_IMPL */

        if (copyout(iocblist, uiocb, riocbsz) ||
            copyout(&cnt, nwait, sizeof (uint_t)))
            error = EFAULT;
    }

    if (aiop->aio_iocbsz > AIO_IOCB_MAX) {
        kmem_free(iocblist, aiop->aio_iocbsz);
        aiop->aio_iocb = NULL;
    }

    /* check if there is another thread waiting for execution */
    mutex_enter(&aiop->aio_mutex);
    aiop->aio_flags &= ~AIO_WAITN;
    if (aiop->aio_flags & AIO_WAITN_PENDING) {
        aiop->aio_flags &= ~AIO_WAITN_PENDING;
        cv_signal(&aiop->aio_waitncv);
    }
    mutex_exit(&aiop->aio_mutex);

    return (error);
}

/*
 * aio_unlock_requests
 * copyouts the result of the request as well as the return value.
 * It builds the list of completed asynchronous requests,
 * unlocks the allocated memory ranges and
 * put the aio request structure back into the free list.
 */

static int
aio_unlock_requests(
    caddr_t iocblist,
    int iocb_index,
    aio_req_t *reqlist,
    aio_t   *aiop,
    model_t model)
{
    aio_req_t   *reqp, *nreqp;

    if (model == DATAMODEL_NATIVE) {
        for (reqp = reqlist; reqp != NULL;  reqp = nreqp) {
            (((caddr_t *)iocblist)[iocb_index++]) =
                reqp->aio_req_iocb.iocb;
            nreqp = reqp->aio_req_next;
            aphysio_unlock(reqp);
            aio_copyout_result(reqp);
            mutex_enter(&aiop->aio_mutex);
            aio_req_free(aiop, reqp);
            mutex_exit(&aiop->aio_mutex);
        }
    }
#ifdef  _SYSCALL32_IMPL
    else {
        for (reqp = reqlist; reqp != NULL;  reqp = nreqp) {
            ((caddr32_t *)iocblist)[iocb_index++] =
                reqp->aio_req_iocb.iocb32;
            nreqp = reqp->aio_req_next;
            aphysio_unlock(reqp);
            aio_copyout_result(reqp);
            mutex_enter(&aiop->aio_mutex);
            aio_req_free(aiop, reqp);
            mutex_exit(&aiop->aio_mutex);
        }
    }
#endif  /* _SYSCALL32_IMPL */
    return (iocb_index);
}

/*
 * aio_reqlist_concat
 * moves "max" elements from the done queue to the reqlist queue and removes
 * the AIO_DONEQ flag.
 * - reqlist queue is a simple linked list
 * - done queue is a double linked list
 */

static int
aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max)
{
    aio_req_t *q2, *q2work, *list;
    int count = 0;

    list = *reqlist;
    q2 = aiop->aio_doneq;
    q2work = q2;
    while (max-- > 0) {
        q2work->aio_req_flags &= ~AIO_DONEQ;
        q2work = q2work->aio_req_next;
        count++;
        if (q2work == q2)
            break;
    }

    if (q2work == q2) {
        /* all elements revised */
        q2->aio_req_prev->aio_req_next = list;
        list = q2;
        aiop->aio_doneq = NULL;
    } else {
        /*
         * max < elements in the doneq
         * detach only the required amount of elements
         * out of the doneq
         */
        q2work->aio_req_prev->aio_req_next = list;
        list = q2;

        aiop->aio_doneq = q2work;
        q2work->aio_req_prev = q2->aio_req_prev;
        q2->aio_req_prev->aio_req_next = q2work;
    }
    *reqlist = list;
    return (count);
}

/*ARGSUSED*/
static int
aiosuspend(
    void    *aiocb,
    int nent,
    struct  timespec    *timout,
    int flag,
    long    *rval,
    int run_mode)
{
    int         error;
    aio_t       *aiop;
    aio_req_t   *reqp, *found, *next;
    caddr_t     cbplist = NULL;
    aiocb_t     *cbp, **ucbp;
#ifdef  _SYSCALL32_IMPL
    aiocb32_t   *cbp32;
    caddr32_t   *ucbp32;
#endif  /* _SYSCALL32_IMPL */
    aiocb64_32_t    *cbp64;
    int     rv;
    int     i;
    size_t      ssize;
    model_t     model = get_udatamodel();
    int     blocking;
    int     timecheck;
    timestruc_t rqtime;
    timestruc_t *rqtp;

    aiop = curproc->p_aio;
    if (aiop == NULL || nent <= 0)
        return (EINVAL);

    /*
     * Establish the absolute future time for the timeout.
     */
    error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
    if (error)
        return (error);
    if (rqtp) {
        timestruc_t now;
        timecheck = timechanged;
        gethrestime(&now);
        timespecadd(rqtp, &now);
    }

    /*
     * If we are not blocking and there's no IO complete
     * skip aiocb copyin.
     */
    if (!blocking && (aiop->aio_pollq == NULL) &&
        (aiop->aio_doneq == NULL)) {
        return (EAGAIN);
    }

    if (model == DATAMODEL_NATIVE)
        ssize = (sizeof (aiocb_t *) * nent);
#ifdef  _SYSCALL32_IMPL
    else
        ssize = (sizeof (caddr32_t) * nent);
#endif  /* _SYSCALL32_IMPL */

    cbplist = kmem_alloc(ssize, KM_NOSLEEP);
    if (cbplist == NULL)
        return (ENOMEM);

    if (copyin(aiocb, cbplist, ssize)) {
        error = EFAULT;
        goto done;
    }

    found = NULL;
    /*
     * we need to get the aio_cleanupq_mutex since we call
     * aio_req_done().
     */
    mutex_enter(&aiop->aio_cleanupq_mutex);
    mutex_enter(&aiop->aio_mutex);
    for (;;) {
        /* push requests on poll queue to done queue */
        if (aiop->aio_pollq) {
            mutex_exit(&aiop->aio_mutex);
            mutex_exit(&aiop->aio_cleanupq_mutex);
            aio_cleanup(0);
            mutex_enter(&aiop->aio_cleanupq_mutex);
            mutex_enter(&aiop->aio_mutex);
        }
        /* check for requests on done queue */
        if (aiop->aio_doneq) {
            if (model == DATAMODEL_NATIVE)
                ucbp = (aiocb_t **)cbplist;
#ifdef  _SYSCALL32_IMPL
            else
                ucbp32 = (caddr32_t *)cbplist;
#endif  /* _SYSCALL32_IMPL */
            for (i = 0; i < nent; i++) {
                if (model == DATAMODEL_NATIVE) {
                    if ((cbp = *ucbp++) == NULL)
                        continue;
                    if (run_mode != AIO_LARGEFILE)
                        reqp = aio_req_done(
                            &cbp->aio_resultp);
                    else {
                        cbp64 = (aiocb64_32_t *)cbp;
                        reqp = aio_req_done(
                            &cbp64->aio_resultp);
                    }
                }
#ifdef  _SYSCALL32_IMPL
                else {
                    if (run_mode == AIO_32) {
                        if ((cbp32 =
                            (aiocb32_t *)(uintptr_t)
                            *ucbp32++) == NULL)
                            continue;
                        reqp = aio_req_done(
                            &cbp32->aio_resultp);
                    } else if (run_mode == AIO_LARGEFILE) {
                        if ((cbp64 =
                            (aiocb64_32_t *)(uintptr_t)
                            *ucbp32++) == NULL)
                            continue;
                            reqp = aio_req_done(
                            &cbp64->aio_resultp);
                    }

                }
#endif  /* _SYSCALL32_IMPL */
                if (reqp) {
                    reqp->aio_req_next = found;
                    found = reqp;
                }
                if (aiop->aio_doneq == NULL)
                    break;
            }
            if (found)
                break;
        }
        if (aiop->aio_notifycnt > 0) {
            /*
             * nothing on the kernel's queue. the user
             * has notified the kernel that it has items
             * on a user-level queue.
             */
            aiop->aio_notifycnt--;
            *rval = 1;
            error = 0;
            break;
        }
        /* don't block if nothing is outstanding */
        if (aiop->aio_outstanding == 0) {
            error = EAGAIN;
            break;
        }
        if (blocking) {
            /*
             * drop the aio_cleanupq_mutex as we are
             * going to block.
             */
            mutex_exit(&aiop->aio_cleanupq_mutex);
            rv = cv_waituntil_sig(&aiop->aio_waitcv,
                &aiop->aio_mutex, rqtp, timecheck);
            /*
             * we have to drop aio_mutex and
             * grab it in the right order.
             */
            mutex_exit(&aiop->aio_mutex);
            mutex_enter(&aiop->aio_cleanupq_mutex);
            mutex_enter(&aiop->aio_mutex);
            if (rv > 0) /* check done queue again */
                continue;
            if (rv == 0)    /* interrupted by a signal */
                error = EINTR;
            else        /* timer expired */
                error = ETIME;
        } else {
            error = EAGAIN;
        }
        break;
    }
    mutex_exit(&aiop->aio_mutex);
    mutex_exit(&aiop->aio_cleanupq_mutex);
    for (reqp = found; reqp != NULL; reqp = next) {
        next = reqp->aio_req_next;
        aphysio_unlock(reqp);
        aio_copyout_result(reqp);
        mutex_enter(&aiop->aio_mutex);
        aio_req_free(aiop, reqp);
        mutex_exit(&aiop->aio_mutex);
    }
done:
    kmem_free(cbplist, ssize);
    return (error);
}

/*
 * initialize aio by allocating an aio_t struct for this
 * process.
 */
static int
aioinit(void)
{
    proc_t *p = curproc;
    aio_t *aiop;
    mutex_enter(&p->p_lock);
    if ((aiop = p->p_aio) == NULL) {
        aiop = aio_aiop_alloc();
        p->p_aio = aiop;
    }
    mutex_exit(&p->p_lock);
    if (aiop == NULL)
        return (ENOMEM);
    return (0);
}

/*
 * start a special thread that will cleanup after aio requests
 * that are preventing a segment from being unmapped. as_unmap()
 * blocks until all phsyio to this segment is completed. this
 * doesn't happen until all the pages in this segment are not
 * SOFTLOCKed. Some pages will be SOFTLOCKed when there are aio
 * requests still outstanding. this special thread will make sure
 * that these SOFTLOCKed pages will eventually be SOFTUNLOCKed.
 *
 * this function will return an error if the process has only
 * one LWP. the assumption is that the caller is a separate LWP
 * that remains blocked in the kernel for the life of this process.
 */
static int
aiostart(void)
{
    proc_t *p = curproc;
    aio_t *aiop;
    int first, error = 0;

    if (p->p_lwpcnt == 1)
        return (EDEADLK);
    mutex_enter(&p->p_lock);
    if ((aiop = p->p_aio) == NULL)
        error = EINVAL;
    else {
        first = aiop->aio_ok;
        if (aiop->aio_ok == 0)
            aiop->aio_ok = 1;
    }
    mutex_exit(&p->p_lock);
    if (error == 0 && first == 0) {
        return (aio_cleanup_thread(aiop));
        /* should return only to exit */
    }
    return (error);
}

/*
 * Associate an aiocb with a port.
 * This function is used by aiorw() to associate a transaction with a port.
 * Allocate an event port structure (port_alloc_event()) and store the
 * delivered user pointer (portnfy_user) in the portkev_user field of the
 * port_kevent_t structure..
 * The aio_req_portkev pointer in the aio_req_t structure was added to identify
 * the port association.
 */

static int
aio_req_assoc_port_rw(port_notify_t *pntfy, aiocb_t *cbp, aio_req_t *reqp)
{
    port_kevent_t   *pkevp = NULL;
    int     error;

    error = port_alloc_event(pntfy->portnfy_port, PORT_ALLOC_DEFAULT,
        PORT_SOURCE_AIO, &pkevp);
    if (error) {
        if ((error == ENOMEM) || (error == EAGAIN))
            error = EAGAIN;
        else
            error = EINVAL;
    } else {
        port_init_event(pkevp, (uintptr_t)cbp, pntfy->portnfy_user,
            aio_port_callback, reqp);
        reqp->aio_req_portkev = pkevp;
        reqp->aio_req_port = pntfy->portnfy_port;
    }
    return (error);
}

/*
 * Associate an aiocb with a port.
 * This function is used by lio_listio() to associate a transaction with a port.
 * Allocate an event port structure (port_alloc_event()) and store the
 * delivered user pointer (portnfy_user) in the portkev_user field of the
 * The aio_req_portkev pointer in the aio_req_t structure was added to identify
 * the port association.
 * The event port notification can be requested attaching the port_notify_t
 * structure to the sigevent argument of lio_listio() or attaching the
 * port_notify_t structure to the sigevent structure which is embedded in the
 * aiocb.
 * The attachement to the global sigevent structure is valid for all aiocbs
 * in the list.
 */

static int
aio_req_assoc_port(struct sigevent *sigev, void *user, aiocb_t *cbp,
    aio_req_t *reqp, port_kevent_t *pkevtp)
{
    port_kevent_t   *pkevp = NULL;
    port_notify_t   pntfy;
    int     error;

    if (sigev->sigev_notify == SIGEV_PORT) {
        /* aiocb has an own port notification embedded */
        if (copyin((void *)sigev->sigev_value.sival_ptr, &pntfy,
            sizeof (port_notify_t)))
            return (EFAULT);

        error = port_alloc_event(pntfy.portnfy_port, PORT_ALLOC_DEFAULT,
            PORT_SOURCE_AIO, &pkevp);
        if (error) {
            if ((error == ENOMEM) || (error == EAGAIN))
                return (EAGAIN);
            else
                return (EINVAL);
        }
        /* use this values instead of the global values in port */

        port_init_event(pkevp, (uintptr_t)cbp, pntfy.portnfy_user,
            aio_port_callback, reqp);
        reqp->aio_req_port = pntfy.portnfy_port;
    } else {
        /* use global port notification */
        error = port_dup_event(pkevtp, &pkevp, PORT_ALLOC_DEFAULT);
        if (error)
            return (EAGAIN);
        port_init_event(pkevp, (uintptr_t)cbp, user, aio_port_callback,
            reqp);
    }
    reqp->aio_req_portkev = pkevp;
    return (0);
}

/*
 * Same comments as in aio_req_assoc_port(), see above.
 */

static int
aio_req_assoc_port32(struct sigevent32 *sigev, void *user, aiocb_t *cbp,
    aio_req_t *reqp, port_kevent_t *pkevtp)
{
    port_kevent_t   *pkevp = NULL;
    port_notify32_t pntfy;
    int     error;

    if (sigev->sigev_notify == SIGEV_PORT) {
        if (copyin((void *)(uintptr_t)sigev->sigev_value.sival_int,
            &pntfy, sizeof (port_notify32_t)))
            return (EFAULT);

        error = port_alloc_event(pntfy.portnfy_port,
            PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevp);
        if (error) {
            if ((error == ENOMEM) || (error == EAGAIN))
                return (EAGAIN);
            else
                return (EINVAL);
        }
        /* use this values instead of the global values in port */

        port_init_event(pkevp, (uintptr_t)cbp,
            (void *)(uintptr_t)pntfy.portnfy_user,
            aio_port_callback, reqp);
        reqp->aio_req_port = pntfy.portnfy_port;
    } else {
        error = port_dup_event(pkevtp, &pkevp, PORT_ALLOC_DEFAULT);
        if (error)
            return (EAGAIN);
        port_init_event(pkevp, (uintptr_t)cbp, user, aio_port_callback,
            reqp);
    }
    reqp->aio_req_portkev = pkevp;
    return (0);
}


#ifdef _LP64

/*
 * Asynchronous list IO. A chain of aiocb's are copied in
 * one at a time. If the aiocb is invalid, it is skipped.
 * For each aiocb, the appropriate driver entry point is
 * called. Optimize for the common case where the list
 * of requests is to the same file descriptor.
 *
 * One possible optimization is to define a new driver entry
 * point that supports a list of IO requests. Whether this
 * improves performance depends somewhat on the driver's
 * locking strategy. Processing a list could adversely impact
 * the driver's interrupt latency.
 */
/*ARGSUSED*/
static int
alio(
    int opcode,
    int mode_arg,
    aiocb_t **aiocb_arg,
    int nent,
    struct  sigevent *sigev)

{
    file_t      *fp;
    file_t      *prev_fp = NULL;
    int     prev_mode = -1;
    struct vnode    *vp;
    aio_lio_t   *head;
    aio_req_t   *reqp;
    aio_t       *aiop;
    caddr_t     cbplist;
    aiocb_t     *cbp, **ucbp;
    aiocb_t     cb;
    aiocb_t     *aiocb = &cb;
    struct sigevent sigevk;
    sigqueue_t  *sqp;
    int     (*aio_func)();
    int     mode;
    int     error = 0;
    int     aio_errors = 0;
    int     i;
    size_t      ssize;
    int     deadhead = 0;
    int     aio_notsupported = 0;
    int     aio_use_port = 0;
    port_kevent_t   *pkevtp = NULL;
    port_notify_t   pnotify;

    aiop = curproc->p_aio;
    if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
        return (EINVAL);

    ssize = (sizeof (aiocb_t *) * nent);
    cbplist = kmem_alloc(ssize, KM_SLEEP);
    ucbp = (aiocb_t **)cbplist;

    if (copyin(aiocb_arg, cbplist, sizeof (aiocb_t *) * nent)) {
        kmem_free(cbplist, ssize);
        return (EFAULT);
    }

    if (sigev) {
        if (copyin(sigev, &sigevk, sizeof (struct sigevent))) {
            kmem_free(cbplist, ssize);
            return (EFAULT);
        }
    }

    /*
     * a list head should be allocated if notification is
     * enabled for this list.
     */
    head = NULL;

    /* Event Ports  */

    if (sigev && sigevk.sigev_notify == SIGEV_PORT) {
        /* Use port for completion notification */
        if (copyin(sigevk.sigev_value.sival_ptr, &pnotify,
            sizeof (port_notify_t))) {
            kmem_free(cbplist, ssize);
            return (EFAULT);
        }
        /* use event ports for the list of aiocbs */
        aio_use_port = 1;
        error = port_alloc_event(pnotify.portnfy_port,
            PORT_ALLOC_PRIVATE, PORT_SOURCE_AIO, &pkevtp);
        if (error) {
            if ((error == ENOMEM) || (error == EAGAIN))
                error = EAGAIN;
            else
                error = EINVAL;
            kmem_free(cbplist, ssize);
            return (error);
        }
    } else if ((mode_arg == LIO_WAIT) || sigev) {
        mutex_enter(&aiop->aio_mutex);
        error = aio_lio_alloc(&head);
        mutex_exit(&aiop->aio_mutex);
        if (error)
            goto done;
        deadhead = 1;
        head->lio_nent = nent;
        head->lio_refcnt = nent;
        if (sigev && (sigevk.sigev_notify == SIGEV_SIGNAL) &&
            (sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG)) {
            sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
            if (sqp == NULL) {
                error = EAGAIN;
                goto done;
            }
            sqp->sq_func = NULL;
            sqp->sq_next = NULL;
            sqp->sq_info.si_code = SI_ASYNCIO;
            sqp->sq_info.si_pid = curproc->p_pid;
            sqp->sq_info.si_ctid = PRCTID(curproc);
            sqp->sq_info.si_zoneid = getzoneid();
            sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
            sqp->sq_info.si_signo = sigevk.sigev_signo;
            sqp->sq_info.si_value = sigevk.sigev_value;
            head->lio_sigqp = sqp;
        } else {
            head->lio_sigqp = NULL;
        }
    }

    for (i = 0; i < nent; i++, ucbp++) {

        cbp = *ucbp;
        /* skip entry if it can't be copied. */
        if (cbp == NULL || copyin(cbp, aiocb, sizeof (aiocb_t))) {
            if (head) {
                mutex_enter(&aiop->aio_mutex);
                head->lio_nent--;
                head->lio_refcnt--;
                mutex_exit(&aiop->aio_mutex);
            }
            continue;
        }

        /* skip if opcode for aiocb is LIO_NOP */

        mode = aiocb->aio_lio_opcode;
        if (mode == LIO_NOP) {
            cbp = NULL;
            if (head) {
                mutex_enter(&aiop->aio_mutex);
                head->lio_nent--;
                head->lio_refcnt--;
                mutex_exit(&aiop->aio_mutex);
            }
            continue;
        }

        /* increment file descriptor's ref count. */
        if ((fp = getf(aiocb->aio_fildes)) == NULL) {
            lio_set_uerror(&cbp->aio_resultp, EBADF);
            if (head) {
                mutex_enter(&aiop->aio_mutex);
                head->lio_nent--;
                head->lio_refcnt--;
                mutex_exit(&aiop->aio_mutex);
            }
            aio_errors++;
            continue;
        }

        vp = fp->f_vnode;

        /*
         * check the permission of the partition
         */
        mode = aiocb->aio_lio_opcode;
        if ((fp->f_flag & mode) == 0) {
            releasef(aiocb->aio_fildes);
            lio_set_uerror(&cbp->aio_resultp, EBADF);
            if (head) {
                mutex_enter(&aiop->aio_mutex);
                head->lio_nent--;
                head->lio_refcnt--;
                mutex_exit(&aiop->aio_mutex);
            }
            aio_errors++;
            continue;
        }

        /*
         * common case where requests are to the same fd for the
         * same r/w operation.
         * for UFS, need to set EBADFD
         */
        if ((fp != prev_fp) || (mode != prev_mode)) {
            aio_func = check_vp(vp, mode);
            if (aio_func == NULL) {
                prev_fp = NULL;
                releasef(aiocb->aio_fildes);
                lio_set_uerror(&cbp->aio_resultp, EBADFD);
                aio_notsupported++;
                if (head) {
                    mutex_enter(&aiop->aio_mutex);
                    head->lio_nent--;
                    head->lio_refcnt--;
                    mutex_exit(&aiop->aio_mutex);
                }
                continue;
            } else {
                prev_fp = fp;
                prev_mode = mode;
            }
        }

        if (error = aio_req_setup(&reqp, aiop, aiocb,
            &cbp->aio_resultp, aio_use_port, vp)) {
            releasef(aiocb->aio_fildes);
            lio_set_uerror(&cbp->aio_resultp, error);
            if (head) {
                mutex_enter(&aiop->aio_mutex);
                head->lio_nent--;
                head->lio_refcnt--;
                mutex_exit(&aiop->aio_mutex);
            }
            aio_errors++;
            continue;
        }

        reqp->aio_req_lio = head;
        deadhead = 0;

        /*
         * Set the errno field now before sending the request to
         * the driver to avoid a race condition
         */
        (void) suword32(&cbp->aio_resultp.aio_errno,
            EINPROGRESS);

        reqp->aio_req_iocb.iocb = (caddr_t)cbp;

        if (aio_use_port) {
            reqp->aio_req_port = pnotify.portnfy_port;
            error = aio_req_assoc_port(&aiocb->aio_sigevent,
                pnotify.portnfy_user, cbp, reqp, pkevtp);
        }

        /*
         * send the request to driver.
         * Clustering: If PXFS vnode, call PXFS function.
         */
        if (error == 0) {
            if (aiocb->aio_nbytes == 0) {
                clear_active_fd(aiocb->aio_fildes);
                aio_zerolen(reqp);
                continue;
            }
            error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
                CRED());
        }
        /*
         * the fd's ref count is not decremented until the IO has
         * completed unless there was an error.
         */
        if (error) {
            releasef(aiocb->aio_fildes);
            lio_set_uerror(&cbp->aio_resultp, error);
            if (head) {
                mutex_enter(&aiop->aio_mutex);
                head->lio_nent--;
                head->lio_refcnt--;
                mutex_exit(&aiop->aio_mutex);
            }
            if (error == ENOTSUP)
                aio_notsupported++;
            else
                aio_errors++;
            lio_set_error(reqp);
        } else {
            clear_active_fd(aiocb->aio_fildes);
        }
    }

    if (pkevtp)
        port_free_event(pkevtp);

    if (aio_notsupported) {
        error = ENOTSUP;
    } else if (aio_errors) {
        /*
         * return EIO if any request failed
         */
        error = EIO;
    }

    if (mode_arg == LIO_WAIT) {
        mutex_enter(&aiop->aio_mutex);
        while (head->lio_refcnt > 0) {
            if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
                mutex_exit(&aiop->aio_mutex);
                error = EINTR;
                goto done;
            }
        }
        mutex_exit(&aiop->aio_mutex);
        alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_64);
    }

done:
    kmem_free(cbplist, ssize);
    if (deadhead) {
        if (head->lio_sigqp)
            kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
        kmem_free(head, sizeof (aio_lio_t));
    }
    return (error);
}

#endif /* _LP64 */

/*
 * Asynchronous list IO.
 * If list I/O is called with LIO_WAIT it can still return
 * before all the I/O's are completed if a signal is caught
 * or if the list include UFS I/O requests. If this happens,
 * libaio will call aliowait() to wait for the I/O's to
 * complete
 */
/*ARGSUSED*/
static int
aliowait(
    int mode,
    void    *aiocb,
    int nent,
    void    *sigev,
    int run_mode)
{
    aio_lio_t   *head;
    aio_t       *aiop;
    caddr_t     cbplist;
    aiocb_t     *cbp, **ucbp;
#ifdef  _SYSCALL32_IMPL
    aiocb32_t   *cbp32;
    caddr32_t   *ucbp32;
    aiocb64_32_t    *cbp64;
#endif
    int     error = 0;
    int     i;
    size_t      ssize = 0;
    model_t     model = get_udatamodel();

    aiop = curproc->p_aio;
    if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
        return (EINVAL);

    if (model == DATAMODEL_NATIVE)
        ssize = (sizeof (aiocb_t *) * nent);
#ifdef  _SYSCALL32_IMPL
    else
        ssize = (sizeof (caddr32_t) * nent);
#endif  /* _SYSCALL32_IMPL */

    if (ssize == 0)
        return (EINVAL);

    cbplist = kmem_alloc(ssize, KM_SLEEP);

    if (model == DATAMODEL_NATIVE)
        ucbp = (aiocb_t **)cbplist;
#ifdef  _SYSCALL32_IMPL
    else
        ucbp32 = (caddr32_t *)cbplist;
#endif  /* _SYSCALL32_IMPL */

    if (copyin(aiocb, cbplist, ssize)) {
        error = EFAULT;
        goto done;
    }

    /*
     * To find the list head, we go through the
     * list of aiocb structs, find the request
     * its for, then get the list head that reqp
     * points to
     */
    head = NULL;

    for (i = 0; i < nent; i++) {
        if (model == DATAMODEL_NATIVE) {
            /*
             * Since we are only checking for a NULL pointer
             * Following should work on both native data sizes
             * as well as for largefile aiocb.
             */
            if ((cbp = *ucbp++) == NULL)
                continue;
            if (run_mode != AIO_LARGEFILE)
                if (head = aio_list_get(&cbp->aio_resultp))
                    break;
            else {
                /*
                 * This is a case when largefile call is
                 * made on 32 bit kernel.
                 * Treat each pointer as pointer to
                 * aiocb64_32
                 */
                if (head = aio_list_get((aio_result_t *)
                    &(((aiocb64_32_t *)cbp)->aio_resultp)))
                    break;
            }
        }
#ifdef  _SYSCALL32_IMPL
        else {
            if (run_mode == AIO_LARGEFILE) {
                if ((cbp64 = (aiocb64_32_t *)
                    (uintptr_t)*ucbp32++) == NULL)
                    continue;
                if (head = aio_list_get((aio_result_t *)
                    &cbp64->aio_resultp))
                    break;
            } else if (run_mode == AIO_32) {
                if ((cbp32 = (aiocb32_t *)
                    (uintptr_t)*ucbp32++) == NULL)
                    continue;
                if (head = aio_list_get((aio_result_t *)
                    &cbp32->aio_resultp))
                    break;
            }
        }
#endif  /* _SYSCALL32_IMPL */
    }

    if (head == NULL) {
        error = EINVAL;
        goto done;
    }

    mutex_enter(&aiop->aio_mutex);
    while (head->lio_refcnt > 0) {
        if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
            mutex_exit(&aiop->aio_mutex);
            error = EINTR;
            goto done;
        }
    }
    mutex_exit(&aiop->aio_mutex);
    alio_cleanup(aiop, (aiocb_t **)cbplist, nent, run_mode);
done:
    kmem_free(cbplist, ssize);
    return (error);
}

aio_lio_t *
aio_list_get(aio_result_t *resultp)
{
    aio_lio_t   *head = NULL;
    aio_t       *aiop;
    aio_req_t   **bucket;
    aio_req_t   *reqp;
    long        index;

    aiop = curproc->p_aio;
    if (aiop == NULL)
        return (NULL);

    if (resultp) {
        index = AIO_HASH(resultp);
        bucket = &aiop->aio_hash[index];
        for (reqp = *bucket; reqp != NULL;
            reqp = reqp->aio_hash_next) {
            if (reqp->aio_req_resultp == resultp) {
                head = reqp->aio_req_lio;
                return (head);
            }
        }
    }
    return (NULL);
}


static void
lio_set_uerror(void *resultp, int error)
{
    /*
     * the resultp field is a pointer to where the
     * error should be written out to the user's
     * aiocb.
     *
     */
    if (get_udatamodel() == DATAMODEL_NATIVE) {
        (void) sulword(&((aio_result_t *)resultp)->aio_return,
            (ssize_t)-1);
        (void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
    }
#ifdef  _SYSCALL32_IMPL
    else {
        (void) suword32(&((aio_result32_t *)resultp)->aio_return,
            (uint_t)-1);
        (void) suword32(&((aio_result32_t *)resultp)->aio_errno, error);
    }
#endif  /* _SYSCALL32_IMPL */
}

/*
 * do cleanup completion for all requests in list. memory for
 * each request is also freed.
 */
static void
alio_cleanup(aio_t *aiop, aiocb_t **cbp, int nent, int run_mode)
{
    int i;
    aio_req_t *reqp;
    aio_result_t *resultp;
    aiocb64_32_t    *aiocb_64;

    for (i = 0; i < nent; i++) {
        if (get_udatamodel() == DATAMODEL_NATIVE) {
            if (cbp[i] == NULL)
                continue;
            if (run_mode == AIO_LARGEFILE) {
                aiocb_64 = (aiocb64_32_t *)cbp[i];
                resultp = (aio_result_t *)&aiocb_64->
                    aio_resultp;
            } else
                resultp = &cbp[i]->aio_resultp;
        }
#ifdef  _SYSCALL32_IMPL
        else {
            aiocb32_t   *aiocb_32;
            caddr32_t   *cbp32;

            cbp32 = (caddr32_t *)cbp;
            if (cbp32[i] == NULL)
                continue;
            if (run_mode == AIO_32) {
                aiocb_32 = (aiocb32_t *)(uintptr_t)cbp32[i];
                resultp = (aio_result_t *)&aiocb_32->
                    aio_resultp;
            } else if (run_mode == AIO_LARGEFILE) {
                aiocb_64 = (aiocb64_32_t *)(uintptr_t)cbp32[i];
                resultp = (aio_result_t *)&aiocb_64->
                    aio_resultp;
            }
        }
#endif  /* _SYSCALL32_IMPL */
        /*
         * we need to get the aio_cleanupq_mutex since we call
         * aio_req_done().
         */
        mutex_enter(&aiop->aio_cleanupq_mutex);
        mutex_enter(&aiop->aio_mutex);
        reqp = aio_req_done(resultp);
        mutex_exit(&aiop->aio_mutex);
        mutex_exit(&aiop->aio_cleanupq_mutex);
        if (reqp != NULL) {
            aphysio_unlock(reqp);
            aio_copyout_result(reqp);
            mutex_enter(&aiop->aio_mutex);
            aio_req_free(aiop, reqp);
            mutex_exit(&aiop->aio_mutex);
        }
    }
}

/*
 * write out the results for an aio request that is
 * done.
 */
static int
aioerror(void *cb, int run_mode)
{
    aio_result_t *resultp;
    aio_t *aiop;
    aio_req_t *reqp;
    int retval;

    aiop = curproc->p_aio;
    if (aiop == NULL || cb == NULL)
        return (EINVAL);

    if (get_udatamodel() == DATAMODEL_NATIVE) {
        if (run_mode == AIO_LARGEFILE)
            resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
                aio_resultp;
        else
            resultp = &((aiocb_t *)cb)->aio_resultp;
    }
#ifdef  _SYSCALL32_IMPL
    else {
        if (run_mode == AIO_LARGEFILE)
            resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
                aio_resultp;
        else if (run_mode == AIO_32)
            resultp = (aio_result_t *)&((aiocb32_t *)cb)->
                aio_resultp;
    }
#endif  /* _SYSCALL32_IMPL */
    /*
     * we need to get the aio_cleanupq_mutex since we call
     * aio_req_find().
     */
    mutex_enter(&aiop->aio_cleanupq_mutex);
    mutex_enter(&aiop->aio_mutex);
    retval = aio_req_find(resultp, &reqp);
    mutex_exit(&aiop->aio_mutex);
    mutex_exit(&aiop->aio_cleanupq_mutex);
    if (retval == 0) {
        aphysio_unlock(reqp);
        aio_copyout_result(reqp);
        mutex_enter(&aiop->aio_mutex);
        aio_req_free(aiop, reqp);
        mutex_exit(&aiop->aio_mutex);
        return (0);
    } else if (retval == 1)
        return (EINPROGRESS);
    else if (retval == 2)
        return (EINVAL);
    return (0);
}

/*
 *  aio_cancel - if no requests outstanding,
 *          return AIO_ALLDONE
 *          else
 *          return AIO_NOTCANCELED
 */
static int
aio_cancel(
    int fildes,
    void    *cb,
    long    *rval,
    int run_mode)
{
    aio_t *aiop;
    void *resultp;
    int index;
    aio_req_t **bucket;
    aio_req_t *ent;


    /*
     * Verify valid file descriptor
     */
    if ((getf(fildes)) == NULL) {
        return (EBADF);
    }
    releasef(fildes);

    aiop = curproc->p_aio;
    if (aiop == NULL)
        return (EINVAL);

    if (aiop->aio_outstanding == 0) {
        *rval = AIO_ALLDONE;
        return (0);
    }

    mutex_enter(&aiop->aio_mutex);
    if (cb != NULL) {
        if (get_udatamodel() == DATAMODEL_NATIVE) {
            if (run_mode == AIO_LARGEFILE)
                resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
                    ->aio_resultp;
            else
                resultp = &((aiocb_t *)cb)->aio_resultp;
        }
#ifdef  _SYSCALL32_IMPL
        else {
            if (run_mode == AIO_LARGEFILE)
                resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
                    ->aio_resultp;
            else if (run_mode == AIO_32)
                resultp = (aio_result_t *)&((aiocb32_t *)cb)
                    ->aio_resultp;
        }
#endif  /* _SYSCALL32_IMPL */
        index = AIO_HASH(resultp);
        bucket = &aiop->aio_hash[index];
        for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
            if (ent->aio_req_resultp == resultp) {
                if ((ent->aio_req_flags & AIO_PENDING) == 0) {
                    mutex_exit(&aiop->aio_mutex);
                    *rval = AIO_ALLDONE;
                    return (0);
                }
                mutex_exit(&aiop->aio_mutex);
                *rval = AIO_NOTCANCELED;
                return (0);
            }
        }
        mutex_exit(&aiop->aio_mutex);
        *rval = AIO_ALLDONE;
        return (0);
    }

    for (index = 0; index < AIO_HASHSZ; index++) {
        bucket = &aiop->aio_hash[index];
        for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
            if (ent->aio_req_fd == fildes) {
                if ((ent->aio_req_flags & AIO_PENDING) != 0) {
                    mutex_exit(&aiop->aio_mutex);
                    *rval = AIO_NOTCANCELED;
                    return (0);
                }
            }
        }
    }
    mutex_exit(&aiop->aio_mutex);
    *rval = AIO_ALLDONE;
    return (0);
}

/*
 * solaris version of asynchronous read and write
 */
static int
arw(
    int opcode,
    int fdes,
    char    *bufp,
    int bufsize,
    offset_t    offset,
    aio_result_t    *resultp,
    int     mode)
{
    file_t      *fp;
    int     error;
    struct vnode    *vp;
    aio_req_t   *reqp;
    aio_t       *aiop;
    int     (*aio_func)();
#ifdef _LP64
    aiocb_t     aiocb;
#else
    aiocb64_32_t    aiocb64;
#endif

    aiop = curproc->p_aio;
    if (aiop == NULL)
        return (EINVAL);

    if ((fp = getf(fdes)) == NULL) {
        return (EBADF);
    }

    /*
     * check the permission of the partition
     */
    if ((fp->f_flag & mode) == 0) {
        releasef(fdes);
        return (EBADF);
    }

    vp = fp->f_vnode;
    aio_func = check_vp(vp, mode);
    if (aio_func == NULL) {
        releasef(fdes);
        return (EBADFD);
    }
#ifdef _LP64
    aiocb.aio_fildes = fdes;
    aiocb.aio_buf = bufp;
    aiocb.aio_nbytes = bufsize;
    aiocb.aio_offset = offset;
    aiocb.aio_sigevent.sigev_notify = 0;
    error = aio_req_setup(&reqp, aiop, &aiocb, resultp, 0, vp);
#else
    aiocb64.aio_fildes = fdes;
    aiocb64.aio_buf = (caddr32_t)bufp;
    aiocb64.aio_nbytes = bufsize;
    aiocb64.aio_offset = offset;
    aiocb64.aio_sigevent.sigev_notify = 0;
    error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, 0, vp);
#endif
    if (error) {
        releasef(fdes);
        return (error);
    }

    /*
     * enable polling on this request if the opcode has
     * the AIO poll bit set
     */
    if (opcode & AIO_POLL_BIT)
        reqp->aio_req_flags |= AIO_POLL;

    if (bufsize == 0) {
        clear_active_fd(fdes);
        aio_zerolen(reqp);
        return (0);
    }
    /*
     * send the request to driver.
     * Clustering: If PXFS vnode, call PXFS function.
     */
    error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
    /*
     * the fd is stored in the aio_req_t by aio_req_setup(), and
     * is released by the aio_cleanup_thread() when the IO has
     * completed.
     */
    if (error) {
        releasef(fdes);
        mutex_enter(&aiop->aio_mutex);
        aio_req_free(aiop, reqp);
        aiop->aio_pending--;
        if (aiop->aio_flags & AIO_REQ_BLOCK)
            cv_signal(&aiop->aio_cleanupcv);
        mutex_exit(&aiop->aio_mutex);
        return (error);
    }
    clear_active_fd(fdes);
    return (0);
}

/*
 * Take request out of the port pending queue ...
 */

void
aio_deq_port_pending(aio_t *aiop, aio_req_t *reqp)
{
    ASSERT(MUTEX_HELD(&aiop->aio_mutex));
    if (reqp->aio_req_prev == NULL)
        /* first request */
        aiop->aio_portpending = reqp->aio_req_next;
    else
        reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
    if (reqp->aio_req_next != NULL)
        reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
}

/*
 * posix version of asynchronous read and write
 */
static  int
aiorw(
    int     opcode,
    void        *aiocb_arg,
    int     mode,
    int     run_mode)
{
#ifdef _SYSCALL32_IMPL
    aiocb32_t   aiocb32;
    struct  sigevent32 *sigev32;
    port_notify32_t pntfy32;
#endif
    aiocb64_32_t    aiocb64;
    aiocb_t     aiocb;
    file_t      *fp;
    int     error, fd;
    size_t      bufsize;
    struct vnode    *vp;
    aio_req_t   *reqp;
    aio_t       *aiop;
    int     (*aio_func)();
    aio_result_t    *resultp;
    struct  sigevent *sigev;
    model_t     model;
    int     aio_use_port = 0;
    port_notify_t   pntfy;

    model = get_udatamodel();
    aiop = curproc->p_aio;
    if (aiop == NULL)
        return (EINVAL);

    if (model == DATAMODEL_NATIVE) {
        if (run_mode != AIO_LARGEFILE) {
            if (copyin(aiocb_arg, &aiocb, sizeof (aiocb_t)))
                return (EFAULT);
            bufsize = aiocb.aio_nbytes;
            resultp = &(((aiocb_t *)aiocb_arg)->aio_resultp);
            if ((fp = getf(fd = aiocb.aio_fildes)) == NULL) {
                return (EBADF);
            }
            sigev = &aiocb.aio_sigevent;
        } else {
            /*
             * We come here only when we make largefile
             * call on 32 bit kernel using 32 bit library.
             */
            if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
                return (EFAULT);
            bufsize = aiocb64.aio_nbytes;
            resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
                ->aio_resultp);
            if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL) {
                return (EBADF);
            }
            sigev = (struct sigevent *)&aiocb64.aio_sigevent;
        }

        if (sigev->sigev_notify == SIGEV_PORT) {
            if (copyin((void *)sigev->sigev_value.sival_ptr,
                &pntfy, sizeof (port_notify_t))) {
                releasef(fd);
                return (EFAULT);
            }
            aio_use_port = 1;
        }
    }
#ifdef  _SYSCALL32_IMPL
    else {
        if (run_mode == AIO_32) {
            /* 32 bit system call is being made on 64 bit kernel */
            if (copyin(aiocb_arg, &aiocb32, sizeof (aiocb32_t)))
                return (EFAULT);

            bufsize = aiocb32.aio_nbytes;
            aiocb_32ton(&aiocb32, &aiocb);
            resultp = (aio_result_t *)&(((aiocb32_t *)aiocb_arg)->
                aio_resultp);
            if ((fp = getf(fd = aiocb32.aio_fildes)) == NULL) {
                return (EBADF);
            }
            sigev32 = &aiocb32.aio_sigevent;
        } else if (run_mode == AIO_LARGEFILE) {
            /*
             * We come here only when we make largefile
             * call on 64 bit kernel using 32 bit library.
             */
            if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
                return (EFAULT);
            bufsize = aiocb64.aio_nbytes;
            aiocb_LFton(&aiocb64, &aiocb);
            resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
                ->aio_resultp);
            if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
                return (EBADF);
            sigev32 = &aiocb64.aio_sigevent;
        }

        if (sigev32->sigev_notify == SIGEV_PORT) {
            if (copyin(
                (void *)(uintptr_t)sigev32->sigev_value.sival_ptr,
                &pntfy32, sizeof (port_notify32_t))) {
                releasef(fd);
                return (EFAULT);
            }
            pntfy.portnfy_port = pntfy32.portnfy_port;
            pntfy.portnfy_user =
                (void *)(uintptr_t)pntfy32.portnfy_user;
            aio_use_port = 1;
        }
    }
#endif  /* _SYSCALL32_IMPL */

    /*
     * check the permission of the partition
     */

    if ((fp->f_flag & mode) == 0) {
        releasef(fd);
        return (EBADF);
    }

    vp = fp->f_vnode;
    aio_func = check_vp(vp, mode);
    if (aio_func == NULL) {
        releasef(fd);
        return (EBADFD);
    }
    if ((model == DATAMODEL_NATIVE) && (run_mode == AIO_LARGEFILE))
        error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp,
            aio_use_port, vp);
    else
        error = aio_req_setup(&reqp, aiop, &aiocb, resultp,
            aio_use_port, vp);

    if (error) {
        releasef(fd);
        return (error);
    }
    /*
     * enable polling on this request if the opcode has
     * the AIO poll bit set
     */
    if (opcode & AIO_POLL_BIT)
        reqp->aio_req_flags |= AIO_POLL;

    if (model == DATAMODEL_NATIVE)
        reqp->aio_req_iocb.iocb = aiocb_arg;
#ifdef  _SYSCALL32_IMPL
    else
        reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)aiocb_arg;
#endif

    if (aio_use_port)
        error = aio_req_assoc_port_rw(&pntfy, aiocb_arg, reqp);

    /*
     * send the request to driver.
     * Clustering: If PXFS vnode, call PXFS function.
     */
    if (error == 0) {
        if (bufsize == 0) {
            clear_active_fd(fd);
            aio_zerolen(reqp);
            return (0);
        }
        error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
    }

    /*
     * the fd is stored in the aio_req_t by aio_req_setup(), and
     * is released by the aio_cleanup_thread() when the IO has
     * completed.
     */
    if (error) {
        releasef(fd);
        mutex_enter(&aiop->aio_mutex);
        aio_deq_port_pending(aiop, reqp);
        aio_req_free(aiop, reqp);
        aiop->aio_pending--;
        if (aiop->aio_flags & AIO_REQ_BLOCK)
            cv_signal(&aiop->aio_cleanupcv);
        mutex_exit(&aiop->aio_mutex);
        return (error);
    }
    clear_active_fd(fd);
    return (0);
}


/*
 * set error for a list IO entry that failed.
 */
static void
lio_set_error(aio_req_t *reqp)
{
    aio_t *aiop = curproc->p_aio;

    if (aiop == NULL)
        return;

    mutex_enter(&aiop->aio_mutex);
    aio_deq_port_pending(aiop, reqp);
    aiop->aio_pending--;
    /* request failed, AIO_PHYSIODONE set to aviod physio cleanup. */
    reqp->aio_req_flags |= AIO_PHYSIODONE;
    /*
     * Need to free the request now as its never
     * going to get on the done queue
     *
     * Note: aio_outstanding is decremented in
     *   aio_req_free()
     */
    aio_req_free(aiop, reqp);
    if (aiop->aio_flags & AIO_REQ_BLOCK)
        cv_signal(&aiop->aio_cleanupcv);
    mutex_exit(&aiop->aio_mutex);
}

/*
 * check if a specified request is done, and remove it from
 * the done queue. otherwise remove anybody from the done queue
 * if NULL is specified.
 */
static aio_req_t *
aio_req_done(void *resultp)
{
    aio_req_t **bucket;
    aio_req_t *ent;
    aio_t *aiop = curproc->p_aio;
    long index;

    ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
    ASSERT(MUTEX_HELD(&aiop->aio_mutex));

    if (resultp) {
        index = AIO_HASH(resultp);
        bucket = &aiop->aio_hash[index];
        for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
            if (ent->aio_req_resultp == (aio_result_t *)resultp) {
                if (ent->aio_req_flags & AIO_DONEQ) {
                    return (aio_req_remove(ent));
                }
                return (NULL);
            }
        }
        /* no match, resultp is invalid */
        return (NULL);
    }
    return (aio_req_remove(NULL));
}

/*
 * determine if a user-level resultp pointer is associated with an
 * active IO request. Zero is returned when the request is done,
 * and the request is removed from the done queue. Only when the
 * return value is zero, is the "reqp" pointer valid. One is returned
 * when the request is inprogress. Two is returned when the request
 * is invalid.
 */
static int
aio_req_find(aio_result_t *resultp, aio_req_t **reqp)
{
    aio_req_t **bucket;
    aio_req_t *ent;
    aio_t *aiop = curproc->p_aio;
    long index;

    ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
    ASSERT(MUTEX_HELD(&aiop->aio_mutex));

    index = AIO_HASH(resultp);
    bucket = &aiop->aio_hash[index];
    for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
        if (ent->aio_req_resultp == resultp) {
            if (ent->aio_req_flags & AIO_DONEQ) {
                *reqp = aio_req_remove(ent);
                return (0);
            }
            return (1);
        }
    }
    /* no match, resultp is invalid */
    return (2);
}

/*
 * remove a request from the done queue.
 */
static aio_req_t *
aio_req_remove(aio_req_t *reqp)
{
    aio_t *aiop = curproc->p_aio;
    aio_req_t *head;

    ASSERT(MUTEX_HELD(&aiop->aio_mutex));

    if (reqp) {
        ASSERT(reqp->aio_req_flags & AIO_DONEQ);
        if (reqp->aio_req_next == reqp) {
            /* only one request on queue */
            if (reqp ==  aiop->aio_doneq) {
                aiop->aio_doneq = NULL;
            } else {
                ASSERT(reqp == aiop->aio_cleanupq);
                aiop->aio_cleanupq = NULL;
            }
        } else {
            reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
            reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
            /*
             * The request can be either on the aio_doneq or the
             * aio_cleanupq
             */
            if (reqp == aiop->aio_doneq)
                aiop->aio_doneq = reqp->aio_req_next;

            if (reqp == aiop->aio_cleanupq)
                aiop->aio_cleanupq = reqp->aio_req_next;
        }
        reqp->aio_req_flags &= ~AIO_DONEQ;
        return (reqp);
    }

    if (aiop->aio_doneq) {
        head = aiop->aio_doneq;
        ASSERT(head->aio_req_flags & AIO_DONEQ);
        if (head == head->aio_req_next) {
            /* only one request on queue */
            aiop->aio_doneq = NULL;
        } else {
            head->aio_req_prev->aio_req_next = head->aio_req_next;
            head->aio_req_next->aio_req_prev = head->aio_req_prev;
            aiop->aio_doneq = head->aio_req_next;
        }
        head->aio_req_flags &= ~AIO_DONEQ;
        return (head);
    }
    return (NULL);
}

static int
aio_req_setup(
    aio_req_t   **reqpp,
    aio_t       *aiop,
    aiocb_t     *arg,
    aio_result_t    *resultp,
    int     port,
    vnode_t     *vp)
{
    aio_req_t   *reqp;
    sigqueue_t  *sqp;
    struct uio  *uio;

    struct sigevent *sigev;
    int     error;

    sigev = &arg->aio_sigevent;
    if ((sigev->sigev_notify == SIGEV_SIGNAL) &&
        (sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG)) {
        sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
        if (sqp == NULL)
            return (EAGAIN);
        sqp->sq_func = NULL;
        sqp->sq_next = NULL;
        sqp->sq_info.si_code = SI_ASYNCIO;
        sqp->sq_info.si_pid = curproc->p_pid;
        sqp->sq_info.si_ctid = PRCTID(curproc);
        sqp->sq_info.si_zoneid = getzoneid();
        sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
        sqp->sq_info.si_signo = sigev->sigev_signo;
        sqp->sq_info.si_value = sigev->sigev_value;
    } else
        sqp = NULL;

    mutex_enter(&aiop->aio_mutex);

    if (aiop->aio_flags & AIO_REQ_BLOCK) {
        mutex_exit(&aiop->aio_mutex);
        if (sqp)
            kmem_free(sqp, sizeof (sigqueue_t));
        return (EIO);
    }
    /*
     * get an aio_reqp from the free list or allocate one
     * from dynamic memory.
     */
    if (error = aio_req_alloc(&reqp, resultp)) {
        mutex_exit(&aiop->aio_mutex);
        if (sqp)
            kmem_free(sqp, sizeof (sigqueue_t));
        return (error);
    }
    aiop->aio_pending++;
    aiop->aio_outstanding++;
    reqp->aio_req_flags = AIO_PENDING;
    if (port)
        aio_enq_port_pending(aiop, reqp);
    mutex_exit(&aiop->aio_mutex);
    /*
     * initialize aio request.
     */
    reqp->aio_req_fd = arg->aio_fildes;
    reqp->aio_req_sigqp = sqp;
    reqp->aio_req_iocb.iocb = NULL;
    reqp->aio_req_buf.b_file = vp;
    uio = reqp->aio_req.aio_uio;
    uio->uio_iovcnt = 1;
    uio->uio_iov->iov_base = (caddr_t)arg->aio_buf;
    uio->uio_iov->iov_len = arg->aio_nbytes;
    uio->uio_loffset = arg->aio_offset;
    *reqpp = reqp;
    return (0);
}

/*
 * Allocate p_aio struct.
 */
static aio_t *
aio_aiop_alloc(void)
{
    aio_t   *aiop;

    ASSERT(MUTEX_HELD(&curproc->p_lock));

    aiop = kmem_zalloc(sizeof (struct aio), KM_NOSLEEP);
    if (aiop) {
        mutex_init(&aiop->aio_mutex, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&aiop->aio_cleanupq_mutex, NULL, MUTEX_DEFAULT,
                                    NULL);
        mutex_init(&aiop->aio_portq_mutex, NULL, MUTEX_DEFAULT, NULL);
    }
    return (aiop);
}

/*
 * Allocate an aio_req struct.
 */
static int
aio_req_alloc(aio_req_t **nreqp, aio_result_t *resultp)
{
    aio_req_t *reqp;
    aio_t *aiop = curproc->p_aio;

    ASSERT(MUTEX_HELD(&aiop->aio_mutex));

    if ((reqp = aiop->aio_free) != NULL) {
        reqp->aio_req_flags = 0;
        aiop->aio_free = reqp->aio_req_next;
        /*
         * Clustering:This field has to be specifically
         * set to null so that the right thing can be
         * done in aphysio()
         */
        reqp->aio_req_buf.b_iodone = NULL;
    } else {
        /*
         * Check whether memory is getting tight.
         * This is a temporary mechanism to avoid memory
         * exhaustion by a single process until we come up
         * with a per process solution such as setrlimit().
         */
        if (freemem < desfree)
            return (EAGAIN);

        reqp = kmem_zalloc(sizeof (struct aio_req_t), KM_NOSLEEP);
        if (reqp == NULL)
            return (EAGAIN);
        reqp->aio_req.aio_uio = &(reqp->aio_req_uio);
        reqp->aio_req.aio_uio->uio_iov = &(reqp->aio_req_iov);
        reqp->aio_req.aio_private = reqp;
    }

    reqp->aio_req_buf.b_offset = -1;
    reqp->aio_req_resultp = resultp;
    if (aio_hash_insert(reqp, aiop)) {
        reqp->aio_req_next = aiop->aio_free;
        aiop->aio_free = reqp;
        return (EINVAL);
    }
    *nreqp = reqp;
    return (0);
}

/*
 * Allocate an aio_lio_t struct.
 */
static int
aio_lio_alloc(aio_lio_t **head)
{
    aio_lio_t *liop;
    aio_t *aiop = curproc->p_aio;

    ASSERT(MUTEX_HELD(&aiop->aio_mutex));

    if ((liop = aiop->aio_lio_free) != NULL) {
        aiop->aio_lio_free = liop->lio_next;
    } else {
        /*
         * Check whether memory is getting tight.
         * This is a temporary mechanism to avoid memory
         * exhaustion by a single process until we come up
         * with a per process solution such as setrlimit().
         */
        if (freemem < desfree)
            return (EAGAIN);

        liop = kmem_zalloc(sizeof (aio_lio_t), KM_NOSLEEP);
        if (liop == NULL)
            return (EAGAIN);
    }
    *head = liop;
    return (0);
}

/*
 * this is a special per-process thread that is only activated if
 * the process is unmapping a segment with outstanding aio. normally,
 * the process will have completed the aio before unmapping the
 * segment. If the process does unmap a segment with outstanding aio,
 * this special thread will guarentee that the locked pages due to
 * aphysio() are released, thereby permitting the segment to be
 * unmapped.
 */

static int
aio_cleanup_thread(aio_t *aiop)
{
    proc_t *p = curproc;
    struct as *as = p->p_as;
    int poked = 0;
    kcondvar_t *cvp;
    int exit_flag = 0;

    sigfillset(&curthread->t_hold);
    sigdiffset(&curthread->t_hold, &cantmask);
    for (;;) {
        /*
         * if a segment is being unmapped, and the current
         * process's done queue is not empty, then every request
         * on the doneq with locked resources should be forced
         * to release their locks. By moving the doneq request
         * to the cleanupq, aio_cleanup() will process the cleanupq,
         * and place requests back onto the doneq. All requests
         * processed by aio_cleanup() will have their physical
         * resources unlocked.
         */
        mutex_enter(&aiop->aio_mutex);
        if ((aiop->aio_flags & AIO_CLEANUP) == 0) {
            aiop->aio_flags |= AIO_CLEANUP;
            mutex_enter(&as->a_contents);
            if (AS_ISUNMAPWAIT(as) && aiop->aio_doneq) {
                aio_req_t *doneqhead = aiop->aio_doneq;
                mutex_exit(&as->a_contents);
                aiop->aio_doneq = NULL;
                aio_cleanupq_concat(aiop, doneqhead, AIO_DONEQ);
            } else {
                mutex_exit(&as->a_contents);
            }
        }
        mutex_exit(&aiop->aio_mutex);
        aio_cleanup(AIO_CLEANUP_THREAD);
        /*
         * thread should block on the cleanupcv while
         * AIO_CLEANUP is set.
         */
        cvp = &aiop->aio_cleanupcv;
        mutex_enter(&aiop->aio_mutex);

        if (aiop->aio_pollq != NULL || aiop->aio_cleanupq != NULL ||
            aiop->aio_notifyq != NULL ||
            aiop->aio_portcleanupq != NULL) {
            mutex_exit(&aiop->aio_mutex);
            continue;
        }
        mutex_enter(&as->a_contents);

        /*
         * AIO_CLEANUP determines when the cleanup thread
         * should be active. This flag is only set when
         * the cleanup thread is awakened by as_unmap().
         * The flag is cleared when the blocking as_unmap()
         * that originally awakened us is allowed to
         * complete. as_unmap() blocks when trying to
         * unmap a segment that has SOFTLOCKed pages. when
         * the segment's pages are all SOFTUNLOCKed,
         * as->a_flags & AS_UNMAPWAIT should be zero. The flag
         * shouldn't be cleared right away if the cleanup thread
         * was interrupted because the process is doing forkall().
         * This happens when cv_wait_sig() returns zero,
         * because it was awakened by a pokelwps(). If the
         * process is not exiting, it must be doing forkall().
         */
        if ((poked == 0) &&
            ((AS_ISUNMAPWAIT(as) == 0) || (aiop->aio_pending == 0))) {
            aiop->aio_flags &= ~(AIO_CLEANUP | AIO_CLEANUP_PORT);
            cvp = &as->a_cv;
        }
        mutex_exit(&aiop->aio_mutex);
        if (poked) {
            /*
             * If the process is exiting/killed, don't return
             * immediately without waiting for pending I/O's
             * and releasing the page locks.
             */
            if (p->p_flag & (SEXITLWPS|SKILLED)) {
                /*
                 * If exit_flag is set, then it is
                 * safe to exit because we have released
                 * page locks of completed I/O's.
                 */
                if (exit_flag)
                    break;

                mutex_exit(&as->a_contents);

                /*
                 * Wait for all the pending aio to complete.
                 */
                mutex_enter(&aiop->aio_mutex);
                aiop->aio_flags |= AIO_REQ_BLOCK;
                while (aiop->aio_pending != 0)
                    cv_wait(&aiop->aio_cleanupcv,
                        &aiop->aio_mutex);
                mutex_exit(&aiop->aio_mutex);
                exit_flag = 1;
                continue;
            } else if (p->p_flag &
                (SHOLDFORK|SHOLDFORK1|SHOLDWATCH)) {
                /*
                 * hold LWP until it
                 * is continued.
                 */
                mutex_exit(&as->a_contents);
                mutex_enter(&p->p_lock);
                stop(PR_SUSPENDED, SUSPEND_NORMAL);
                mutex_exit(&p->p_lock);
                poked = 0;
                continue;
            }
        } else {
            /*
             * When started this thread will sleep on as->a_cv.
             * as_unmap will awake this thread if the
             * segment has SOFTLOCKed pages (poked = 0).
             * 1. pokelwps() awakes this thread =>
             *    break the loop to check SEXITLWPS, SHOLDFORK, etc
             * 2. as_unmap awakes this thread =>
             *    to break the loop it is necessary that
             *    - AS_UNMAPWAIT is set (as_unmap is waiting for
             *  memory to be unlocked)
             *    - some transactions are still pending
             *    - AIO_CLEANUP is not set
             *  (if AIO_CLEANUP is set we have to wait for
             *  pending requests. aio_done will send a signal
             *  for every request which completes to continue
             *  unmapping the corresponding address range)
             */
            while (poked == 0) {
                if ((AS_ISUNMAPWAIT(as) != 0) &&
                    (aiop->aio_pending != 0) &&
                    ((aiop->aio_flags & AIO_CLEANUP) == 0))
                    break;
                poked = !cv_wait_sig(cvp, &as->a_contents);
                if (AS_ISUNMAPWAIT(as) == 0)
                    cv_signal(cvp);
                if (aiop->aio_outstanding != 0)
                    break;
            }
        }
        mutex_exit(&as->a_contents);
    }
exit:
    mutex_exit(&as->a_contents);
    ASSERT((curproc->p_flag & (SEXITLWPS|SKILLED)));
    aston(curthread);   /* make thread do post_syscall */
    return (0);
}

/*
 * save a reference to a user's outstanding aio in a hash list.
 */
static int
aio_hash_insert(
    aio_req_t *aio_reqp,
    aio_t *aiop)
{
    long index;
    aio_result_t *resultp = aio_reqp->aio_req_resultp;
    aio_req_t *current;
    aio_req_t **nextp;

    index = AIO_HASH(resultp);
    nextp = &aiop->aio_hash[index];
    while ((current = *nextp) != NULL) {
        if (current->aio_req_resultp == resultp)
            return (DUPLICATE);
        nextp = &current->aio_hash_next;
    }
    *nextp = aio_reqp;
    aio_reqp->aio_hash_next = NULL;
    return (0);
}

static int
(*check_vp(struct vnode *vp, int mode))(vnode_t *, struct aio_req *,
    cred_t *)
{
    struct snode *sp;
    dev_t       dev;
    struct cb_ops   *cb;
    major_t     major;
    int     (*aio_func)();

    dev = vp->v_rdev;
    major = getmajor(dev);

    /*
     * return NULL for requests to files and STREAMs so
     * that libaio takes care of them.
     */
    if (vp->v_type == VCHR) {
        /* no stream device for kaio */
        if (STREAMSTAB(major)) {
            return (NULL);
        }
    } else {
        return (NULL);
    }

    /*
     * Check old drivers which do not have async I/O entry points.
     */
    if (devopsp[major]->devo_rev < 3)
        return (NULL);

    cb = devopsp[major]->devo_cb_ops;

    if (cb->cb_rev < 1)
        return (NULL);

    /*
     * Check whether this device is a block device.
     * Kaio is not supported for devices like tty.
     */
    if (cb->cb_strategy == nodev || cb->cb_strategy == NULL)
        return (NULL);

    /*
     * Clustering: If vnode is a PXFS vnode, then the device may be remote.
     * We cannot call the driver directly. Instead return the
     * PXFS functions.
     */

    if (IS_PXFSVP(vp)) {
        if (mode & FREAD)
            return (clpxfs_aio_read);
        else
            return (clpxfs_aio_write);
    }
    if (mode & FREAD)
        aio_func = (cb->cb_aread == nodev) ? NULL : driver_aio_read;
    else
        aio_func = (cb->cb_awrite == nodev) ? NULL : driver_aio_write;

    /*
     * Do we need this ?
     * nodev returns ENXIO anyway.
     */
    if (aio_func == nodev)
        return (NULL);

    sp = VTOS(vp);
    smark(sp, SACC);
    return (aio_func);
}

/*
 * Clustering: We want check_vp to return a function prototyped
 * correctly that will be common to both PXFS and regular case.
 * We define this intermediate function that will do the right
 * thing for driver cases.
 */

static int
driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
{
    dev_t dev;
    struct cb_ops   *cb;

    ASSERT(vp->v_type == VCHR);
    ASSERT(!IS_PXFSVP(vp));
    dev = VTOS(vp)->s_dev;
    ASSERT(STREAMSTAB(getmajor(dev)) == NULL);

    cb = devopsp[getmajor(dev)]->devo_cb_ops;

    ASSERT(cb->cb_awrite != nodev);
    return ((*cb->cb_awrite)(dev, aio, cred_p));
}

/*
 * Clustering: We want check_vp to return a function prototyped
 * correctly that will be common to both PXFS and regular case.
 * We define this intermediate function that will do the right
 * thing for driver cases.
 */

static int
driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
{
    dev_t dev;
    struct cb_ops   *cb;

    ASSERT(vp->v_type == VCHR);
    ASSERT(!IS_PXFSVP(vp));
    dev = VTOS(vp)->s_dev;
    ASSERT(!STREAMSTAB(getmajor(dev)));

    cb = devopsp[getmajor(dev)]->devo_cb_ops;

    ASSERT(cb->cb_aread != nodev);
    return ((*cb->cb_aread)(dev, aio, cred_p));
}

/*
 * This routine is called when a largefile call is made by a 32bit
 * process on a ILP32 or LP64 kernel. All 64bit processes are large
 * file by definition and will call alio() instead.
 */
static int
alioLF(
    int     mode_arg,
    void        *aiocb_arg,
    int     nent,
    void        *sigev)
{
    file_t      *fp;
    file_t      *prev_fp = NULL;
    int     prev_mode = -1;
    struct vnode    *vp;
    aio_lio_t   *head;
    aio_req_t   *reqp;
    aio_t       *aiop;
    caddr_t     cbplist;
    aiocb64_32_t    *cbp;
    caddr32_t   *ucbp;
    aiocb64_32_t    cb64;
    aiocb64_32_t    *aiocb = &cb64;
#ifdef _LP64
    aiocb_t     aiocb_n;
#endif
    struct sigevent32   sigevk;
    sigqueue_t  *sqp;
    int     (*aio_func)();
    int     mode;
    int     error = 0, aio_errors = 0;
    int     i;
    size_t      ssize;
    int     deadhead = 0;
    int     aio_notsupported = 0;
    int     aio_use_port = 0;
    port_kevent_t   *pkevtp = NULL;
    port_notify32_t pnotify;

    aiop = curproc->p_aio;
    if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
        return (EINVAL);

    ASSERT(get_udatamodel() == DATAMODEL_ILP32);

    ssize = (sizeof (caddr32_t) * nent);
    cbplist = kmem_alloc(ssize, KM_SLEEP);
    ucbp = (caddr32_t *)cbplist;

    if (copyin(aiocb_arg, cbplist, ssize)) {
        kmem_free(cbplist, ssize);
        return (EFAULT);
    }

    if (sigev) {
        if (copyin(sigev, &sigevk, sizeof (sigevk))) {
            kmem_free(cbplist, ssize);
            return (EFAULT);
        }
    }

    /*
     * a list head should be allocated if notification is
     * enabled for this list.
     */
    head = NULL;

    /* Event Ports  */

    if (sigev && sigevk.sigev_notify == SIGEV_PORT) {
        /* Use PORT for completion notification */
        if (copyin((void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
            &pnotify, sizeof (port_notify32_t))) {
            kmem_free(cbplist, ssize);
            return (EFAULT);
        }
        /* use event ports for the list of aiocbs */
        aio_use_port = 1;
        error = port_alloc_event(pnotify.portnfy_port,
            PORT_ALLOC_PRIVATE, PORT_SOURCE_AIO, &pkevtp);
        if (error) {
            if (error == ENOMEM)
                error = EAGAIN;
            kmem_free(cbplist, ssize);
            return (error);
        }
    } else if ((mode_arg == LIO_WAIT) || sigev) {
        mutex_enter(&aiop->aio_mutex);
        error = aio_lio_alloc(&head);
        mutex_exit(&aiop->aio_mutex);
        if (error)
            goto done;
        deadhead = 1;
        head->lio_nent = nent;
        head->lio_refcnt = nent;
        if (sigev && (sigevk.sigev_notify == SIGEV_SIGNAL) &&
            (sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG)) {
            sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
            if (sqp == NULL) {
                error = EAGAIN;
                goto done;
            }
            sqp->sq_func = NULL;
            sqp->sq_next = NULL;
            sqp->sq_info.si_code = SI_ASYNCIO;
            sqp->sq_info.si_pid = curproc->p_pid;
            sqp->sq_info.si_ctid = PRCTID(curproc);
            sqp->sq_info.si_zoneid = getzoneid();
            sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
            sqp->sq_info.si_signo = sigevk.sigev_signo;
            sqp->sq_info.si_value.sival_int =
                sigevk.sigev_value.sival_int;
            head->lio_sigqp = sqp;
        } else {
            head->lio_sigqp = NULL;
        }
    }

    for (i = 0; i < nent; i++, ucbp++) {

        cbp = (aiocb64_32_t *)(uintptr_t)*ucbp;
        /* skip entry if it can't be copied. */
        if (cbp == NULL || copyin(cbp, aiocb, sizeof (aiocb64_32_t))) {
            if (head) {
                mutex_enter(&aiop->aio_mutex);
                head->lio_nent--;
                head->lio_refcnt--;
                mutex_exit(&aiop->aio_mutex);
            }
            continue;
        }

        /* skip if opcode for aiocb is LIO_NOP */

        mode = aiocb->aio_lio_opcode;
        if (mode == LIO_NOP) {
            cbp = NULL;
            if (head) {
                mutex_enter(&aiop->aio_mutex);
                head->lio_nent--;
                head->lio_refcnt--;
                mutex_exit(&aiop->aio_mutex);
            }
            continue;
        }

        /* increment file descriptor's ref count. */
        if ((fp = getf(aiocb->aio_fildes)) == NULL) {
            lio_set_uerror(&cbp->aio_resultp, EBADF);
            if (head) {
                mutex_enter(&aiop->aio_mutex);
                head->lio_nent--;
                head->lio_refcnt--;
                mutex_exit(&aiop->aio_mutex);
            }
            aio_errors++;
            continue;
        }

        vp = fp->f_vnode;

        /*
         * check the permission of the partition
         */
        mode = aiocb->aio_lio_opcode;
        if ((fp->f_flag & mode) == 0) {
            releasef(aiocb->aio_fildes);
            lio_set_uerror(&cbp->aio_resultp, EBADF);
            if (head) {
                mutex_enter(&aiop->aio_mutex);
                head->lio_nent--;
                head->lio_refcnt--;
                mutex_exit(&aiop->aio_mutex);
            }
            aio_errors++;
            continue;
        }

        /*
         * common case where requests are to the same fd
         * for the same r/w operation
         * for UFS, need to set EBADFD
         */
        if ((fp != prev_fp) || (mode != prev_mode)) {
            aio_func = check_vp(vp, mode);
            if (aio_func == NULL) {
                prev_fp = NULL;
                releasef(aiocb->aio_fildes);
                lio_set_uerror(&cbp->aio_resultp, EBADFD);
                aio_notsupported++;
                if (head) {
                    mutex_enter(&aiop->aio_mutex);
                    head->lio_nent--;
                    head->lio_refcnt--;
                    mutex_exit(&aiop->aio_mutex);
                }
                continue;
            } else {
                prev_fp = fp;
                prev_mode = mode;
            }
        }
#ifdef  _LP64
        aiocb_LFton(aiocb, &aiocb_n);
        error = aio_req_setup(&reqp, aiop, &aiocb_n,
            (aio_result_t *)&cbp->aio_resultp, aio_use_port, vp);
#else
        error = aio_req_setupLF(&reqp, aiop, aiocb,
            (aio_result_t *)&cbp->aio_resultp, aio_use_port, vp);
#endif  /* _LP64 */
        if (error) {
            releasef(aiocb->aio_fildes);
            if (head) {
                mutex_enter(&aiop->aio_mutex);
                head->lio_nent--;
                head->lio_refcnt--;
                mutex_exit(&aiop->aio_mutex);
            }
            aio_errors++;
            continue;
        }

        reqp->aio_req_lio = head;
        deadhead = 0;

        /*
         * Set the errno field now before sending the request to
         * the driver to avoid a race condition
         */
        (void) suword32(&cbp->aio_resultp.aio_errno,
            EINPROGRESS);

        reqp->aio_req_iocb.iocb32 = *ucbp;

        if (aio_use_port) {
            reqp->aio_req_port = pnotify.portnfy_port;
            error = aio_req_assoc_port32(&aiocb->aio_sigevent,
                (void *)(uintptr_t)pnotify.portnfy_user,
                (aiocb_t *)(uintptr_t)*ucbp, reqp, pkevtp);
        }

        /*
         * send the request to driver.
         * Clustering: If PXFS vnode, call PXFS function.
         */
        if (error == 0) {
            if (aiocb->aio_nbytes == 0) {
                clear_active_fd(aiocb->aio_fildes);
                aio_zerolen(reqp);
                continue;
            }
            error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
                CRED());
        }

        /*
         * the fd's ref count is not decremented until the IO has
         * completed unless there was an error.
         */
        if (error) {
            releasef(aiocb->aio_fildes);
            lio_set_uerror(&cbp->aio_resultp, error);
            if (head) {
                mutex_enter(&aiop->aio_mutex);
                head->lio_nent--;
                head->lio_refcnt--;
                mutex_exit(&aiop->aio_mutex);
            }
            if (error == ENOTSUP)
                aio_notsupported++;
            else
                aio_errors++;
            lio_set_error(reqp);
        } else {
            clear_active_fd(aiocb->aio_fildes);
        }
    }

    if (pkevtp)
        port_free_event(pkevtp);

    if (aio_notsupported) {
        error = ENOTSUP;
    } else if (aio_errors) {
        /*
         * return EIO if any request failed
         */
        error = EIO;
    }

    if (mode_arg == LIO_WAIT) {
        mutex_enter(&aiop->aio_mutex);
        while (head->lio_refcnt > 0) {
            if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
                mutex_exit(&aiop->aio_mutex);
                error = EINTR;
                goto done;
            }
        }
        mutex_exit(&aiop->aio_mutex);
        alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_LARGEFILE);
    }

done:
    kmem_free(cbplist, ssize);
    if (deadhead) {
        if (head->lio_sigqp)
            kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
        kmem_free(head, sizeof (aio_lio_t));
    }
    return (error);
}

#ifdef  _SYSCALL32_IMPL
static void
aiocb_LFton(aiocb64_32_t *src, aiocb_t *dest)
{
    dest->aio_fildes = src->aio_fildes;
    dest->aio_buf = (void *)(uintptr_t)src->aio_buf;
    dest->aio_nbytes = (size_t)src->aio_nbytes;
    dest->aio_offset = (off_t)src->aio_offset;
    dest->aio_reqprio = src->aio_reqprio;
    dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
    dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;

    /*
     * See comment in sigqueue32() on handling of 32-bit
     * sigvals in a 64-bit kernel.
     */
    dest->aio_sigevent.sigev_value.sival_int =
        (int)src->aio_sigevent.sigev_value.sival_int;
    dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
        (uintptr_t)src->aio_sigevent.sigev_notify_function;
    dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
        (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
    dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
    dest->aio_lio_opcode = src->aio_lio_opcode;
    dest->aio_state = src->aio_state;
    dest->aio__pad[0] = src->aio__pad[0];
}
#endif

/*
 * This function is used only for largefile calls made by
 * 32 bit applications on 32 bit kernel.
 */
static int
aio_req_setupLF(
    aio_req_t   **reqpp,
    aio_t       *aiop,
    aiocb64_32_t    *arg,
    aio_result_t    *resultp,
    int     port,
    vnode_t     *vp)
{
    aio_req_t   *reqp;
    sigqueue_t  *sqp;
    struct  uio *uio;

    struct  sigevent *sigev;
    int         error;

    sigev = (struct sigevent *)&arg->aio_sigevent;
    if ((sigev->sigev_notify == SIGEV_SIGNAL) &&
        (sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG)) {
        sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
        if (sqp == NULL)
            return (EAGAIN);
        sqp->sq_func = NULL;
        sqp->sq_next = NULL;
        sqp->sq_info.si_code = SI_ASYNCIO;
        sqp->sq_info.si_pid = curproc->p_pid;
        sqp->sq_info.si_ctid = PRCTID(curproc);
        sqp->sq_info.si_zoneid = getzoneid();
        sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
        sqp->sq_info.si_signo = sigev->sigev_signo;
        sqp->sq_info.si_value = sigev->sigev_value;
    } else
        sqp = NULL;

    mutex_enter(&aiop->aio_mutex);

    if (aiop->aio_flags & AIO_REQ_BLOCK) {
        mutex_exit(&aiop->aio_mutex);
        if (sqp)
            kmem_free(sqp, sizeof (sigqueue_t));
        return (EIO);
    }
    /*
     * get an aio_reqp from the free list or allocate one
     * from dynamic memory.
     */
    if (error = aio_req_alloc(&reqp, resultp)) {
        mutex_exit(&aiop->aio_mutex);
        if (sqp)
            kmem_free(sqp, sizeof (sigqueue_t));
        return (error);
    }
    aiop->aio_pending++;
    aiop->aio_outstanding++;
    reqp->aio_req_flags = AIO_PENDING;
    if (port)
        aio_enq_port_pending(aiop, reqp);
    mutex_exit(&aiop->aio_mutex);
    /*
     * initialize aio request.
     */
    reqp->aio_req_fd = arg->aio_fildes;
    reqp->aio_req_sigqp = sqp;
    reqp->aio_req_iocb.iocb = NULL;
    reqp->aio_req_buf.b_file = vp;
    uio = reqp->aio_req.aio_uio;
    uio->uio_iovcnt = 1;
    uio->uio_iov->iov_base = (caddr_t)(uintptr_t)arg->aio_buf;
    uio->uio_iov->iov_len = arg->aio_nbytes;
    uio->uio_loffset = arg->aio_offset;
    *reqpp = reqp;
    return (0);
}

/*
 * This routine is called when a non largefile call is made by a 32bit
 * process on a ILP32 or LP64 kernel.
 */
static int
alio32(
    int     mode_arg,
    void        *aiocb_arg,
    int     nent,
    void        *sigev_arg)
{
    file_t      *fp;
    file_t      *prev_fp = NULL;
    int     prev_mode = -1;
    struct vnode    *vp;
    aio_lio_t   *head;
    aio_req_t   *reqp;
    aio_t       *aiop;
    aiocb_t     cb;
    aiocb_t     *aiocb = &cb;
    caddr_t     cbplist;
#ifdef  _LP64
    aiocb32_t   *cbp;
    caddr32_t   *ucbp;
    aiocb32_t   cb32;
    aiocb32_t   *aiocb32 = &cb32;
    struct sigevent32   sigev;
#else
    aiocb_t     *cbp, **ucbp;
    struct sigevent sigev;
#endif
    sigqueue_t  *sqp;
    int     (*aio_func)();
    int     mode;
    int     error = 0, aio_errors = 0;
    int     i;
    size_t      ssize;
    int     deadhead = 0;
    int     aio_notsupported = 0;
    int     aio_use_port = 0;
    port_kevent_t   *pkevtp = NULL;
#ifdef  _LP64
    port_notify32_t pnotify;
#else
    port_notify_t   pnotify;
#endif
    aiop = curproc->p_aio;
    if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
        return (EINVAL);

#ifdef  _LP64
    ssize = (sizeof (caddr32_t) * nent);
#else
    ssize = (sizeof (aiocb_t *) * nent);
#endif
    cbplist = kmem_alloc(ssize, KM_SLEEP);
    ucbp = (void *)cbplist;

    if (copyin(aiocb_arg, cbplist, ssize)) {
        kmem_free(cbplist, ssize);
        return (EFAULT);
    }

    if (sigev_arg) {
        if (copyin(sigev_arg, &sigev, sizeof (struct sigevent32))) {
            kmem_free(cbplist, ssize);
            return (EFAULT);
        }
    }

    /*
     * a list head should be allocated if notification is
     * enabled for this list.
     */
    head = NULL;

    /* Event Ports  */

    if (sigev_arg && sigev.sigev_notify == SIGEV_PORT) {
        /* Use PORT for completion notification */
        if (copyin((void *)(uintptr_t)sigev.sigev_value.sival_ptr,
            &pnotify, sizeof (port_notify32_t))) {
            kmem_free(cbplist, ssize);
            return (EFAULT);
        }
        /* use event ports for the list of aiocbs */
        aio_use_port = 1;
        error = port_alloc_event(pnotify.portnfy_port,
            PORT_ALLOC_PRIVATE, PORT_SOURCE_AIO, &pkevtp);
        if (error) {
            if ((error == ENOMEM) || (error == EAGAIN))
                error = EAGAIN;
            else
                error = EINVAL;
            kmem_free(cbplist, ssize);
            return (error);
        }
    } else if ((mode_arg == LIO_WAIT) || sigev_arg) {
        mutex_enter(&aiop->aio_mutex);
        error = aio_lio_alloc(&head);
        mutex_exit(&aiop->aio_mutex);
        if (error)
            goto done;
        deadhead = 1;
        head->lio_nent = nent;
        head->lio_refcnt = nent;
        if (sigev_arg && (sigev.sigev_notify == SIGEV_SIGNAL) &&
            (sigev.sigev_signo > 0 && sigev.sigev_signo < NSIG)) {
            sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
            if (sqp == NULL) {
                error = EAGAIN;
                goto done;
            }
            sqp->sq_func = NULL;
            sqp->sq_next = NULL;
            sqp->sq_info.si_code = SI_ASYNCIO;
            sqp->sq_info.si_pid = curproc->p_pid;
            sqp->sq_info.si_ctid = PRCTID(curproc);
            sqp->sq_info.si_zoneid = getzoneid();
            sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
            sqp->sq_info.si_signo = sigev.sigev_signo;
            sqp->sq_info.si_value.sival_int =
                sigev.sigev_value.sival_int;
            head->lio_sigqp = sqp;
        } else {
            head->lio_sigqp = NULL;
        }
    }

    for (i = 0; i < nent; i++, ucbp++) {

        /* skip entry if it can't be copied. */
#ifdef  _LP64
        cbp = (aiocb32_t *)(uintptr_t)*ucbp;
        if (cbp == NULL || copyin(cbp, aiocb32, sizeof (aiocb32_t))) {
#else
        cbp = (aiocb_t *)*ucbp;
        if (cbp == NULL || copyin(cbp, aiocb, sizeof (aiocb_t))) {
#endif
            if (head) {
                mutex_enter(&aiop->aio_mutex);
                head->lio_nent--;
                head->lio_refcnt--;
                mutex_exit(&aiop->aio_mutex);
            }
            continue;
        }
#ifdef  _LP64
        /*
         * copy 32 bit structure into 64 bit structure
         */
        aiocb_32ton(aiocb32, aiocb);
#endif /* _LP64 */

        /* skip if opcode for aiocb is LIO_NOP */

        mode = aiocb->aio_lio_opcode;
        if (mode == LIO_NOP) {
            cbp = NULL;
            if (head) {
                mutex_enter(&aiop->aio_mutex);
                head->lio_nent--;
                head->lio_refcnt--;
                mutex_exit(&aiop->aio_mutex);
            }
            continue;
        }

        /* increment file descriptor's ref count. */
        if ((fp = getf(aiocb->aio_fildes)) == NULL) {
            lio_set_uerror(&cbp->aio_resultp, EBADF);
            if (head) {
                mutex_enter(&aiop->aio_mutex);
                head->lio_nent--;
                head->lio_refcnt--;
                mutex_exit(&aiop->aio_mutex);
            }
            aio_errors++;
            continue;
        }

        vp = fp->f_vnode;

        /*
         * check the permission of the partition
         */
        mode = aiocb->aio_lio_opcode;
        if ((fp->f_flag & mode) == 0) {
            releasef(aiocb->aio_fildes);
            lio_set_uerror(&cbp->aio_resultp, EBADF);
            if (head) {
                mutex_enter(&aiop->aio_mutex);
                head->lio_nent--;
                head->lio_refcnt--;
                mutex_exit(&aiop->aio_mutex);
            }
            aio_errors++;
            continue;
        }

        /*
         * common case where requests are to the same fd
         * for the same r/w operation
         * for UFS, need to set EBADFD
         */
        if ((fp != prev_fp) || (mode != prev_mode)) {
            aio_func = check_vp(vp, mode);
            if (aio_func == NULL) {
                prev_fp = NULL;
                releasef(aiocb->aio_fildes);
                lio_set_uerror(&cbp->aio_resultp,
                    EBADFD);
                aio_notsupported++;
                if (head) {
                    mutex_enter(&aiop->aio_mutex);
                    head->lio_nent--;
                    head->lio_refcnt--;
                    mutex_exit(&aiop->aio_mutex);
                }
                continue;
            } else {
                prev_fp = fp;
                prev_mode = mode;
            }
        }
        if (error = aio_req_setup(&reqp, aiop, aiocb,
            (aio_result_t *)&cbp->aio_resultp, aio_use_port, vp)) {
            releasef(aiocb->aio_fildes);
            lio_set_uerror(&cbp->aio_resultp, error);
            if (head) {
                mutex_enter(&aiop->aio_mutex);
                head->lio_nent--;
                head->lio_refcnt--;
                mutex_exit(&aiop->aio_mutex);
            }
            aio_errors++;
            continue;
        }

        reqp->aio_req_lio = head;
        deadhead = 0;

        /*
         * Set the errno field now before sending the request to
         * the driver to avoid a race condition
         */
        (void) suword32(&cbp->aio_resultp.aio_errno,
            EINPROGRESS);

        reqp->aio_req_iocb.iocb32 = ((caddr32_t *)cbplist)[i];

        if (aio_use_port) {
            reqp->aio_req_port = pnotify.portnfy_port;
#ifdef _LP64
            error = aio_req_assoc_port32(&aiocb32->aio_sigevent,
                (void *)(uintptr_t)pnotify.portnfy_user,
                (aiocb_t *)(uintptr_t)(((caddr32_t *)cbplist)[i]),
                reqp, pkevtp);
#else
            error = aio_req_assoc_port(&aiocb->aio_sigevent,
                pnotify.portnfy_user,
                (aiocb_t *)(((caddr32_t *)cbplist)[i]),
                reqp, pkevtp);
#endif
        }

        /*
         * send the request to driver.
         * Clustering: If PXFS vnode, call PXFS function.
         */
        if (error == 0) {
            if (aiocb->aio_nbytes == 0) {
                clear_active_fd(aiocb->aio_fildes);
                aio_zerolen(reqp);
                continue;
            }
            error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
                CRED());
        }

        /*
         * the fd's ref count is not decremented until the IO has
         * completed unless there was an error.
         */
        if (error) {
            releasef(aiocb->aio_fildes);
            lio_set_uerror(&cbp->aio_resultp, error);
            if (head) {
                mutex_enter(&aiop->aio_mutex);
                head->lio_nent--;
                head->lio_refcnt--;
                mutex_exit(&aiop->aio_mutex);
            }
            if (error == ENOTSUP)
                aio_notsupported++;
            else
                aio_errors++;
            lio_set_error(reqp);
        } else {
            clear_active_fd(aiocb->aio_fildes);
        }
    }

    if (pkevtp)
        port_free_event(pkevtp);

    if (aio_notsupported) {
        error = ENOTSUP;
    } else if (aio_errors) {
        /*
         * return EIO if any request failed
         */
        error = EIO;
    }

    if (mode_arg == LIO_WAIT) {
        mutex_enter(&aiop->aio_mutex);
        while (head->lio_refcnt > 0) {
            if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
                mutex_exit(&aiop->aio_mutex);
                error = EINTR;
                goto done;
            }
        }
        mutex_exit(&aiop->aio_mutex);
        alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_32);
    }

done:
    kmem_free(cbplist, ssize);
    if (deadhead) {
        if (head->lio_sigqp)
            kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
        kmem_free(head, sizeof (aio_lio_t));
    }
    return (error);
}


#ifdef  _SYSCALL32_IMPL
void
aiocb_32ton(aiocb32_t *src, aiocb_t *dest)
{
    dest->aio_fildes = src->aio_fildes;
    dest->aio_buf = (caddr_t)(uintptr_t)src->aio_buf;
    dest->aio_nbytes = (size_t)src->aio_nbytes;
    dest->aio_offset = (off_t)src->aio_offset;
    dest->aio_reqprio = src->aio_reqprio;
    dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
    dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;

    /*
     * See comment in sigqueue32() on handling of 32-bit
     * sigvals in a 64-bit kernel.
     */
    dest->aio_sigevent.sigev_value.sival_int =
        (int)src->aio_sigevent.sigev_value.sival_int;
    dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
        (uintptr_t)src->aio_sigevent.sigev_notify_function;
    dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
        (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
    dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
    dest->aio_lio_opcode = src->aio_lio_opcode;
    dest->aio_state = src->aio_state;
    dest->aio__pad[0] = src->aio__pad[0];
}
#endif /* _SYSCALL32_IMPL */

/*
 * aio_port_callback() is called just before the event is retrieved from the
 * port. The task of this callback function is to finish the work of the
 * transaction for the application, it means :
 * - copyout transaction data to the application
 *  (this thread is running in the right process context)
 * - keep trace of the transaction (update of counters).
 * - free allocated buffers
 * The aiocb pointer is the object element of the port_kevent_t structure.
 *
 * flag :
 *  PORT_CALLBACK_DEFAULT : do copyout and free resources
 *  PORT_CALLBACK_CLOSE   : don't do copyout, free resources
 */

/*ARGSUSED*/
int
aio_port_callback(void *arg, int *events, pid_t pid, int flag, void *evp)
{
    aio_t       *aiop = curproc->p_aio;
    aio_req_t   *reqp = arg;
    struct  iovec   *iov;
    struct  buf *bp;
    void        *resultp;

    if (pid != curproc->p_pid) {
        /* wrong proc !!, can not deliver data here ... */
        return (EACCES);
    }

    mutex_enter(&aiop->aio_portq_mutex);
    reqp->aio_req_portkev = NULL;
    aio_req_remove_portq(aiop, reqp); /* remove request from portq */
    mutex_exit(&aiop->aio_portq_mutex);
    aphysio_unlock(reqp);       /* unlock used pages */
    mutex_enter(&aiop->aio_mutex);
    if (reqp->aio_req_flags & AIO_COPYOUTDONE) {
        aio_req_free_port(aiop, reqp);  /* back to free list */
        mutex_exit(&aiop->aio_mutex);
        return (0);
    }

    iov = reqp->aio_req_uio.uio_iov;
    bp = &reqp->aio_req_buf;
    resultp = (void *)reqp->aio_req_resultp;
    aio_req_free_port(aiop, reqp);  /* request struct back to free list */
    mutex_exit(&aiop->aio_mutex);
    if (flag == PORT_CALLBACK_DEFAULT)
        aio_copyout_result_port(iov, bp, resultp);
    return (0);
}