starfire/io/idn_smr.c

	idn_smr.c revision 07d06da50d310a325b457d6330165aebab1e0064
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 *
 * Inter-Domain Network
 *
 * Shared Memory Region (SMR) supporting code.
 */

#include <sys/types.h>
#include <sys/param.h>
#include <sys/machparam.h>
#include <sys/debug.h>
#include <sys/cpuvar.h>
#include <sys/kmem.h>
#include <sys/mutex.h>
#include <sys/rwlock.h>
#include <sys/systm.h>
#include <sys/machlock.h>
#include <sys/membar.h>
#include <sys/mman.h>
#include <vm/hat.h>
#include <vm/as.h>
#include <vm/hat_sfmmu.h>
#include <sys/vm_machparam.h>
#include <sys/x_call.h>

#include <sys/idn.h>

#ifdef DEBUG
#define DIOCHECK(domid) \
{ \
    int _dio; \
    if ((_dio = idn_domain[domid].dio) < 0) { \
        cmn_err(CE_WARN, \
            ">>>>> file %s, line %d: domain %d, dio = %d", \
            __FILE__, __LINE__, (domid), _dio); \
    } \
}
#else
#define DIOCHECK(domid)
#endif /* DEBUG */

static int  smr_slab_alloc_local(int domid, smr_slab_t **spp);
static int  smr_slab_alloc_remote(int domid, smr_slab_t **spp);
static void smr_slab_free_local(int domid, smr_slab_t *sp);
static void smr_slab_free_remote(int domid, smr_slab_t *sp);
static int  smr_slabwaiter_register(int domid);
static int  smr_slabwaiter_unregister(int domid, smr_slab_t **spp);
static int  smr_slaballoc_wait(int domid, smr_slab_t **spp);
static smr_slab_t   *smr_slab_reserve(int domid);
static void     smr_slab_unreserve(int domid, smr_slab_t *sp);
static void smr_slab_reap_global();

/*
 * Can only be called by the master.  Allocate a slab from the
 * local pool representing the SMR, on behalf of the given
 * domain.  Slab is either being requested for use by the
 * local domain (i.e. domid == idn.localid), or it's being
 * allocated to give to a remote domain which requested one.
 * In the base of allocating on behalf of a remote domain,
 * smr_slab_t structure is used simply to manage ownership.
 *
 * Returns: smr_slaballoc_wait
 *      (EINVAL, ETIMEDOUT)
 *      smr_slabwatier_unregister
 *      (0, EINVAL, EBUSY, ENOMEM)
 *      ENOLCK
 */
static int
smr_slab_alloc_local(int domid, smr_slab_t **spp)
{
    int     serrno = 0;
    int     nwait;
    smr_slab_t  *sp;
    idn_domain_t    *dp;


    /*
     * Only the master can make local allocations.
     */
    ASSERT(IDN_GET_MASTERID() != IDN_NIL_DOMID);
    ASSERT(idn.localid == IDN_GET_MASTERID());

    *spp = NULL;

    dp = &idn_domain[domid];
    ASSERT(DSLAB_READ_HELD(domid));
    ASSERT(dp->dslab_state == DSLAB_STATE_LOCAL);

    /*
     * Register myself with the waiting list.
     */
    nwait = smr_slabwaiter_register(domid);

    if (nwait > 1) {
        /*
         * XXX - old comment?
         * Need to drop the read lock _after_ registering
         * ourselves with the potential wait list for this allocation.
         * Although this allocation is not a remote one, we could
         * still have multiple threads on the master trying to
         * satisfy (allocate) request on behalf of a remote domain.
         */
        /*
         * Somebody is already in the process of satisfying
         * the allocation request for this respective
         * domain.  All we need to do is wait and let
         * it happen.
         */
        serrno = smr_slaballoc_wait(domid, spp);
        return (serrno);
    }
    /*
     * I'm the original slab requester for this domain.  It's local
     * so go ahead and do the job.
     */

    if ((sp = smr_slab_reserve(domid)) == NULL)
        serrno = ENOMEM;

    /*
     * Allocation may have failed.  In either case we've
     * got to do the put to at least wake potential waiters up.
     */
    if (!serrno) {
        if (DSLAB_LOCK_TRYUPGRADE(domid) == 0) {
            DSLAB_UNLOCK(domid);
            DSLAB_LOCK_EXCL(domid);
        }
    }

    (void) smr_slaballoc_put(domid, sp, 0, serrno);

    /*
     * If serrno is ENOLCK here, then we must have failed
     * on the upgrade above, so lock already dropped.
     */
    if (serrno != ENOLCK) {
        /*
         * Need to drop since reaping may be recursive?
         */
        DSLAB_UNLOCK(domid);
    }

    /*
     * Since we were the original requester but never went
     * to sleep, we need to directly unregister ourselves
     * from the waiting list.
     */
    serrno = smr_slabwaiter_unregister(domid, spp);

    /*
     * Now that we've satisfied the request, let's check if any
     * reaping is necessary.  Only the master does this and only
     * when allocating slabs, an infrequent event :-o
     */
    smr_slab_reap_global();

    ASSERT((serrno == 0) ? (*spp != NULL) : (*spp == NULL));

    DSLAB_LOCK_SHARED(domid);

    return (serrno);
}

/*
 * Can only be called by a slave on behalf of himself.  Need to
 * make a request to the master to allocate a slab of SMR buffers
 * for the local domain.
 *
 * Returns: smr_slaballoc_wait
 *      (0, EINVAL, EBUSY, ENOMEM)
 *      ENOLCK
 *      ECANCELED
 */
static int
smr_slab_alloc_remote(int domid, smr_slab_t **spp)
{
    int     nwait;
    int     serrno = 0;
    int     bailout = 0;
    int     masterid;
    idn_domain_t    *dp, *mdp = NULL;
    procname_t  proc = "smr_slab_alloc_remote";

    /*
     * Only slaves make remote allocations.
     */
    ASSERT(idn.localid != IDN_GET_MASTERID());
    ASSERT(domid == idn.localid);
    ASSERT(IDN_GET_MASTERID() != IDN_NIL_DOMID);

    *spp = NULL;

    dp = &idn_domain[domid];
    ASSERT(DSLAB_READ_HELD(domid));
    ASSERT(dp->dslab_state == DSLAB_STATE_REMOTE);

    /*
     * Register myself with the slaballoc waiting list.
     * Note that only allow one outstanding allocation
     * request for the given domain.  Other callers which
     * detect a slab is needed simply get stuck on the
     * waiting list waiting for the original caller to
     * get the job done.
     * The waiter_register routine will allocate the necessary
     * slab structure which will ultimately be inserted in
     * the domain's slab list via smr_slaballoc_put().
     */
    nwait = smr_slabwaiter_register(domid);

    /*
     * Make sure we have a connection with the master
     * before we wait around for nothing and send a
     * command off to nowhere.
     * First do a quick (no lock) check for global okayness.
     */
    if ((idn.state != IDNGS_ONLINE) ||
        ((masterid = IDN_GET_MASTERID()) == IDN_NIL_DOMID)) {
        bailout = 1;
        serrno = ECANCELED;
    }
    /*
     * We need to drop our read lock _before_ acquiring the
     * slaballoc waiter lock.  This is necessary because the
     * thread that receives the slab alloc response and fills
     * in the slab structure will need to grab the domain write
     * lock while holding onto the slaballoc waiter lock.
     * Potentially could deadlock if we didn't drop our domain
     * lock before.  Plus, we've registered.
     *
     * 4093209 - Note also that we do this _after_ the check for
     *  idn.masterid where we grab the READER global
     *  lock.  This is to prevent somebody from
     *  changing our state after we drop the drwlock.
     *  A deadlock can occur when shutting down a
     *  domain we're holding the
     */

    if (!bailout) {
        mdp = &idn_domain[masterid];
        /*
         * Global state is okay.  Let's double check the
         * state of our actual target domain.
         */
        if (mdp->dstate != IDNDS_CONNECTED) {
            bailout = 1;
            serrno = ECANCELED;
        } else if (IDN_DLOCK_TRY_SHARED(masterid)) {
            if (mdp->dstate != IDNDS_CONNECTED) {
                bailout = 1;
                serrno = ECANCELED;
                IDN_DUNLOCK(masterid);
            } else if (nwait != 1) {
                IDN_DUNLOCK(masterid);
            }
            /*
             * Note that keep the drwlock(read) for
             * the target (master) domain if it appears
             * we're the lucky one to send the command.
             * We hold onto the lock until we've actually
             * sent the command out.
             * We don't reach this place unless it
             * appears everything is kosher with
             * the target (master) domain.
             */
        } else {
            bailout = 1;
            serrno = ENOLCK;
        }
    }

    if (bailout) {
        ASSERT(serrno);
        /*
         * Gotta bail.  Abort operation.  Error result
         * will be picked up when we attempt to wait.
         */
        PR_SMR("%s: BAILING OUT on behalf domain %d "
            "(err=%d, gs=%s, ms=%s)\n",
            proc, domid, serrno, idngs_str[idn.state],
            (masterid == IDN_NIL_DOMID)
            ? "unknown" : idnds_str[idn_domain[masterid].dstate]);
        (void) smr_slabwaiter_abort(domid, serrno);

    } else if (nwait == 1) {
        /*
         * We are the original requester.  Initiate the
         * actual request to the master.
         */
        idn_send_cmd(masterid, IDNCMD_SLABALLOC, IDN_SLAB_SIZE, 0, 0);
        ASSERT(mdp);
        IDN_DUNLOCK(masterid);
    }

    /*
     * Wait here for response.  Once awakened func returns
     * with slab structure possibly filled with gifts!
     */
    serrno = smr_slaballoc_wait(domid, spp);

    return (serrno);
}

/*
 * Allocate a slab from the Master on behalf
 * of the given domain.  Note that master uses
 * this function to allocate slabs on behalf of
 * remote domains also.
 * Entered with drwlock held.
 * Leaves with drwlock dropped.
 * Returns: EDQUOT
 *      EINVAL
 *      ENOLCK
 *      smr_slab_alloc_local
 *      smr_slab_alloc_remote
 *      (0, EINVAL, EBUSY, ENOMEM)
 */
int
smr_slab_alloc(int domid, smr_slab_t **spp)
{
    int     serrno = 0;
    idn_domain_t    *dp;
    procname_t  proc = "smr_slab_alloc";


    dp = &idn_domain[domid];

    ASSERT(DSLAB_READ_HELD(domid));
    ASSERT(dp->dslab_state != DSLAB_STATE_UNKNOWN);

    *spp = NULL;

    switch (dp->dslab_state) {
    case DSLAB_STATE_UNKNOWN:
        cmn_err(CE_WARN,
            "IDN: 300: no slab allocations without a master");
        serrno = EINVAL;
        break;

    case DSLAB_STATE_LOCAL:
        /*
         * If I'm the master, then get a slab
         * from the local SMR pool, but only
         * if the number of allocated slabs has
         * not been exceeded.
         */
        if (((int)dp->dnslabs < IDN_SLAB_MAXPERDOMAIN) ||
            !IDN_SLAB_MAXPERDOMAIN)
            serrno = smr_slab_alloc_local(domid, spp);
        else
            serrno = EDQUOT;
        break;

    case DSLAB_STATE_REMOTE:
        /*
         * Have to make a remote request.
         * In order to prevent overwhelming the master
         * with a bunch of requests that he won't be able
         * to handle we do a check to see if we're still
         * under quota.  Note that the limit is known
         * apriori based on the SMR/NWR size and
         * IDN_SLAB_MINTOTAL.  Domains must have the same
         * size SMR/NWR, however they can have different
         * IDN_SLAB_MINTOTAL.  Thus a domain could throttle
         * itself however it wishes.
         */
        if (((int)dp->dnslabs < IDN_SLAB_MAXPERDOMAIN) ||
            !IDN_SLAB_MAXPERDOMAIN)
            serrno = smr_slab_alloc_remote(domid, spp);
        else
            serrno = EDQUOT;
        break;

    default:
        cmn_err(CE_WARN,
            "IDN: 301: (ALLOC) unknown slab state (%d) "
            "for domain %d", dp->dslab_state, domid);
        serrno = EINVAL;
        break;
    }

    if (*spp == NULL) {
        PR_SMR("%s: failed to allocate %s slab [serrno = %d]\n",
            proc, (idn.localid == IDN_GET_MASTERID()) ?
            "local" : "remote", serrno);
    }

    if (serrno) {
        IDN_GKSTAT_GLOBAL_EVENT(gk_slabfail, gk_slabfail_last);
    }

    return (serrno);
}

static void
smr_slab_free_local(int domid, smr_slab_t *sp)
{
    int rv;

    /*
     * Do a slaballoc_put just in case there may have
     * been waiters for slabs for this respective domain
     * before we unreserve this slab.
     */
    rv = smr_slaballoc_put(domid, sp, 0, 0);

    if (rv == -1) {
        /*
         * Put failed.  Must not have been any waiters.
         * Go ahead and unreserve the space.
         */
        smr_slab_unreserve(domid, sp);
    }
}

static void
smr_slab_free_remote(int domid, smr_slab_t *sp)
{
    smr_offset_t    slab_offset;
    int     slab_size;
    int     rv;
    int     masterid;

    ASSERT(domid == idn.localid);
    ASSERT(idn.localid != IDN_GET_MASTERID());
    ASSERT(DSLAB_WRITE_HELD(domid));
    ASSERT(idn_domain[domid].dslab_state == DSLAB_STATE_REMOTE);

    masterid = IDN_GET_MASTERID();

    ASSERT(masterid != IDN_NIL_DOMID);

    slab_offset = IDN_ADDR2OFFSET(sp->sl_start);
    slab_size   = (int)(sp->sl_end - sp->sl_start);

    /*
     * Do a slaballoc_put just in case there may have
     * been waiters for slabs for this domain before
     * returning back to the master.
     */
    rv = smr_slaballoc_put(domid, sp, 0, 0);

    if ((rv == -1) && (masterid != IDN_NIL_DOMID)) {
        /*
         * Put failed.  No waiters so free the local data
         * structure ship the SMR range off to the master.
         */
        smr_free_buflist(sp);
        FREESTRUCT(sp, smr_slab_t, 1);

        IDN_DLOCK_SHARED(masterid);
        idn_send_cmd(masterid, IDNCMD_SLABFREE, slab_offset, slab_size,
            0);
        IDN_DUNLOCK(masterid);
    }
}

/*
 * Free up the list of slabs passed
 */
void
smr_slab_free(int domid, smr_slab_t *sp)
{
    smr_slab_t  *nsp = NULL;

    ASSERT(DSLAB_WRITE_HELD(domid));

    if (sp == NULL)
        return;

    ASSERT(IDN_GET_MASTERID() != IDN_NIL_DOMID);

    switch (idn_domain[domid].dslab_state) {
    case DSLAB_STATE_UNKNOWN:
        cmn_err(CE_WARN, "IDN: 302: no slab free without a master");
        break;

    case DSLAB_STATE_LOCAL:
        /*
         * If I'm the master then put the slabs
         * back to the local SMR pool.
         */
        for (; sp; sp = nsp) {
            nsp = sp->sl_next;
            smr_slab_free_local(domid, sp);
        }
        break;

    case DSLAB_STATE_REMOTE:
        /*
         * If the domid is my own then I'm freeing
         * a slab back to the Master.
         */
        for (; sp; sp = nsp) {
            nsp = sp->sl_next;
            smr_slab_free_remote(domid, sp);
        }
        break;

    default:
        cmn_err(CE_WARN,
            "IDN: 301: (FREE) unknown slab state (%d) for domain %d",
            idn_domain[domid].dslab_state, domid);
        break;
    }
}

/*
 * Free up the list of slab data structures ONLY.
 * This is called during a fatal shutdown of the master
 * where we need to garbage collect the locally allocated
 * data structures used to manage slabs allocated to the
 * local domain.  Should never be called by a master since
 * the master can do a regular smr_slab_free.
 */
void
smr_slab_garbage_collection(smr_slab_t *sp)
{
    smr_slab_t  *nsp;

    ASSERT(idn_domain[idn.localid].dvote.v.master == 0);

    if (sp == NULL)
        return;
    /*
     * Since this is only ever called by a slave,
     * the slab structure size always contains a buflist.
     */
    for (; sp; sp = nsp) {
        nsp = sp->sl_next;
        smr_free_buflist(sp);
        FREESTRUCT(sp, smr_slab_t, 1);
    }
}

/*
 * Allocate a SMR buffer on behalf of the local domain
 * which is ultimately targeted for the given domain.
 *
 * IMPORTANT: This routine is going to drop the domain rwlock (drwlock)
 *        for the domain on whose behalf the request is being
 *        made.  This routine canNOT block on trying to
 *        reacquire the drwlock.  If he does block then somebody
 *        must have the write lock on the domain which most likely
 *        means the domain is going south anyway, so just bail on
 *        this buffer.  Higher levels will retry if needed.
 *
 * XXX - Support larger than IDN_SMR_BUFSIZE allocations?
 *
 * Returns: A negative return value indicates lock lost on domid.
 *      EINVAL, ENOLINK, ENOLCK(internal)
 *      smr_slaballoc_wait
 *      (EINVAL, ETIMEDOUT)
 *      smr_slabwatier_unregister
 *      (0, EINVAL, EBUSY, ENOMEM)
 */
int
smr_buf_alloc(int domid, uint_t len, caddr_t *bufpp)
{
    register idn_domain_t   *dp, *ldp;
    smr_slab_t  *sp;
    caddr_t     bufp = NULL;
    int     serrno;
    procname_t  proc = "smr_buf_alloc";

    dp = &idn_domain[domid];
    /*
     * Local domain can only allocate on behalf of
     * itself if this is a priviledged call and the
     * caller is the master.
     */
    ASSERT((domid != idn.localid) && (domid != IDN_NIL_DOMID));

    *bufpp = NULL;

    if (len > IDN_DATA_SIZE) {
        cmn_err(CE_WARN,
            "IDN: 303: buffer len %d > IDN_DATA_SIZE (%lu)",
            len, IDN_DATA_SIZE);
        IDN_GKSTAT_GLOBAL_EVENT(gk_buffail, gk_buffail_last);
        return (EINVAL);
    }

    /*
     * Need to go to my local slab list to find
     * a buffer.
     */
    ldp = &idn_domain[idn.localid];
    /*
     * Now we loop trying to locate a buffer out of our
     * slabs.  We continue this until either we find a
     * buffer or we're unable to allocate a slab.  Note
     * that new slabs are allocated to the front.
     */
    DSLAB_LOCK_SHARED(idn.localid);
    sp = ldp->dslab;
    do {
        int spl, all_empty;

        if (sp == NULL) {
            if ((serrno = smr_slab_alloc(idn.localid, &sp)) != 0) {
                PR_SMR("%s:%d: failed to allocate "
                    "slab [serrno = %d]",
                    proc, domid, serrno);
                DSLAB_UNLOCK(idn.localid);
                IDN_GKSTAT_GLOBAL_EVENT(gk_buffail,
                    gk_buffail_last);
                return (serrno);
            }
            /*
             * Of course, the world may have changed while
             * we dropped the lock.  Better make sure we're
             * still established.
             */
            if (dp->dstate != IDNDS_CONNECTED) {
                PR_SMR("%s:%d: state changed during slab "
                    "alloc (dstate = %s)\n",
                    proc, domid, idnds_str[dp->dstate]);
                DSLAB_UNLOCK(idn.localid);
                IDN_GKSTAT_GLOBAL_EVENT(gk_buffail,
                    gk_buffail_last);
                return (ENOLINK);
            }
            /*
             * We were able to allocate a slab.  Should
             * be at the front of the list, spin again.
             */
            sp = ldp->dslab;
        }
        /*
         * If we have reached here then we have a slab!
         * Hopefully there are free bufs there :-o
         */
        spl = splhi();
        all_empty = 1;
        for (; sp && !bufp; sp = sp->sl_next) {
            smr_slabbuf_t   *bp;

            if (sp->sl_free == NULL)
                continue;

            if (!lock_try(&sp->sl_lock)) {
                all_empty = 0;
                continue;
            }

            if ((bp = sp->sl_free) == NULL) {
                lock_clear(&sp->sl_lock);
                continue;
            }

            sp->sl_free = bp->sb_next;
            bp->sb_next = sp->sl_inuse;
            sp->sl_inuse = bp;
            /*
             * Found a free buffer.
             */
            bp->sb_domid = domid;
            bufp = bp->sb_bufp;
            lock_clear(&sp->sl_lock);
        }
        splx(spl);

        if (!all_empty && !bufp) {
            /*
             * If we still haven't found a buffer, but
             * there's still possibly a buffer available,
             * then try again.  Only if we're absolutely
             * sure all slabs are empty do we attempt
             * to allocate a new one.
             */
            sp = ldp->dslab;
        }
    } while (bufp == NULL);

    *bufpp = bufp;

    ATOMIC_INC(dp->dio);

    DSLAB_UNLOCK(idn.localid);

    return (0);
}

/*
 * Free a buffer allocated to the local domain back to
 * its respective slab.  Slabs are freed via the slab-reap command.
 * XXX - Support larger than IDN_SMR_BUFSIZE allocations?
 */
int
smr_buf_free(int domid, caddr_t bufp, uint_t len)
{
    register smr_slab_t *sp;
    smr_slabbuf_t       *bp, **bpp;
    idn_domain_t        *ldp;
    int     buffreed;
    int     lockheld = (len == (uint_t)-1);

    /*
     * We should never be free'ing a buffer on
     * behalf of ourselves as we are never the
     * target for allocated SMR buffers.
     */
    ASSERT(domid != idn.localid);

    sp = NULL;
    buffreed = 0;
    ldp = &idn_domain[idn.localid];

    DSLAB_LOCK_SHARED(idn.localid);

    if (((uintptr_t)bufp & (IDN_SMR_BUFSIZE-1)) &&
        (IDN_ADDR2OFFSET(bufp) % IDN_SMR_BUFSIZE)) {
        cmn_err(CE_WARN,
            "IDN: 304: buffer (0x%p) from domain %d not on a "
            "%d boundary", (void *)bufp, domid, IDN_SMR_BUFSIZE);
        goto bfdone;
    }
    if (!lockheld && (len > IDN_DATA_SIZE)) {
        cmn_err(CE_WARN,
            "IDN: 305: buffer length (%d) from domain %d greater "
            "than IDN_DATA_SIZE (%lu)",
            len, domid, IDN_DATA_SIZE);
        goto bfdone;
    }

    for (sp = ldp->dslab; sp; sp = sp->sl_next)
        if ((bufp >= sp->sl_start) && (bufp < sp->sl_end))
            break;

    if (sp) {
        int spl;

        spl = splhi();
        while (!lock_try(&sp->sl_lock))
            ;
        bpp = &sp->sl_inuse;
        for (bp = *bpp; bp; bp = *bpp) {
            if (bp->sb_bufp == bufp)
                break;
            bpp = &bp->sb_next;
        }
        if (bp) {
            ASSERT(bp->sb_domid == domid);
            buffreed++;
            bp->sb_domid = IDN_NIL_DOMID;
            *bpp = bp->sb_next;
            bp->sb_next = sp->sl_free;
            sp->sl_free = bp;
        }
        lock_clear(&sp->sl_lock);
        splx(spl);
    }
bfdone:
    if (buffreed) {
        ATOMIC_DEC(idn_domain[domid].dio);
        DIOCHECK(domid);
    } else {
        cmn_err(CE_WARN,
            "IDN: 306: unknown buffer (0x%p) from domain %d",
            (void *)bufp, domid);
        ATOMIC_INC(idn_domain[domid].dioerr);
    }

    DSLAB_UNLOCK(idn.localid);

    return (sp ? 0 : -1);
}

/*
 * Alternative interface to smr_buf_free, but with local drwlock
 * held.
 */
/* ARGSUSED2 */
int
smr_buf_free_locked(int domid, caddr_t bufp, uint_t len)
{
    return (smr_buf_free(domid, bufp, (uint_t)-1));
}

/*
 * Free any and all buffers associated with the given domain.
 * Assumption is that domain is dead and buffers are not in use.
 * Returns: Number of buffers freed.
 *      -1 if error.
 */
int
smr_buf_free_all(int domid)
{
    register smr_slab_t *sp;
    register smr_slabbuf_t  *bp, **bpp;
    idn_domain_t        *ldp;
    int         nbufsfreed = 0;
    procname_t  proc = "smr_buf_free_all";

    /*
     * We should never be free'ing buffers on
     * behalf of ourself
     */
    ASSERT(domid != idn.localid);

    if (!VALID_DOMAINID(domid)) {
        cmn_err(CE_WARN, "IDN: 307: domain ID (%d) invalid", domid);
        return (-1);
    }

    ldp = &idn_domain[idn.localid];

    /*
     * We grab the writer lock so that we don't have any
     * competition during a "free-all" call.
     * No need to grab individual slab locks when holding
     * dslab(writer).
     */
    DSLAB_LOCK_EXCL(idn.localid);

    for (sp = ldp->dslab; sp; sp = sp->sl_next) {
        bpp = &sp->sl_inuse;
        for (bp = *bpp; bp; bp = *bpp) {
            if (bp->sb_domid == domid) {
                bp->sb_domid = IDN_NIL_DOMID;
                *bpp = bp->sb_next;
                bp->sb_next = sp->sl_free;
                sp->sl_free = bp;
                nbufsfreed++;
            } else {
                bpp = &bp->sb_next;
            }
        }
    }

    if (nbufsfreed > 0) {
        ATOMIC_SUB(idn_domain[domid].dio, nbufsfreed);
        idn_domain[domid].dioerr = 0;
        DIOCHECK(domid);
    }

    DSLAB_UNLOCK(idn.localid);

    PR_SMR("%s: freed %d buffers for domain %d\n", proc, nbufsfreed, domid);

    return (nbufsfreed);
}

int
smr_buf_reclaim(int domid, int nbufs)
{
    int     num_reclaimed = 0;
    idn_domain_t    *ldp, *dp;
    procname_t  proc = "smr_buf_reclaim";

    ldp = &idn_domain[idn.localid];
    dp  = &idn_domain[domid];

    ASSERT(domid != idn.localid);

    if (ATOMIC_CAS(&dp->dreclaim_inprogress, 0, 1)) {
        /*
         * Reclaim is already in progress, don't
         * bother.
         */
        PR_DATA("%s: reclaim already in progress\n", proc);
        return (0);
    }

    PR_SMR("%s: requested %d buffers from domain %d\n", proc, nbufs, domid);

    if (dp->dio && nbufs) {
        register smr_slab_t *sp;
        int spl;

        DSLAB_LOCK_SHARED(idn.localid);
        spl = splhi();
        for (sp = ldp->dslab; sp && nbufs; sp = sp->sl_next) {
            register smr_slabbuf_t  *bp, **bpp;

            if (sp->sl_inuse == NULL)
                continue;

            if (!lock_try(&sp->sl_lock))
                continue;

            if (sp->sl_inuse == NULL) {
                lock_clear(&sp->sl_lock);
                continue;
            }

            bpp = &sp->sl_inuse;
            for (bp = *bpp; bp && nbufs; bp = *bpp) {
                if (bp->sb_domid == domid) {
                    /*
                     * Buffer no longer in use,
                     * reclaim it.
                     */
                    bp->sb_domid = IDN_NIL_DOMID;
                    *bpp = bp->sb_next;
                    bp->sb_next = sp->sl_free;
                    sp->sl_free = bp;
                    num_reclaimed++;
                    nbufs--;
                } else {
                    bpp = &bp->sb_next;
                }
            }
            lock_clear(&sp->sl_lock);
        }
        splx(spl);

        if (num_reclaimed > 0) {
            ATOMIC_SUB(dp->dio, num_reclaimed);
            DIOCHECK(domid);
        }
        DSLAB_UNLOCK(idn.localid);
    }

    PR_SMR("%s: reclaimed %d buffers from domain %d\n",
        proc, num_reclaimed, domid);

    return (num_reclaimed);
}

/*
 * Returns 1    If any buffers are locked for the given slab.
 *     0    If all buffers are free for the given slab.
 *
 * The caller is assumed to have the slab protected so that no
 * new allocations are attempted from it.  Also, this is only
 * valid to be called with respect to slabs that were allocated
 * on behalf of the local domain, i.e. the master is not expected
 * to call this function with (slave) slab "representatives".
 */
int
smr_slab_busy(smr_slab_t *sp)
{
    return ((sp && sp->sl_inuse) ? 1 : 0);
}

int
smr_slabwaiter_init()
{
    register int        i;
    struct slabwaiter   *wp;

    if (idn.slabwaiter != NULL)
        return (0);

    /*
     * Initialize the slab waiting area for MAX_DOMAINS.
     */
    idn.slabwaiter = GETSTRUCT(struct slabwaiter, MAX_DOMAINS);
    wp = idn.slabwaiter;
    for (i = 0; i < MAX_DOMAINS; wp++, i++) {
        wp->w_closed = 0;
        mutex_init(&wp->w_mutex, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&wp->w_cv, NULL, CV_DEFAULT, NULL);
    }

    return (0);
}

void
smr_slabwaiter_deinit()
{
    register int        i;
    struct slabwaiter   *wp;

    if ((wp = idn.slabwaiter) == NULL)
        return;

    for (i = 0; i < MAX_DOMAINS; wp++, i++) {
        ASSERT(wp->w_nwaiters == 0);
        ASSERT(wp->w_sp == NULL);
        cv_destroy(&wp->w_cv);
        mutex_destroy(&wp->w_mutex);
    }

    FREESTRUCT(idn.slabwaiter, struct slabwaiter, MAX_DOMAINS);
    idn.slabwaiter = NULL;
}

void
smr_slabwaiter_open(domainset_t domset)
{
    int         d;
    struct slabwaiter   *wp;

    if ((domset == 0) || !idn.slabwaiter)
        return;

    wp = idn.slabwaiter;

    for (d = 0; d < MAX_DOMAINS; wp++, d++) {
        if (!DOMAIN_IN_SET(domset, d))
            continue;
        mutex_enter(&wp->w_mutex);
        wp->w_closed = 0;
        mutex_exit(&wp->w_mutex);
    }
}

void
smr_slabwaiter_close(domainset_t domset)
{
    int         d;
    struct slabwaiter   *wp;

    if ((domset == 0) || !idn.slabwaiter)
        return;

    wp = idn.slabwaiter;

    for (d = 0; d < MAX_DOMAINS; wp++, d++) {
        if (!DOMAIN_IN_SET(domset, d))
            continue;
        mutex_enter(&wp->w_mutex);
        wp->w_closed = 1;
        cv_broadcast(&wp->w_cv);
        mutex_exit(&wp->w_mutex);
    }
}

/*
 * Register the caller with the waiting list for the
 * given domain.
 *
 * Protocol:
 *  1st Local requester:    register -> alloc ->
 *                      put(wakeup|xdc) -> unregister
 *  Nth Local requester:    register -> wait
 *  1st Remote requester:   register -> xdc -> wait
 *  Nth Remote requester:   register -> wait
 *
 *  Remote Responder:   local alloc -> put(xdc)
 *  Local Handler:      xdc -> put(wakeup)
 *
 * E.g. A standard slave allocation request:
 *  slave           master
 *  -----           ------
 *  idn_slab_alloc(remote)
 *  - register
 *  - xdc       ->  idn_handler
 *  - wait          ...
 *              idn_slab_alloc(local)
 *              - register
 *              - alloc
 *              - put
 *                . wakeup [local]
 *              - unregister
 *  idn_handler     <-  - xdc
 *  - put               DONE
 *    . wakeup [local]
 *      |
 *      V
 *      - wait
 *    . unregister
 *  DONE
 */
static int
smr_slabwaiter_register(int domid)
{
    struct slabwaiter   *wp;
    int     nwait;
    procname_t  proc = "smr_slabwaiter_register";


    ASSERT(domid != IDN_NIL_DOMID);

    ASSERT(DSLAB_READ_HELD(domid));

    wp = &idn.slabwaiter[domid];

    ASSERT(MUTEX_NOT_HELD(&wp->w_mutex));

    mutex_enter(&wp->w_mutex);

    nwait = ++(wp->w_nwaiters);
    ASSERT(nwait > 0);

    PR_SMR("%s: domain = %d, (new)nwaiters = %d\n", proc, domid, nwait);

    if (nwait > 1) {
        /*
         * There are already waiters for slab allocations
         * with respect to this domain.
         */
        PR_SMR("%s: existing waiters for slabs for domain %d\n",
            proc, domid);
        mutex_exit(&wp->w_mutex);

        return (nwait);
    }
    PR_SMR("%s: initial waiter for slabs for domain %d\n", proc, domid);
    /*
     * We are the first requester of a slab allocation for this
     * respective domain.  Need to prep waiting area for
     * subsequent arrival of a slab.
     */
    wp->w_sp = NULL;
    wp->w_done = 0;
    wp->w_serrno = 0;

    mutex_exit(&wp->w_mutex);

    return (nwait);
}

/*
 * It is assumed that the caller had previously registered,
 * but wakeup did not occur due to caller never waiting.
 * Thus, slaballoc mutex is still held by caller.
 *
 * Returns: 0
 *      EINVAL
 *      EBUSY
 *      w_serrno (smr_slaballoc_put)
 *      (0, ENOLCK, ENOMEM, EDQUOT, EBUSY, ECANCELED)
 */
static int
smr_slabwaiter_unregister(int domid, smr_slab_t **spp)
{
    struct slabwaiter   *wp;
    int     serrno = 0;
    procname_t  proc = "smr_slabwaiter_unregister";


    ASSERT(domid != IDN_NIL_DOMID);

    wp = &idn.slabwaiter[domid];

    mutex_enter(&wp->w_mutex);

    PR_SMR("%s: domain = %d, nwaiters = %d\n", proc, domid, wp->w_nwaiters);

    if (wp->w_nwaiters <= 0) {
        /*
         * Hmmm...nobody is registered!
         */
        PR_SMR("%s: NO WAITERS (domid = %d)\n", proc, domid);
        mutex_exit(&wp->w_mutex);
        return (EINVAL);
    }
    (wp->w_nwaiters)--;
    /*
     * Is our present under the tree?
     */
    if (!wp->w_done) {
        /*
         * Bummer...no presents.  Let the caller know
         * via a null slab pointer.
         * Note that we don't clean up immediately since
         * message might still come in for other waiters.
         * Thus, late sleepers may still get a chance.
         */
        PR_SMR("%s: bummer no slab allocated for domain %d\n",
            proc, domid);
        ASSERT(wp->w_sp == NULL);
        (*spp) = NULL;
        serrno = wp->w_closed ? ECANCELED : EBUSY;

    } else {
        (*spp) = wp->w_sp;
        serrno = wp->w_serrno;

#ifdef DEBUG
        if (serrno == 0) {
            register smr_slab_t *sp;

            ASSERT(wp->w_sp);
            PR_SMR("%s: allocation succeeded (domain %d)\n",
                proc, domid);

            DSLAB_LOCK_SHARED(domid);
            for (sp = idn_domain[domid].dslab; sp; sp = sp->sl_next)
                if (sp == wp->w_sp)
                    break;
            if (sp == NULL)
                cmn_err(CE_WARN,
                    "%s:%d: slab ptr = NULL",
                    proc, domid);
            DSLAB_UNLOCK(domid);
        } else {
            PR_SMR("%s: allocation failed (domain %d) "
                "[serrno = %d]\n", proc, domid, serrno);
        }
#endif /* DEBUG */
    }
    if (wp->w_nwaiters == 0) {
        /*
         * Last one turns out the lights.
         */
        PR_SMR("%s: domain %d last waiter, turning out lights\n",
            proc, domid);
        wp->w_sp = NULL;
        wp->w_done = 0;
        wp->w_serrno = 0;
    }
    mutex_exit(&wp->w_mutex);

    return (serrno);
}

/*
 * Called to abort any slaballoc requests on behalf of the
 * given domain.
 */
int
smr_slabwaiter_abort(int domid, int serrno)
{
    ASSERT(serrno != 0);

    return (smr_slaballoc_put(domid, NULL, 0, serrno));
}

/*
 * Put ourselves into a timedwait waiting for slab to be
 * allocated.
 * Returns with slaballoc mutex dropped.
 *
 * Returns: EINVAL
 *      ETIMEDOUT
 *      smr_slabwatier_unregister
 *      (0, EINVAL, EBUSY, ENOMEM)
 */
static int
smr_slaballoc_wait(int domid, smr_slab_t **spp)
{
    struct slabwaiter   *wp;
    int         serrno = 0, serrno_unreg;
    procname_t      proc = "smr_slaballoc_wait";


    wp = &idn.slabwaiter[domid];

    ASSERT(MUTEX_NOT_HELD(&wp->w_mutex));

    mutex_enter(&wp->w_mutex);

    PR_SMR("%s: domain = %d, nwaiters = %d, wsp = 0x%p\n",
        proc, domid, wp->w_nwaiters, (void *)wp->w_sp);

    if (wp->w_nwaiters <= 0) {
        /*
         * Hmmm...no waiters registered.
         */
        PR_SMR("%s: domain %d, no waiters!\n", proc, domid);
        mutex_exit(&wp->w_mutex);
        return (EINVAL);
    }
    ASSERT(DSLAB_READ_HELD(domid));
    DSLAB_UNLOCK(domid);

    if (!wp->w_done && !wp->w_closed) {
        int rv;

        /*
         * Only wait if data hasn't arrived yet.
         */
        PR_SMR("%s: domain %d, going to sleep...\n", proc, domid);

        rv = cv_reltimedwait_sig(&wp->w_cv, &wp->w_mutex,
            IDN_SLABALLOC_WAITTIME, TR_CLOCK_TICK);
        if (rv == -1)
            serrno = ETIMEDOUT;

        PR_SMR("%s: domain %d, awakened (reason = %s)\n",
            proc, domid, (rv == -1) ? "TIMEOUT" : "SIGNALED");
    }
    /*
     * We've awakened or request already filled!
     * Unregister ourselves.
     */
    mutex_exit(&wp->w_mutex);

    /*
     * Any gifts will be entered into spp.
     */
    serrno_unreg = smr_slabwaiter_unregister(domid, spp);

    /*
     * Leave with reader lock on dslab_lock.
     */
    DSLAB_LOCK_SHARED(domid);

    if ((serrno_unreg == EBUSY) && (serrno == ETIMEDOUT))
        return (serrno);
    else
        return (serrno_unreg);
}

/*
 * A SMR slab was allocated on behalf of the given domain.
 * Wakeup anybody that may have been waiting for the allocation.
 * Note that if the domain is a remote one, i.e. master is allocating
 * on behalf of a slave, it's up to the caller to transmit the
 * allocation response to that domain.
 * The force flag indicates that we want to install the slab for
 * the given user regardless of whether there are waiters or not.
 * This is used primarily in situations where a slave may have timed
 * out before the response actually arrived.  In this situation we
 * don't want to send slab back to the master after we went through
 * the trouble of allocating one.  Master is _not_ allowed to do this
 * for remote domains.
 *
 * Returns: -1  Non-registered waiter or waiting area garbaged.
 *      0   Successfully performed operation.
 */
int
smr_slaballoc_put(int domid, smr_slab_t *sp, int forceflag, int serrno)
{
    idn_domain_t        *dp;
    struct slabwaiter   *wp;
    procname_t      proc = "smr_slaballoc_put";


    dp = &idn_domain[domid];

    ASSERT(!serrno ? DSLAB_WRITE_HELD(domid) : 1);

    if (domid == IDN_NIL_DOMID)
        return (-1);

    ASSERT(serrno ? (sp == NULL) : (sp != NULL));

    wp = &idn.slabwaiter[domid];

    mutex_enter(&wp->w_mutex);

    PR_SMR("%s: domain = %d, bufp = 0x%p, ebufp = 0x%p, "
        "(f = %d, se = %d)\n", proc, domid,
        (sp ? (void *)sp->sl_start : 0),
        (sp ? (void *)sp->sl_end : 0), forceflag, serrno);

    if (wp->w_nwaiters <= 0) {
        /*
         * There are no waiters!!  Must have timed out
         * and left.  Oh well...
         */
        PR_SMR("%s: no slaballoc waiters found for domain %d\n",
            proc, domid);
        if (!forceflag || serrno || !sp) {
            /*
             * No waiters and caller doesn't want to force it.
             */
            mutex_exit(&wp->w_mutex);
            return (-1);
        }
        PR_SMR("%s: forcing slab onto domain %d\n", proc, domid);
        ASSERT(domid == idn.localid);
        ASSERT(wp->w_sp == NULL);
        wp->w_done = 0;
        /*
         * Now we fall through and let it be added in the
         * regular manor.
         */
    }
    if (wp->w_done) {
        /*
         * There's at least one waiter so there has
         * to be a slab structure waiting for us.
         * If everything is going smoothly, there should only
         * be one guy coming through the path of inserting
         * an error or good slab.  However, if a disconnect was
         * detected, you may get several guys coming through
         * trying to let everybody know.
         */
        ASSERT(wp->w_serrno ?
            (wp->w_sp == NULL) : (wp->w_sp != NULL));

        cv_broadcast(&wp->w_cv);
        mutex_exit(&wp->w_mutex);

        return (-1);
    }
    if (serrno != 0) {
        /*
         * Bummer...allocation failed.  This call is simply
         * to wake up the sleepers and let them know.
         */
        PR_SMR("%s: slaballoc failed for domain %d\n", proc, domid);
        wp->w_serrno = serrno;
        wp->w_done = 1;
        cv_broadcast(&wp->w_cv);
        mutex_exit(&wp->w_mutex);

        return (0);
    }
    PR_SMR("%s: putting slab into struct (domid=%d, localid=%d)\n",
        proc, domid, idn.localid);
    /*
     * Prep the slab structure.
     */

    if (domid == idn.localid) {
        /*
         * Allocation was indeed for me.
         * Slab may or may not be locked when
         * we reach.  Normally they will be locked
         * if we're being called on behalf of a
         * free, and not locked if on behalf of
         * a new allocation request.
         */
        lock_clear(&sp->sl_lock);
        smr_alloc_buflist(sp);
#ifdef DEBUG
    } else {
        uint_t  rv;
        /*
         * Slab was not allocated on my behalf.  Must be
         * a master request on behalf of some other domain.
         * Prep appropriately.  Slab should have been locked
         * by smr_slab_reserve.
         */
        rv = lock_try(&sp->sl_lock);
        ASSERT(!rv);
        ASSERT(sp->sl_domid == (short)domid);
#endif /* DEBUG */
    }

    /*
     * Slab is ready to go.  Insert it into the domain's
     * slab list so once we wake everybody up they'll find it.
     * You better have write lock if you're putting treasures
     * there.
     */
    ASSERT(DSLAB_WRITE_HELD(domid));

    sp->sl_next = dp->dslab;
    dp->dslab  = sp;
    dp->dnslabs++;

    /*
     * It's possible to fall through here without waiters.
     * This is a case where forceflag was set.
     */
    if (wp->w_nwaiters > 0) {
        wp->w_sp = sp;
        wp->w_serrno = serrno;
        wp->w_done = 1;
        cv_broadcast(&wp->w_cv);
    } else {
        ASSERT(forceflag);
        wp->w_sp = NULL;
        wp->w_serrno = 0;
        wp->w_done = 0;
    }
    mutex_exit(&wp->w_mutex);

    return (0);
}

/*
 * Get the slab representing [bufp,ebufp] from the respective
 * domain's pool if all the buffers are free.  Remove them from
 * the domain's list and return it.
 * If bufp == NULL, then return however many free ones you
 * can find.
 * List of slabs are returned locked (sl_lock).
 * XXX - Need minimum limit to make sure we don't free up _all_
 *   of our slabs!  However, during a shutdown we will need
 *   method to free them all up regardless of locking.
 */
smr_slab_t *
smr_slaballoc_get(int domid, caddr_t bufp, caddr_t ebufp)
{
    idn_domain_t    *dp;
    smr_slab_t  *retsp, *sp, **psp;
    int     foundit, islocal = 0;
    int     nslabs;
    procname_t  proc = "smr_slaballoc_get";

    PR_SMR("%s: getting slab for domain %d [bufp=0x%p, ebufp=0x%p]\n",
        proc, domid, (void *)bufp, (void *)ebufp);

    dp = &idn_domain[domid];

    ASSERT(DSLAB_WRITE_HELD(domid));

    if ((sp = dp->dslab) == NULL) {
        PR_SMR("%s: oops, no slabs for domain %d\n", proc, domid);
        return (NULL);
    }
    /*
     * If domid is myself then I'm trying to get a slab out
     * of my local pool.  Otherwise, I'm the master and
     * I'm trying to get the slab representative from the
     * global pool.
     */
    if (domid == idn.localid)
        islocal = 1;

    if (bufp != NULL) {
        nslabs = -1;
    } else {
        nslabs = *(int *)ebufp;
        if (nslabs == 0) {
            PR_SMR("%s: requested nslabs (%d) <= 0\n",
                proc, nslabs);
            return (NULL);
        } else if (nslabs < 0) {
            /*
             * Caller wants them all!
             */
            nslabs = (int)dp->dnslabs;
        }
    }

    retsp = NULL;
    foundit = 0;
    for (psp = &dp->dslab; sp; sp = *psp) {
        int isbusy;

        if (bufp && (sp->sl_start != bufp)) {
            psp = &sp->sl_next;
            continue;
        }

        if (bufp && (ebufp > sp->sl_end)) {
            PR_SMR("%s: bufp/ebufp (0x%p/0x%p) "
                "expected (0x%p/0x%p)\n", proc, (void *)bufp,
                (void *)ebufp, (void *)sp->sl_start,
                (void *)sp->sl_end);
            ASSERT(0);
        }
        /*
         * We found the desired slab.  Make sure
         * it's free.
         */
        foundit++;
        isbusy = 0;
        if (islocal) {
            int spl;

            /*
             * Some of the buffers in the slab
             * are still in use.  Unlock the
             * buffers we locked and bail out.
             */
            spl = splhi();
            if (!lock_try(&sp->sl_lock)) {
                isbusy = 1;
                foundit--;
            } else if (sp->sl_inuse) {
                lock_clear(&sp->sl_lock);
                isbusy = 1;
                foundit--;
            }
            splx(spl);
        } else {
            /*
             * If not local, then I'm the master getting
             * a slab from one of the slaves.  In this case,
             * their slab structs will always be locked.
             */
            ASSERT(!lock_try(&sp->sl_lock));
        }
        if (!isbusy) {
            /*
             * Delete the entry from the list and slap
             * it onto our return list.
             */
            *psp = sp->sl_next;
            sp->sl_next = retsp;
            retsp = sp;
        } else {
            psp = &sp->sl_next;
        }
        /*
         * If bufp == NULL (alternate interface) and we haven't
         * found the desired number of slabs yet, keep looking.
         */
        if (bufp || (foundit == nslabs))
            break;
    }
    dp->dnslabs -= (short)foundit;

    if (foundit) {
        PR_SMR("%s: found %d free slabs (domid = %d)\n", proc, foundit,
            domid);
    } else {
        PR_SMR("%s: no free slabs found (domid = %d)\n", proc, domid);
    }

    /*
     * If this is the alternate interface, need to return
     * the number of slabs found in the ebufp parameter.
     */
    if (bufp == NULL)
        *(int *)ebufp = foundit;

    return (retsp);
}

/*
 * Wrapper to hide alternate interface to smr_slaballoc_get()
 */
smr_slab_t *
smr_slaballoc_get_n(int domid, int *nslabs)
{
    smr_slab_t  *sp;

    ASSERT(DSLAB_WRITE_HELD(domid));

    sp = smr_slaballoc_get(domid, NULL, (caddr_t)nslabs);

    return (sp);
}

/*
 * Only called by master.  Initialize slab pool based on local SMR.
 * Returns number of slabs initialized.
 * reserved_size = Length of area at the front of the NWR portion
 *         of the SMR to reserve and not make available for
 *         slab allocations.  Must be a IDN_SMR_BUFSIZE multiple.
 * reserved_area = Pointer to reserved area, if any.
 */
int
smr_slabpool_init(size_t reserved_size, caddr_t *reserved_area)
{
    size_t          nwr_available;
    int         minperpool, ntotslabs, nxslabs, nslabs;
    register int        p, pp;
    register caddr_t    bufp;
    register smr_slab_t *sp;

    ASSERT(IDN_GLOCK_IS_EXCL());
    ASSERT(IDN_GET_MASTERID() != IDN_NIL_DOMID);

    *reserved_area = NULL;

    nwr_available = MB2B(IDN_NWR_SIZE) - reserved_size;

    if ((idn.localid != IDN_GET_MASTERID()) ||
        (nwr_available < IDN_SLAB_SIZE) ||
        (idn.slabpool != NULL) ||
        ((reserved_size != 0) && (reserved_size & (IDN_SMR_BUFSIZE-1)))) {
        return (-1);
    }

    idn.slabpool = GETSTRUCT(struct slabpool, 1);
    idn.slabpool->ntotslabs = ntotslabs = nwr_available / IDN_SLAB_SIZE;
    ASSERT(ntotslabs > 0);
    minperpool = (ntotslabs < IDN_SLAB_MINPERPOOL) ?
        1 : IDN_SLAB_MINPERPOOL;
    idn.slabpool->npools = (ntotslabs + (minperpool - 1)) / minperpool;

    if ((idn.slabpool->npools & 1) == 0) {
        /*
         * npools needs to be odd for hashing algorithm.
         */
        idn.slabpool->npools++;
    }
    ASSERT(idn.slabpool->npools > 0);
    minperpool = (ntotslabs < idn.slabpool->npools) ?
        1 : (ntotslabs / idn.slabpool->npools);

    /*
     * Calculate the number of extra slabs that will need to
     * be alloted to the pools.  This number will be less than
     * npools.  Only one extra slab is allocated to each pool
     * until we have assigned all the extra slabs.
     */
    if (ntotslabs > (idn.slabpool->npools * minperpool))
        nxslabs = ntotslabs - (idn.slabpool->npools * minperpool);
    else
        nxslabs = 0;
    ASSERT((nxslabs >= 0) && (nxslabs < idn.slabpool->npools));

    idn.slabpool->pool = GETSTRUCT(struct smr_slabtbl,
        idn.slabpool->npools);
    sp = GETSTRUCT(smr_slab_t, idn.slabpool->ntotslabs);

    idn.slabpool->savep = sp;
    bufp = idn.smr.vaddr + reserved_size;

    for (p = nslabs = 0;
        (p < idn.slabpool->npools) && (ntotslabs > 0);
        p++, ntotslabs -= nslabs) {

        nslabs = (ntotslabs < minperpool) ? ntotslabs : minperpool;
        if (nxslabs > 0) {
            nslabs++;
            nxslabs--;
        }
        idn.slabpool->pool[p].sarray = sp;
        for (pp = 0; pp < nslabs; pp++) {

            sp->sl_next  = NULL;
            sp->sl_start = bufp;
            sp->sl_end   = bufp = sp->sl_start + IDN_SLAB_SIZE;
            sp->sl_lock  = 0;
            sp->sl_domid = (short)IDN_NIL_DOMID;

            sp++;
        }
        idn.slabpool->pool[p].nfree   = nslabs;
        idn.slabpool->pool[p].nslabs  = nslabs;
    }
    ASSERT((ntotslabs == 0) && (nxslabs == 0));
    /*
     * We should be at the end of the SMR at this point.
     */
    ASSERT(bufp == (idn.smr.vaddr + reserved_size
        + (idn.slabpool->ntotslabs * IDN_SLAB_SIZE)));

    if (reserved_size != 0)
        *reserved_area = idn.smr.vaddr;

    return (0);
}

void
smr_slabpool_deinit()
{
    if (idn.slabpool == NULL)
        return;

    FREESTRUCT(idn.slabpool->savep, smr_slab_t, idn.slabpool->ntotslabs);
    FREESTRUCT(idn.slabpool->pool, struct smr_slabtbl,
        idn.slabpool->npools);
    FREESTRUCT(idn.slabpool, struct slabpool, 1);

    idn.slabpool = NULL;
}

void
smr_alloc_buflist(smr_slab_t *sp)
{
    int     n, nbufs;
    caddr_t     sbufp;
    smr_slabbuf_t   *hp, *bp;

    if (sp->sl_head)
        return;

    nbufs = (sp->sl_end - sp->sl_start) / IDN_SMR_BUFSIZE;
    ASSERT(nbufs > 0);
    if (nbufs <= 0) {
        sp->sl_head = sp->sl_free = sp->sl_inuse = NULL;
        return;
    }

    hp = GETSTRUCT(smr_slabbuf_t, nbufs);

    sbufp = sp->sl_start;
    for (n = 0, bp = hp; n < nbufs; bp++, n++) {
        bp->sb_bufp = sbufp;
        bp->sb_domid = IDN_NIL_DOMID;
        bp->sb_next = bp + 1;
        sbufp += IDN_SMR_BUFSIZE;
    }
    (--bp)->sb_next = NULL;

    sp->sl_head = sp->sl_free = hp;
    sp->sl_inuse = NULL;
}

void
smr_free_buflist(smr_slab_t *sp)
{
    int nbufs;

    if (sp->sl_head == NULL)
        return;

    nbufs = (sp->sl_end - sp->sl_start) / IDN_SMR_BUFSIZE;

    FREESTRUCT(sp->sl_head, smr_slabbuf_t, nbufs);

    sp->sl_head = sp->sl_free = sp->sl_inuse = NULL;
}

/*
 * Returns: 0 Successfully located a slab.
 *         -1 Failure.
 */
static smr_slab_t *
smr_slab_reserve(int domid)
{
    register int        p, nextp, s, nexts;
    register smr_slab_t *spa;
    int         startp, starts;
    int         foundone = 0;
    int         spl;
    procname_t      proc = "smr_slab_reserve";

    p = startp = SMR_SLABPOOL_HASH(domid);
    nextp = -1;

    spl = splhi();
    while ((nextp != startp) && !foundone) {

        s = starts = SMR_SLAB_HASH(p, domid);
        nexts = -1;
        spa = &(idn.slabpool->pool[p].sarray[0]);

        while ((nexts != starts) && !foundone) {
            if (lock_try(&spa[s].sl_lock)) {
                foundone = 1;
                break;
            }
            nexts = SMR_SLAB_HASHSTEP(p, s);
            s = nexts;
        }
        if (foundone)
            break;
        nextp = SMR_SLABPOOL_HASHSTEP(p);
        p = nextp;
    }
    splx(spl);

    if (foundone) {
        ASSERT((&spa[s] >= idn.slabpool->savep) &&
            (&spa[s] < (idn.slabpool->savep +
            idn.slabpool->ntotslabs)));

        spa[s].sl_domid = (short)domid;

        ATOMIC_DEC(idn.slabpool->pool[p].nfree);

        if (domid == idn.localid) {
            smr_slab_t  *nsp;
            /*
             * Caller is actually reserving a slab for
             * themself which means they'll need the full
             * slab structure to represent all of the I/O
             * buffers.  The "spa" is just a representative
             * and doesn't contain the space to manage the
             * individual buffers.  Need to alloc a full-size
             * struct.
             * Note that this results in the returning
             * smr_slab_t structure being unlocked.
             */
            ASSERT(idn.localid == IDN_GET_MASTERID());
            nsp = GETSTRUCT(smr_slab_t, 1);
            nsp->sl_start = spa[s].sl_start;
            nsp->sl_end   = spa[s].sl_end;
            smr_alloc_buflist(nsp);
            spa = nsp;
            PR_SMR("%s: allocated full slab struct for domain %d\n",
                proc, domid);
        } else {
            /*
             * Slab structure gets returned locked.
             */
            spa += s;
        }

        PR_SMR("%s: allocated slab 0x%p (start=0x%p, size=%lu) for "
            "domain %d\n", proc, (void *)spa, (void *)spa->sl_start,
            spa->sl_end - spa->sl_start, domid);
    } else {
        PR_SMR("%s: FAILED to allocate for domain %d\n",
            proc, domid);
        spa = NULL;
    }

    return (spa);
}

static void
smr_slab_unreserve(int domid, smr_slab_t *sp)
{
    register int        p, nextp, s, nexts;
    register smr_slab_t *spa;
    int         foundit = 0;
    int         startp, starts;
    caddr_t         bufp;
    procname_t      proc = "smr_slab_unreserve";

    bufp = sp->sl_start;
    p = startp = SMR_SLABPOOL_HASH(domid);
    nextp = -1;

    while ((nextp != startp) && !foundit) {

        s = starts = SMR_SLAB_HASH(p, domid);
        nexts = -1;
        spa = &(idn.slabpool->pool[p].sarray[0]);

        while ((nexts != starts) && !foundit) {
            if (spa[s].sl_start == bufp) {
                foundit = 1;
                break;
            }
            nexts = SMR_SLAB_HASHSTEP(p, s);
            s = nexts;
        }
        if (foundit)
            break;
        nextp = SMR_SLABPOOL_HASHSTEP(p);
        p = nextp;
    }
    if (foundit) {
        ASSERT((&spa[s] >= idn.slabpool->savep) &&
            (&spa[s] < (idn.slabpool->savep +
            idn.slabpool->ntotslabs)));
        ASSERT(!lock_try(&spa[s].sl_lock));
        ASSERT(spa[s].sl_domid == (short)domid);

        spa[s].sl_next = NULL;
        spa[s].sl_domid = (short)IDN_NIL_DOMID;
        lock_clear(&spa[s].sl_lock);

        ATOMIC_INC(idn.slabpool->pool[p].nfree);

        PR_SMR("%s: freed (bufp=0x%p) for domain %d\n",
            proc, (void *)bufp, domid);

        if (domid == idn.localid) {
            /*
             * Caller is actually unreserving a slab of their
             * own.  Note that only the master calls this
             * routine.  Since the master's local slab
             * structures do not get entered into the global
             * "representative" pool, we need to free up the
             * data structure that was passed in.
             */
            ASSERT(idn.localid == IDN_GET_MASTERID());
            ASSERT(sp != &spa[s]);

            smr_free_buflist(sp);
            FREESTRUCT(sp, smr_slab_t, 1);
        } else {
            ASSERT(sp == &spa[s]);
        }
    } else {
        /*
         * Couldn't find slab entry for given buf!
         */
        PR_SMR("%s: FAILED to free (bufp=0x%p) for domain %d\n",
            proc, (void *)bufp, domid);
    }
}

/*
 * The Reap Protocol:
 *  master                 slave
 *  ------                 -----
 *  smr_slab_reap_global
 *  - idn_broadcast_cmd(SLABREAP) ->   idn_recv_cmd(SLABREAP)
 *    . idn_local_cmd(SLABREAP)        - idn_recv_slabreap_req
 *      - smr_slab_reap              . smr_slab_reap
 *        . smr_slaballoc_get_n            - smr_slaballoc_get_n
 *        . smr_slab_free              - smr_slab_free
 *      - smr_slab_free_local        . smr_slab_free_remote
 *        . smr_slab_unreserve
 *                    <-       - idn_send_cmd(SLABFREE)
 *  idn_recv_cmd(SLABFREE)
 *  - idn_recv_slabfree_req
 *    . smr_slaballoc_get
 *    . smr_slab_free
 *      - smr_slab_free_local
 *        . smr_slab_unreserve
 *        . idn_send_slabfree_resp    ->   idn_recv_cmd(SLABFREE | ack)
 *                     - idn_recv_slabfree_resp
 *
 *  idn_recv_cmd(SLABREAP | ack)  <-     . idn_send_slabreap_resp
 *  - idn_recv_slabreap_resp       DONE
 *  DONE
 *
 * Check available slabs and if we're below the threshold, kick
 * off reaping to all remote domains.  There is no guarantee remote
 * domains will be able to free up any.
 */
static void
smr_slab_reap_global()
{
    register int    p, npools;
    register int    total_free = 0;
    register struct smr_slabtbl *tblp;
    static clock_t  reap_last = 0;
    procname_t  proc = "smr_slab_reap_global";
    clock_t     now;

    ASSERT(IDN_GET_MASTERID() != IDN_NIL_DOMID);

    DSLAB_LOCK_SHARED(idn.localid);
    if (idn_domain[idn.localid].dslab_state != DSLAB_STATE_LOCAL) {
        PR_SMR("%s: only allowed by master (%d)\n",
            proc, IDN_GET_MASTERID());
        DSLAB_UNLOCK(idn.localid);
        return;
    }
    DSLAB_UNLOCK(idn.localid);

    now = ddi_get_lbolt();
    if ((now > 0) && (now > reap_last) &&
        ((now - reap_last) < IDN_REAP_INTERVAL))
        return;

    reap_last = now;

    ASSERT(idn.slabpool);

    npools = idn.slabpool->npools;
    tblp   = idn.slabpool->pool;

    for (p = 0; p < npools; tblp++, p++)
        total_free += tblp->nfree;

    if (total_free <= IDN_SLAB_THRESHOLD) {
        int diff, reap_per_domain;

        PR_SMR("%s: kicking off reaping "
            "(total_free = %d, min = %d)\n",
            proc, total_free, IDN_SLAB_THRESHOLD);

        diff = IDN_SLAB_THRESHOLD - total_free;
        reap_per_domain = (diff < idn.ndomains) ?
            1 : (diff / idn.ndomains);

        idn_broadcast_cmd(IDNCMD_SLABREAP, reap_per_domain, 0, 0);
    }
}

void
smr_slab_reap(int domid, int *nslabs)
{
    register int    d;
    int     nreclaimed;
    smr_slab_t  *sp;
    domainset_t reapset;
    procname_t  proc = "smr_slab_reap";

    /*
     * Should only be called on behalf of local
     * domain.
     */
    if (domid != idn.localid) {
        PR_SMR("%s: called by domain %d, should only be local (%d)\n",
            proc, domid, idn.localid);
        ASSERT(0);
        return;
    }
    /*
     * Try and reclaim some buffers so we can possibly
     * free up some slabs.
     */
    reapset = idn.domset.ds_connected;

    IDN_GKSTAT_GLOBAL_EVENT(gk_reaps, gk_reap_last);

    nreclaimed = 0;
    for (d = 0; d < MAX_DOMAINS; d++) {
        int     nr;
        idn_domain_t    *dp;

        if (!DOMAIN_IN_SET(reapset, d))
            continue;

        IDN_DLOCK_SHARED(d);

        dp = &idn_domain[d];
        if ((d == idn.localid) || (dp->dcpu < 0)) {
            IDN_DUNLOCK(d);
            continue;
        }
        /*
         * Clean up any dead I/O errors if possible.
         */
        if (dp->dioerr > 0) {
            idn_domain_t    *ldp;
            register int    cnt;
            register smr_slabbuf_t  *bp;
            /*
             * We need to grab the writer lock to prevent
             * anybody from allocating buffers while we
             * traverse the slabs outstanding.
             */
            cnt = 0;
            ldp = &idn_domain[idn.localid];
            IDN_DLOCK_EXCL(idn.localid);
            DSLAB_LOCK_EXCL(idn.localid);
            for (sp = ldp->dslab; sp; sp = sp->sl_next)
                for (bp = sp->sl_inuse; bp; bp = bp->sb_next)
                    if (bp->sb_domid == d)
                        cnt++;
            DSLAB_UNLOCK(idn.localid);
            ASSERT((dp->dio + dp->dioerr) >= cnt);
            dp->dio = cnt;
            dp->dioerr = 0;
            IDN_DUNLOCK(idn.localid);
        }
        if ((dp->dstate == IDNDS_CONNECTED) &&
            ((nr = idn_reclaim_mboxdata(d, 0, -1)) > 0))
            nreclaimed += nr;

        IDN_DUNLOCK(d);
    }

    DSLAB_LOCK_EXCL(domid);
    sp = smr_slaballoc_get_n(domid, nslabs);
    if (sp) {
        IDN_GKSTAT_ADD(gk_reap_count, (ulong_t)(*nslabs));
        smr_slab_free(domid, sp);
    }
    DSLAB_UNLOCK(domid);
}

/*
 * ---------------------------------------------------------------------
 * Remap the (IDN) shared memory region to a new physical address.
 * Caller is expected to have performed a ecache flush if needed.
 * ---------------------------------------------------------------------
 */
void
smr_remap(struct as *as, register caddr_t vaddr,
        register pfn_t new_pfn, uint_t mblen)
{
    tte_t       tte;
    size_t      blen;
    pgcnt_t     p, npgs;
    procname_t  proc = "smr_remap";

    if (va_to_pfn(vaddr) == new_pfn) {
        PR_REMAP("%s: vaddr (0x%p) already mapped to pfn (0x%lx)\n",
            proc, (void *)vaddr, new_pfn);
        return;
    }

    blen = MB2B(mblen);
    npgs = btopr(blen);
    ASSERT(npgs != 0);

    PR_REMAP("%s: va = 0x%p, pfn = 0x%lx, npgs = %ld, mb = %d MB (%ld)\n",
        proc, (void *)vaddr, new_pfn, npgs, mblen, blen);

    /*
     * Unmap the SMR virtual address from it's current
     * mapping.
     */
    hat_unload(as->a_hat, vaddr, blen, HAT_UNLOAD_UNLOCK);

    if (new_pfn == PFN_INVALID)
        return;

    /*
     * Map the SMR to the new physical address space,
     * presumably a remote pfn.  Cannot use hat_devload
     * because it will think pfn represents non-memory,
     * i.e. space since it may beyond his physmax.
     */
    for (p = 0; p < npgs; p++) {
        sfmmu_memtte(&tte, new_pfn, PROT_READ | PROT_WRITE | HAT_NOSYNC,
            TTE8K);
        sfmmu_tteload(as->a_hat, &tte, vaddr, NULL, HAT_LOAD_LOCK);

        vaddr += MMU_PAGESIZE;
        new_pfn++;
    }

    PR_REMAP("%s: remapped %ld pages (expected %ld)\n",
        proc, npgs, btopr(MB2B(mblen)));
}