udapl_tavor/tavor/dapl_tavor_wr.c

	dapl_tavor_wr.c revision 9e39c5ba00a55fa05777cc94b148296af305e135
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include "dapl.h"
#include "dapl_tavor_wr.h"
#include "dapl_hash.h"
#include "dapl_tavor_ibtf_impl.h"

static dapls_tavor_wrid_entry_t *dapli_tavor_wrid_find_match(
    dapls_tavor_workq_hdr_t *, tavor_hw_cqe_t *);
static dapls_tavor_wrid_list_hdr_t *dapli_tavor_wrid_get_list(uint32_t, int);
static void dapli_tavor_wrid_reaplist_add(ib_cq_handle_t,
    dapls_tavor_workq_hdr_t *);
static dapls_tavor_workq_hdr_t *dapli_tavor_wrid_wqhdr_find(ib_cq_handle_t,
    uint_t, uint_t);
static uint32_t dapli_tavor_wrid_get_wqeaddrsz(dapls_tavor_workq_hdr_t *);
static dapls_tavor_workq_hdr_t *dapli_tavor_wrid_list_reap(
    dapls_tavor_wrid_list_hdr_t *);
static dapls_tavor_workq_hdr_t *dapli_tavor_wrid_wqhdr_create(ib_cq_handle_t,
    uint_t, uint_t, uint_t);
static void dapli_tavor_wrid_wqhdr_add(dapls_tavor_workq_hdr_t *,
    dapls_tavor_wrid_list_hdr_t *);
static void dapli_tavor_wrid_wqhdr_remove(dapls_tavor_workq_hdr_t *,
    dapls_tavor_wrid_list_hdr_t *);
static void dapli_tavor_wrid_wqhdr_lock_both(ib_qp_handle_t);
static void dapli_tavor_wrid_wqhdr_unlock_both(ib_qp_handle_t);
static DAT_RETURN dapli_tavor_cq_wqhdr_add(ib_cq_handle_t,
    dapls_tavor_workq_hdr_t *);
static void dapli_tavor_cq_wqhdr_remove(ib_cq_handle_t,
    dapls_tavor_workq_hdr_t *);

/*
 * dapls_tavor_wrid_get_entry()
 */
uint64_t
dapls_tavor_wrid_get_entry(ib_cq_handle_t cq, tavor_hw_cqe_t *cqe,
    uint_t send_or_recv, uint_t error, dapls_tavor_wrid_entry_t *wre)
{
    dapls_tavor_workq_hdr_t *wq;
    dapls_tavor_wrid_entry_t    *wre_tmp;
    uint64_t        wrid;
    uint_t          qpnum;

    /* Lock the list of work queues associated with this CQ */
    dapl_os_lock(&cq->cq_wrid_wqhdr_lock);

    /* Find the work queue for this QP number (send or receive side) */
    qpnum = TAVOR_CQE_QPNUM_GET(cqe);
    wq = dapli_tavor_wrid_wqhdr_find(cq, qpnum, send_or_recv);

    dapl_os_assert(wq != NULL);

    /*
     * Regardless of whether the completion is the result of a "success"
     * or a "failure", we lock the list of "containers" and attempt to
     * search for the the first matching completion (i.e. the first WR
     * with a matching WQE addr and size).  Once we find it, we pull out
     * the "wrid" field and return it (see below).  Note: One possible
     * future enhancement would be to enable this routine to skip over
     * any "unsignaled" completions to go directly to the next "signaled"
     * entry on success. XXX
     */
    dapl_os_lock(&wq->wq_wrid_lock->wrl_lock);
    wre_tmp = dapli_tavor_wrid_find_match(wq, cqe);

    /*
     * If this is a "successful" completion, then we assert that this
     * completion must be a "signaled" completion.
     */
    dapl_os_assert(error || (wre_tmp->wr_signaled_dbd &
        TAVOR_WRID_ENTRY_SIGNALED));

    /*
     * If the completion is a "failed" completion, then we save away the
     * contents of the entry (into the "wre" field passed in) for use
     * in later CQE processing. Note: We use the
     * dapli_tavor_wrid_get_wqeaddrsz() function to grab "wqeaddrsz" from
     * the next entry in the container.
     * This is required for error processing (where updating these fields
     * properly is necessary to correct handling of the "error" CQE)
     */
    if (error && (wre != NULL)) {
        *wre = *wre_tmp;
        wre->wr_wqeaddrsz = dapli_tavor_wrid_get_wqeaddrsz(wq);
    }

    /* Pull out the WRID and return it */
    wrid = wre_tmp->wr_wrid;

    dapl_os_unlock(&wq->wq_wrid_lock->wrl_lock);
    dapl_os_unlock(&cq->cq_wrid_wqhdr_lock);

    return (wrid);
}


/*
 * dapli_tavor_wrid_find_match()
 */
static dapls_tavor_wrid_entry_t *
dapli_tavor_wrid_find_match(dapls_tavor_workq_hdr_t *wq, tavor_hw_cqe_t *cqe)
{
    dapls_tavor_wrid_entry_t    *curr = NULL;
    dapls_tavor_wrid_list_hdr_t *container;
    uint32_t        wqeaddr_size;
    uint32_t        head, tail, size;
    int         found = 0, last_container;

    /* dapl_os_assert(MUTEX_HELD(&wq->wq_wrid_lock)); */

    /* Pull the "wqeaddrsz" information from the CQE */
    wqeaddr_size = TAVOR_CQE_WQEADDRSZ_GET(cqe);

    /*
     * Walk the "containers" list(s), find first WR with a matching WQE
     * addr.  If the current "container" is not the last one on the list,
     * i.e. not the current one to which we are posting new WRID entries,
     * then we do not attempt to update the "q_head", "q_tail", and
     * "q_full" indicators on the main work queue header.  We do, however,
     * update the "head" and "full" indicators on the individual containers
     * as we go.  This is imperative because we need to be able to
     * determine when the current container has been emptied (so that we
     * can move on to the next container).
     */
    container = wq->wq_wrid_poll;
    while (container != NULL) {

        /* Is this the last/only "container" on the list */
        last_container = (container != wq->wq_wrid_post) ? 0 : 1;

        /*
         * First check if we are on an SRQ.  If so, we grab the entry
         * and break out.  Since SRQ wridlist's are never added to
         * reaplist, they can only be the last container.
         */
        if (container->wl_srq_en) {
            dapl_os_assert(last_container == 1);
            curr = dapli_tavor_wrid_find_match_srq(container, cqe);
            break;
        }

        /*
         * Grab the current "head", "tail" and "size" fields before
         * walking the list in the current container. Note: the "size"
         * field here must always be a power-of-2.  The "full"
         * parameter is checked (and updated) here to distinguish the
         * "queue full" condition from "queue empty".
         */
        head = container->wl_head;
        tail = container->wl_tail;
        size = container->wl_size;
        while ((head != tail) || (container->wl_full)) {
            container->wl_full = 0;
            curr = &container->wl_wre[head];
            head = ((head + 1) & (size - 1));
            /*
             * If the current entry's "wqeaddrsz" matches the one
             * we're searching for, then this must correspond to
             * the work request that caused the completion.  Set
             * the "found" flag and bail out.
             */
            if (curr->wr_wqeaddrsz == wqeaddr_size) {
                found = 1;
                break;
            }
        }

        /*
         * If the current container is empty (having reached here the
         * "head == tail" condition can only mean that the container
         * is empty), then NULL out the "wrid_old_tail" field (see
         * tavor_post_send() and tavor_post_recv() for more details)
         * and (potentially) remove the current container from future
         * searches.
         */
        if (head == tail) {
            container->wl_wre_old_tail = NULL;
            /*
             * If this wasn't the last "container" on the chain,
             * i.e. the one to which new WRID entries will be
             * added, then remove it from the list.
             * Note: we don't "lose" the memory pointed to by this
             * because we should have already put this container
             * on the "reapable" list (from where it will later be
             * pulled).
             */
            if (!last_container) {
                wq->wq_wrid_poll = container->wl_next;
            }
        }

        /* Update the head index for the container */
        container->wl_head = head;

        /*
         * If the entry was found in this container, then continue to
         * bail out.  Else reset the "curr" pointer and move on to the
         * next container (if there is one).  Note: the only real
         * reason for setting "curr = NULL" here is so that the ASSERT
         * below can catch the case where no matching entry was found
         * on any of the lists.
         */
        if (found) {
            break;
        } else {
            curr = NULL;
            container = container->wl_next;
        }
    }

    /*
     * Update work queue header's "head" and "full" conditions to match
     * the last entry on the container list.  (Note: Only if we're pulling
     * entries from the last work queue portion of the list, i.e. not from
     * the previous portions that may be the "reapable" list.)
     */
    if (last_container) {
        wq->wq_head = wq->wq_wrid_post->wl_head;
        wq->wq_full = wq->wq_wrid_post->wl_full;
    }

    /* Ensure that we've actually found what we were searching for */
    dapl_os_assert(curr != NULL);

    return (curr);
}

/*
 * tavor_wrid_find_match_srq()
 *    Context: Can be called from interrupt or base context.
 */
dapls_tavor_wrid_entry_t *
dapli_tavor_wrid_find_match_srq(dapls_tavor_wrid_list_hdr_t *wl,
    tavor_hw_cqe_t *cqe)
{
    dapls_tavor_wrid_entry_t    *wre;
    uint32_t        wqe_index;
    uint32_t        wqe_addr;
    uint32_t        qsize_msk;
    uint32_t        tail, next_tail;

    /* Grab the WQE addr out of the CQE */
    wqe_addr = TAVOR_CQE_WQEADDRSZ_GET(cqe) & 0xFFFFFFC0;

    /*
     * Given the 'wqe_addr' just calculated and the srq buf address, we
     * find the 'wqe_index'.  The 'wre' returned below contains the WRID
     * that we are looking for.  This indexes into the wre_list for this
     * specific WQE.
     */
    wqe_index = TAVOR_SRQ_WQ_INDEX(wl->wl_srq_desc_addr, wqe_addr,
        wl->wl_srq_wqesz);

    /* ASSERT on impossible wqe_index values */
    dapl_os_assert(wqe_index < wl->wl_size);

    /* Put this WQE back on the free list */

    qsize_msk = wl->wl_size - 1;
    tail      = wl->wl_freel_tail;

    next_tail = (tail + 1) & qsize_msk;
    wl->wl_freel_entries++;

    dapl_os_assert(wl->wl_freel_entries <= wl->wl_size);

    /* Get the descriptor (IO Address) of the WQE to be built */
    wl->wl_free_list[tail] = wqe_addr;
    wl->wl_freel_tail = next_tail;
    /* Using the index, return the Work Request ID Entry (wre) */
    wre = &wl->wl_wre[wqe_index];

    return (wre);
}

/*
 * dapls_tavor_wrid_cq_reap()
 */
void
dapls_tavor_wrid_cq_reap(ib_cq_handle_t cq)
{
    dapls_tavor_workq_hdr_t *consume_wqhdr;
    dapls_tavor_wrid_list_hdr_t *container, *to_free;


    /* dapl_os_assert(MUTEX_HELD(&cq->cq_lock)); */

    /* Lock the list of work queues associated with this CQ */
    dapl_os_lock(&cq->cq_wrid_wqhdr_lock);

    /* Walk the "reapable" list and free up containers */
    container = cq->cq_wrid_reap_head;
    while (container != NULL) {
        to_free   = container;
        container = container->wl_reap_next;
        /*
         * If reaping the WRID list containers pulls the last
         * container from the given work queue header, then we free
         * the work queue header as well.
         */
        consume_wqhdr = dapli_tavor_wrid_list_reap(to_free);
        if (consume_wqhdr != NULL) {
            dapli_tavor_cq_wqhdr_remove(cq, consume_wqhdr);
        }
    }

    /* Once finished reaping, we reset the CQ's reap list */
    cq->cq_wrid_reap_head = cq->cq_wrid_reap_tail = NULL;

    dapl_os_unlock(&cq->cq_wrid_wqhdr_lock);
}


/*
 * dapls_tavor_wrid_cq_force_reap()
 */
void
dapls_tavor_wrid_cq_force_reap(ib_cq_handle_t cq)
{
    DAPL_HASH_DATA      curr;
    DAT_RETURN      retval;
    dapls_tavor_workq_hdr_t     *to_free_wqhdr;
    dapls_tavor_wrid_list_hdr_t *container, *to_free;

    /* dapl_os_assert(MUTEX_HELD(&cq->cq_lock)); */

    /*
     * The first step is to walk the "reapable" list and free up those
     * containers.  This is necessary because the containers on the
     * reapable list are not otherwise connected to the work queue headers
     * anymore.
     */
    dapls_tavor_wrid_cq_reap(cq);

    /* Now lock the list of work queues associated with this CQ */
    dapl_os_lock(&cq->cq_wrid_wqhdr_lock);

    /*
     * Walk the list of work queue headers and free up all the WRID list
     * containers chained to it.  Note: We don't need to grab the locks
     * for each of the individual WRID lists here because the only way
     * things can be added or removed from the list at this point would be
     * through post a work request to a QP.  But if we've come this far,
     * then we can be assured that there are no longer any QP associated
     * with the CQ that we are trying to free.
     */
    retval = dapls_hash_iterate(cq->cq_wrid_wqhdr_list,
        DAPL_HASH_ITERATE_INIT, &curr);
    dapl_os_assert(retval == DAT_SUCCESS);

    while (curr != NULL) {
        to_free_wqhdr = (dapls_tavor_workq_hdr_t *)curr;
        container = ((dapls_tavor_workq_hdr_t *)curr)->wq_wrid_poll;
        retval = dapls_hash_iterate(cq->cq_wrid_wqhdr_list,
            DAPL_HASH_ITERATE_NEXT, &curr);
        dapl_os_assert(retval == DAT_SUCCESS);
        while (container != NULL) {
            to_free   = container;
            container = container->wl_next;
            /*
             * If reaping the WRID list containers pulls the last
             * container from the given work queue header, then
             * we free the work queue header as well.  Note: we
             * ignore the return value because we know that the
             * work queue header should always be freed once the
             * list of containers has come to an end.
             */
            (void) dapli_tavor_wrid_list_reap(to_free);
            if (container == NULL) {
                dapli_tavor_cq_wqhdr_remove(cq, to_free_wqhdr);
            }
        }
    }

    dapl_os_lock(&cq->cq_wrid_wqhdr_lock);
}


/*
 * dapli_tavor_wrid_get_list()
 */
static dapls_tavor_wrid_list_hdr_t *
dapli_tavor_wrid_get_list(uint32_t qsize, int wrid_for_srq)
{
    dapls_tavor_wrid_list_hdr_t *wridlist;
    dapls_tavor_wrid_entry_t    *wl_wre;
    uint32_t            *wl_freel;
    uint32_t            size;
    uint32_t            wl_wre_size;
    uint32_t            wl_freel_size;

    wridlist = NULL;
    wl_wre = NULL;
    wl_freel = NULL;
    size = wl_wre_size = wl_freel_size = 0;
    /*
     * The WRID list "container" consists of the dapls_tavor_wrid_list_hdr_t
     * which holds the pointers necessary for maintaining the "reapable"
     * list, chaining together multiple "containers" old and new, and
     * tracking the head, tail, size, etc. for each container.  The
     * "container" also holds all the tavor_wrid_entry_t's, one for
     * each entry on the corresponding work queue.
     */

    /*
     * For wridlist associated with SRQs the wridlock needs to be
     * allocated and initialized here.
     */
    size = sizeof (dapls_tavor_wrid_list_hdr_t);
    if (wrid_for_srq) {
        size = size + sizeof (dapls_tavor_wrid_lock_t);
    }
    wridlist = dapl_os_alloc(size);
    if (wridlist == NULL) {
        goto bail;
    }
    if (wrid_for_srq) {
        wridlist->wl_lock = (dapls_tavor_wrid_lock_t *)(
            (uintptr_t)wridlist + sizeof (dapls_tavor_wrid_list_hdr_t));
        dapl_os_lock_init(&wridlist->wl_lock->wrl_lock);
        wridlist->wl_lock->wrl_on_srq = wrid_for_srq;
    } else {
        wridlist->wl_lock = NULL;
    }
    wl_wre_size = qsize * sizeof (dapls_tavor_wrid_entry_t);
    wl_wre = dapl_os_alloc(wl_wre_size);
    if (wl_wre == NULL) {
        goto bail;
    }
    if (wrid_for_srq) { /* memory for the SRQ free list */
        wl_freel_size = qsize * sizeof (uint32_t);
        wl_freel = dapl_os_alloc(wl_freel_size);
        if (wl_freel == NULL) {
            goto bail;
        }
    }


    /* Complete the "container" initialization */
    wridlist->wl_size = qsize;
    wridlist->wl_full = 0;
    wridlist->wl_head = 0;
    wridlist->wl_tail = 0;
    wridlist->wl_wre = wl_wre;
    wridlist->wl_wre_old_tail  = NULL;
    wridlist->wl_reap_next = NULL;
    wridlist->wl_next  = NULL;
    wridlist->wl_prev  = NULL;
    if (wrid_for_srq) {
        wridlist->wl_srq_en = 1;
        wridlist->wl_free_list = (uint32_t *)wl_freel;
        wridlist->wl_freel_head = 0;
        wridlist->wl_freel_tail = 0;
        wridlist->wl_freel_entries = qsize;
    } else {
        wridlist->wl_srq_en = 0;
        wridlist->wl_free_list = NULL;
        wridlist->wl_freel_head = 0;
        wridlist->wl_freel_tail = 0;
        wridlist->wl_freel_entries = 0;
        wridlist->wl_srq_wqesz = 0;
        wridlist->wl_srq_desc_addr = 0;
    }
    return (wridlist);
bail:
    if (wridlist) {
        if (wrid_for_srq) {
            dapl_os_lock_destroy(&wridlist->wl_lock->wrl_lock);
        }
        dapl_os_free(wridlist, size);
    }
    if (wl_wre) {
        dapl_os_free(wl_wre, wl_wre_size);
    }
    if (wl_freel) {
        dapl_os_free(wl_freel, wl_freel_size);
    }
    return (NULL);
}


/*
 * dapli_tavor_wrid_reaplist_add()
 */
static void
dapli_tavor_wrid_reaplist_add(ib_cq_handle_t cq, dapls_tavor_workq_hdr_t *wq)
{
    /* dapl_os_assert(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); */

    dapl_os_lock(&wq->wq_wrid_lock->wrl_lock);

    /*
     * Add the "post" container (the last one on the current chain) to
     * the CQ's "reapable" list
     */
    if ((cq->cq_wrid_reap_head == NULL) &&
        (cq->cq_wrid_reap_tail == NULL)) {
        cq->cq_wrid_reap_head = wq->wq_wrid_post;
        cq->cq_wrid_reap_tail = wq->wq_wrid_post;
    } else {
        cq->cq_wrid_reap_tail->wl_reap_next = wq->wq_wrid_post;
        cq->cq_wrid_reap_tail = wq->wq_wrid_post;
    }

    dapl_os_unlock(&wq->wq_wrid_lock->wrl_lock);
}


/*
 * dapli_tavor_wrid_wqhdr_find()
 */
static dapls_tavor_workq_hdr_t *
dapli_tavor_wrid_wqhdr_find(ib_cq_handle_t cq, uint_t qpn, uint_t send_or_recv)
{
    DAPL_HASH_DATA      curr;
    DAPL_HASH_KEY       key;
    DAT_RETURN      status;

    /* dapl_os_assert(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); */

    /*
     * Walk the CQ's work queue list, trying to find a send or recv queue
     * with the same QP number.  We do this even if we are going to later
     * create a new entry because it helps us easily find the end of the
     * list.
     */
    key = (DAPL_HASH_KEY)(((uint64_t)send_or_recv << 32) | (uint32_t)qpn);

    status = dapls_hash_search(cq->cq_wrid_wqhdr_list, key, &curr);
    if (status == DAT_SUCCESS) {
        return ((dapls_tavor_workq_hdr_t *)curr);
    } else {
        return (NULL);
    }
}


/*
 * dapli_tavor_wrid_get_wqeaddrsz()
 */
static uint32_t
dapli_tavor_wrid_get_wqeaddrsz(dapls_tavor_workq_hdr_t *wq)
{
    dapls_tavor_wrid_entry_t    *wre;
    uint32_t        wqeaddrsz;
    uint32_t        head;

    /*
     * If the container is empty, then there is no next entry. So just
     * return zero.  Note: the "head == tail" condition here can only
     * mean that the container is empty because we have previously pulled
     * something from the container.
     *
     * If the container is not empty, then find the next entry and return
     * the contents of its "wqeaddrsz" field.
     */
    if (wq->wq_wrid_poll->wl_head == wq->wq_wrid_poll->wl_tail) {
        wqeaddrsz = 0;
    } else {
        /*
         * We don't need to calculate the "next" head pointer here
         * because "head" should already point to the next entry on
         * the list (since we just pulled something off - in
         * dapli_tavor_wrid_find_match() - and moved the head index
         * forward.)
         */
        head = wq->wq_wrid_poll->wl_head;
        wre = &wq->wq_wrid_poll->wl_wre[head];
        wqeaddrsz = wre->wr_wqeaddrsz;
    }
    return (wqeaddrsz);
}


/*
 * dapli_tavor_wrid_list_reap()
 *    Note: The "wqhdr_list_lock" must be held.
 */
static dapls_tavor_workq_hdr_t *
dapli_tavor_wrid_list_reap(dapls_tavor_wrid_list_hdr_t *wridlist)
{
    dapls_tavor_workq_hdr_t *wqhdr, *consume_wqhdr = NULL;
    dapls_tavor_wrid_list_hdr_t *prev, *next;

    /* Get the back pointer to the work queue header (see below) */
    wqhdr = wridlist->wl_wqhdr;
    dapl_os_lock(&wqhdr->wq_wrid_lock->wrl_lock);

    /* Unlink the WRID list "container" from the work queue list */
    prev = wridlist->wl_prev;
    next = wridlist->wl_next;
    if (prev != NULL) {
        prev->wl_next = next;
    }
    if (next != NULL) {
        next->wl_prev = prev;
    }

    /*
     * If the back pointer to the work queue header shows that it
     * was pointing to the entry we are about to remove, then the work
     * queue header is reapable as well.
     */
    if ((wqhdr->wq_wrid_poll == wridlist) &&
        (wqhdr->wq_wrid_post == wridlist)) {
        consume_wqhdr = wqhdr;
    }

    /* Be sure to update the "poll" and "post" container pointers */
    if (wqhdr->wq_wrid_poll == wridlist) {
        wqhdr->wq_wrid_poll = next;
    }
    if (wqhdr->wq_wrid_post == wridlist) {
        wqhdr->wq_wrid_post = NULL;
    }

    /*
     * Calculate the size and free the container, for SRQ wridlist is
     * freed when srq gets freed
     */
    if (!wridlist->wl_srq_en) {
        if (wridlist->wl_wre) {
            dapl_os_free(wridlist->wl_wre, wridlist->wl_size *
                sizeof (dapls_tavor_wrid_entry_t));
        }
        dapl_os_assert(wridlist->wl_free_list == NULL);
        dapl_os_free(wridlist, sizeof (dapls_tavor_wrid_list_hdr_t));
    }

    dapl_os_unlock(&wqhdr->wq_wrid_lock->wrl_lock);

    return (consume_wqhdr);
}

/*
 * dapls_tavor_srq_wrid_init()
 */
DAT_RETURN
dapls_tavor_srq_wrid_init(ib_srq_handle_t srq)
{
    dapls_tavor_wrid_list_hdr_t *wridlist;
    int i;

    wridlist = dapli_tavor_wrid_get_list(srq->srq_wq_numwqe, 1);


    if (wridlist == NULL) {
        srq->srq_wridlist = NULL;
        return (DAT_INSUFFICIENT_RESOURCES | DAT_RESOURCE_MEMORY);
    }

    /* initialize the free list with the descriptor addresses */
    wridlist->wl_free_list[0] = srq->srq_wq_desc_addr;
    for (i = 1; i < srq->srq_wq_numwqe; i++) {
        wridlist->wl_free_list[i] = wridlist->wl_free_list[i-1] +
            srq->srq_wq_wqesz;
    }
    wridlist->wl_srq_wqesz = srq->srq_wq_wqesz;
    wridlist->wl_srq_desc_addr = srq->srq_wq_desc_addr;

    srq->srq_wridlist = wridlist;
    return (DAT_SUCCESS);
}

void
dapls_tavor_srq_wrid_free(ib_srq_handle_t srq)
{
    dapls_tavor_wrid_list_hdr_t *wridlist;
    size_t              size = 0;

    wridlist = srq->srq_wridlist;
    if (wridlist) {
        dapl_os_assert(wridlist->wl_srq_en == 1);
        if (wridlist->wl_wre) {
            dapl_os_free(wridlist->wl_wre, wridlist->wl_size *
                sizeof (dapls_tavor_wrid_entry_t));
        }
        if (wridlist->wl_free_list) {
            dapl_os_free(wridlist->wl_free_list, wridlist->wl_size *
                sizeof (uint32_t));
        }
        if (wridlist->wl_lock) {
            dapl_os_assert(wridlist->wl_lock->wrl_on_srq == 1);
            dapl_os_lock_destroy(&wridlist->wl_lock->wrl_lock);
            size = sizeof (dapls_tavor_wrid_lock_t);
        }
        size = size; /* pacify lint */
        dapl_os_free(wridlist, size +
            sizeof (dapls_tavor_wrid_list_hdr_t));
        srq->srq_wridlist = NULL;
    }
}


/*
 * dapls_tavor_wrid_init()
 */
DAT_RETURN
dapls_tavor_wrid_init(ib_qp_handle_t qp)
{
    dapls_tavor_workq_hdr_t     *swq;
    dapls_tavor_workq_hdr_t     *rwq;
    dapls_tavor_wrid_list_hdr_t *s_wridlist;
    dapls_tavor_wrid_list_hdr_t *r_wridlist;
    uint_t      create_new_swq = 0;
    uint_t      create_new_rwq = 0;

    /*
     * For each of this QP's Work Queues, make sure we have a (properly
     * initialized) Work Request ID list attached to the relevant
     * completion queue.  Grab the CQ lock(s) before manipulating the
     * lists.
     */
    dapli_tavor_wrid_wqhdr_lock_both(qp);
    swq = dapli_tavor_wrid_wqhdr_find(qp->qp_sq_cqhdl, qp->qp_num,
        TAVOR_WR_SEND);
    if (swq == NULL) {
        /* Couldn't find matching work queue header, create it */
        create_new_swq = 1;
        swq = dapli_tavor_wrid_wqhdr_create(qp->qp_sq_cqhdl,
            qp->qp_num, TAVOR_WR_SEND, 1);
        if (swq == NULL) {
            /*
             * If we couldn't find/allocate space for the workq
             * header, then drop the lock(s) and return failure.
             */
            dapli_tavor_wrid_wqhdr_unlock_both(qp);
            return (DAT_INSUFFICIENT_RESOURCES);
        }
    }
    qp->qp_sq_wqhdr = swq;
    swq->wq_size = qp->qp_sq_numwqe;
    swq->wq_head = 0;
    swq->wq_tail = 0;
    swq->wq_full = 0;

    /*
     * Allocate space for the dapls_tavor_wrid_entry_t container
     */
    s_wridlist = dapli_tavor_wrid_get_list(swq->wq_size, 0);
    if (s_wridlist == NULL) {
        /*
         * If we couldn't allocate space for tracking the WRID
         * entries, then cleanup the workq header from above (if
         * necessary, i.e. if we created the workq header).  Then
         * drop the lock(s) and return failure.
         */
        if (create_new_swq) {
            dapli_tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
        }

        dapli_tavor_wrid_wqhdr_unlock_both(qp);
        return (DAT_INSUFFICIENT_RESOURCES | DAT_RESOURCE_MEMORY);
    }
    s_wridlist->wl_wqhdr = swq;
    /* Chain the new WRID list container to the workq hdr list */
    dapl_os_lock(&swq->wq_wrid_lock->wrl_lock);
    dapli_tavor_wrid_wqhdr_add(swq, s_wridlist);
    dapl_os_unlock(&swq->wq_wrid_lock->wrl_lock);


    /*
     * Now we repeat all the above operations for the receive work queue
     */
    rwq = dapli_tavor_wrid_wqhdr_find(qp->qp_rq_cqhdl, qp->qp_num,
        TAVOR_WR_RECV);
    if (rwq == NULL) {
        create_new_rwq = 1;
        /* if qp is attached to an SRQ don't need to alloc wrid_lock */
        rwq = dapli_tavor_wrid_wqhdr_create(qp->qp_rq_cqhdl,
            qp->qp_num, TAVOR_WR_RECV, qp->qp_srq_enabled ? 0 : 1);
        if (rwq == NULL) {
            /*
             * If we couldn't find/allocate space for the workq
             * header, then free all the send queue resources we
             * just allocated and setup (above), drop the lock(s)
             * and return failure.
             */
            dapl_os_lock(&swq->wq_wrid_lock->wrl_lock);
            dapli_tavor_wrid_wqhdr_remove(swq, s_wridlist);
            dapl_os_unlock(&swq->wq_wrid_lock->wrl_lock);
            if (create_new_swq) {
                dapli_tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl,
                    swq);
            }

            dapli_tavor_wrid_wqhdr_unlock_both(qp);
            return (DAT_INSUFFICIENT_RESOURCES |
                DAT_RESOURCE_MEMORY);
        }
    }
    qp->qp_rq_wqhdr = rwq;
    rwq->wq_size = qp->qp_rq_numwqe;
    rwq->wq_head = 0;
    rwq->wq_tail = 0;
    rwq->wq_full = 0;

    /*
     * Allocate space for the dapls_tavor_wrid_entry_t container
     * For qp associated with SRQs the SRQ wridlist is used
     */
    if (qp->qp_srq_enabled) {
        /* Use existing srq_wridlist pointer */
        r_wridlist = qp->qp_srq->srq_wridlist;
        dapl_os_assert(r_wridlist != NULL);
        /* store the wl_lock in the wqhdr */
        rwq->wq_wrid_lock = r_wridlist->wl_lock;
        dapl_os_assert(rwq->wq_wrid_lock != NULL);
    } else {
        /* Allocate memory for the r_wridlist */
        r_wridlist = dapli_tavor_wrid_get_list(rwq->wq_size, 0);
    }
    if (r_wridlist == NULL) {
        /*
         * If we couldn't allocate space for tracking the WRID
         * entries, then cleanup all the stuff from above.  Then
         * drop the lock(s) and return failure.
         */
        dapl_os_lock(&swq->wq_wrid_lock->wrl_lock);
        dapli_tavor_wrid_wqhdr_remove(swq, s_wridlist);
        dapl_os_unlock(&swq->wq_wrid_lock->wrl_lock);
        if (create_new_swq) {
            dapli_tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
        }
        if (create_new_rwq) {
            dapli_tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, rwq);
        }

        dapli_tavor_wrid_wqhdr_unlock_both(qp);
        return (DAT_INSUFFICIENT_RESOURCES | DAT_RESOURCE_MEMORY);
    }

    /* For SRQ based QPs r_wridlist does not point to recv wqhdr */
    if (!qp->qp_srq_enabled) {
        r_wridlist->wl_wqhdr = rwq;
    }

    /* Chain the new WRID list "container" to the workq hdr list */
    dapl_os_lock(&rwq->wq_wrid_lock->wrl_lock);
    dapli_tavor_wrid_wqhdr_add(rwq, r_wridlist);
    dapl_os_unlock(&rwq->wq_wrid_lock->wrl_lock);

    dapli_tavor_wrid_wqhdr_unlock_both(qp);

    return (DAT_SUCCESS);
}


/*
 * dapls_tavor_wrid_cleanup()
 */
void
dapls_tavor_wrid_cleanup(DAPL_EP *ep, ib_qp_handle_t qp)
{
    /*
     * For each of this QP's Work Queues, move the WRID "container" to
     * the "reapable" list.  Although there may still be unpolled
     * entries in these containers, it is not a big deal.  We will not
     * reap the list until either the Poll CQ command detects an empty
     * condition or the CQ itself is freed.  Grab the CQ lock(s) before
     * manipulating the lists.
     */
    dapli_tavor_wrid_wqhdr_lock_both(qp);
    dapli_tavor_wrid_reaplist_add(qp->qp_sq_cqhdl, qp->qp_sq_wqhdr);

    /*
     * Repeat the above operation for the Recv work queue "container".
     * However for qps with SRQ we flush the cq entries, remove the
     * wridlist and wqhdr.
     * Then drop the CQ lock(s) and return
     */
    if (qp->qp_srq_enabled) {
        /*
         * Pull off all (if any) entries for this QP from CQ.  This
         * only includes entries that have not yet been polled
         */
        dapl_os_lock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
        DAPL_FLUSH(ep)(qp);

        /* Remove wridlist from WQHDR */
        dapli_tavor_wrid_wqhdr_remove(qp->qp_rq_wqhdr,
            qp->qp_rq_wqhdr->wq_wrid_post);

        dapl_os_assert(qp->qp_rq_wqhdr->wq_wrid_post == NULL);

        dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);

        /* Free the WQHDR */
        dapli_tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
    } else {
        dapli_tavor_wrid_reaplist_add(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
    }
    dapli_tavor_wrid_wqhdr_unlock_both(qp);
}

/*
 * dapli_tavor_wrid_wqhdr_create()
 */
static dapls_tavor_workq_hdr_t *
dapli_tavor_wrid_wqhdr_create(ib_cq_handle_t cq, uint_t qpn,
    uint_t send_or_recv, uint_t alloc_wrl)
{
    dapls_tavor_workq_hdr_t *wqhdr_tmp;
    size_t          size, aligned_size;

    /* dapl_os_assert(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); */

    /*
     * Allocate space for a work queue header structure and initialize it.
     * Each work queue header structure includes a "wq_wrid_lock"
     * which needs to be initialized.
     *
     * Note: the address smashing is needed to ensure wq_wrid_lock is
     * 8-byte aligned, which is not always the case on 32-bit sparc.
     */
    size = (sizeof (dapls_tavor_workq_hdr_t) + 0x7) & ~0x7;
    aligned_size = size;
    if (alloc_wrl) {
        /* for non-srq wqhdr the lock is allocated with the wqhdr */
        size = size + sizeof (dapls_tavor_wrid_lock_t);
    }
    wqhdr_tmp = dapl_os_alloc(size);
    if (wqhdr_tmp == NULL) {
        return (NULL);
    }
    if (alloc_wrl) {
        wqhdr_tmp->wq_wrid_lock = (dapls_tavor_wrid_lock_t *)
            (((uintptr_t)wqhdr_tmp + aligned_size) & ~0x7);
        dapl_os_lock_init(&wqhdr_tmp->wq_wrid_lock->wrl_lock);
        /* wrl allocated with wqhdr don't have srq enabled */
        wqhdr_tmp->wq_wrid_lock->wrl_on_srq = 0;
    }

    wqhdr_tmp->wq_qpn   = qpn;
    wqhdr_tmp->wq_send_or_recv = send_or_recv;

    wqhdr_tmp->wq_wrid_poll = NULL;
    wqhdr_tmp->wq_wrid_post = NULL;

    /* Chain the newly allocated work queue header to the CQ's list */
    if (dapli_tavor_cq_wqhdr_add(cq, wqhdr_tmp) != DAT_SUCCESS) {
        if (alloc_wrl) {
            dapl_os_lock_destroy(&wqhdr_tmp->wq_wrid_lock->
                wrl_lock);
        }
        dapl_os_free(wqhdr_tmp, size);
        wqhdr_tmp = NULL;
    }

    return (wqhdr_tmp);
}

/*
 * dapli_tavor_wrid_wqhdr_add()
 */
static void
dapli_tavor_wrid_wqhdr_add(dapls_tavor_workq_hdr_t *wqhdr,
    dapls_tavor_wrid_list_hdr_t *wridlist)
{
    /* dapl_os_assert(MUTEX_HELD(&wqhdr->wq_wrid_lock)); */

    /* Chain the new WRID list "container" to the work queue list */
    if ((wqhdr->wq_wrid_post == NULL) &&
        (wqhdr->wq_wrid_poll == NULL)) {
        wqhdr->wq_wrid_poll = wridlist;
        wqhdr->wq_wrid_post = wridlist;
    } else {
        wqhdr->wq_wrid_post->wl_next = wridlist;
        wridlist->wl_prev = wqhdr->wq_wrid_post;
        wqhdr->wq_wrid_post = wridlist;
    }
}


/*
 * dapli_tavor_wrid_wqhdr_remove()
 *    Note: this is only called to remove the most recently added WRID list
 *    container.
 */
static void
dapli_tavor_wrid_wqhdr_remove(dapls_tavor_workq_hdr_t *wqhdr,
    dapls_tavor_wrid_list_hdr_t *wridlist)
{
    dapls_tavor_wrid_list_hdr_t *prev, *next;

    /* dapl_os_assert(MUTEX_HELD(&wqhdr->wq_wrid_lock)); */

    /* Unlink the WRID list "container" from the work queue list */
    prev = wridlist->wl_prev;
    next = wridlist->wl_next;
    if (prev != NULL) {
        prev->wl_next = next;
    }
    if (next != NULL) {
        next->wl_prev = prev;
    }

    /*
     * Update any pointers in the work queue hdr that may point to this
     * WRID list container
     */
    if (wqhdr->wq_wrid_post == wridlist) {
        wqhdr->wq_wrid_post = prev;
    }
    if (wqhdr->wq_wrid_poll == wridlist) {
        wqhdr->wq_wrid_poll = NULL;
    }
}


/*
 * dapli_tavor_wrid_wqhdr_lock_both()
 */
static void
dapli_tavor_wrid_wqhdr_lock_both(ib_qp_handle_t qp)
{
    ib_cq_handle_t  sq_cq, rq_cq;

    sq_cq = qp->qp_sq_cqhdl;
    rq_cq = qp->qp_rq_cqhdl;

    /*
     * If both work queues (send and recv) share a completion queue, then
     * grab the common lock.  If they use different CQs (hence different
     * "cq_wrid_wqhdr_list" locks), then grab the send one first, then the
     * receive.  We do this consistently and correctly in
     * tavor_wrid_wqhdr_unlock_both() below to avoid introducing any kind
     * of dead lock condition.
     */
    if (sq_cq == rq_cq) {
        dapl_os_lock(&sq_cq->cq_wrid_wqhdr_lock);
    } else {
        dapl_os_lock(&sq_cq->cq_wrid_wqhdr_lock);
        dapl_os_lock(&rq_cq->cq_wrid_wqhdr_lock);
    }
}

/*
 * dapli_tavor_wrid_wqhdr_unlock_both()
 */
static void
dapli_tavor_wrid_wqhdr_unlock_both(ib_qp_handle_t qp)
{
    ib_cq_handle_t  sq_cq, rq_cq;

    sq_cq = qp->qp_sq_cqhdl;
    rq_cq = qp->qp_rq_cqhdl;

    /*
     * See tavor_wrid_wqhdr_lock_both() above for more detail
     */
    if (sq_cq == rq_cq) {
        dapl_os_unlock(&sq_cq->cq_wrid_wqhdr_lock);
    } else {
        dapl_os_unlock(&rq_cq->cq_wrid_wqhdr_lock);
        dapl_os_unlock(&sq_cq->cq_wrid_wqhdr_lock);
    }
}


/*
 * dapli_tavor_cq_wqhdr_add()
 */
static DAT_RETURN
dapli_tavor_cq_wqhdr_add(ib_cq_handle_t cq, dapls_tavor_workq_hdr_t *wqhdr)
{
    DAPL_HASH_KEY       key;

    /* dapl_os_assert(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); */

    /*
     * If the CQ's work queue list is empty, then just add it.
     * Otherwise, chain it to the beginning of the list.
     */
    key = (DAPL_HASH_KEY)(((uint64_t)wqhdr->wq_send_or_recv << 32) |
        wqhdr->wq_qpn);

    return (dapls_hash_insert(cq->cq_wrid_wqhdr_list, key, wqhdr));
}


/*
 * dapli_tavor_cq_wqhdr_remove
 */
static void
dapli_tavor_cq_wqhdr_remove(ib_cq_handle_t cq, dapls_tavor_workq_hdr_t *wqhdr)
{
    DAPL_HASH_DATA  curr;
    DAPL_HASH_KEY   key;
    size_t      size = 0;

    /* dapl_os_assert(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); */

    /* Remove "wqhdr" from the work queue header list on "cq" */

    key = (DAPL_HASH_KEY)(((uint64_t)wqhdr->wq_send_or_recv << 32) |
        wqhdr->wq_qpn);

    (void) dapls_hash_remove(cq->cq_wrid_wqhdr_list, key,  &curr);

    size = (sizeof (dapls_tavor_workq_hdr_t) + 0x7) & ~0x7;
    if (wqhdr->wq_wrid_lock && (!wqhdr->wq_wrid_lock->wrl_on_srq)) {
        dapl_os_lock_destroy(&wqhdr->wq_wrid_lock->wrl_lock);
        size += sizeof (dapls_tavor_wrid_lock_t);
    }

    /* Free the memory associated with "wqhdr" */
    dapl_os_free(wqhdr, size);
}

/*
 * dapls_tavor_srq_wrid_resize() is called to resize the wridlist
 * associated with SRQS as a result of dat_srq_resize().
 *
 * Returns: DAT_TRUE if successful, otherwise DAT_FALSE
 */
DAT_BOOLEAN
dapls_tavor_srq_wrid_resize(ib_srq_handle_t srq_handle, uint32_t new_size)
{
    dapls_tavor_wrid_list_hdr_t *wridlist;
    dapls_tavor_wrid_entry_t    *old_wl_wre;
    dapls_tavor_wrid_entry_t    *new_wl_wre;
    uint32_t            *old_wl_freel;
    uint32_t            *new_wl_freel;
    uint32_t            old_size;
    uint32_t            idx;
    uint32_t            prev_idx;
    uint32_t            i;

    wridlist = srq_handle->srq_wridlist;

    if (wridlist == NULL) {
        return (DAT_FALSE);
    }
    dapl_os_assert(wridlist->wl_srq_en);

    dapl_os_lock(&wridlist->wl_lock->wrl_lock);

    old_wl_wre = wridlist->wl_wre;
    old_wl_freel = wridlist->wl_free_list;
    old_size = wridlist->wl_size;

    new_wl_wre = (dapls_tavor_wrid_entry_t *)dapl_os_alloc(new_size *
        sizeof (dapls_tavor_wrid_entry_t));
    if (new_wl_wre == NULL) {
        goto bail;
    }
    new_wl_freel = dapl_os_alloc(new_size * sizeof (uint32_t));
    if (new_wl_freel == NULL) {
        goto bail;
    }
    /*
     * we just need to copy the old WREs to the new array. Since the
     * descriptors are relatively addressed the descriptor to index
     * mapping doesn't change.
     */
    (void) dapl_os_memcpy(&new_wl_wre[0], &old_wl_wre[0],
        old_size * sizeof (dapls_tavor_wrid_entry_t));
    /*
     * Copy the old free list to the new one
     */
    idx = wridlist->wl_freel_head;
    for (i = 0; i < wridlist->wl_freel_entries; i++) {
        new_wl_freel[i] = old_wl_freel[idx];
        idx = (idx + 1) % old_size;
    }
    /*
     * Add the new entries in wl_wre to the new free list
     */
    idx = wridlist->wl_freel_entries;
    new_wl_freel[idx] = wridlist->wl_srq_desc_addr + old_size *
        wridlist->wl_srq_wqesz;
    prev_idx = idx;
    idx = (idx + 1) % new_size;
    for (i = 0; i < new_size - old_size - 1; i++) {
        new_wl_freel[idx] = new_wl_freel[prev_idx] +
            wridlist->wl_srq_wqesz;
        prev_idx = idx;
        idx = (idx + 1) % new_size;
    }
    wridlist->wl_size = new_size;
    wridlist->wl_wre = new_wl_wre;
    wridlist->wl_free_list = new_wl_freel;
    wridlist->wl_freel_head = 0;
    wridlist->wl_freel_tail = idx;
    wridlist->wl_freel_entries = wridlist->wl_freel_entries + new_size -
        old_size;

    dapl_os_unlock(&wridlist->wl_lock->wrl_lock);

    if (old_wl_wre) {
        dapl_os_free(old_wl_wre, old_size *
            sizeof (dapls_tavor_wrid_entry_t));
    }
    if (old_wl_freel) {
        dapl_os_free(old_wl_freel, old_size * sizeof (uint32_t));
    }
    return (DAT_TRUE);
bail:
    dapl_os_unlock(&wridlist->wl_lock->wrl_lock);
    if (new_wl_wre) {
        dapl_os_free(new_wl_wre, new_size *
            sizeof (dapls_tavor_wrid_entry_t));
    }
    return (DAT_FALSE);
}