udapl_tavor/tavor/dapl_tavor_hw.c

2N/A/*
2N/A * CDDL HEADER START
2N/A *
2N/A * The contents of this file are subject to the terms of the
2N/A * Common Development and Distribution License (the "License").
2N/A * You may not use this file except in compliance with the License.
2N/A *
2N/A * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
2N/A * or http://www.opensolaris.org/os/licensing.
2N/A * See the License for the specific language governing permissions
2N/A * and limitations under the License.
2N/A *
2N/A * When distributing Covered Code, include this CDDL HEADER in each
2N/A * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
2N/A * If applicable, add the following below this CDDL HEADER, with the
2N/A * fields enclosed by brackets "[]" replaced with your own identifying
2N/A * information: Portions Copyright [yyyy] [name of copyright owner]
2N/A *
2N/A * CDDL HEADER END
2N/A */
2N/A
2N/A/*
2N/A * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
2N/A * Use is subject to license terms.
2N/A */
2N/A
2N/A/*
2N/A * This file may contain confidential information of
2N/A * Mellanox Technologies, Ltd. and should not be distributed in source
2N/A * form without approval from Sun Legal.
2N/A */
2N/A
2N/A#include "dapl.h"
2N/A#include "dapl_tavor_hw.h"
2N/A#include "dapl_tavor_wr.h"
2N/A#include "dapl_tavor_ibtf_impl.h"
2N/A
2N/A/*
2N/A * Function signatures
2N/A */
2N/Aextern uint64_t dapls_tavor_wrid_get_entry(ib_cq_handle_t, tavor_hw_cqe_t *,
2N/A    uint_t, uint_t, dapls_tavor_wrid_entry_t *);
2N/Aextern void dapls_tavor_wrid_cq_reap(ib_cq_handle_t);
2N/Aextern DAPL_OS_LOCK g_tavor_uar_lock;
2N/A
2N/A#ifndef _LP64
2N/Aextern void dapls_atomic_assign_64(uint64_t, uint64_t *);
2N/A#endif
2N/A
2N/Astatic int dapli_tavor_wqe_send_build(ib_qp_handle_t, ibt_send_wr_t *,
2N/A    uint64_t *, uint_t *);
2N/Astatic void dapli_tavor_wqe_send_linknext(ibt_send_wr_t *, uint64_t *,
2N/A    boolean_t, uint32_t, uint_t, uint64_t *, tavor_sw_wqe_dbinfo_t *);
2N/Astatic DAT_RETURN dapli_tavor_wqe_recv_build(ib_qp_handle_t, ibt_recv_wr_t *,
2N/A    uint64_t *, uint_t *);
2N/Astatic void dapli_tavor_wqe_recv_linknext(uint64_t *, boolean_t, uint32_t,
2N/A    uint_t, uint64_t *);
2N/Astatic int dapli_tavor_cq_cqe_consume(ib_cq_handle_t, tavor_hw_cqe_t *,
2N/A    ibt_wc_t *);
2N/Astatic int dapli_tavor_cq_errcqe_consume(ib_cq_handle_t, tavor_hw_cqe_t *,
2N/A    ibt_wc_t *);
2N/A
2N/A/* exported to other HCAs */
2N/Aextern void dapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t *, uint64_t,
2N/A    uint32_t, uint_t);
2N/Aextern void dapli_tavor_wrid_add_entry_srq(ib_srq_handle_t, uint64_t, uint32_t);
2N/A
2N/A/*
2N/A * Note: The 64 bit doorbells need to written atomically.
2N/A * In 32 bit libraries we need to use the special assembly rtn
2N/A * because compiler generated code splits into 2 word writes
2N/A */
2N/A
2N/A#if defined(_LP64) || defined(__lint)
2N/A/* use a macro to ensure inlining on S10 amd64 compiler */
2N/A#define dapli_tavor_cq_doorbell(ia_uar, cq_cmd, cqn, cq_param) \
2N/A    ((tavor_hw_uar_t *)ia_uar)->cq = HTOBE_64( \
2N/A        ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) | \
2N/A        ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param)
2N/A#else
2N/A
2N/A/*
2N/A * dapli_tavor_cq_doorbell()
2N/A * Takes the specified cq cmd and cq number and rings the cq doorbell
2N/A */
2N/Astatic void
2N/Adapli_tavor_cq_doorbell(dapls_hw_uar_t ia_uar, uint32_t cq_cmd, uint32_t cqn,
2N/A    uint32_t cq_param)
2N/A{
2N/A    uint64_t doorbell;
2N/A
2N/A    /* Build the doorbell from the parameters */
2N/A    doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) |
2N/A        ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param;
2N/A
2N/A    /* Write the doorbell to UAR */
2N/A#ifdef _LP64
2N/A    ((tavor_hw_uar_t *)ia_uar)->cq = HTOBE_64(doorbell);
2N/A    /* 32 bit version */
2N/A#elif defined(i386)
2N/A    dapl_os_lock(&g_tavor_uar_lock);
2N/A    /*
2N/A     * For 32 bit intel we assign the doorbell in the order
2N/A     * prescribed by the Tavor PRM, lower to upper addresses
2N/A     */
2N/A    ((tavor_hw_uar32_t *)ia_uar)->cq[0] =
2N/A        (uint32_t)HTOBE_32(doorbell >> 32);
2N/A    ((tavor_hw_uar32_t *)ia_uar)->cq[1] =
2N/A        (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
2N/A    dapl_os_unlock(&g_tavor_uar_lock);
2N/A#else
2N/A    dapls_atomic_assign_64(HTOBE_64(doorbell),
2N/A        &((tavor_hw_uar_t *)ia_uar)->cq);
2N/A#endif
2N/A}
2N/A#pragma inline(dapli_tavor_cq_doorbell)
2N/A
2N/A#endif  /* _LP64 */
2N/A
2N/A#if defined(_LP64) || defined(__lint)
2N/A#define dapli_tavor_qp_send_doorbell(ia_uar, nda, nds, qpn, fence, nopcode) \
2N/A    ((tavor_hw_uar_t *)ia_uar)->send = HTOBE_64( \
2N/A        (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) << \
2N/A        TAVOR_QPSNDDB_NDA_SHIFT) | \
2N/A        ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) | \
2N/A        ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) | \
2N/A        ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds)
2N/A#else
2N/A
2N/A/*
2N/A * dapli_tavor_qp_send_doorbell()
2N/A * Takes the specified next descriptor information, qp number, opcode and
2N/A * rings the send doorbell
2N/A */
2N/Astatic void
2N/Adapli_tavor_qp_send_doorbell(dapls_hw_uar_t ia_uar, uint32_t nda,
2N/A    uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode)
2N/A{
2N/A    uint64_t doorbell;
2N/A
2N/A    /* Build the doorbell from the parameters */
2N/A    doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) <<
2N/A        TAVOR_QPSNDDB_NDA_SHIFT) |
2N/A        ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) |
2N/A        ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) |
2N/A        ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds;
2N/A
2N/A    /* Write the doorbell to UAR */
2N/A#ifdef _LP64
2N/A    ((tavor_hw_uar_t *)ia_uar)->send = HTOBE_64(doorbell);
2N/A#else
2N/A#if defined(i386)
2N/A    dapl_os_lock(&g_tavor_uar_lock);
2N/A    /*
2N/A     * For 32 bit intel we assign the doorbell in the order
2N/A     * prescribed by the Tavor PRM, lower to upper addresses
2N/A     */
2N/A    ((tavor_hw_uar32_t *)ia_uar)->send[0] =
2N/A        (uint32_t)HTOBE_32(doorbell >> 32);
2N/A    ((tavor_hw_uar32_t *)ia_uar)->send[1] =
2N/A        (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
2N/A    dapl_os_unlock(&g_tavor_uar_lock);
2N/A#else
2N/A    dapls_atomic_assign_64(HTOBE_64(doorbell),
2N/A        &((tavor_hw_uar_t *)ia_uar)->send);
2N/A#endif
2N/A#endif
2N/A}
2N/A#pragma inline(dapli_tavor_qp_send_doorbell)
2N/A#endif  /* _LP64 */
2N/A
2N/A#if defined(_LP64) || defined(__lint)
2N/A
2N/A#define dapli_tavor_qp_recv_doorbell(ia_uar, nda, nds, qpn, credits) \
2N/A    ((tavor_hw_uar_t *)ia_uar)->recv = HTOBE_64( \
2N/A        (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) << \
2N/A        TAVOR_QPRCVDB_NDA_SHIFT) | \
2N/A        ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) | \
2N/A        ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits)
2N/A#else
2N/A
2N/A/*
2N/A * dapli_tavor_qp_recv_doorbell()
2N/A * Takes the specified next descriptor information, qp number and
2N/A * rings the recv doorbell
2N/A */
2N/Astatic void
2N/Adapli_tavor_qp_recv_doorbell(dapls_hw_uar_t ia_uar, uint32_t nda,
2N/A    uint32_t nds, uint32_t qpn, uint32_t credits)
2N/A{
2N/A    uint64_t doorbell;
2N/A
2N/A    /* Build the doorbell from the parameters */
2N/A    doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) <<
2N/A        TAVOR_QPRCVDB_NDA_SHIFT) |
2N/A        ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) |
2N/A        ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits;
2N/A
2N/A    /* Write the doorbell to UAR */
2N/A#ifdef _LP64
2N/A    ((tavor_hw_uar_t *)ia_uar)->recv = HTOBE_64(doorbell);
2N/A#else
2N/A#if defined(i386)
2N/A    dapl_os_lock(&g_tavor_uar_lock);
2N/A    /*
2N/A     * For 32 bit intel we assign the doorbell in the order
2N/A     * prescribed by the Tavor PRM, lower to upper addresses
2N/A     */
2N/A    ((tavor_hw_uar32_t *)ia_uar)->recv[0] =
2N/A        (uint32_t)HTOBE_32(doorbell >> 32);
2N/A    ((tavor_hw_uar32_t *)ia_uar)->recv[1] =
2N/A        (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
2N/A    dapl_os_unlock(&g_tavor_uar_lock);
2N/A#else
2N/A    dapls_atomic_assign_64(HTOBE_64(doorbell),
2N/A        &((tavor_hw_uar_t *)ia_uar)->recv);
2N/A#endif
2N/A#endif
2N/A}
2N/A#pragma inline(dapli_tavor_qp_recv_doorbell)
2N/A#endif  /* _LP64 */
2N/A
2N/A
2N/A/*
2N/A * dapls_tavor_max_inline()
2N/A * Return the max inline value that should be used.
2N/A * Env variable DAPL_MAX_INLINE can override the default.
2N/A * If it's not set (or set to -1), default behavior is used.
2N/A * If it's zero or negative (except -1) inline is not done.
2N/A */
2N/Aint
2N/Adapls_tavor_max_inline(void)
2N/A{
2N/A    static int max_inline_env = -2;
2N/A
2N/A    /* Check the env exactly once, otherwise return previous value. */
2N/A    if (max_inline_env != -2)
2N/A        return (max_inline_env);
2N/A
2N/A    max_inline_env = dapl_os_get_env_val("DAPL_MAX_INLINE", -1);
2N/A    if (max_inline_env != -1)
2N/A        if (max_inline_env <= 0)
2N/A            max_inline_env = 0; /* no inlining */
2N/A    return (max_inline_env);
2N/A}
2N/A
2N/A/*
2N/A * dapls_ib_max_request_iov(), aka, max send sgl size.
2N/A * The send queue's scatter/gather list is used for "inline" data.
2N/A *
2N/A * By default, compute reasonable send queue size based on #iovs, #wqes,
2N/A * max_iovs, and max inline byte count.  If the #wqes is large, then we
2N/A * limit how much the SGL (space for inline data) can take.  The heuristic
2N/A * is to increase the memory for the send queue to a maximum of 32KB:
2N/A *
2N/A *  < 128 wqes  increase to at most 256 minus header
2N/A *  < 256 wqes  increase to at most 128 minus header
2N/A *  >= 256 wqes use SGL unaltered
2N/A *
2N/A * If the env is supplied (max_inline >= 0), use it without checking.
2N/A */
2N/Aint
2N/Adapls_ib_max_request_iov(int iovs, int wqes, int max_iovs,
2N/A    int max_inline_bytes)
2N/A{
2N/A    int ret_iovs;
2N/A
2N/A    if (max_inline_bytes > 0) {
2N/A        ret_iovs = max_inline_bytes / sizeof (tavor_hw_wqe_sgl_t);
2N/A    } else if (wqes < 128) {
2N/A        max_inline_bytes = 256 - TAVOR_INLINE_HEADER_SIZE_MAX;
2N/A        ret_iovs = max_inline_bytes / sizeof (tavor_hw_wqe_sgl_t);
2N/A    } else if (wqes < 256) {
2N/A        max_inline_bytes = 128 - TAVOR_INLINE_HEADER_SIZE_MAX;
2N/A        ret_iovs = max_inline_bytes / sizeof (tavor_hw_wqe_sgl_t);
2N/A    } else {
2N/A        ret_iovs = iovs;
2N/A    }
2N/A
2N/A    if (ret_iovs > max_iovs)    /* do not exceed max */
2N/A        ret_iovs = max_iovs;
2N/A    if (iovs > ret_iovs)        /* never decrease iovs */
2N/A        ret_iovs = iovs;
2N/A    return (ret_iovs);
2N/A}
2N/A
2N/A/*
2N/A * dapli_tavor_wqe_send_build()
2N/A * Constructs a WQE for a given ibt_send_wr_t
2N/A */
2N/Astatic int
2N/Adapli_tavor_wqe_send_build(ib_qp_handle_t qp, ibt_send_wr_t *wr,
2N/A    uint64_t *addr, uint_t *size)
2N/A{
2N/A    tavor_hw_snd_wqe_remaddr_t  *rc;
2N/A    tavor_hw_snd_wqe_bind_t     *bn;
2N/A    tavor_hw_wqe_sgl_t      *ds;
2N/A    ibt_wr_ds_t         *sgl;
2N/A    uint32_t            nds;
2N/A    uint32_t            len, total_len;
2N/A    uint32_t            tavor_num_mpt_mask;
2N/A    uint32_t            new_rkey;
2N/A    uint32_t            old_rkey;
2N/A    int             i, num_ds;
2N/A    int             max_inline_bytes = -1;
2N/A
2N/A    nds = wr->wr_nds;
2N/A    sgl = wr->wr_sgl;
2N/A    num_ds = 0;
2N/A
2N/A    /*
2N/A     * RC is the only supported transport in UDAPL
2N/A     * For RC requests, we allow "Send", "RDMA Read", "RDMA Write"
2N/A     */
2N/A    switch (wr->wr_opcode) {
2N/A    case IBT_WRC_SEND:
2N/A        /*
2N/A         * If this is a Send request, then all we need is
2N/A         * the Data Segment processing below.
2N/A         * Initialize the information for the Data Segments
2N/A         */
2N/A        ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
2N/A            sizeof (tavor_hw_snd_wqe_nextctrl_t));
2N/A        if (qp->qp_sq_inline != 0)
2N/A            max_inline_bytes =
2N/A                qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_SEND;
2N/A        break;
2N/A    case IBT_WRC_RDMAW:
2N/A        if (qp->qp_sq_inline != 0)
2N/A            max_inline_bytes =
2N/A                qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_RDMAW;
2N/A        /* FALLTHROUGH */
2N/A    case IBT_WRC_RDMAR:
2N/A        if (qp->qp_sq_inline < 0 && wr->wr_opcode == IBT_WRC_RDMAR)
2N/A            qp->qp_sq_inline = 0;
2N/A        /*
2N/A         * If this is an RDMA Read or RDMA Write request, then fill
2N/A         * in the "Remote Address" header fields.
2N/A         */
2N/A        rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)addr +
2N/A            sizeof (tavor_hw_snd_wqe_nextctrl_t));
2N/A
2N/A        /*
2N/A         * Build the Remote Address Segment for the WQE, using
2N/A         * the information from the RC work request.
2N/A         */
2N/A        TAVOR_WQE_BUILD_REMADDR(rc, &wr->wr.rc.rcwr.rdma);
2N/A
2N/A        /* Update "ds" for filling in Data Segments (below) */
2N/A        ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc +
2N/A            sizeof (tavor_hw_snd_wqe_remaddr_t));
2N/A        break;
2N/A    case IBT_WRC_BIND:
2N/A        /*
2N/A         * Generate a new R_key
2N/A         * Increment the upper "unconstrained" bits and need to keep
2N/A         * the lower "constrained" bits the same it represents
2N/A         * the MPT index.
2N/A         */
2N/A        old_rkey = wr->wr.rc.rcwr.bind->bind_rkey;
2N/A        tavor_num_mpt_mask = (uint32_t)(1 << qp->qp_num_mpt_shift) - 1;
2N/A        new_rkey = (old_rkey >> qp->qp_num_mpt_shift);
2N/A        new_rkey++;
2N/A        new_rkey = ((new_rkey << qp->qp_num_mpt_shift) |
2N/A            (old_rkey & tavor_num_mpt_mask));
2N/A
2N/A        wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
2N/A
2N/A        bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)addr +
2N/A            sizeof (tavor_hw_snd_wqe_nextctrl_t));
2N/A
2N/A        /*
2N/A         * Build the Bind Memory Window Segments for the WQE,
2N/A         * using the information from the RC Bind memory
2N/A         * window work request.
2N/A         */
2N/A        TAVOR_WQE_BUILD_BIND(bn, wr->wr.rc.rcwr.bind);
2N/A
2N/A        /*
2N/A         * Update the "ds" pointer.  Even though the "bind"
2N/A         * operation requires no SGLs, this is necessary to
2N/A         * facilitate the correct descriptor size calculations
2N/A         * (below).
2N/A         */
2N/A        ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
2N/A            sizeof (tavor_hw_snd_wqe_bind_t));
2N/A        break;
2N/A    default:
2N/A        dapl_dbg_log(DAPL_DBG_TYPE_ERR,
2N/A            "dapli_tavor_wqe_send_build: invalid wr_opcode=%d\n",
2N/A            wr->wr_opcode);
2N/A        return (DAT_INTERNAL_ERROR);
2N/A    }
2N/A
2N/A    /*
2N/A     * Now fill in the Data Segments (SGL) for the Send WQE based on
2N/A     * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
2N/A     * Start by checking for a valid number of SGL entries
2N/A     */
2N/A    if (nds > qp->qp_sq_sgl) {
2N/A        return (DAT_INVALID_PARAMETER);
2N/A    }
2N/A
2N/A    /*
2N/A     * For each SGL in the Send Work Request, fill in the Send WQE's data
2N/A     * segments.  Note: We skip any SGL with zero size because Tavor
2N/A     * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
2N/A     * the encoding for zero means a 2GB transfer.  Because of this special
2N/A     * encoding in the hardware, we mask the requested length with
2N/A     * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
2N/A     * zero.)
2N/A     */
2N/A
2N/A    if (max_inline_bytes != -1) {       /* compute total_len */
2N/A        total_len = 0;
2N/A        for (i = 0; i < nds; i++)
2N/A            total_len += sgl[i].ds_len;
2N/A        if (total_len > max_inline_bytes)
2N/A            max_inline_bytes = -1;  /* too big, do not "inline" */
2N/A    }
2N/A    if (max_inline_bytes != -1) {       /* do "inline" */
2N/A        uint8_t *dst = (uint8_t *)((uint32_t *)ds + 1);
2N/A        *(uint32_t *)ds =
2N/A            HTOBE_32(total_len | TAVOR_WQE_SGL_INLINE_MASK);
2N/A        for (i = 0; i < nds; i++) {
2N/A            if ((len = sgl[i].ds_len) == 0) {
2N/A                continue;
2N/A            }
2N/A            (void) dapl_os_memcpy(dst,
2N/A                (void *)(uintptr_t)sgl[i].ds_va, len);
2N/A            dst += len;
2N/A        }
2N/A        /* Return the size of descriptor (in 16-byte chunks) */
2N/A        *size = ((uintptr_t)dst - (uintptr_t)addr + 15) >> 4;
2N/A    } else {
2N/A        for (i = 0; i < nds; i++) {
2N/A            if (sgl[i].ds_len == 0) {
2N/A                continue;
2N/A            }
2N/A
2N/A            /*
2N/A             * Fill in the Data Segment(s) for the current WQE,
2N/A             * using the information contained in the
2N/A             * scatter-gather list of the work request.
2N/A             */
2N/A            TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &sgl[i]);
2N/A            num_ds++;
2N/A        }
2N/A
2N/A        /* Return the size of descriptor (in 16-byte chunks) */
2N/A        *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)addr) >> 4;
2N/A    }
2N/A
2N/A    return (DAT_SUCCESS);
2N/A}
2N/A
2N/A/*
2N/A * dapli_tavor_wqe_send_linknext()
2N/A * Takes a WQE and links it to the prev WQE chain
2N/A */
2N/Astatic void
2N/Adapli_tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, uint64_t *curr_addr,
2N/A    boolean_t ns, uint32_t curr_desc, uint_t curr_descsz, uint64_t *prev_addr,
2N/A    tavor_sw_wqe_dbinfo_t *dbinfo)
2N/A{
2N/A    uint64_t    next, ctrl;
2N/A    uint32_t    nopcode, fence;
2N/A
2N/A    next = 0;
2N/A    ctrl = 0;
2N/A
2N/A    /* Set the "c" (i.e. "signaled") bit appropriately */
2N/A    if (curr_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
2N/A        ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK;
2N/A    }
2N/A
2N/A    /* Set the "s" (i.e. "solicited") bit appropriately */
2N/A    if (curr_wr->wr_flags & IBT_WR_SEND_SOLICIT) {
2N/A        ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK;
2N/A    }
2N/A    /* Set the "e" (i.e. "event") bit if notification is needed */
2N/A    if (!ns) {
2N/A        ctrl = ctrl | TAVOR_WQE_RCV_EVENT_MASK;
2N/A    }
2N/A
2N/A    /*
2N/A     * The "i" bit is unused since uDAPL doesn't support
2N/A     * the immediate data
2N/A     */
2N/A
2N/A    /* initialize the ctrl and next fields of the current descriptor */
2N/A    TAVOR_WQE_LINKNEXT(curr_addr, ctrl, next);
2N/A
2N/A    /*
2N/A     * Calculate the "next" field of the prev descriptor.  This amounts
2N/A     * to setting up the "next_wqe_addr", "nopcode", "fence", and "nds"
2N/A     * fields (see tavor_hw.h for more).
2N/A     */
2N/A
2N/A    /*
2N/A     * Determine the value for the Tavor WQE "nopcode" field
2N/A     * by using the IBTF opcode from the work request
2N/A     */
2N/A    switch (curr_wr->wr_opcode) {
2N/A    case IBT_WRC_RDMAW:
2N/A        nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW;
2N/A        break;
2N/A
2N/A    case IBT_WRC_SEND:
2N/A        nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
2N/A        break;
2N/A
2N/A    case IBT_WRC_RDMAR:
2N/A        nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR;
2N/A        break;
2N/A
2N/A    case IBT_WRC_BIND:
2N/A        nopcode = TAVOR_WQE_SEND_NOPCODE_BIND;
2N/A        break;
2N/A    default:
2N/A        /* Unsupported opcodes in UDAPL */
2N/A        dapl_dbg_log(DAPL_DBG_TYPE_ERR,
2N/A            "dapli_tavor_wqe_send_linknext: invalid nopcode=%d\n",
2N/A            nopcode);
2N/A        return;
2N/A    }
2N/A
2N/A    next  = ((uint64_t)curr_desc & TAVOR_WQE_NDA_MASK) << 32;
2N/A    next  = next | ((uint64_t)nopcode << 32);
2N/A    fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
2N/A    if (fence) {
2N/A        next = next | TAVOR_WQE_SEND_FENCE_MASK;
2N/A    }
2N/A    next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
2N/A
2N/A    /*
2N/A     * A send queue doorbell will be rung for the next
2N/A     * WQE on the chain, set the current WQE's "dbd" bit.
2N/A     * Note: We also update the "dbinfo" structure here to pass
2N/A     * back information about what should (later) be included
2N/A     * in the send queue doorbell.
2N/A     */
2N/A    next = next | TAVOR_WQE_DBD_MASK;
2N/A    dbinfo->db_nopcode = nopcode;
2N/A    dbinfo->db_fence   = fence;
2N/A
2N/A    /*
2N/A     * Send queue doorbell will be rung for the next WQE on
2N/A     * the chain, update the prev WQE's "next" field and return.
2N/A     */
2N/A    if (prev_addr != NULL) {
2N/A        TAVOR_WQE_LINKFIRST(prev_addr, next);
2N/A    }
2N/A}
2N/A
2N/A
2N/A/*
2N/A * dapli_tavor_wqe_recv_build()
2N/A * Builds the recv WQE for a given ibt_recv_wr_t
2N/A */
2N/Astatic DAT_RETURN
2N/Adapli_tavor_wqe_recv_build(ib_qp_handle_t qp, ibt_recv_wr_t *wr,
2N/A    uint64_t *addr, uint_t *size)
2N/A{
2N/A    tavor_hw_wqe_sgl_t  *ds;
2N/A    int         i;
2N/A    int         num_ds;
2N/A
2N/A    /* Fill in the Data Segments (SGL) for the Recv WQE */
2N/A    ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
2N/A        sizeof (tavor_hw_rcv_wqe_nextctrl_t));
2N/A    num_ds = 0;
2N/A
2N/A    /* Check for valid number of SGL entries */
2N/A    if (wr->wr_nds > qp->qp_rq_sgl) {
2N/A        return (DAT_INVALID_PARAMETER);
2N/A    }
2N/A
2N/A    /*
2N/A     * For each SGL in the Recv Work Request, fill in the Recv WQE's data
2N/A     * segments.  Note: We skip any SGL with zero size because Tavor
2N/A     * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
2N/A     * the encoding for zero means a 2GB transfer.  Because of this special
2N/A     * encoding in the hardware, we mask the requested length with
2N/A     * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
2N/A     * zero.)
2N/A     */
2N/A    for (i = 0; i < wr->wr_nds; i++) {
2N/A        if (wr->wr_sgl[i].ds_len == 0) {
2N/A            continue;
2N/A        }
2N/A
2N/A        /*
2N/A         * Fill in the Data Segment(s) for the receive WQE, using the
2N/A         * information contained in the scatter-gather list of the
2N/A         * work request.
2N/A         */
2N/A        TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]);
2N/A        num_ds++;
2N/A    }
2N/A
2N/A    /* Return the size of descriptor (in 16-byte chunks) */
2N/A    *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)addr) >> 0x4;
2N/A
2N/A    return (DAT_SUCCESS);
2N/A}
2N/A
2N/A
2N/A/*
2N/A * dapli_tavor_wqe_recv_linknext()
2N/A * Links a recv WQE to the prev chain
2N/A */
2N/Astatic void
2N/Adapli_tavor_wqe_recv_linknext(uint64_t *curr_addr, boolean_t ns,
2N/A    uint32_t curr_desc, uint_t curr_descsz, uint64_t *prev_addr)
2N/A{
2N/A    uint64_t    next;
2N/A    uint64_t    ctrl = 0;
2N/A
2N/A    /*
2N/A     * Note: curr_addr is the last WQE (In uDAPL we manipulate 1 WQE
2N/A     * at a time. If there is no next descriptor (i.e. if the current
2N/A     * descriptor is the last WQE on the chain), then set "next" field
2N/A     * to TAVOR_WQE_DBD_MASK.  This is because the Tavor hardware
2N/A     * requires the "dbd" bit to be set to one for all Recv WQEs.
2N/A     * In either case, we must add a single bit in the "reserved" field
2N/A     * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
2N/A     * workaround for a known Tavor errata that can cause Recv WQEs with
2N/A     * zero in the NDA field to behave improperly.
2N/A     *
2N/A     * If notification suppression is not desired then we set
2N/A     * the "E" bit in the ctrl field.
2N/A     */
2N/A
2N/A    next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
2N/A    if (!ns) { /* notification needed - so set the "E" bit */
2N/A        ctrl = TAVOR_WQE_RCV_EVENT_MASK;
2N/A    }
2N/A
2N/A    /* update the WQE */
2N/A    TAVOR_WQE_LINKNEXT(curr_addr, ctrl, next);
2N/A
2N/A    if (prev_addr != NULL) {
2N/A        /*
2N/A         * Calculate the "next" field of the descriptor.  This amounts
2N/A         * to setting up the "next_wqe_addr", "dbd", and "nds" fields
2N/A         * (see tavor_hw.h for more).
2N/A         */
2N/A        next = ((uint64_t)curr_desc & TAVOR_WQE_NDA_MASK) << 32;
2N/A        next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) |
2N/A            TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
2N/A
2N/A        /*
2N/A         * If this WQE is supposed to be linked to the previous
2N/A         * descriptor, then we need to update not only the previous
2N/A         * WQE's "next" fields but we must not touch this WQE's
2N/A         * "ctrl" fields.
2N/A         */
2N/A        TAVOR_WQE_LINKFIRST(prev_addr, next);
2N/A    }
2N/A}
2N/A
2N/A/*
2N/A * dapli_tavor_wqe_srq_build()
2N/A * Builds the recv WQE for a given ibt_recv_wr_t
2N/A */
2N/Astatic DAT_RETURN
2N/Adapli_tavor_wqe_srq_build(ib_srq_handle_t srq, ibt_recv_wr_t *wr,
2N/A    uint64_t *addr)
2N/A{
2N/A    tavor_hw_wqe_sgl_t  *ds;
2N/A    ibt_wr_ds_t     end_sgl;
2N/A    int         i;
2N/A    int         num_ds;
2N/A
2N/A    /* Fill in the Data Segments (SGL) for the Recv WQE */
2N/A    ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
2N/A        sizeof (tavor_hw_rcv_wqe_nextctrl_t));
2N/A    num_ds = 0;
2N/A
2N/A    /* Check for valid number of SGL entries */
2N/A    if (wr->wr_nds > srq->srq_wq_sgl) {
2N/A        return (DAT_INVALID_PARAMETER);
2N/A    }
2N/A
2N/A    /*
2N/A     * For each SGL in the Recv Work Request, fill in the Recv WQE's data
2N/A     * segments.  Note: We skip any SGL with zero size because Tavor
2N/A     * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
2N/A     * the encoding for zero means a 2GB transfer.  Because of this special
2N/A     * encoding in the hardware, we mask the requested length with
2N/A     * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
2N/A     * zero.)
2N/A     */
2N/A    for (i = 0; i < wr->wr_nds; i++) {
2N/A        if (wr->wr_sgl[i].ds_len == 0) {
2N/A            continue;
2N/A        }
2N/A
2N/A        /*
2N/A         * Fill in the Data Segment(s) for the receive WQE, using the
2N/A         * information contained in the scatter-gather list of the
2N/A         * work request.
2N/A         */
2N/A        TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]);
2N/A        num_ds++;
2N/A    }
2N/A
2N/A    /*
2N/A     * For SRQ, if the number of data segments is less than the maximum
2N/A     * specified at alloc, then we have to fill in a special "key" entry in
2N/A     * the sgl entry after the last valid one in this post request.  We do
2N/A     * that here.
2N/A     */
2N/A    if (num_ds < srq->srq_wq_sgl) {
2N/A        end_sgl.ds_va  = (ib_vaddr_t)0;
2N/A        end_sgl.ds_len = (ib_msglen_t)0;
2N/A        end_sgl.ds_key = (ibt_lkey_t)1;
2N/A        TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &end_sgl);
2N/A    }
2N/A
2N/A    return (DAT_SUCCESS);
2N/A}
2N/A
2N/A/*
2N/A * dapli_tavor_wqe_srq_linknext()
2N/A * Links a srq recv WQE to the prev chain
2N/A */
2N/Astatic void
2N/Adapli_tavor_wqe_srq_linknext(uint64_t *curr_addr, boolean_t ns,
2N/A    uint32_t curr_desc, uint64_t *prev_addr)
2N/A{
2N/A    uint64_t    next;
2N/A    uint64_t    ctrl = 0;
2N/A
2N/A    /*
2N/A     * Note: curr_addr is the last WQE (In uDAPL we manipulate 1 WQE
2N/A     * at a time. If there is no next descriptor (i.e. if the current
2N/A     * descriptor is the last WQE on the chain), then set "next" field
2N/A     * to TAVOR_WQE_DBD_MASK.  This is because the Tavor hardware
2N/A     * requires the "dbd" bit to be set to one for all Recv WQEs.
2N/A     * In either case, we must add a single bit in the "reserved" field
2N/A     * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
2N/A     * workaround for a known Tavor errata that can cause Recv WQEs with
2N/A     * zero in the NDA field to behave improperly.
2N/A     *
2N/A     * If notification suppression is not desired then we set
2N/A     * the "E" bit in the ctrl field.
2N/A     */
2N/A
2N/A    next = TAVOR_RCV_WQE_NDA0_WA_MASK;
2N/A    if (!ns) { /* notification needed - so set the "E" bit */
2N/A        ctrl = TAVOR_WQE_RCV_EVENT_MASK;
2N/A    }
2N/A
2N/A    /* update the WQE */
2N/A    TAVOR_WQE_LINKNEXT(curr_addr, ctrl, next);
2N/A
2N/A    if (prev_addr != NULL) {
2N/A        /*
2N/A         * Calculate the "next" field of the descriptor.  This amounts
2N/A         * to setting up the "next_wqe_addr", "dbd", and "nds" fields
2N/A         * (see tavor_hw.h for more).
2N/A         */
2N/A        next = ((uint64_t)curr_desc & TAVOR_WQE_NDA_MASK) << 32;
2N/A        next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
2N/A
2N/A        /*
2N/A         * If this WQE is supposed to be linked to the previous
2N/A         * descriptor, then we need to update not only the previous
2N/A         * WQE's "next" fields but we must not touch this WQE's
2N/A         * "ctrl" fields.
2N/A         */
2N/A        TAVOR_WQE_LINKFIRST(prev_addr, next);
2N/A    }
2N/A}
2N/A
2N/A/*
2N/A * dapli_tavor_cq_peek()
2N/A * Peeks into a given CQ to check if there are any events that can be
2N/A * polled. It returns the number of CQEs that can be polled.
2N/A */
2N/Astatic void
2N/Adapli_tavor_cq_peek(ib_cq_handle_t cq, int *num_cqe)
2N/A{
2N/A    tavor_hw_cqe_t      *cqe;
2N/A    uint32_t        imm_eth_pkey_cred;
2N/A    uint32_t        cons_indx;
2N/A    uint32_t        wrap_around_mask;
2N/A    uint32_t        polled_cnt;
2N/A    uint_t          doorbell_cnt;
2N/A    uint_t          opcode;
2N/A
2N/A    /* Get the consumer index */
2N/A    cons_indx = cq->cq_consindx;
2N/A
2N/A    /*
2N/A     * Calculate the wrap around mask.  Note: This operation only works
2N/A     * because all Tavor completion queues have power-of-2 sizes
2N/A     */
2N/A    wrap_around_mask = (cq->cq_size - 1);
2N/A
2N/A    /* Calculate the pointer to the first CQ entry */
2N/A    cqe = &cq->cq_addr[cons_indx];
2N/A
2N/A    /*
2N/A     * Count entries in the CQ until we find an entry owned by
2N/A     * the hardware.
2N/A     */
2N/A    polled_cnt = 0;
2N/A    while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
2N/A        opcode = TAVOR_CQE_OPCODE_GET(cqe);
2N/A        /* Error CQE map to multiple work completions */
2N/A        if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
2N/A            (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
2N/A            imm_eth_pkey_cred =
2N/A                TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe);
2N/A            doorbell_cnt =
2N/A                imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
2N/A            polled_cnt += (doorbell_cnt + 1);
2N/A        } else {
2N/A            polled_cnt++;
2N/A        }
2N/A        /* Increment the consumer index */
2N/A        cons_indx = (cons_indx + 1) & wrap_around_mask;
2N/A
2N/A        /* Update the pointer to the next CQ entry */
2N/A        cqe = &cq->cq_addr[cons_indx];
2N/A    }
2N/A
2N/A    *num_cqe = polled_cnt;
2N/A}
2N/A
2N/A/*
2N/A * dapli_tavor_cq_poll()
2N/A * This routine polls CQEs out of a CQ and puts them into the ibt_wc_t
2N/A * array that is passed in.
2N/A */
2N/Astatic DAT_RETURN
2N/Adapli_tavor_cq_poll(ib_cq_handle_t cq, ibt_wc_t *wc_p, uint_t num_wc,
2N/A    uint_t *num_polled)
2N/A{
2N/A    tavor_hw_cqe_t      *cqe;
2N/A    uint32_t        cons_indx;
2N/A    uint32_t        wrap_around_mask;
2N/A    uint32_t        polled_cnt;
2N/A    uint32_t        num_to_increment;
2N/A    DAT_RETURN      dat_status;
2N/A    int         status;
2N/A
2N/A    /* Get the consumer index */
2N/A    cons_indx = cq->cq_consindx;
2N/A
2N/A    /*
2N/A     * Calculate the wrap around mask.  Note: This operation only works
2N/A     * because all Tavor completion queues have power-of-2 sizes
2N/A     */
2N/A    wrap_around_mask = (cq->cq_size - 1);
2N/A
2N/A    /* Calculate the pointer to the first CQ entry */
2N/A    cqe = &cq->cq_addr[cons_indx];
2N/A
2N/A    /*
2N/A     * Keep pulling entries from the CQ until we find an entry owned by
2N/A     * the hardware.  As long as there the CQE's owned by SW, process
2N/A     * each entry by calling dapli_tavor_cq_cqe_consume() and updating the
2N/A     * CQ consumer index.  Note:  We only update the consumer index if
2N/A     * dapli_tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.
2N/A     * Otherwise, it indicates that we are going to "recycle" the CQE
2N/A     * (probably because it is a error CQE and corresponds to more than one
2N/A     * completion).
2N/A     */
2N/A    polled_cnt = 0;
2N/A    while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
2N/A        status = dapli_tavor_cq_cqe_consume(cq, cqe,
2N/A            &wc_p[polled_cnt++]);
2N/A        if (status == TAVOR_CQ_SYNC_AND_DB) {
2N/A            /* Reset entry to hardware ownership */
2N/A            TAVOR_CQE_OWNER_SET_HW(cqe);
2N/A
2N/A            /* Increment the consumer index */
2N/A            cons_indx = (cons_indx + 1) & wrap_around_mask;
2N/A
2N/A            /* Update the pointer to the next CQ entry */
2N/A            cqe = &cq->cq_addr[cons_indx];
2N/A        }
2N/A
2N/A        /*
2N/A         * If we have run out of space to store work completions,
2N/A         * then stop and return the ones we have pulled of the CQ.
2N/A         */
2N/A        if (polled_cnt >= num_wc) {
2N/A            break;
2N/A        }
2N/A    }
2N/A
2N/A    dat_status = DAT_SUCCESS;
2N/A    /*
2N/A     * Now we only ring the doorbell (to update the consumer index) if
2N/A     * we've actually consumed a CQ entry.  If we have, for example,
2N/A     * pulled from a CQE that we are still in the process of "recycling"
2N/A     * for error purposes, then we would not update the consumer index.
2N/A     */
2N/A    if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) {
2N/A        /*
2N/A         * Post doorbell to update the consumer index.  Doorbell
2N/A         * value indicates number of entries consumed (minus 1)
2N/A         */
2N/A        if (cons_indx > cq->cq_consindx) {
2N/A            num_to_increment = (cons_indx - cq->cq_consindx) - 1;
2N/A        } else {
2N/A            num_to_increment = ((cons_indx + cq->cq_size) -
2N/A                cq->cq_consindx) - 1;
2N/A        }
2N/A        cq->cq_consindx = cons_indx;
2N/A        dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_INCR_CONSINDX,
2N/A            cq->cq_num, num_to_increment);
2N/A    } else if (polled_cnt == 0) {
2N/A        /*
2N/A         * If the CQ is empty, we can try to free up some of the WRID
2N/A         * list containers.
2N/A         */
2N/A        if (cq->cq_wrid_reap_head)  /* look before leaping */
2N/A            dapls_tavor_wrid_cq_reap(cq);
2N/A        dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0);
2N/A    }
2N/A
2N/A    if (num_polled != NULL) {
2N/A        *num_polled = polled_cnt;
2N/A    }
2N/A
2N/A    return (dat_status);
2N/A}
2N/A
2N/A/*
2N/A * dapli_tavor_cq_poll_one()
2N/A * This routine polls one CQE out of a CQ and puts ot into the ibt_wc_t
2N/A * that is passed in.  See above for more comments/details.
2N/A */
2N/Astatic DAT_RETURN
2N/Adapli_tavor_cq_poll_one(ib_cq_handle_t cq, ibt_wc_t *wc_p)
2N/A{
2N/A    tavor_hw_cqe_t      *cqe;
2N/A    uint32_t        cons_indx;
2N/A    DAT_RETURN      dat_status;
2N/A    int         status;
2N/A
2N/A    /* Get the consumer index */
2N/A    cons_indx = cq->cq_consindx;
2N/A
2N/A    /* Calculate the pointer to the first CQ entry */
2N/A    cqe = &cq->cq_addr[cons_indx];
2N/A
2N/A    /*
2N/A     * Keep pulling entries from the CQ until we find an entry owned by
2N/A     * the hardware.  As long as there the CQE's owned by SW, process
2N/A     * each entry by calling dapli_tavor_cq_cqe_consume() and updating the
2N/A     * CQ consumer index.  Note:  We only update the consumer index if
2N/A     * dapli_tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.
2N/A     * Otherwise, it indicates that we are going to "recycle" the CQE
2N/A     * (probably because it is a error CQE and corresponds to more than one
2N/A     * completion).
2N/A     */
2N/A    if (TAVOR_CQE_OWNER_IS_SW(cqe)) {
2N/A        status = dapli_tavor_cq_cqe_consume(cq, cqe, wc_p);
2N/A        if (status == TAVOR_CQ_SYNC_AND_DB) {
2N/A            /* Reset entry to hardware ownership */
2N/A            TAVOR_CQE_OWNER_SET_HW(cqe);
2N/A
2N/A            /* Increment the consumer index */
2N/A            cq->cq_consindx =
2N/A                (cons_indx + 1) & (cq->cq_size - 1);
2N/A            dapli_tavor_cq_doorbell(cq->cq_iauar,
2N/A                TAVOR_CQDB_INCR_CONSINDX,
2N/A                cq->cq_num, 0);
2N/A        }
2N/A        dat_status = DAT_SUCCESS;
2N/A    } else {
2N/A        if (cq->cq_wrid_reap_head)  /* look before leaping */
2N/A            dapls_tavor_wrid_cq_reap(cq);
2N/A        dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0);
2N/A    }
2N/A    return (dat_status);
2N/A}
2N/A
2N/A/*
2N/A * dapli_tavor_cq_cqe_consume()
2N/A * Converts a given CQE into a ibt_wc_t object
2N/A */
2N/Astatic int
2N/Adapli_tavor_cq_cqe_consume(ib_cq_handle_t cqhdl, tavor_hw_cqe_t *cqe,
2N/A    ibt_wc_t *wc)
2N/A{
2N/A    uint_t      flags;
2N/A    uint_t      type;
2N/A    uint_t      opcode;
2N/A    int     status;
2N/A
2N/A    /*
2N/A     * Determine if this is an "error" CQE by examining "opcode".  If it
2N/A     * is an error CQE, then call dapli_tavor_cq_errcqe_consume() and return
2N/A     * whatever status it returns.  Otherwise, this is a successful
2N/A     * completion.
2N/A     */
2N/A    opcode = TAVOR_CQE_OPCODE_GET(cqe);
2N/A    if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
2N/A        (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
2N/A        status = dapli_tavor_cq_errcqe_consume(cqhdl, cqe, wc);
2N/A        return (status);
2N/A    }
2N/A
2N/A    /*
2N/A     * Fetch the Work Request ID using the information in the CQE.
2N/A     * See tavor_wr.c for more details.
2N/A     */
2N/A    wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, cqe,
2N/A        TAVOR_CQE_SENDRECV_GET(cqe), 0, NULL);
2N/A    wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe);
2N/A
2N/A    /*
2N/A     * Parse the CQE opcode to determine completion type.  This will set
2N/A     * not only the type of the completion, but also any flags that might
2N/A     * be associated with it (e.g. whether immediate data is present).
2N/A     */
2N/A    flags = IBT_WC_NO_FLAGS;
2N/A    if (TAVOR_CQE_SENDRECV_GET(cqe) != TAVOR_COMPLETION_RECV) {
2N/A
2N/A        /*
2N/A         * Send CQE
2N/A         *
2N/A         * The following opcodes will not be generated in uDAPL
2N/A         * case TAVOR_CQE_SND_RDMAWR_IMM:
2N/A         * case TAVOR_CQE_SND_SEND_IMM:
2N/A         * case TAVOR_CQE_SND_ATOMIC_CS:
2N/A         * case TAVOR_CQE_SND_ATOMIC_FA:
2N/A         */
2N/A        switch (opcode) {
2N/A        case TAVOR_CQE_SND_RDMAWR:
2N/A            type = IBT_WRC_RDMAW;
2N/A            break;
2N/A
2N/A        case TAVOR_CQE_SND_SEND:
2N/A            type = IBT_WRC_SEND;
2N/A            break;
2N/A
2N/A        case TAVOR_CQE_SND_RDMARD:
2N/A            type = IBT_WRC_RDMAR;
2N/A            wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe);
2N/A            break;
2N/A
2N/A        case TAVOR_CQE_SND_BIND_MW:
2N/A            type = IBT_WRC_BIND;
2N/A            break;
2N/A
2N/A        default:
2N/A            wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR;
2N/A            return (TAVOR_CQ_SYNC_AND_DB);
2N/A        }
2N/A    } else {
2N/A
2N/A        /*
2N/A         * Receive CQE
2N/A         *
2N/A         * The following opcodes will not be generated in uDAPL
2N/A         *
2N/A         * case TAVOR_CQE_RCV_RECV_IMM:
2N/A         * case TAVOR_CQE_RCV_RECV_IMM2:
2N/A         * case TAVOR_CQE_RCV_RDMAWR_IMM:
2N/A         * case TAVOR_CQE_RCV_RDMAWR_IMM2:
2N/A         */
2N/A        switch (opcode & 0x1F) {
2N/A        case TAVOR_CQE_RCV_RECV:
2N/A            /* FALLTHROUGH */
2N/A        case TAVOR_CQE_RCV_RECV2:
2N/A            type = IBT_WRC_RECV;
2N/A            wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe);
2N/A            break;
2N/A        default:
2N/A            wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR;
2N/A            return (TAVOR_CQ_SYNC_AND_DB);
2N/A        }
2N/A    }
2N/A    wc->wc_type = type;
2N/A    wc->wc_flags = flags;
2N/A    /* If we got here, completion status must be success */
2N/A    wc->wc_status = IBT_WC_SUCCESS;
2N/A
2N/A    return (TAVOR_CQ_SYNC_AND_DB);
2N/A}
2N/A
2N/A
2N/A/*
2N/A * dapli_tavor_cq_errcqe_consume()
2N/A */
2N/Astatic int
2N/Adapli_tavor_cq_errcqe_consume(ib_cq_handle_t cqhdl, tavor_hw_cqe_t *cqe,
2N/A    ibt_wc_t *wc)
2N/A{
2N/A    dapls_tavor_wrid_entry_t    wre;
2N/A    uint32_t        next_wqeaddr;
2N/A    uint32_t        imm_eth_pkey_cred;
2N/A    uint_t          nextwqesize, dbd;
2N/A    uint_t          doorbell_cnt, status;
2N/A    uint_t          opcode = TAVOR_CQE_OPCODE_GET(cqe);
2N/A
2N/A    dapl_dbg_log(DAPL_DBG_TYPE_EVD, "errcqe_consume:cqe.eth=%x, wqe=%x\n",
2N/A        TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe),
2N/A        TAVOR_CQE_WQEADDRSZ_GET(cqe));
2N/A
2N/A    /*
2N/A     * Fetch the Work Request ID using the information in the CQE.
2N/A     * See tavor_wr.c for more details.
2N/A     */
2N/A    wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, cqe,
2N/A        (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ? TAVOR_COMPLETION_SEND :
2N/A        TAVOR_COMPLETION_RECV, 1, &wre);
2N/A    wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe);
2N/A
2N/A    /*
2N/A     * Parse the CQE opcode to determine completion type.  We know that
2N/A     * the CQE is an error completion, so we extract only the completion
2N/A     * status here.
2N/A     */
2N/A    imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe);
2N/A    status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT;
2N/A    switch (status) {
2N/A    case TAVOR_CQE_LOC_LEN_ERR:
2N/A        status = IBT_WC_LOCAL_LEN_ERR;
2N/A        break;
2N/A
2N/A    case TAVOR_CQE_LOC_OP_ERR:
2N/A        status = IBT_WC_LOCAL_CHAN_OP_ERR;
2N/A        break;
2N/A
2N/A    case TAVOR_CQE_LOC_PROT_ERR:
2N/A        status = IBT_WC_LOCAL_PROTECT_ERR;
2N/A        break;
2N/A
2N/A    case TAVOR_CQE_WR_FLUSHED_ERR:
2N/A        status = IBT_WC_WR_FLUSHED_ERR;
2N/A        break;
2N/A
2N/A    case TAVOR_CQE_MW_BIND_ERR:
2N/A        status = IBT_WC_MEM_WIN_BIND_ERR;
2N/A        break;
2N/A
2N/A    case TAVOR_CQE_BAD_RESPONSE_ERR:
2N/A        status = IBT_WC_BAD_RESPONSE_ERR;
2N/A        break;
2N/A
2N/A    case TAVOR_CQE_LOCAL_ACCESS_ERR:
2N/A        status = IBT_WC_LOCAL_ACCESS_ERR;
2N/A        break;
2N/A
2N/A    case TAVOR_CQE_REM_INV_REQ_ERR:
2N/A        status = IBT_WC_REMOTE_INVALID_REQ_ERR;
2N/A        break;
2N/A
2N/A    case TAVOR_CQE_REM_ACC_ERR:
2N/A        status = IBT_WC_REMOTE_ACCESS_ERR;
2N/A        break;
2N/A
2N/A    case TAVOR_CQE_REM_OP_ERR:
2N/A        status = IBT_WC_REMOTE_OP_ERR;
2N/A        break;
2N/A
2N/A    case TAVOR_CQE_TRANS_TO_ERR:
2N/A        status = IBT_WC_TRANS_TIMEOUT_ERR;
2N/A        break;
2N/A
2N/A    case TAVOR_CQE_RNRNAK_TO_ERR:
2N/A        status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
2N/A        break;
2N/A
2N/A    /*
2N/A     * The following error codes are not supported in the Tavor driver
2N/A     * as they relate only to Reliable Datagram completion statuses:
2N/A     *    case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
2N/A     *    case TAVOR_CQE_REM_INV_RD_REQ_ERR:
2N/A     *    case TAVOR_CQE_EEC_REM_ABORTED_ERR:
2N/A     *    case TAVOR_CQE_INV_EEC_NUM_ERR:
2N/A     *    case TAVOR_CQE_INV_EEC_STATE_ERR:
2N/A     *    case TAVOR_CQE_LOC_EEC_ERR:
2N/A     */
2N/A
2N/A    default:
2N/A        status = IBT_WC_LOCAL_CHAN_OP_ERR;
2N/A        break;
2N/A    }
2N/A    wc->wc_status = status;
2N/A    wc->wc_type = 0;
2N/A    /*
2N/A     * Now we do all the checking that's necessary to handle completion
2N/A     * queue entry "recycling"
2N/A     *
2N/A     * It is not necessary here to try to sync the WQE as we are only
2N/A     * attempting to read from the Work Queue (and hardware does not
2N/A     * write to it).
2N/A     */
2N/A
2N/A    /*
2N/A     * We can get doorbell info, WQE address, size for the next WQE
2N/A     * from the "wre" (which was filled in above in the call to the
2N/A     * tavor_wrid_get_entry() routine)
2N/A     */
2N/A    dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0;
2N/A    next_wqeaddr = wre.wr_wqeaddrsz;
2N/A    nextwqesize  = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK;
2N/A
2N/A    /*
2N/A     * Get the doorbell count from the CQE.  This indicates how many
2N/A     * completions this one CQE represents.
2N/A     */
2N/A    doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
2N/A
2N/A    /*
2N/A     * Determine if we're ready to consume this CQE yet or not.  If the
2N/A     * next WQE has size zero (i.e. no next WQE) or if the doorbell count
2N/A     * is down to zero, then this is the last/only completion represented
2N/A     * by the current CQE (return TAVOR_CQ_SYNC_AND_DB).  Otherwise, the
2N/A     * current CQE needs to be recycled (see below).
2N/A     */
2N/A    if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) {
2N/A        /*
2N/A         * Consume the CQE
2N/A         *    Return status to indicate that doorbell and sync may be
2N/A         *    necessary.
2N/A         */
2N/A        return (TAVOR_CQ_SYNC_AND_DB);
2N/A
2N/A    } else {
2N/A        /*
2N/A         * Recycle the CQE for use in the next PollCQ() call
2N/A         *    Decrement the doorbell count, modify the error status,
2N/A         *    and update the WQE address and size (to point to the
2N/A         *    next WQE on the chain.  Put these update entries back
2N/A         *    into the CQE.
2N/A         *    Despite the fact that we have updated the CQE, it is not
2N/A         *    necessary for us to attempt to sync this entry just yet
2N/A         *    as we have not changed the "hardware's view" of the
2N/A         *    entry (i.e. we have not modified the "owner" bit - which
2N/A         *    is all that the Tavor hardware really cares about.
2N/A         */
2N/A        doorbell_cnt = doorbell_cnt - dbd;
2N/A        TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cqe,
2N/A            ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) |
2N/A            (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK)));
2N/A        TAVOR_CQE_WQEADDRSZ_SET(cqe,
2N/A            TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize));
2N/A        dapl_dbg_log(DAPL_DBG_TYPE_EVD,
2N/A            "errcqe_consume: recycling cqe.eth=%x, wqe=%x\n",
2N/A            TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe),
2N/A            TAVOR_CQE_WQEADDRSZ_GET(cqe));
2N/A        return (TAVOR_CQ_RECYCLE_ENTRY);
2N/A    }
2N/A}
2N/A
2N/A/*
2N/A * dapli_tavor_cq_notify()
2N/A * This function is used for arming the CQ by ringing the CQ doorbell.
2N/A */
2N/Astatic DAT_RETURN
2N/Adapli_tavor_cq_notify(ib_cq_handle_t cq, int flags, uint32_t param)
2N/A{
2N/A    uint32_t    cqnum;
2N/A
2N/A    /*
2N/A     * Determine if we are trying to get the next completion or the next
2N/A     * "solicited" completion.  Then hit the appropriate doorbell.
2N/A     */
2N/A    cqnum = cq->cq_num;
2N/A    if (flags == IB_NOTIFY_ON_NEXT_COMP) {
2N/A        dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_NOTIFY_CQ,
2N/A            cqnum, TAVOR_CQDB_DEFAULT_PARAM);
2N/A
2N/A    } else if (flags == IB_NOTIFY_ON_NEXT_SOLICITED) {
2N/A        dapli_tavor_cq_doorbell(cq->cq_iauar,
2N/A            TAVOR_CQDB_NOTIFY_CQ_SOLICIT, cqnum,
2N/A            TAVOR_CQDB_DEFAULT_PARAM);
2N/A
2N/A    } else if (flags == IB_NOTIFY_ON_NEXT_NCOMP) {
2N/A        dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_NOTIFY_NCQ,
2N/A            cqnum, param);
2N/A    } else {
2N/A        return (DAT_INVALID_PARAMETER);
2N/A    }
2N/A
2N/A    return (DAT_SUCCESS);
2N/A}
2N/A
2N/A/*
2N/A * dapli_tavor_post_send()
2N/A */
2N/Astatic DAT_RETURN
2N/Adapli_tavor_post_send(DAPL_EP *ep, ibt_send_wr_t *wr, boolean_t ns)
2N/A{
2N/A    tavor_sw_wqe_dbinfo_t       dbinfo;
2N/A    dapls_tavor_wrid_list_hdr_t *wridlist;
2N/A    dapls_tavor_wrid_entry_t    *wre_last;
2N/A    uint32_t            desc;
2N/A    uint64_t            *wqe_addr;
2N/A    uint32_t            desc_sz;
2N/A    uint32_t            wqeaddrsz, signaled_dbd;
2N/A    uint32_t            head, tail, next_tail, qsize_msk;
2N/A    int             status;
2N/A    ib_qp_handle_t          qp;
2N/A
2N/A    if ((ep->qp_state == IBT_STATE_RESET) ||
2N/A        (ep->qp_state == IBT_STATE_INIT) ||
2N/A        (ep->qp_state == IBT_STATE_RTR)) {
2N/A        dapl_dbg_log(DAPL_DBG_TYPE_ERR,
2N/A            "post_send: invalid qp_state %d\n", ep->qp_state);
2N/A        return (DAT_INVALID_STATE);
2N/A    }
2N/A
2N/A    qp = ep->qp_handle;
2N/A
2N/A    /* Grab the lock for the WRID list */
2N/A    dapl_os_lock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
2N/A    wridlist  = qp->qp_sq_wqhdr->wq_wrid_post;
2N/A
2N/A    /* Save away some initial QP state */
2N/A    qsize_msk = qp->qp_sq_wqhdr->wq_size - 1;
2N/A    tail      = qp->qp_sq_wqhdr->wq_tail;
2N/A    head      = qp->qp_sq_wqhdr->wq_head;
2N/A
2N/A    /*
2N/A     * Check for "queue full" condition.  If the queue is already full,
2N/A     * then no more WQEs can be posted, return an error
2N/A     */
2N/A    if (qp->qp_sq_wqhdr->wq_full != 0) {
2N/A        dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
2N/A        return (DAT_INSUFFICIENT_RESOURCES);
2N/A    }
2N/A
2N/A    /*
2N/A     * Increment the "tail index" and check for "queue full" condition.
2N/A     * If we detect that the current work request is going to fill the
2N/A     * work queue, then we mark this condition and continue.
2N/A     */
2N/A    next_tail = (tail + 1) & qsize_msk;
2N/A    if (next_tail == head) {
2N/A        qp->qp_sq_wqhdr->wq_full = 1;
2N/A    }
2N/A
2N/A    /*
2N/A     * Get the user virtual address of the location where the next
2N/A     * Send WQE should be built
2N/A     */
2N/A    wqe_addr = TAVOR_QP_SQ_ENTRY(qp, tail);
2N/A
2N/A    /*
2N/A     * Call tavor_wqe_send_build() to build the WQE at the given address.
2N/A     * This routine uses the information in the ibt_send_wr_t and
2N/A     * returns the size of the WQE when it returns.
2N/A     */
2N/A    status = dapli_tavor_wqe_send_build(qp, wr, wqe_addr, &desc_sz);
2N/A    if (status != DAT_SUCCESS) {
2N/A        dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
2N/A        return (status);
2N/A    }
2N/A
2N/A    /*
2N/A     * Get the descriptor (io address) corresponding to the location
2N/A     * Send WQE was built.
2N/A     */
2N/A    desc = TAVOR_QP_SQ_DESC(qp, tail);
2N/A
2N/A    dapl_os_assert(desc >= qp->qp_sq_desc_addr &&
2N/A        desc <= (qp->qp_sq_desc_addr +
2N/A        qp->qp_sq_numwqe*qp->qp_sq_wqesz));
2N/A
2N/A    /*
2N/A     * Add a WRID entry to the WRID list.  Need to calculate the
2N/A     * "wqeaddrsz" and "signaled_dbd" values to pass to
2N/A     * dapli_tavor_wrid_add_entry()
2N/A     */
2N/A    wqeaddrsz = TAVOR_QP_WQEADDRSZ(desc, desc_sz);
2N/A
2N/A    if (wr->wr_flags & IBT_WR_SEND_SIGNAL) {
2N/A        signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
2N/A    }
2N/A
2N/A    dapli_tavor_wrid_add_entry(qp->qp_sq_wqhdr, wr->wr_id, wqeaddrsz,
2N/A        signaled_dbd);
2N/A
2N/A    /*
2N/A     * Now link the wqe to the old chain (if there was one)
2N/A     */
2N/A    dapli_tavor_wqe_send_linknext(wr, wqe_addr, ns, desc, desc_sz,
2N/A        qp->qp_sq_lastwqeaddr, &dbinfo);
2N/A
2N/A    /*
2N/A     * Now if the WRID tail entry is non-NULL, then this
2N/A     * represents the entry to which we are chaining the
2N/A     * new entries.  Since we are going to ring the
2N/A     * doorbell for this WQE, we want set its "dbd" bit.
2N/A     *
2N/A     * On the other hand, if the tail is NULL, even though
2N/A     * we will have rung the doorbell for the previous WQE
2N/A     * (for the hardware's sake) it is irrelevant to our
2N/A     * purposes (for tracking WRIDs) because we know the
2N/A     * request must have already completed.
2N/A     */
2N/A    wre_last = wridlist->wl_wre_old_tail;
2N/A    if (wre_last != NULL) {
2N/A        wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED;
2N/A    }
2N/A
2N/A    /* Update some of the state in the QP */
2N/A    qp->qp_sq_lastwqeaddr    = wqe_addr;
2N/A    qp->qp_sq_wqhdr->wq_tail = next_tail;
2N/A
2N/A    /* Ring the doorbell */
2N/A    dapli_tavor_qp_send_doorbell(qp->qp_iauar, desc, desc_sz,
2N/A        qp->qp_num, dbinfo.db_fence, dbinfo.db_nopcode);
2N/A
2N/A    dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
2N/A
2N/A    return (DAT_SUCCESS);
2N/A}
2N/A
2N/A/*
2N/A * dapli_tavor_post_recv()
2N/A */
2N/Astatic DAT_RETURN
2N/Adapli_tavor_post_recv(DAPL_EP   *ep, ibt_recv_wr_t *wr, boolean_t ns)
2N/A{
2N/A    dapls_tavor_wrid_list_hdr_t *wridlist;
2N/A    dapls_tavor_wrid_entry_t    *wre_last;
2N/A    ib_qp_handle_t          qp;
2N/A    DAT_RETURN          status;
2N/A    uint32_t            desc;
2N/A    uint64_t            *wqe_addr;
2N/A    uint32_t            desc_sz;
2N/A    uint32_t            wqeaddrsz;
2N/A    uint32_t            head, tail, next_tail, qsize_msk;
2N/A
2N/A    if (ep->qp_state == IBT_STATE_RESET) {
2N/A        dapl_dbg_log(DAPL_DBG_TYPE_ERR,
2N/A            "post_recv: invalid qp_state %d\n", ep->qp_state);
2N/A        return (DAT_INVALID_STATE);
2N/A    }
2N/A    qp = ep->qp_handle;
2N/A
2N/A    /* Grab the lock for the WRID list */
2N/A    dapl_os_lock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
2N/A    wridlist  = qp->qp_rq_wqhdr->wq_wrid_post;
2N/A
2N/A    /* Save away some initial QP state */
2N/A    qsize_msk = qp->qp_rq_wqhdr->wq_size - 1;
2N/A    tail      = qp->qp_rq_wqhdr->wq_tail;
2N/A    head      = qp->qp_rq_wqhdr->wq_head;
2N/A
2N/A    /*
2N/A     * For the ibt_recv_wr_t passed in, parse the request and build a
2N/A     * Recv WQE. Link the WQE with the previous WQE and ring the
2N/A     * door bell.
2N/A     */
2N/A
2N/A    /*
2N/A     * Check for "queue full" condition.  If the queue is already full,
2N/A     * then no more WQEs can be posted. So return an error.
2N/A     */
2N/A    if (qp->qp_rq_wqhdr->wq_full != 0) {
2N/A        dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
2N/A        return (DAT_INSUFFICIENT_RESOURCES);
2N/A    }
2N/A
2N/A    /*
2N/A     * Increment the "tail index" and check for "queue
2N/A     * full" condition.  If we detect that the current
2N/A     * work request is going to fill the work queue, then
2N/A     * we mark this condition and continue.
2N/A     */
2N/A    next_tail = (tail + 1) & qsize_msk;
2N/A    if (next_tail == head) {
2N/A        qp->qp_rq_wqhdr->wq_full = 1;
2N/A    }
2N/A
2N/A    /* Get the descriptor (IO Address) of the WQE to be built */
2N/A    desc = TAVOR_QP_RQ_DESC(qp, tail);
2N/A    /* The user virtual address of the WQE to be built */
2N/A    wqe_addr = TAVOR_QP_RQ_ENTRY(qp, tail);
2N/A
2N/A    /*
2N/A     * Call tavor_wqe_recv_build() to build the WQE at the given
2N/A     * address. This routine uses the information in the
2N/A     * ibt_recv_wr_t and returns the size of the WQE.
2N/A     */
2N/A    status = dapli_tavor_wqe_recv_build(qp, wr, wqe_addr, &desc_sz);
2N/A    if (status != DAT_SUCCESS) {
2N/A        dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
2N/A        return (DAT_INTERNAL_ERROR);
2N/A    }
2N/A
2N/A    /*
2N/A     * Add a WRID entry to the WRID list.  Need to calculate the
2N/A     * "wqeaddrsz" and "signaled_dbd" values to pass to
2N/A     * dapli_tavor_wrid_add_entry().
2N/A     * Note: all Recv WQEs are essentially "signaled"
2N/A     */
2N/A    wqeaddrsz = TAVOR_QP_WQEADDRSZ(desc, desc_sz);
2N/A    dapli_tavor_wrid_add_entry(qp->qp_rq_wqhdr, wr->wr_id, wqeaddrsz,
2N/A        (uint32_t)TAVOR_WRID_ENTRY_SIGNALED);
2N/A
2N/A    /*
2N/A     * Now link the chain to the old chain (if there was one)
2N/A     * and ring the doorbel for the recv work queue.
2N/A     */
2N/A    dapli_tavor_wqe_recv_linknext(wqe_addr, ns, desc, desc_sz,
2N/A        qp->qp_rq_lastwqeaddr);
2N/A
2N/A    /*
2N/A     * Now if the WRID tail entry is non-NULL, then this
2N/A     * represents the entry to which we are chaining the
2N/A     * new entries.  Since we are going to ring the
2N/A     * doorbell for this WQE, we want set its "dbd" bit.
2N/A     *
2N/A     * On the other hand, if the tail is NULL, even though
2N/A     * we will have rung the doorbell for the previous WQE
2N/A     * (for the hardware's sake) it is irrelevant to our
2N/A     * purposes (for tracking WRIDs) because we know the
2N/A     * request must have already completed.
2N/A     */
2N/A    wre_last = wridlist->wl_wre_old_tail;
2N/A    if (wre_last != NULL) {
2N/A        wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED;
2N/A    }
2N/A
2N/A    /* Update some of the state in the QP */
2N/A    qp->qp_rq_lastwqeaddr    = wqe_addr;
2N/A    qp->qp_rq_wqhdr->wq_tail = next_tail;
2N/A
2N/A    /* Ring the doorbell */
2N/A    dapli_tavor_qp_recv_doorbell(qp->qp_iauar, desc, desc_sz,
2N/A        qp->qp_num, 1);
2N/A
2N/A    dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
2N/A
2N/A    return (DAT_SUCCESS);
2N/A}
2N/A
2N/A/*
2N/A * dapli_tavor_post_srq()
2N/A */
2N/Astatic DAT_RETURN
2N/Adapli_tavor_post_srq(DAPL_SRQ *srqp, ibt_recv_wr_t *wr, boolean_t ns)
2N/A{
2N/A    ib_srq_handle_t         srq;
2N/A    DAT_RETURN          status;
2N/A    uint32_t            desc;
2N/A    uint64_t            *wqe_addr;
2N/A    uint64_t            *last_wqe_addr;
2N/A    uint32_t            head, next_head, qsize_msk;
2N/A    uint32_t            wqe_index;
2N/A
2N/A
2N/A    srq = srqp->srq_handle;
2N/A
2N/A    /* Grab the lock for the WRID list */
2N/A    dapl_os_lock(&srq->srq_wridlist->wl_lock->wrl_lock);
2N/A
2N/A    /*
2N/A     * For the ibt_recv_wr_t passed in, parse the request and build a
2N/A     * Recv WQE. Link the WQE with the previous WQE and ring the
2N/A     * door bell.
2N/A     */
2N/A
2N/A    /*
2N/A     * Check for "queue full" condition.  If the queue is already full,
2N/A     * ie. there are no free entries, then no more WQEs can be posted.
2N/A     * So return an error.
2N/A     */
2N/A    if (srq->srq_wridlist->wl_freel_entries == 0) {
2N/A        dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
2N/A        return (DAT_INSUFFICIENT_RESOURCES);
2N/A    }
2N/A
2N/A    /* Save away some initial SRQ state */
2N/A    qsize_msk = srq->srq_wridlist->wl_size - 1;
2N/A    head      = srq->srq_wridlist->wl_freel_head;
2N/A
2N/A    next_head = (head + 1) & qsize_msk;
2N/A
2N/A    /* Get the descriptor (IO Address) of the WQE to be built */
2N/A    desc = srq->srq_wridlist->wl_free_list[head];
2N/A
2N/A    wqe_index = TAVOR_SRQ_WQ_INDEX(srq->srq_wq_desc_addr, desc,
2N/A        srq->srq_wq_wqesz);
2N/A
2N/A    /* The user virtual address of the WQE to be built */
2N/A    wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq, wqe_index);
2N/A
2N/A    /*
2N/A     * Call dapli_tavor_wqe_srq_build() to build the WQE at the given
2N/A     * address. This routine uses the information in the
2N/A     * ibt_recv_wr_t and returns the size of the WQE.
2N/A     */
2N/A    status = dapli_tavor_wqe_srq_build(srq, wr, wqe_addr);
2N/A    if (status != DAT_SUCCESS) {
2N/A        dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
2N/A        return (status);
2N/A    }
2N/A
2N/A    /*
2N/A     * Add a WRID entry to the WRID list.
2N/A     */
2N/A    dapli_tavor_wrid_add_entry_srq(srq, wr->wr_id, wqe_index);
2N/A
2N/A    if (srq->srq_wq_lastwqeindex == -1) {
2N/A        last_wqe_addr = NULL;
2N/A    } else {
2N/A        last_wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq,
2N/A            srq->srq_wq_lastwqeindex);
2N/A    }
2N/A    /*
2N/A     * Now link the chain to the old chain (if there was one)
2N/A     * and ring the doorbell for the SRQ.
2N/A     */
2N/A    dapli_tavor_wqe_srq_linknext(wqe_addr, ns, desc, last_wqe_addr);
2N/A
2N/A    /* Update some of the state in the SRQ */
2N/A    srq->srq_wq_lastwqeindex     = wqe_index;
2N/A    srq->srq_wridlist->wl_freel_head = next_head;
2N/A    srq->srq_wridlist->wl_freel_entries--;
2N/A    dapl_os_assert(srq->srq_wridlist->wl_freel_entries <=
2N/A        srq->srq_wridlist->wl_size);
2N/A
2N/A    /* Ring the doorbell - for SRQ nds = 0 */
2N/A    dapli_tavor_qp_recv_doorbell(srq->srq_iauar, desc, 0,
2N/A        srq->srq_num, 1);
2N/A
2N/A    dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
2N/A
2N/A    return (DAT_SUCCESS);
2N/A}
2N/A
2N/A/*
2N/A * dapli_tavor_wrid_add_entry()
2N/A */
2N/Aextern void
2N/Adapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t *wq, uint64_t wrid,
2N/A    uint32_t wqeaddrsz, uint_t signaled_dbd)
2N/A{
2N/A    dapls_tavor_wrid_entry_t    *wre_tmp;
2N/A    uint32_t            head, tail, size;
2N/A
2N/A    /*
2N/A     * Find the entry in the container pointed to by the "tail" index.
2N/A     * Add all of the relevant information to that entry, including WRID,
2N/A     * "wqeaddrsz" parameter, and whether it was signaled/unsignaled
2N/A     * and/or doorbelled.
2N/A     */
2N/A    head = wq->wq_wrid_post->wl_head;
2N/A    tail = wq->wq_wrid_post->wl_tail;
2N/A    size = wq->wq_wrid_post->wl_size;
2N/A    wre_tmp = &wq->wq_wrid_post->wl_wre[tail];
2N/A    wre_tmp->wr_wrid      = wrid;
2N/A    wre_tmp->wr_wqeaddrsz     = wqeaddrsz;
2N/A    wre_tmp->wr_signaled_dbd  = signaled_dbd;
2N/A
2N/A    /*
2N/A     * Update the "wrid_old_tail" pointer to point to the entry we just
2N/A     * inserted into the queue.  By tracking this pointer (the pointer to
2N/A     * the most recently inserted entry) it will possible later in the
2N/A     * PostSend() and PostRecv() code paths to find the entry that needs
2N/A     * its "doorbelled" flag set (see comment in tavor_post_recv() and/or
2N/A     * tavor_post_send()).
2N/A     */
2N/A    wq->wq_wrid_post->wl_wre_old_tail = wre_tmp;
2N/A
2N/A    /* Update the tail index */
2N/A    tail = ((tail + 1) & (size - 1));
2N/A    wq->wq_wrid_post->wl_tail = tail;
2N/A
2N/A    /*
2N/A     * If the "tail" index has just wrapped over into the "head" index,
2N/A     * then we have filled the container.  We use the "full" flag to
2N/A     * indicate this condition and to distinguish it from the "empty"
2N/A     * condition (where head and tail are also equal).
2N/A     */
2N/A    if (head == tail) {
2N/A        wq->wq_wrid_post->wl_full = 1;
2N/A    }
2N/A}
2N/A
2N/A/*
2N/A * dapli_tavor_wrid_add_entry_srq()
2N/A */
2N/Aextern void
2N/Adapli_tavor_wrid_add_entry_srq(ib_srq_handle_t srq, uint64_t wrid,
2N/A    uint32_t wqe_index)
2N/A{
2N/A    dapls_tavor_wrid_entry_t    *wre;
2N/A
2N/A    /* ASSERT on impossible wqe_index values */
2N/A    dapl_os_assert(wqe_index < srq->srq_wq_numwqe);
2N/A
2N/A    /*
2N/A     * Setup the WRE.
2N/A     *
2N/A     * Given the 'wqe_index' value, we store the WRID at this WRE offset.
2N/A     * And we set the WRE to be signaled_dbd so that on poll CQ we can find
2N/A     * this information and associate the WRID to the WQE found on the CQE.
2N/A     * Note: all Recv WQEs are essentially "signaled"
2N/A     */
2N/A    wre = &srq->srq_wridlist->wl_wre[wqe_index];
2N/A    wre->wr_wrid = wrid;
2N/A    wre->wr_signaled_dbd = (uint32_t)TAVOR_WRID_ENTRY_SIGNALED;
2N/A}
2N/A
2N/A/*
2N/A * dapli_tavor_cq_srq_entries_flush()
2N/A */
2N/Astatic void
2N/Adapli_tavor_cq_srq_entries_flush(ib_qp_handle_t qp)
2N/A{
2N/A    ib_cq_handle_t      cq;
2N/A    dapls_tavor_workq_hdr_t *wqhdr;
2N/A    tavor_hw_cqe_t      *cqe;
2N/A    tavor_hw_cqe_t      *next_cqe;
2N/A    uint32_t        cons_indx, tail_cons_indx, wrap_around_mask;
2N/A    uint32_t        new_indx, check_indx, indx;
2N/A    uint32_t        num_to_increment;
2N/A    int         cqe_qpnum, cqe_type;
2N/A    int         outstanding_cqes, removed_cqes;
2N/A    int         i;
2N/A
2N/A    /* ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock)); */
2N/A
2N/A    cq = qp->qp_rq_cqhdl;
2N/A    wqhdr = qp->qp_rq_wqhdr;
2N/A
2N/A    dapl_os_assert(wqhdr->wq_wrid_post != NULL);
2N/A    dapl_os_assert(wqhdr->wq_wrid_post->wl_srq_en != 0);
2N/A
2N/A    /* Get the consumer index */
2N/A    cons_indx = cq->cq_consindx;
2N/A
2N/A    /*
2N/A     * Calculate the wrap around mask.  Note: This operation only works
2N/A     * because all Tavor completion queues have power-of-2 sizes
2N/A     */
2N/A    wrap_around_mask = (cq->cq_size - 1);
2N/A
2N/A    /* Calculate the pointer to the first CQ entry */
2N/A    cqe = &cq->cq_addr[cons_indx];
2N/A
2N/A    /*
2N/A     * Loop through the CQ looking for entries owned by software.  If an
2N/A     * entry is owned by software then we increment an 'outstanding_cqes'
2N/A     * count to know how many entries total we have on our CQ.  We use this
2N/A     * value further down to know how many entries to loop through looking
2N/A     * for our same QP number.
2N/A     */
2N/A    outstanding_cqes = 0;
2N/A    tail_cons_indx = cons_indx;
2N/A    while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
2N/A        /* increment total cqes count */
2N/A        outstanding_cqes++;
2N/A
2N/A        /* increment the consumer index */
2N/A        tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask;
2N/A
2N/A        /* update the pointer to the next cq entry */
2N/A        cqe = &cq->cq_addr[tail_cons_indx];
2N/A    }
2N/A
2N/A    /*
2N/A     * Using the 'tail_cons_indx' that was just set, we now know how many
2N/A     * total CQEs possible there are.  Set the 'check_indx' and the
2N/A     * 'new_indx' to the last entry identified by 'tail_cons_indx'
2N/A     */
2N/A    check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask;
2N/A
2N/A    for (i = 0; i < outstanding_cqes; i++) {
2N/A        cqe = &cq->cq_addr[check_indx];
2N/A
2N/A        /* Grab QP number from CQE */
2N/A        cqe_qpnum = TAVOR_CQE_QPNUM_GET(cqe);
2N/A        cqe_type = TAVOR_CQE_SENDRECV_GET(cqe);
2N/A
2N/A        /*
2N/A         * If the QP number is the same in the CQE as the QP that we
2N/A         * have on this SRQ, then we must free up the entry off the
2N/A         * SRQ.  We also make sure that the completion type is of the
2N/A         * 'TAVOR_COMPLETION_RECV' type.  So any send completions on
2N/A         * this CQ will be left as-is.  The handling of returning
2N/A         * entries back to HW ownership happens further down.
2N/A         */
2N/A        if (cqe_qpnum == qp->qp_num &&
2N/A            cqe_type == TAVOR_COMPLETION_RECV) {
2N/A            /* Add back to SRQ free list */
2N/A            (void) dapli_tavor_wrid_find_match_srq(
2N/A                wqhdr->wq_wrid_post, cqe);
2N/A        } else {
2N/A            /* Do Copy */
2N/A            if (check_indx != new_indx) {
2N/A                next_cqe = &cq->cq_addr[new_indx];
2N/A                /*
2N/A                 * Copy the CQE into the "next_cqe"
2N/A                 * pointer.
2N/A                 */
2N/A                (void) dapl_os_memcpy(next_cqe, cqe,
2N/A                    sizeof (tavor_hw_cqe_t));
2N/A            }
2N/A            new_indx = (new_indx - 1) & wrap_around_mask;
2N/A        }
2N/A        /* Move index to next CQE to check */
2N/A        check_indx = (check_indx - 1) & wrap_around_mask;
2N/A    }
2N/A
2N/A    /* Initialize removed cqes count */
2N/A    removed_cqes = 0;
2N/A
2N/A    /* If an entry was removed */
2N/A    if (check_indx != new_indx) {
2N/A
2N/A        /*
2N/A         * Set current pointer back to the beginning consumer index.
2N/A         * At this point, all unclaimed entries have been copied to the
2N/A         * index specified by 'new_indx'.  This 'new_indx' will be used
2N/A         * as the new consumer index after we mark all freed entries as
2N/A         * having HW ownership.  We do that here.
2N/A         */
2N/A
2N/A        /* Loop through all entries until we reach our new pointer */
2N/A        for (indx = cons_indx; indx <= new_indx;
2N/A            indx = (indx + 1) & wrap_around_mask) {
2N/A            removed_cqes++;
2N/A            cqe = &cq->cq_addr[indx];
2N/A
2N/A            /* Reset entry to hardware ownership */
2N/A            TAVOR_CQE_OWNER_SET_HW(cqe);
2N/A        }
2N/A    }
2N/A
2N/A    /*
2N/A     * Update consumer index to be the 'new_indx'.  This moves it past all
2N/A     * removed entries.  Because 'new_indx' is pointing to the last
2N/A     * previously valid SW owned entry, we add 1 to point the cons_indx to
2N/A     * the first HW owned entry.
2N/A     */
2N/A    cons_indx = (new_indx + 1) & wrap_around_mask;
2N/A
2N/A    /*
2N/A     * Now we only ring the doorbell (to update the consumer index) if
2N/A     * we've actually consumed a CQ entry.  If we found no QP number
2N/A     * matches above, then we would not have removed anything.  So only if
2N/A     * something was removed do we ring the doorbell.
2N/A     */
2N/A    if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
2N/A        /*
2N/A         * Post doorbell to update the consumer index.  Doorbell
2N/A         * value indicates number of entries consumed (minus 1)
2N/A         */
2N/A        if (cons_indx > cq->cq_consindx) {
2N/A            num_to_increment = (cons_indx - cq->cq_consindx) - 1;
2N/A        } else {
2N/A            num_to_increment = ((cons_indx + cq->cq_size) -
2N/A                cq->cq_consindx) - 1;
2N/A        }
2N/A        cq->cq_consindx = cons_indx;
2N/A
2N/A        dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_INCR_CONSINDX,
2N/A            cq->cq_num, num_to_increment);
2N/A    }
2N/A}
2N/A
2N/A/* ARGSUSED */
2N/Astatic void
2N/Adapli_tavor_qp_init(ib_qp_handle_t qp)
2N/A{
2N/A}
2N/A
2N/A/* ARGSUSED */
2N/Astatic void
2N/Adapli_tavor_cq_init(ib_cq_handle_t cq)
2N/A{
2N/A}
2N/A
2N/A/* ARGSUSED */
2N/Astatic void
2N/Adapli_tavor_srq_init(ib_srq_handle_t srq)
2N/A{
2N/A}
2N/A
2N/Avoid
2N/Adapls_init_funcs_tavor(DAPL_HCA *hca_ptr)
2N/A{
2N/A    hca_ptr->post_send = dapli_tavor_post_send;
2N/A    hca_ptr->post_recv = dapli_tavor_post_recv;
2N/A    hca_ptr->post_srq = dapli_tavor_post_srq;
2N/A    hca_ptr->cq_peek = dapli_tavor_cq_peek;
2N/A    hca_ptr->cq_poll = dapli_tavor_cq_poll;
2N/A    hca_ptr->cq_poll_one = dapli_tavor_cq_poll_one;
2N/A    hca_ptr->cq_notify = dapli_tavor_cq_notify;
2N/A    hca_ptr->srq_flush = dapli_tavor_cq_srq_entries_flush;
2N/A    hca_ptr->qp_init = dapli_tavor_qp_init;
2N/A    hca_ptr->cq_init = dapli_tavor_cq_init;
2N/A    hca_ptr->srq_init = dapli_tavor_srq_init;
2N/A    hca_ptr->hermon_resize_cq = 0;
2N/A}