hermon_wr.c revision 9c865d645a5c60028aed172eea31ca36d81a2ff1
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* hermon_wr.c
* Hermon Work Request Processing Routines
*
* Implements all the routines necessary to provide the PostSend(),
* PostRecv() and PostSRQ() verbs. Also contains all the code
* necessary to implement the Hermon WRID tracking mechanism.
*/
#include <sys/types.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/modctl.h>
#include <sys/avl.h>
#include <sys/ib/adapters/hermon/hermon.h>
static uint32_t hermon_wr_get_immediate(ibt_send_wr_t *wr);
static int hermon_wr_bind_check(hermon_state_t *state, ibt_send_wr_t *wr);
static int hermon_wqe_send_build(hermon_state_t *state, hermon_qphdl_t qp,
ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
static int hermon_wqe_mlx_build(hermon_state_t *state, hermon_qphdl_t qp,
ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
static void hermon_wqe_headroom(uint_t from, hermon_qphdl_t qp);
static int hermon_wqe_recv_build(hermon_state_t *state, hermon_qphdl_t qp,
ibt_recv_wr_t *wr, uint64_t *desc);
static int hermon_wqe_srq_build(hermon_state_t *state, hermon_srqhdl_t srq,
ibt_recv_wr_t *wr, uint64_t *desc);
static void hermon_wqe_sync(void *hdl, uint_t sync_from,
uint_t sync_to, uint_t sync_type, uint_t flag);
static hermon_workq_avl_t *hermon_wrid_wqavl_find(hermon_cqhdl_t cq, uint_t qpn,
uint_t send_or_recv);
static void hermon_cq_workq_add(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl);
static void hermon_cq_workq_remove(hermon_cqhdl_t cq,
hermon_workq_avl_t *wqavl);
static ibt_wr_ds_t null_sgl = { 0, 0x00000100, 0 };
static int
hermon_post_send_ud(hermon_state_t *state, hermon_qphdl_t qp,
ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
{
hermon_hw_snd_wqe_ud_t *ud;
hermon_workq_hdr_t *wq;
hermon_ahhdl_t ah;
ibt_ud_dest_t *dest;
uint64_t *desc;
uint32_t desc_sz;
uint32_t signaled_dbd, solicited;
uint32_t head, tail, next_tail, qsize_msk;
uint32_t hdrmwqes;
uint32_t nopcode, fence, immed_data = 0;
hermon_hw_wqe_sgl_t *ds, *old_ds;
ibt_wr_ds_t *sgl;
uint32_t nds, dnds;
int i, j, last_ds, num_ds, status;
uint32_t *wqe_start;
int sectperwqe;
uint_t posted_cnt = 0;
/* initialize the FMA retry loop */
hermon_pio_init(fm_loop_cnt, fm_status, fm_test_num);
ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
_NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&qp->qp_sq_lock))
/* Grab the lock for the WRID list */
membar_consumer();
/* Save away some initial QP state */
wq = qp->qp_sq_wqhdr;
qsize_msk = wq->wq_mask;
hdrmwqes = qp->qp_sq_hdrmwqes; /* in WQEs */
sectperwqe = 1 << (qp->qp_sq_log_wqesz - 2);
tail = wq->wq_tail;
head = wq->wq_head;
status = DDI_SUCCESS;
post_next:
/*
* Check for "queue full" condition. If the queue
* is already full, then no more WQEs can be posted.
* So break out, ring a doorbell (if necessary) and
* return an error
*/
if (wq->wq_full != 0) {
status = IBT_QP_FULL;
goto done;
}
next_tail = (tail + 1) & qsize_msk;
if (((tail + hdrmwqes) & qsize_msk) == head) {
wq->wq_full = 1;
}
desc = HERMON_QP_SQ_ENTRY(qp, tail);
ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
sizeof (hermon_hw_snd_wqe_ctrl_t));
ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
sizeof (hermon_hw_snd_wqe_ud_t));
nds = wr->wr_nds;
sgl = wr->wr_sgl;
num_ds = 0;
/* need to know the count of destination nds for backward loop */
for (dnds = 0, i = 0; i < nds; i++) {
if (sgl[i].ds_len != 0)
dnds++;
}
/*
* Build a Send or Send_LSO WQE
*/
if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
int total_len;
nopcode = HERMON_WQE_SEND_NOPCODE_LSO;
if (wr->wr.ud_lso.lso_hdr_sz > 60) {
nopcode |= (1 << 6); /* ReRead bit must be set */
}
dest = wr->wr.ud_lso.lso_ud_dest;
ah = (hermon_ahhdl_t)dest->ud_ah;
if (ah == NULL) {
status = IBT_AH_HDL_INVALID;
goto done;
}
HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
total_len = (4 + 0xf + wr->wr.ud_lso.lso_hdr_sz) & ~0xf;
if ((uintptr_t)ds + total_len + (nds * 16) >
(uintptr_t)desc + (1 << qp->qp_sq_log_wqesz)) {
status = IBT_QP_SGL_LEN_INVALID;
goto done;
}
old_ds = ds;
bcopy(wr->wr.ud_lso.lso_hdr, (uint32_t *)old_ds + 1,
wr->wr.ud_lso.lso_hdr_sz);
ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + total_len);
i = 0;
} else if (wr->wr_opcode == IBT_WRC_SEND) {
if (wr->wr_flags & IBT_WR_SEND_IMMED) {
nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
immed_data = wr->wr.ud.udwr_immed;
} else {
nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
}
dest = wr->wr.ud.udwr_dest;
ah = (hermon_ahhdl_t)dest->ud_ah;
if (ah == NULL) {
status = IBT_AH_HDL_INVALID;
goto done;
}
HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
i = 0;
} else {
status = IBT_QP_OP_TYPE_INVALID;
goto done;
}
if (nds > qp->qp_sq_sgl) {
status = IBT_QP_SGL_LEN_INVALID;
goto done;
}
for (last_ds = num_ds, j = i; j < nds; j++) {
if (sgl[j].ds_len != 0)
last_ds++; /* real last ds of wqe to fill */
}
desc_sz = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc) >> 0x4;
for (j = nds; --j >= i; ) {
if (sgl[j].ds_len == 0) {
continue;
}
/*
* Fill in the Data Segment(s) for the current WQE, using the
* information contained in the scatter-gather list of the
* work request.
*/
last_ds--;
HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[j]);
}
membar_producer();
if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
HERMON_WQE_BUILD_LSO(qp, old_ds, wr->wr.ud_lso.lso_mss,
wr->wr.ud_lso.lso_hdr_sz);
}
fence = (wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
signaled_dbd = ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
(wr->wr_flags & IBT_WR_SEND_SIGNAL)) ? 1 : 0;
solicited = (wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 1 : 0;
HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data,
solicited, signaled_dbd, wr->wr_flags & IBT_WR_SEND_CKSUM, qp);
wq->wq_wrid[tail] = wr->wr_id;
tail = next_tail;
/* Update some of the state in the QP */
wq->wq_tail = tail;
membar_producer();
/* Now set the ownership bit and opcode (first dword). */
HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode);
posted_cnt++;
if (--num_wr > 0) {
/* do the invalidate of the headroom */
wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
(tail + hdrmwqes) & qsize_msk);
for (i = 16; i < sectperwqe; i += 16) {
wqe_start[i] = 0xFFFFFFFF;
}
wr++;
goto post_next;
}
done:
if (posted_cnt != 0) {
ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
membar_producer();
/* the FMA retry loop starts for Hermon doorbell register. */
hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
fm_status, fm_test_num);
HERMON_UAR_DOORBELL(state, uarhdl,
(uint64_t *)(void *)&state->hs_uar->send,
(uint64_t)qp->qp_ring);
/* the FMA retry loop ends. */
hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
fm_status, fm_test_num);
/* do the invalidate of the headroom */
wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
(tail + hdrmwqes) & qsize_msk);
for (i = 16; i < sectperwqe; i += 16) {
wqe_start[i] = 0xFFFFFFFF;
}
}
if (num_posted != NULL)
*num_posted = posted_cnt;
mutex_exit(&qp->qp_sq_lock);
return (status);
pio_error:
mutex_exit(&qp->qp_sq_lock);
hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
return (ibc_get_ci_failure(0));
}
static int
hermon_post_send_rc(hermon_state_t *state, hermon_qphdl_t qp,
ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
{
uint64_t *desc;
hermon_workq_hdr_t *wq;
uint32_t desc_sz;
uint32_t signaled_dbd, solicited;
uint32_t head, tail, next_tail, qsize_msk;
uint32_t hdrmwqes;
int status;
uint32_t nopcode, fence, immed_data = 0;
hermon_hw_snd_wqe_remaddr_t *rc;
hermon_hw_snd_wqe_atomic_t *at;
hermon_hw_snd_wqe_bind_t *bn;
hermon_hw_wqe_sgl_t *ds;
ibt_wr_ds_t *sgl;
uint32_t nds;
int i, last_ds, num_ds;
uint32_t *wqe_start;
int sectperwqe;
uint_t posted_cnt = 0;
/* initialize the FMA retry loop */
hermon_pio_init(fm_loop_cnt, fm_status, fm_test_num);
ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
_NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&qp->qp_sq_lock))
/* make sure we see any update of wq_head */
membar_consumer();
/* Save away some initial QP state */
wq = qp->qp_sq_wqhdr;
qsize_msk = wq->wq_mask;
hdrmwqes = qp->qp_sq_hdrmwqes; /* in WQEs */
sectperwqe = 1 << (qp->qp_sq_log_wqesz - 2);
tail = wq->wq_tail;
head = wq->wq_head;
status = DDI_SUCCESS;
post_next:
/*
* Check for "queue full" condition. If the queue
* is already full, then no more WQEs can be posted.
* So break out, ring a doorbell (if necessary) and
* return an error
*/
if (wq->wq_full != 0) {
status = IBT_QP_FULL;
goto done;
}
next_tail = (tail + 1) & qsize_msk;
if (((tail + hdrmwqes) & qsize_msk) == head) {
wq->wq_full = 1;
}
desc = HERMON_QP_SQ_ENTRY(qp, tail);
ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
sizeof (hermon_hw_snd_wqe_ctrl_t));
nds = wr->wr_nds;
sgl = wr->wr_sgl;
num_ds = 0;
/*
* Validate the operation type. For RC requests, we allow
* "Send", "RDMA Read", "RDMA Write", various "Atomic"
* operations, and memory window "Bind"
*/
switch (wr->wr_opcode) {
default:
status = IBT_QP_OP_TYPE_INVALID;
goto done;
case IBT_WRC_SEND:
if (wr->wr_flags & IBT_WR_SEND_IMMED) {
nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
immed_data = wr->wr.rc.rcwr.send_immed;
} else {
nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
}
break;
/*
* If this is an RDMA Read or RDMA Write request, then fill
* in the "Remote Address" header fields.
*/
case IBT_WRC_RDMAW:
if (wr->wr_flags & IBT_WR_SEND_IMMED) {
nopcode = HERMON_WQE_SEND_NOPCODE_RDMAWI;
immed_data = wr->wr.rc.rcwr.rdma.rdma_immed;
} else {
nopcode = HERMON_WQE_SEND_NOPCODE_RDMAW;
}
/* FALLTHROUGH */
case IBT_WRC_RDMAR:
if (wr->wr_opcode == IBT_WRC_RDMAR)
nopcode = HERMON_WQE_SEND_NOPCODE_RDMAR;
rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
sizeof (hermon_hw_snd_wqe_ctrl_t));
/*
* Build the Remote Address Segment for the WQE, using
* the information from the RC work request.
*/
HERMON_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
/* Update "ds" for filling in Data Segments (below) */
ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)rc +
sizeof (hermon_hw_snd_wqe_remaddr_t));
break;
/*
* If this is one of the Atomic type operations (i.e
* Compare-Swap or Fetch-Add), then fill in both the "Remote
* Address" header fields and the "Atomic" header fields.
*/
case IBT_WRC_CSWAP:
nopcode = HERMON_WQE_SEND_NOPCODE_ATMCS;
/* FALLTHROUGH */
case IBT_WRC_FADD:
if (wr->wr_opcode == IBT_WRC_FADD)
nopcode = HERMON_WQE_SEND_NOPCODE_ATMFA;
rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
sizeof (hermon_hw_snd_wqe_ctrl_t));
at = (hermon_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
sizeof (hermon_hw_snd_wqe_remaddr_t));
/*
* Build the Remote Address and Atomic Segments for
* the WQE, using the information from the RC Atomic
* work request.
*/
HERMON_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
HERMON_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
/* Update "ds" for filling in Data Segments (below) */
ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)at +
sizeof (hermon_hw_snd_wqe_atomic_t));
/*
* Update "nds" and "sgl" because Atomic requests have
* only a single Data Segment.
*/
nds = 1;
sgl = wr->wr_sgl;
break;
/*
* If this is memory window Bind operation, then we call the
* hermon_wr_bind_check() routine to validate the request and
* to generate the updated RKey. If this is successful, then
* we fill in the WQE's "Bind" header fields.
*/
case IBT_WRC_BIND:
nopcode = HERMON_WQE_SEND_NOPCODE_BIND;
status = hermon_wr_bind_check(state, wr);
if (status != DDI_SUCCESS)
goto done;
bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
sizeof (hermon_hw_snd_wqe_ctrl_t));
/*
* Build the Bind Memory Window Segments for the WQE,
* using the information from the RC Bind memory
* window work request.
*/
HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
/*
* Update the "ds" pointer. Even though the "bind"
* operation requires no SGLs, this is necessary to
* facilitate the correct descriptor size calculations
* (below).
*/
ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
sizeof (hermon_hw_snd_wqe_bind_t));
nds = 0;
}
/*
* Now fill in the Data Segments (SGL) for the Send WQE based
* on the values setup above (i.e. "sgl", "nds", and the "ds"
* pointer. Start by checking for a valid number of SGL entries
*/
if (nds > qp->qp_sq_sgl) {
status = IBT_QP_SGL_LEN_INVALID;
goto done;
}
for (last_ds = num_ds, i = 0; i < nds; i++) {
if (sgl[i].ds_len != 0)
last_ds++; /* real last ds of wqe to fill */
}
desc_sz = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc) >> 0x4;
for (i = nds; --i >= 0; ) {
if (sgl[i].ds_len == 0) {
continue;
}
/*
* Fill in the Data Segment(s) for the current WQE, using the
* information contained in the scatter-gather list of the
* work request.
*/
last_ds--;
HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[i]);
}
fence = (wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
signaled_dbd = ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
(wr->wr_flags & IBT_WR_SEND_SIGNAL)) ? 1 : 0;
solicited = (wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 1 : 0;
HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data, solicited,
signaled_dbd, wr->wr_flags & IBT_WR_SEND_CKSUM, qp);
wq->wq_wrid[tail] = wr->wr_id;
tail = next_tail;
/* Update some of the state in the QP */
wq->wq_tail = tail;
membar_producer();
/* Now set the ownership bit of the first one in the chain. */
HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode);
posted_cnt++;
if (--num_wr > 0) {
/* do the invalidate of the headroom */
wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
(tail + hdrmwqes) & qsize_msk);
for (i = 16; i < sectperwqe; i += 16) {
wqe_start[i] = 0xFFFFFFFF;
}
wr++;
goto post_next;
}
done:
if (posted_cnt != 0) {
ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
membar_producer();
/* the FMA retry loop starts for Hermon doorbell register. */
hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
fm_status, fm_test_num);
/* Ring the doorbell */
HERMON_UAR_DOORBELL(state, uarhdl,
(uint64_t *)(void *)&state->hs_uar->send,
(uint64_t)qp->qp_ring);
/* the FMA retry loop ends. */
hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
fm_status, fm_test_num);
/* do the invalidate of the headroom */
wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
(tail + hdrmwqes) & qsize_msk);
for (i = 16; i < sectperwqe; i += 16) {
wqe_start[i] = 0xFFFFFFFF;
}
}
/*
* Update the "num_posted" return value (if necessary).
* Then drop the locks and return success.
*/
if (num_posted != NULL) {
*num_posted = posted_cnt;
}
mutex_exit(&qp->qp_sq_lock);
return (status);
pio_error:
mutex_exit(&qp->qp_sq_lock);
hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
return (ibc_get_ci_failure(0));
}
/*
* hermon_post_send()
* Context: Can be called from interrupt or base context.
*/
int
hermon_post_send(hermon_state_t *state, hermon_qphdl_t qp,
ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
{
ibt_send_wr_t *curr_wr;
hermon_workq_hdr_t *wq;
hermon_ahhdl_t ah;
uint64_t *desc, *prev;
uint32_t desc_sz;
uint32_t signaled_dbd, solicited;
uint32_t head, tail, next_tail, qsize_msk;
uint32_t sync_from, sync_to;
uint32_t hdrmwqes;
uint_t currindx, wrindx, numremain;
uint_t chainlen;
uint_t posted_cnt, maxstat;
uint_t total_posted;
int status;
uint32_t nopcode, fence, immed_data = 0;
uint32_t prev_nopcode;
/* initialize the FMA retry loop */
hermon_pio_init(fm_loop_cnt, fm_status, fm_test);
/*
* Check for user-mappable QP memory. Note: We do not allow kernel
* clients to post to QP memory that is accessible directly by the
* user. If the QP memory is user accessible, then return an error.
*/
if (qp->qp_is_umap) {
return (IBT_QP_HDL_INVALID);
}
mutex_enter(&qp->qp_lock);
/*
* Check QP state. Can not post Send requests from the "Reset",
* "Init", or "RTR" states
*/
if ((qp->qp_state == HERMON_QP_RESET) ||
(qp->qp_state == HERMON_QP_INIT) ||
(qp->qp_state == HERMON_QP_RTR)) {
mutex_exit(&qp->qp_lock);
return (IBT_QP_STATE_INVALID);
}
mutex_exit(&qp->qp_lock);
mutex_enter(&qp->qp_sq_lock);
if (qp->qp_is_special)
goto post_many;
/* Use these optimized functions most of the time */
if (qp->qp_serv_type == HERMON_QP_UD)
return (hermon_post_send_ud(state, qp, wr, num_wr, num_posted));
if (qp->qp_serv_type == HERMON_QP_RC)
return (hermon_post_send_rc(state, qp, wr, num_wr, num_posted));
if (qp->qp_serv_type == HERMON_QP_UC)
goto post_many;
mutex_exit(&qp->qp_sq_lock);
return (IBT_QP_SRV_TYPE_INVALID);
post_many:
/* general loop for non-optimized posting */
/* Grab the lock for the WRID list */
membar_consumer();
/* Save away some initial QP state */
wq = qp->qp_sq_wqhdr;
qsize_msk = wq->wq_mask;
tail = wq->wq_tail;
head = wq->wq_head;
hdrmwqes = qp->qp_sq_hdrmwqes; /* in WQEs */
/* Initialize posted_cnt */
posted_cnt = 0;
total_posted = 0;
/*
* For each ibt_send_wr_t in the wr[] list passed in, parse the
* request and build a Send WQE. NOTE: Because we are potentially
* building a chain of WQEs to post, we want to build them all first,
* and set the valid (HW Ownership) bit on all but the first.
* However, we do not want to validate the first one until the
* entire chain of WQEs has been built. Then in the final
* we set the valid bit in the first, flush if needed, and as a last
* step ring the appropriate doorbell. NOTE: the doorbell ring may
* NOT be needed if the HCA is already processing, but the doorbell
* ring will be done regardless. NOTE ALSO: It is possible for
* more Work Requests to be posted than the HW will support at one
* shot. If this happens, we need to be able to post and ring
* several chains here until the the entire request is complete.
* NOTE ALSO: the term "chain" is used to differentiate it from
* Work Request List passed in; and because that's the terminology
* from the previous generations of HCA - but the WQEs are not, in fact
* chained together for Hermon
*/
wrindx = 0;
numremain = num_wr;
status = DDI_SUCCESS;
while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
/*
* For the first WQE on a new chain we need "prev" to point
* to the current descriptor.
*/
prev = HERMON_QP_SQ_ENTRY(qp, tail);
/*
* unlike Tavor & Arbel, tail will maintain the number of the
* next (this) WQE to be posted. Since there is no backward linking
* in Hermon, we can always just look ahead
*/
/*
* Before we begin, save the current "tail index" for later
* DMA sync
*/
/* NOTE: don't need to go back one like arbel/tavor */
sync_from = tail;
/*
* Break the request up into lists that are less than or
* equal to the maximum number of WQEs that can be posted
* per doorbell ring - 256 currently
*/
chainlen = (numremain > HERMON_QP_MAXDESC_PER_DB) ?
HERMON_QP_MAXDESC_PER_DB : numremain;
numremain -= chainlen;
for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
/*
* Check for "queue full" condition. If the queue
* is already full, then no more WQEs can be posted.
* So break out, ring a doorbell (if necessary) and
* return an error
*/
if (wq->wq_full != 0) {
status = IBT_QP_FULL;
break;
}
/*
* Increment the "tail index". Check for "queue
* full" condition incl. headroom. If we detect that
* the current work request is going to fill the work
* queue, then we mark this condition and continue.
* Don't need >=, because going one-by-one we have to
* hit it exactly sooner or later
*/
next_tail = (tail + 1) & qsize_msk;
if (((tail + hdrmwqes) & qsize_msk) == head) {
wq->wq_full = 1;
}
/*
* Get the address of the location where the next
* Send WQE should be built
*/
desc = HERMON_QP_SQ_ENTRY(qp, tail);
/*
* Call hermon_wqe_send_build() to build the WQE
* at the given address. This routine uses the
* information in the ibt_send_wr_t list (wr[]) and
* returns the size of the WQE when it returns.
*/
status = hermon_wqe_send_build(state, qp,
&wr[wrindx], desc, &desc_sz);
if (status != DDI_SUCCESS) {
break;
}
/*
* Now, build the Ctrl Segment based on
* what was just done
*/
curr_wr = &wr[wrindx];
switch (curr_wr->wr_opcode) {
case IBT_WRC_RDMAW:
if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
nopcode =
HERMON_WQE_SEND_NOPCODE_RDMAWI;
immed_data =
hermon_wr_get_immediate(curr_wr);
} else {
nopcode = HERMON_WQE_SEND_NOPCODE_RDMAW;
}
break;
case IBT_WRC_SEND:
if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
immed_data =
hermon_wr_get_immediate(curr_wr);
} else {
nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
}
break;
case IBT_WRC_SEND_LSO:
nopcode = HERMON_WQE_SEND_NOPCODE_LSO;
break;
case IBT_WRC_RDMAR:
nopcode = HERMON_WQE_SEND_NOPCODE_RDMAR;
break;
case IBT_WRC_CSWAP:
nopcode = HERMON_WQE_SEND_NOPCODE_ATMCS;
break;
case IBT_WRC_FADD:
nopcode = HERMON_WQE_SEND_NOPCODE_ATMFA;
break;
case IBT_WRC_BIND:
nopcode = HERMON_WQE_SEND_NOPCODE_BIND;
break;
}
fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
/*
* now, build up the control segment, leaving the
* owner bit as it is
*/
if ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
(curr_wr->wr_flags & IBT_WR_SEND_SIGNAL)) {
signaled_dbd = 1;
} else {
signaled_dbd = 0;
}
if (curr_wr->wr_flags & IBT_WR_SEND_SOLICIT)
solicited = 1;
else
solicited = 0;
if (qp->qp_is_special) {
ah = (hermon_ahhdl_t)
curr_wr->wr.ud.udwr_dest->ud_ah;
mutex_enter(&ah->ah_lock);
maxstat = ah->ah_udav->max_stat_rate;
HERMON_WQE_SET_MLX_CTRL_SEGMENT(desc, desc_sz,
signaled_dbd, maxstat, ah->ah_udav->rlid,
qp, ah->ah_udav->sl);
mutex_exit(&ah->ah_lock);
} else {
HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz,
fence, immed_data, solicited,
signaled_dbd, curr_wr->wr_flags &
IBT_WR_SEND_CKSUM, qp);
}
wq->wq_wrid[tail] = curr_wr->wr_id;
/*
* If this is not the first descriptor on the current
* chain, then set the ownership bit.
*/
if (currindx != 0) { /* not the first */
membar_producer();
HERMON_SET_SEND_WQE_OWNER(qp,
(uint32_t *)desc, nopcode);
} else
prev_nopcode = nopcode;
/*
* Update the current "tail index" and increment
* "posted_cnt"
*/
tail = next_tail;
posted_cnt++;
}
/*
* If we reach here and there are one or more WQEs which have
* been successfully built as a chain, we have to finish up
* and prepare them for writing to the HW
* The steps are:
* 1. do the headroom fixup
* 2. add in the size of the headroom for the sync
* 3. write the owner bit for the first WQE
* 4. sync them
* 5. fix up the structures
* 6. hit the doorbell in UAR
*/
if (posted_cnt != 0) {
ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
/*
* Save away updated "tail index" for the DMA sync
* including the headroom that will be needed
*/
sync_to = (tail + hdrmwqes) & qsize_msk;
/* do the invalidate of the headroom */
hermon_wqe_headroom(tail, qp);
/* Do a DMA sync for current send WQE(s) */
hermon_wqe_sync(qp, sync_from, sync_to, HERMON_WR_SEND,
DDI_DMA_SYNC_FORDEV);
/* Update some of the state in the QP */
wq->wq_tail = tail;
total_posted += posted_cnt;
posted_cnt = 0;
membar_producer();
/*
* Now set the ownership bit of the first
* one in the chain
*/
HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)prev,
prev_nopcode);
/* the FMA retry loop starts for Hermon doorbell. */
hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
fm_status, fm_test);
HERMON_UAR_DOORBELL(state, uarhdl,
(uint64_t *)(void *)&state->hs_uar->send,
(uint64_t)qp->qp_ring);
/* the FMA retry loop ends. */
hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
fm_status, fm_test);
}
}
/*
* Update the "num_posted" return value (if necessary).
* Then drop the locks and return success.
*/
if (num_posted != NULL) {
*num_posted = total_posted;
}
mutex_exit(&qp->qp_sq_lock);
return (status);
pio_error:
mutex_exit(&qp->qp_sq_lock);
hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
return (ibc_get_ci_failure(0));
}
/*
* hermon_post_recv()
* Context: Can be called from interrupt or base context.
*/
int
hermon_post_recv(hermon_state_t *state, hermon_qphdl_t qp,
ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
{
uint64_t *desc;
hermon_workq_hdr_t *wq;
uint32_t head, tail, next_tail, qsize_msk;
uint32_t sync_from, sync_to;
uint_t wrindx;
uint_t posted_cnt;
int status;
/*
* Check for user-mappable QP memory. Note: We do not allow kernel
* clients to post to QP memory that is accessible directly by the
* user. If the QP memory is user accessible, then return an error.
*/
if (qp->qp_is_umap) {
return (IBT_QP_HDL_INVALID);
}
/* Initialize posted_cnt */
posted_cnt = 0;
mutex_enter(&qp->qp_lock);
/*
* Check if QP is associated with an SRQ
*/
if (qp->qp_srq_en == HERMON_QP_SRQ_ENABLED) {
mutex_exit(&qp->qp_lock);
return (IBT_SRQ_IN_USE);
}
/*
* Check QP state. Can not post Recv requests from the "Reset" state
*/
if (qp->qp_state == HERMON_QP_RESET) {
mutex_exit(&qp->qp_lock);
return (IBT_QP_STATE_INVALID);
}
/* Check that work request transport type is valid */
if ((qp->qp_serv_type != HERMON_QP_UD) &&
(qp->qp_serv_type != HERMON_QP_RC) &&
(qp->qp_serv_type != HERMON_QP_UC)) {
mutex_exit(&qp->qp_lock);
return (IBT_QP_SRV_TYPE_INVALID);
}
mutex_exit(&qp->qp_lock);
mutex_enter(&qp->qp_rq_lock);
/*
* Grab the lock for the WRID list, i.e., membar_consumer().
* This is not needed because the mutex_enter() above has
* the same effect.
*/
/* Save away some initial QP state */
wq = qp->qp_rq_wqhdr;
qsize_msk = wq->wq_mask;
tail = wq->wq_tail;
head = wq->wq_head;
wrindx = 0;
status = DDI_SUCCESS;
/*
* Before we begin, save the current "tail index" for later
* DMA sync
*/
sync_from = tail;
for (wrindx = 0; wrindx < num_wr; wrindx++) {
if (wq->wq_full != 0) {
status = IBT_QP_FULL;
break;
}
next_tail = (tail + 1) & qsize_msk;
if (next_tail == head) {
wq->wq_full = 1;
}
desc = HERMON_QP_RQ_ENTRY(qp, tail);
status = hermon_wqe_recv_build(state, qp, &wr[wrindx], desc);
if (status != DDI_SUCCESS) {
break;
}
wq->wq_wrid[tail] = wr[wrindx].wr_id;
qp->qp_rq_wqecntr++;
tail = next_tail;
posted_cnt++;
}
if (posted_cnt != 0) {
/* Save away updated "tail index" for the DMA sync */
sync_to = tail;
hermon_wqe_sync(qp, sync_from, sync_to, HERMON_WR_RECV,
DDI_DMA_SYNC_FORDEV);
wq->wq_tail = tail;
membar_producer(); /* ensure wrids are visible */
/* Update the doorbell record w/ wqecntr */
HERMON_UAR_DB_RECORD_WRITE(qp->qp_rq_vdbr,
qp->qp_rq_wqecntr & 0xFFFF);
}
if (num_posted != NULL) {
*num_posted = posted_cnt;
}
mutex_exit(&qp->qp_rq_lock);
return (status);
}
/*
* hermon_post_srq()
* Context: Can be called from interrupt or base context.
*/
int
hermon_post_srq(hermon_state_t *state, hermon_srqhdl_t srq,
ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
{
uint64_t *desc;
hermon_workq_hdr_t *wq;
uint_t indx, wrindx;
uint_t posted_cnt;
int status;
mutex_enter(&srq->srq_lock);
/*
* Check for user-mappable QP memory. Note: We do not allow kernel
* clients to post to QP memory that is accessible directly by the
* user. If the QP memory is user accessible, then return an error.
*/
if (srq->srq_is_umap) {
mutex_exit(&srq->srq_lock);
return (IBT_SRQ_HDL_INVALID);
}
/*
* Check SRQ state. Can not post Recv requests when SRQ is in error
*/
if (srq->srq_state == HERMON_SRQ_STATE_ERROR) {
mutex_exit(&srq->srq_lock);
return (IBT_QP_STATE_INVALID);
}
status = DDI_SUCCESS;
posted_cnt = 0;
wq = srq->srq_wq_wqhdr;
indx = wq->wq_head;
for (wrindx = 0; wrindx < num_wr; wrindx++) {
if (indx == wq->wq_tail) {
status = IBT_QP_FULL;
break;
}
desc = HERMON_SRQ_WQE_ADDR(srq, indx);
wq->wq_wrid[indx] = wr[wrindx].wr_id;
status = hermon_wqe_srq_build(state, srq, &wr[wrindx], desc);
if (status != DDI_SUCCESS) {
break;
}
hermon_wqe_sync(srq, indx, indx + 1,
HERMON_WR_SRQ, DDI_DMA_SYNC_FORDEV);
posted_cnt++;
indx = htons(((uint16_t *)desc)[1]);
wq->wq_head = indx;
}
if (posted_cnt != 0) {
srq->srq_wq_wqecntr += posted_cnt;
membar_producer(); /* ensure wrids are visible */
/* Ring the doorbell w/ wqecntr */
HERMON_UAR_DB_RECORD_WRITE(srq->srq_wq_vdbr,
srq->srq_wq_wqecntr & 0xFFFF);
}
if (num_posted != NULL) {
*num_posted = posted_cnt;
}
mutex_exit(&srq->srq_lock);
return (status);
}
/*
* hermon_wqe_send_build()
* Context: Can be called from interrupt or base context.
*/
static int
hermon_wqe_send_build(hermon_state_t *state, hermon_qphdl_t qp,
ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
{
hermon_hw_snd_wqe_ud_t *ud;
hermon_hw_snd_wqe_remaddr_t *rc;
hermon_hw_snd_wqe_atomic_t *at;
hermon_hw_snd_wqe_remaddr_t *uc;
hermon_hw_snd_wqe_bind_t *bn;
hermon_hw_wqe_sgl_t *ds, *old_ds;
ibt_ud_dest_t *dest;
ibt_wr_ds_t *sgl;
hermon_ahhdl_t ah;
uint32_t nds;
int i, j, last_ds, num_ds, status;
int tmpsize;
ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
/* Initialize the information for the Data Segments */
ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
sizeof (hermon_hw_snd_wqe_ctrl_t));
nds = wr->wr_nds;
sgl = wr->wr_sgl;
num_ds = 0;
i = 0;
/*
* Build a Send WQE depends first and foremost on the transport
* type of Work Request (i.e. UD, RC, or UC)
*/
switch (wr->wr_trans) {
case IBT_UD_SRV:
/* Ensure that work request transport type matches QP type */
if (qp->qp_serv_type != HERMON_QP_UD) {
return (IBT_QP_SRV_TYPE_INVALID);
}
/*
* Validate the operation type. For UD requests, only the
* "Send" and "Send LSO" operations are valid.
*/
if (wr->wr_opcode != IBT_WRC_SEND &&
wr->wr_opcode != IBT_WRC_SEND_LSO) {
return (IBT_QP_OP_TYPE_INVALID);
}
/*
* If this is a Special QP (QP0 or QP1), then we need to
* build MLX WQEs instead. So jump to hermon_wqe_mlx_build()
* and return whatever status it returns
*/
if (qp->qp_is_special) {
if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
return (IBT_QP_OP_TYPE_INVALID);
}
status = hermon_wqe_mlx_build(state, qp,
wr, desc, size);
return (status);
}
/*
* Otherwise, if this is a normal UD Send request, then fill
* all the fields in the Hermon UD header for the WQE. Note:
* to do this we'll need to extract some information from the
* Address Handle passed with the work request.
*/
ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
sizeof (hermon_hw_snd_wqe_ctrl_t));
if (wr->wr_opcode == IBT_WRC_SEND) {
dest = wr->wr.ud.udwr_dest;
} else {
dest = wr->wr.ud_lso.lso_ud_dest;
}
ah = (hermon_ahhdl_t)dest->ud_ah;
if (ah == NULL) {
return (IBT_AH_HDL_INVALID);
}
/*
* Build the Unreliable Datagram Segment for the WQE, using
* the information from the address handle and the work
* request.
*/
/* mutex_enter(&ah->ah_lock); */
if (wr->wr_opcode == IBT_WRC_SEND) {
HERMON_WQE_BUILD_UD(qp, ud, ah, wr->wr.ud.udwr_dest);
} else { /* IBT_WRC_SEND_LSO */
HERMON_WQE_BUILD_UD(qp, ud, ah,
wr->wr.ud_lso.lso_ud_dest);
}
/* mutex_exit(&ah->ah_lock); */
/* Update "ds" for filling in Data Segments (below) */
ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
sizeof (hermon_hw_snd_wqe_ud_t));
if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
int total_len;
total_len = (4 + 0xf + wr->wr.ud_lso.lso_hdr_sz) & ~0xf;
if ((uintptr_t)ds + total_len + (nds * 16) >
(uintptr_t)desc + (1 << qp->qp_sq_log_wqesz))
return (IBT_QP_SGL_LEN_INVALID);
bcopy(wr->wr.ud_lso.lso_hdr, (uint32_t *)ds + 1,
wr->wr.ud_lso.lso_hdr_sz);
old_ds = ds;
ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + total_len);
for (; i < nds; i++) {
if (sgl[i].ds_len == 0)
continue;
HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds],
&sgl[i]);
num_ds++;
i++;
break;
}
membar_producer();
HERMON_WQE_BUILD_LSO(qp, old_ds, wr->wr.ud_lso.lso_mss,
wr->wr.ud_lso.lso_hdr_sz);
}
break;
case IBT_RC_SRV:
/* Ensure that work request transport type matches QP type */
if (qp->qp_serv_type != HERMON_QP_RC) {
return (IBT_QP_SRV_TYPE_INVALID);
}
/*
* Validate the operation type. For RC requests, we allow
* "Send", "RDMA Read", "RDMA Write", various "Atomic"
* operations, and memory window "Bind"
*/
if ((wr->wr_opcode != IBT_WRC_SEND) &&
(wr->wr_opcode != IBT_WRC_RDMAR) &&
(wr->wr_opcode != IBT_WRC_RDMAW) &&
(wr->wr_opcode != IBT_WRC_CSWAP) &&
(wr->wr_opcode != IBT_WRC_FADD) &&
(wr->wr_opcode != IBT_WRC_BIND)) {
return (IBT_QP_OP_TYPE_INVALID);
}
/*
* If this is a Send request, then all we need to do is break
* out and here and begin the Data Segment processing below
*/
if (wr->wr_opcode == IBT_WRC_SEND) {
break;
}
/*
* If this is an RDMA Read or RDMA Write request, then fill
* in the "Remote Address" header fields.
*/
if ((wr->wr_opcode == IBT_WRC_RDMAR) ||
(wr->wr_opcode == IBT_WRC_RDMAW)) {
rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
sizeof (hermon_hw_snd_wqe_ctrl_t));
/*
* Build the Remote Address Segment for the WQE, using
* the information from the RC work request.
*/
HERMON_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
/* Update "ds" for filling in Data Segments (below) */
ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)rc +
sizeof (hermon_hw_snd_wqe_remaddr_t));
break;
}
/*
* If this is one of the Atomic type operations (i.e
* Compare-Swap or Fetch-Add), then fill in both the "Remote
* Address" header fields and the "Atomic" header fields.
*/
if ((wr->wr_opcode == IBT_WRC_CSWAP) ||
(wr->wr_opcode == IBT_WRC_FADD)) {
rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
sizeof (hermon_hw_snd_wqe_ctrl_t));
at = (hermon_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
sizeof (hermon_hw_snd_wqe_remaddr_t));
/*
* Build the Remote Address and Atomic Segments for
* the WQE, using the information from the RC Atomic
* work request.
*/
HERMON_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
HERMON_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
/* Update "ds" for filling in Data Segments (below) */
ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)at +
sizeof (hermon_hw_snd_wqe_atomic_t));
/*
* Update "nds" and "sgl" because Atomic requests have
* only a single Data Segment (and they are encoded
* somewhat differently in the work request.
*/
nds = 1;
sgl = wr->wr_sgl;
break;
}
/*
* If this is memory window Bind operation, then we call the
* hermon_wr_bind_check() routine to validate the request and
* to generate the updated RKey. If this is successful, then
* we fill in the WQE's "Bind" header fields.
*/
if (wr->wr_opcode == IBT_WRC_BIND) {
status = hermon_wr_bind_check(state, wr);
if (status != DDI_SUCCESS) {
return (status);
}
bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
sizeof (hermon_hw_snd_wqe_ctrl_t));
/*
* Build the Bind Memory Window Segments for the WQE,
* using the information from the RC Bind memory
* window work request.
*/
HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
/*
* Update the "ds" pointer. Even though the "bind"
* operation requires no SGLs, this is necessary to
* facilitate the correct descriptor size calculations
* (below).
*/
ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
sizeof (hermon_hw_snd_wqe_bind_t));
nds = 0;
}
break;
case IBT_UC_SRV:
/* Ensure that work request transport type matches QP type */
if (qp->qp_serv_type != HERMON_QP_UC) {
return (IBT_QP_SRV_TYPE_INVALID);
}
/*
* Validate the operation type. For UC requests, we only
* allow "Send", "RDMA Write", and memory window "Bind".
* Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic"
* operations
*/
if ((wr->wr_opcode != IBT_WRC_SEND) &&
(wr->wr_opcode != IBT_WRC_RDMAW) &&
(wr->wr_opcode != IBT_WRC_BIND)) {
return (IBT_QP_OP_TYPE_INVALID);
}
/*
* If this is a Send request, then all we need to do is break
* out and here and begin the Data Segment processing below
*/
if (wr->wr_opcode == IBT_WRC_SEND) {
break;
}
/*
* If this is an RDMA Write request, then fill in the "Remote
* Address" header fields.
*/
if (wr->wr_opcode == IBT_WRC_RDMAW) {
uc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
sizeof (hermon_hw_snd_wqe_ctrl_t));
/*
* Build the Remote Address Segment for the WQE, using
* the information from the UC work request.
*/
HERMON_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma);
/* Update "ds" for filling in Data Segments (below) */
ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)uc +
sizeof (hermon_hw_snd_wqe_remaddr_t));
break;
}
/*
* If this is memory window Bind operation, then we call the
* hermon_wr_bind_check() routine to validate the request and
* to generate the updated RKey. If this is successful, then
* we fill in the WQE's "Bind" header fields.
*/
if (wr->wr_opcode == IBT_WRC_BIND) {
status = hermon_wr_bind_check(state, wr);
if (status != DDI_SUCCESS) {
return (status);
}
bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
sizeof (hermon_hw_snd_wqe_ctrl_t));
/*
* Build the Bind Memory Window Segments for the WQE,
* using the information from the UC Bind memory
* window work request.
*/
HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind);
/*
* Update the "ds" pointer. Even though the "bind"
* operation requires no SGLs, this is necessary to
* facilitate the correct descriptor size calculations
* (below).
*/
ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
sizeof (hermon_hw_snd_wqe_bind_t));
nds = 0;
}
break;
default:
return (IBT_QP_SRV_TYPE_INVALID);
}
/*
* Now fill in the Data Segments (SGL) for the Send WQE based on
* the values setup above (i.e. "sgl", "nds", and the "ds" pointer
* Start by checking for a valid number of SGL entries
*/
if (nds > qp->qp_sq_sgl) {
return (IBT_QP_SGL_LEN_INVALID);
}
/*
* For each SGL in the Send Work Request, fill in the Send WQE's data
* segments. Note: We skip any SGL with zero size because Hermon
* hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
* the encoding for zero means a 2GB transfer.
*/
for (last_ds = num_ds, j = i; j < nds; j++) {
if (sgl[j].ds_len != 0)
last_ds++; /* real last ds of wqe to fill */
}
/*
* Return the size of descriptor (in 16-byte chunks)
* For Hermon, we want them (for now) to be on stride size
* boundaries, which was implicit in Tavor/Arbel
*
*/
tmpsize = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc);
*size = tmpsize >> 0x4;
for (j = nds; --j >= i; ) {
if (sgl[j].ds_len == 0) {
continue;
}
/*
* Fill in the Data Segment(s) for the current WQE, using the
* information contained in the scatter-gather list of the
* work request.
*/
last_ds--;
HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[j]);
}
return (DDI_SUCCESS);
}
/*
* hermon_wqe_mlx_build()
* Context: Can be called from interrupt or base context.
*/
static int
hermon_wqe_mlx_build(hermon_state_t *state, hermon_qphdl_t qp,
ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
{
hermon_ahhdl_t ah;
hermon_hw_udav_t *udav;
ib_lrh_hdr_t *lrh;
ib_grh_t *grh;
ib_bth_hdr_t *bth;
ib_deth_hdr_t *deth;
hermon_hw_wqe_sgl_t *ds;
ibt_wr_ds_t *sgl;
uint8_t *mgmtclass, *hpoint, *hcount;
uint32_t nds, offset, pktlen;
uint32_t desc_sz;
int i, num_ds;
int tmpsize;
ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
/* Initialize the information for the Data Segments */
ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
sizeof (hermon_hw_mlx_wqe_nextctrl_t));
/*
* Pull the address handle from the work request. The UDAV will
* be used to answer some questions about the request.
*/
ah = (hermon_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
if (ah == NULL) {
return (IBT_AH_HDL_INVALID);
}
mutex_enter(&ah->ah_lock);
udav = ah->ah_udav;
/*
* If the request is for QP1 and the destination LID is equal to
* the Permissive LID, then return an error. This combination is
* not allowed
*/
if ((udav->rlid == IB_LID_PERMISSIVE) &&
(qp->qp_is_special == HERMON_QP_GSI)) {
mutex_exit(&ah->ah_lock);
return (IBT_AH_HDL_INVALID);
}
/*
* Calculate the size of the packet headers, including the GRH
* (if necessary)
*/
desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) +
sizeof (ib_deth_hdr_t);
if (udav->grh) {
desc_sz += sizeof (ib_grh_t);
}
/*
* Begin to build the first "inline" data segment for the packet
* headers. Note: By specifying "inline" we can build the contents
* of the MAD packet headers directly into the work queue (as part
* descriptor). This has the advantage of both speeding things up
* and of not requiring the driver to allocate/register any additional
* memory for the packet headers.
*/
HERMON_WQE_BUILD_INLINE(qp, &ds[0], desc_sz);
desc_sz += 4;
/*
* Build Local Route Header (LRH)
* We start here by building the LRH into a temporary location.
* When we have finished we copy the LRH data into the descriptor.
*
* Notice that the VL values are hardcoded. This is not a problem
* because VL15 is decided later based on the value in the MLX
* transport "next/ctrl" header (see the "vl15" bit below), and it
* is otherwise (meaning for QP1) chosen from the SL-to-VL table
* values. This rule does not hold for loopback packets however
* (all of which bypass the SL-to-VL tables) and it is the reason
* that non-QP0 MADs are setup with VL hardcoded to zero below.
*
* Notice also that Source LID is hardcoded to the Permissive LID
* (0xFFFF). This is also not a problem because if the Destination
* LID is not the Permissive LID, then the "slr" value in the MLX
* transport "next/ctrl" header will be set to zero and the hardware
* will pull the LID from value in the port.
*/
lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4);
pktlen = (desc_sz + 0x100) >> 2;
HERMON_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen);
/*
* Build Global Route Header (GRH)
* This is only built if necessary as defined by the "grh" bit in
* the address vector. Note: We also calculate the offset to the
* next header (BTH) based on whether or not the "grh" bit is set.
*/
if (udav->grh) {
/*
* If the request is for QP0, then return an error. The
* combination of global routine (GRH) and QP0 is not allowed.
*/
if (qp->qp_is_special == HERMON_QP_SMI) {
mutex_exit(&ah->ah_lock);
return (IBT_AH_HDL_INVALID);
}
grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
HERMON_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen);
bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t));
} else {
bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
}
mutex_exit(&ah->ah_lock);
/*
* Build Base Transport Header (BTH)
* Notice that the M, PadCnt, and TVer fields are all set
* to zero implicitly. This is true for all Management Datagrams
* MADs whether GSI are SMI.
*/
HERMON_WQE_BUILD_MLX_BTH(state, bth, qp, wr);
/*
* Build Datagram Extended Transport Header (DETH)
*/
deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t));
HERMON_WQE_BUILD_MLX_DETH(deth, qp);
/* Ensure that the Data Segment is aligned on a 16-byte boundary */
ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t));
ds = (hermon_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF);
nds = wr->wr_nds;
sgl = wr->wr_sgl;
num_ds = 0;
/*
* Now fill in the Data Segments (SGL) for the MLX WQE based on the
* values set up above (i.e. "sgl", "nds", and the "ds" pointer
* Start by checking for a valid number of SGL entries
*/
if (nds > qp->qp_sq_sgl) {
return (IBT_QP_SGL_LEN_INVALID);
}
/*
* For each SGL in the Send Work Request, fill in the MLX WQE's data
* segments. Note: We skip any SGL with zero size because Hermon
* hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
* the encoding for zero means a 2GB transfer. Because of this special
* encoding in the hardware, we mask the requested length with
* HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
* zero.)
*/
mgmtclass = hpoint = hcount = NULL;
offset = 0;
for (i = 0; i < nds; i++) {
if (sgl[i].ds_len == 0) {
continue;
}
/*
* Fill in the Data Segment(s) for the MLX send WQE, using
* the information contained in the scatter-gather list of
* the work request.
*/
HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds], &sgl[i]);
/*
* Search through the contents of all MADs posted to QP0 to
* initialize pointers to the places where Directed Route "hop
* pointer", "hop count", and "mgmtclass" would be. Hermon
* needs these updated (i.e. incremented or decremented, as
* necessary) by software.
*/
if (qp->qp_is_special == HERMON_QP_SMI) {
HERMON_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass,
offset, sgl[i].ds_va, sgl[i].ds_len);
HERMON_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint,
offset, sgl[i].ds_va, sgl[i].ds_len);
HERMON_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount,
offset, sgl[i].ds_va, sgl[i].ds_len);
offset += sgl[i].ds_len;
}
num_ds++;
}
/*
* Hermon's Directed Route MADs need to have the "hop pointer"
* incremented/decremented (as necessary) depending on whether it is
* currently less than or greater than the "hop count" (i.e. whether
* the MAD is a request or a response.)
*/
if (qp->qp_is_special == HERMON_QP_SMI) {
HERMON_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass,
*hpoint, *hcount);
}
/*
* Now fill in the ICRC Data Segment. This data segment is inlined
* just like the packets headers above, but it is only four bytes and
* set to zero (to indicate that we wish the hardware to generate ICRC.
*/
HERMON_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0);
num_ds++;
/*
* Return the size of descriptor (in 16-byte chunks)
* For Hermon, we want them (for now) to be on stride size
* boundaries, which was implicit in Tavor/Arbel
*/
tmpsize = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc);
*size = tmpsize >> 0x04;
return (DDI_SUCCESS);
}
/*
* hermon_wqe_recv_build()
* Context: Can be called from interrupt or base context.
*/
/* ARGSUSED */
static int
hermon_wqe_recv_build(hermon_state_t *state, hermon_qphdl_t qp,
ibt_recv_wr_t *wr, uint64_t *desc)
{
hermon_hw_wqe_sgl_t *ds;
int i, num_ds;
ASSERT(MUTEX_HELD(&qp->qp_rq_lock));
/*
* Fill in the Data Segments (SGL) for the Recv WQE - don't
* need to have a reserved for the ctrl, there is none on the
* recv queue for hermon, but will need to put an invalid
* (null) scatter pointer per PRM
*/
ds = (hermon_hw_wqe_sgl_t *)(uintptr_t)desc;
num_ds = 0;
/* Check for valid number of SGL entries */
if (wr->wr_nds > qp->qp_rq_sgl) {
return (IBT_QP_SGL_LEN_INVALID);
}
/*
* For each SGL in the Recv Work Request, fill in the Recv WQE's data
* segments. Note: We skip any SGL with zero size because Hermon
* hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
* the encoding for zero means a 2GB transfer. Because of this special
* encoding in the hardware, we mask the requested length with
* HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
* zero.)
*/
for (i = 0; i < wr->wr_nds; i++) {
if (wr->wr_sgl[i].ds_len == 0) {
continue;
}
/*
* Fill in the Data Segment(s) for the receive WQE, using the
* information contained in the scatter-gather list of the
* work request.
*/
HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &wr->wr_sgl[i]);
num_ds++;
}
/* put the null sgl pointer as well if needed */
if (num_ds < qp->qp_rq_sgl) {
HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &null_sgl);
}
return (DDI_SUCCESS);
}
/*
* hermon_wqe_srq_build()
* Context: Can be called from interrupt or base context.
*/
/* ARGSUSED */
static int
hermon_wqe_srq_build(hermon_state_t *state, hermon_srqhdl_t srq,
ibt_recv_wr_t *wr, uint64_t *desc)
{
hermon_hw_wqe_sgl_t *ds;
int i, num_ds;
ASSERT(MUTEX_HELD(&srq->srq_lock));
/* Fill in the Data Segments (SGL) for the Recv WQE */
ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
sizeof (hermon_hw_srq_wqe_next_t));
num_ds = 0;
/* Check for valid number of SGL entries */
if (wr->wr_nds > srq->srq_wq_sgl) {
return (IBT_QP_SGL_LEN_INVALID);
}
/*
* For each SGL in the Recv Work Request, fill in the Recv WQE's data
* segments. Note: We skip any SGL with zero size because Hermon
* hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
* the encoding for zero means a 2GB transfer. Because of this special
* encoding in the hardware, we mask the requested length with
* HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
* zero.)
*/
for (i = 0; i < wr->wr_nds; i++) {
if (wr->wr_sgl[i].ds_len == 0) {
continue;
}
/*
* Fill in the Data Segment(s) for the receive WQE, using the
* information contained in the scatter-gather list of the
* work request.
*/
HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &wr->wr_sgl[i]);
num_ds++;
}
/*
* put in the null sgl pointer as well, if needed
*/
if (num_ds < srq->srq_wq_sgl) {
HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &null_sgl);
}
return (DDI_SUCCESS);
}
/*
* hermon_wr_get_immediate()
* Context: Can be called from interrupt or base context.
*/
static uint32_t
hermon_wr_get_immediate(ibt_send_wr_t *wr)
{
/*
* This routine extracts the "immediate data" from the appropriate
* location in the IBTF work request. Because of the way the
* work request structure is defined, the location for this data
* depends on the actual work request operation type.
*/
/* For RDMA Write, test if RC or UC */
if (wr->wr_opcode == IBT_WRC_RDMAW) {
if (wr->wr_trans == IBT_RC_SRV) {
return (wr->wr.rc.rcwr.rdma.rdma_immed);
} else { /* IBT_UC_SRV */
return (wr->wr.uc.ucwr.rdma.rdma_immed);
}
}
/* For Send, test if RC, UD, or UC */
if (wr->wr_opcode == IBT_WRC_SEND) {
if (wr->wr_trans == IBT_RC_SRV) {
return (wr->wr.rc.rcwr.send_immed);
} else if (wr->wr_trans == IBT_UD_SRV) {
return (wr->wr.ud.udwr_immed);
} else { /* IBT_UC_SRV */
return (wr->wr.uc.ucwr.send_immed);
}
}
/*
* If any other type of request, then immediate is undefined
*/
return (0);
}
/*
* hermon_wqe_headroom()
* Context: can be called from interrupt or base, currently only from
* base context.
* Routine that fills in the headroom for the Send Queue
*/
static void
hermon_wqe_headroom(uint_t from, hermon_qphdl_t qp)
{
uint32_t *wqe_start, *wqe_top, *wqe_base, qsize;
int hdrmwqes, wqesizebytes, sectperwqe;
uint32_t invalue;
int i, j;
qsize = qp->qp_sq_bufsz;
wqesizebytes = 1 << qp->qp_sq_log_wqesz;
sectperwqe = wqesizebytes >> 6; /* 64 bytes/section */
hdrmwqes = qp->qp_sq_hdrmwqes;
wqe_base = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, 0);
wqe_top = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, qsize);
wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, from);
for (i = 0; i < hdrmwqes; i++) {
for (j = 0; j < sectperwqe; j++) {
if (j == 0) { /* 1st section of wqe */
/* perserve ownership bit */
invalue = ddi_get32(qp->qp_wqinfo.qa_acchdl,
wqe_start) | 0x7FFFFFFF;
} else {
/* or just invalidate it */
invalue = 0xFFFFFFFF;
}
ddi_put32(qp->qp_wqinfo.qa_acchdl, wqe_start, invalue);
wqe_start += 16; /* move 64 bytes */
}
if (wqe_start == wqe_top) /* hit the end of the queue */
wqe_start = wqe_base; /* wrap to start */
}
}
/*
* hermon_wqe_sync()
* Context: Can be called from interrupt or base context.
*/
static void
hermon_wqe_sync(void *hdl, uint_t sync_from, uint_t sync_to,
uint_t sync_type, uint_t flag)
{
hermon_qphdl_t qp;
hermon_srqhdl_t srq;
uint64_t *wqe_from, *wqe_to;
uint64_t *wq_base, *wq_top, *qp_base;
ddi_dma_handle_t dmahdl;
off_t offset;
size_t length;
uint32_t qsize;
int status;
if (sync_type == HERMON_WR_SRQ) {
srq = (hermon_srqhdl_t)hdl;
/* Get the DMA handle from SRQ context */
dmahdl = srq->srq_mrhdl->mr_bindinfo.bi_dmahdl;
/* get base addr of the buffer */
qp_base = (uint64_t *)(void *)srq->srq_wq_buf;
} else {
qp = (hermon_qphdl_t)hdl;
/* Get the DMA handle from QP context */
dmahdl = qp->qp_mrhdl->mr_bindinfo.bi_dmahdl;
/* Determine the base address of the QP buffer */
if (qp->qp_sq_baseaddr == 0) {
qp_base = (uint64_t *)(void *)(qp->qp_sq_buf);
} else {
qp_base = (uint64_t *)(void *)(qp->qp_rq_buf);
}
}
/*
* Depending on the type of the work queue, we grab information
* about the address ranges we need to DMA sync.
*/
if (sync_type == HERMON_WR_SEND) {
wqe_from = HERMON_QP_SQ_ENTRY(qp, sync_from);
wqe_to = HERMON_QP_SQ_ENTRY(qp, sync_to);
qsize = qp->qp_sq_bufsz;
wq_base = HERMON_QP_SQ_ENTRY(qp, 0);
wq_top = HERMON_QP_SQ_ENTRY(qp, qsize);
} else if (sync_type == HERMON_WR_RECV) {
wqe_from = HERMON_QP_RQ_ENTRY(qp, sync_from);
wqe_to = HERMON_QP_RQ_ENTRY(qp, sync_to);
qsize = qp->qp_rq_bufsz;
wq_base = HERMON_QP_RQ_ENTRY(qp, 0);
wq_top = HERMON_QP_RQ_ENTRY(qp, qsize);
} else {
wqe_from = HERMON_SRQ_WQ_ENTRY(srq, sync_from);
wqe_to = HERMON_SRQ_WQ_ENTRY(srq, sync_to);
qsize = srq->srq_wq_bufsz;
wq_base = HERMON_SRQ_WQ_ENTRY(srq, 0);
wq_top = HERMON_SRQ_WQ_ENTRY(srq, qsize);
}
/*
* There are two possible cases for the beginning and end of the WQE
* chain we are trying to sync. Either this is the simple case, where
* the end of the chain is below the beginning of the chain, or it is
* the "wrap-around" case, where the end of the chain has wrapped over
* the end of the queue. In the former case, we simply need to
* calculate the span from beginning to end and sync it. In the latter
* case, however, we need to calculate the span from the top of the
* work queue to the end of the chain and sync that, and then we need
* to find the other portion (from beginning of chain to end of queue)
* and sync that as well. Note: if the "top to end" span is actually
* zero length, then we don't do a DMA sync because a zero length DMA
* sync unnecessarily syncs the entire work queue.
*/
if (wqe_to > wqe_from) {
/* "From Beginning to End" */
offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)qp_base);
length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_from);
status = ddi_dma_sync(dmahdl, offset, length, flag);
if (status != DDI_SUCCESS) {
return;
}
} else {
/* "From Top to End" */
offset = (off_t)((uintptr_t)wq_base - (uintptr_t)qp_base);
length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wq_base);
if (length) {
status = ddi_dma_sync(dmahdl, offset, length, flag);
if (status != DDI_SUCCESS) {
return;
}
}
/* "From Beginning to Bottom" */
offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)qp_base);
length = (size_t)((uintptr_t)wq_top - (uintptr_t)wqe_from);
status = ddi_dma_sync(dmahdl, offset, length, flag);
if (status != DDI_SUCCESS) {
return;
}
}
}
/*
* hermon_wr_bind_check()
* Context: Can be called from interrupt or base context.
*/
/* ARGSUSED */
static int
hermon_wr_bind_check(hermon_state_t *state, ibt_send_wr_t *wr)
{
ibt_bind_flags_t bind_flags;
uint64_t vaddr, len;
uint64_t reg_start_addr, reg_end_addr;
hermon_mwhdl_t mw;
hermon_mrhdl_t mr;
hermon_rsrc_t *mpt;
uint32_t new_rkey;
/* Check for a valid Memory Window handle in the WR */
mw = (hermon_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl;
if (mw == NULL) {
return (IBT_MW_HDL_INVALID);
}
/* Check for a valid Memory Region handle in the WR */
mr = (hermon_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl;
if (mr == NULL) {
return (IBT_MR_HDL_INVALID);
}
mutex_enter(&mr->mr_lock);
mutex_enter(&mw->mr_lock);
/*
* Check here to see if the memory region has already been partially
* deregistered as a result of a hermon_umap_umemlock_cb() callback.
* If so, this is an error, return failure.
*/
if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
mutex_exit(&mr->mr_lock);
mutex_exit(&mw->mr_lock);
return (IBT_MR_HDL_INVALID);
}
/* Check for a valid Memory Window RKey (i.e. a matching RKey) */
if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) {
mutex_exit(&mr->mr_lock);
mutex_exit(&mw->mr_lock);
return (IBT_MR_RKEY_INVALID);
}
/* Check for a valid Memory Region LKey (i.e. a matching LKey) */
if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) {
mutex_exit(&mr->mr_lock);
mutex_exit(&mw->mr_lock);
return (IBT_MR_LKEY_INVALID);
}
/*
* Now check for valid "vaddr" and "len". Note: We don't check the
* "vaddr" range when "len == 0" (i.e. on unbind operations)
*/
len = wr->wr.rc.rcwr.bind->bind_len;
if (len != 0) {
vaddr = wr->wr.rc.rcwr.bind->bind_va;
reg_start_addr = mr->mr_bindinfo.bi_addr;
reg_end_addr = mr->mr_bindinfo.bi_addr +
(mr->mr_bindinfo.bi_len - 1);
if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) {
mutex_exit(&mr->mr_lock);
mutex_exit(&mw->mr_lock);
return (IBT_MR_VA_INVALID);
}
vaddr = (vaddr + len) - 1;
if (vaddr > reg_end_addr) {
mutex_exit(&mr->mr_lock);
mutex_exit(&mw->mr_lock);
return (IBT_MR_LEN_INVALID);
}
}
/*
* Validate the bind access flags. Remote Write and Atomic access for
* the Memory Window require that Local Write access be set in the
* corresponding Memory Region.
*/
bind_flags = wr->wr.rc.rcwr.bind->bind_flags;
if (((bind_flags & IBT_WR_BIND_WRITE) ||
(bind_flags & IBT_WR_BIND_ATOMIC)) &&
!(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) {
mutex_exit(&mr->mr_lock);
mutex_exit(&mw->mr_lock);
return (IBT_MR_ACCESS_REQ_INVALID);
}
/* Calculate the new RKey for the Memory Window */
mpt = mw->mr_mptrsrcp;
new_rkey = hermon_mr_keycalc(mpt->hr_indx);
new_rkey = hermon_mr_key_swap(new_rkey);
wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
mw->mr_rkey = new_rkey;
mutex_exit(&mr->mr_lock);
mutex_exit(&mw->mr_lock);
return (DDI_SUCCESS);
}
/*
* hermon_wrid_from_reset_handling()
* Context: Can be called from interrupt or base context.
*/
/* ARGSUSED */
int
hermon_wrid_from_reset_handling(hermon_state_t *state, hermon_qphdl_t qp)
{
hermon_workq_hdr_t *swq, *rwq;
uint_t qp_srq_en;
if (qp->qp_is_umap)
return (DDI_SUCCESS);
/* grab the cq lock(s) to modify the wqavl tree */
mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
#ifdef __lock_lint
mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
#else
if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
#endif
/* Chain the newly allocated work queue header to the CQ's list */
hermon_cq_workq_add(qp->qp_sq_cqhdl, &qp->qp_sq_wqavl);
swq = qp->qp_sq_wqhdr;
swq->wq_head = 0;
swq->wq_tail = 0;
swq->wq_full = 0;
/*
* Now we repeat all the above operations for the receive work queue,
* or shared receive work queue.
*
* Note: We still use the 'qp_rq_cqhdl' even in the SRQ case.
*/
qp_srq_en = qp->qp_srq_en;
#ifdef __lock_lint
mutex_enter(&qp->qp_srqhdl->srq_lock);
#else
if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
mutex_enter(&qp->qp_srqhdl->srq_lock);
} else {
rwq = qp->qp_rq_wqhdr;
rwq->wq_head = 0;
rwq->wq_tail = 0;
rwq->wq_full = 0;
qp->qp_rq_wqecntr = 0;
}
#endif
hermon_cq_workq_add(qp->qp_rq_cqhdl, &qp->qp_rq_wqavl);
#ifdef __lock_lint
mutex_exit(&qp->qp_srqhdl->srq_lock);
#else
if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
mutex_exit(&qp->qp_srqhdl->srq_lock);
}
#endif
#ifdef __lock_lint
mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
#else
if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
#endif
mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
return (DDI_SUCCESS);
}
/*
* hermon_wrid_to_reset_handling()
* Context: Can be called from interrupt or base context.
*/
int
hermon_wrid_to_reset_handling(hermon_state_t *state, hermon_qphdl_t qp)
{
uint_t qp_srq_en;
if (qp->qp_is_umap)
return (DDI_SUCCESS);
/*
* If there are unpolled entries in these CQs, they are
* polled/flushed.
* Grab the CQ lock(s) before manipulating the lists.
*/
mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
#ifdef __lock_lint
mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
#else
if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
#endif
qp_srq_en = qp->qp_srq_en;
#ifdef __lock_lint
mutex_enter(&qp->qp_srqhdl->srq_lock);
#else
if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
mutex_enter(&qp->qp_srqhdl->srq_lock);
}
#endif
/*
* Flush the entries on the CQ for this QP's QPN.
*/
hermon_cq_entries_flush(state, qp);
#ifdef __lock_lint
mutex_exit(&qp->qp_srqhdl->srq_lock);
#else
if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
mutex_exit(&qp->qp_srqhdl->srq_lock);
}
#endif
hermon_cq_workq_remove(qp->qp_rq_cqhdl, &qp->qp_rq_wqavl);
hermon_cq_workq_remove(qp->qp_sq_cqhdl, &qp->qp_sq_wqavl);
#ifdef __lock_lint
mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
#else
if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
#endif
mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
return (IBT_SUCCESS);
}
/*
* hermon_wrid_get_entry()
* Context: Can be called from interrupt or base context.
*/
uint64_t
hermon_wrid_get_entry(hermon_cqhdl_t cq, hermon_hw_cqe_t *cqe)
{
hermon_workq_avl_t *wqa;
hermon_workq_hdr_t *wq;
uint64_t wrid;
uint_t send_or_recv, qpnum;
uint32_t indx;
/*
* Determine whether this CQE is a send or receive completion.
*/
send_or_recv = HERMON_CQE_SENDRECV_GET(cq, cqe);
/* Find the work queue for this QP number (send or receive side) */
qpnum = HERMON_CQE_QPNUM_GET(cq, cqe);
wqa = hermon_wrid_wqavl_find(cq, qpnum, send_or_recv);
wq = wqa->wqa_wq;
/*
* Regardless of whether the completion is the result of a "success"
* or a "failure", we lock the list of "containers" and attempt to
* search for the the first matching completion (i.e. the first WR
* with a matching WQE addr and size). Once we find it, we pull out
* the "wrid" field and return it (see below). XXX Note: One possible
* future enhancement would be to enable this routine to skip over
* any "unsignaled" completions to go directly to the next "signaled"
* entry on success.
*/
indx = HERMON_CQE_WQEADDRSZ_GET(cq, cqe) & wq->wq_mask;
wrid = wq->wq_wrid[indx];
if (wqa->wqa_srq_en) {
struct hermon_sw_srq_s *srq;
uint64_t *desc;
/* put wqe back on the srq free list */
srq = wqa->wqa_srq;
mutex_enter(&srq->srq_lock);
desc = HERMON_SRQ_WQE_ADDR(srq, wq->wq_tail);
((uint16_t *)desc)[1] = htons(indx);
wq->wq_tail = indx;
mutex_exit(&srq->srq_lock);
} else {
wq->wq_head = (indx + 1) & wq->wq_mask;
wq->wq_full = 0;
}
return (wrid);
}
int
hermon_wrid_workq_compare(const void *p1, const void *p2)
{
hermon_workq_compare_t *cmpp;
hermon_workq_avl_t *curr;
cmpp = (hermon_workq_compare_t *)p1;
curr = (hermon_workq_avl_t *)p2;
if (cmpp->cmp_qpn < curr->wqa_qpn)
return (-1);
else if (cmpp->cmp_qpn > curr->wqa_qpn)
return (+1);
else if (cmpp->cmp_type < curr->wqa_type)
return (-1);
else if (cmpp->cmp_type > curr->wqa_type)
return (+1);
else
return (0);
}
/*
* hermon_wrid_workq_find()
* Context: Can be called from interrupt or base context.
*/
static hermon_workq_avl_t *
hermon_wrid_wqavl_find(hermon_cqhdl_t cq, uint_t qpn, uint_t wq_type)
{
hermon_workq_avl_t *curr;
hermon_workq_compare_t cmp;
/*
* Walk the CQ's work queue list, trying to find a send or recv queue
* with the same QP number. We do this even if we are going to later
* create a new entry because it helps us easily find the end of the
* list.
*/
cmp.cmp_qpn = qpn;
cmp.cmp_type = wq_type;
#ifdef __lock_lint
hermon_wrid_workq_compare(NULL, NULL);
#endif
curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);
return (curr);
}
/*
* hermon_wrid_wqhdr_create()
* Context: Can be called from base context.
*/
/* ARGSUSED */
hermon_workq_hdr_t *
hermon_wrid_wqhdr_create(int bufsz)
{
hermon_workq_hdr_t *wqhdr;
/*
* Allocate space for the wqhdr, and an array to record all the wrids.
*/
wqhdr = (hermon_workq_hdr_t *)kmem_zalloc(sizeof (*wqhdr), KM_NOSLEEP);
if (wqhdr == NULL) {
return (NULL);
}
_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr))
wqhdr->wq_wrid = kmem_zalloc(bufsz * sizeof (uint64_t), KM_NOSLEEP);
if (wqhdr->wq_wrid == NULL) {
kmem_free(wqhdr, sizeof (*wqhdr));
return (NULL);
}
wqhdr->wq_size = bufsz;
wqhdr->wq_mask = bufsz - 1;
return (wqhdr);
}
void
hermon_wrid_wqhdr_destroy(hermon_workq_hdr_t *wqhdr)
{
kmem_free(wqhdr->wq_wrid, wqhdr->wq_size * sizeof (uint64_t));
kmem_free(wqhdr, sizeof (*wqhdr));
}
/*
* hermon_cq_workq_add()
* Context: Can be called from interrupt or base context.
*/
static void
hermon_cq_workq_add(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl)
{
hermon_workq_compare_t cmp;
avl_index_t where;
cmp.cmp_qpn = wqavl->wqa_qpn;
cmp.cmp_type = wqavl->wqa_type;
#ifdef __lock_lint
hermon_wrid_workq_compare(NULL, NULL);
#endif
(void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where);
avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqavl, where);
}
/*
* hermon_cq_workq_remove()
* Context: Can be called from interrupt or base context.
*/
static void
hermon_cq_workq_remove(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl)
{
#ifdef __lock_lint
hermon_wrid_workq_compare(NULL, NULL);
#endif
avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqavl);
}