tavor_srq.c revision 9e39c5ba00a55fa05777cc94b148296af305e135
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* tavor_srq.c
* Tavor Shared Receive Queue Processing Routines
*
* Implements all the routines necessary for allocating, freeing, querying,
* modifying and posting shared receive queues.
*/
#include <sys/types.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/modctl.h>
#include <sys/bitmap.h>
#include <sys/ib/adapters/tavor/tavor.h>
/*
* Used by tavor_srq_numcalc() below to fill in the "unconstrained" portion of
* Tavor shared receive queue number
*/
static uint_t tavor_debug_srqnum_cnt = 0x00000000;
static void tavor_srq_numcalc(tavor_state_t *state, uint32_t indx,
uint32_t *key);
static void tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl);
/*
* tavor_srq_alloc()
* Context: Can be called only from user or kernel context.
*/
int
tavor_srq_alloc(tavor_state_t *state, tavor_srq_info_t *srqinfo,
uint_t sleepflag, tavor_srq_options_t *op)
{
ibt_srq_hdl_t ibt_srqhdl;
tavor_pdhdl_t pd;
ibt_srq_sizes_t *sizes;
ibt_srq_sizes_t *real_sizes;
tavor_srqhdl_t *srqhdl;
ibt_srq_flags_t flags;
tavor_rsrc_t *srqc, *rsrc;
tavor_hw_srqc_t srqc_entry;
uint32_t *buf;
tavor_srqhdl_t srq;
tavor_umap_db_entry_t *umapdb;
ibt_mr_attr_t mr_attr;
tavor_mr_options_t mr_op;
tavor_mrhdl_t mr;
uint64_t addr;
uint64_t value, srq_desc_off;
uint32_t lkey;
uint32_t log_srq_size;
uint32_t uarpg;
uint_t wq_location, dma_xfer_mode, srq_is_umap;
int flag, status;
char *errormsg;
uint_t max_sgl;
uint_t wqesz;
_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sizes))
TAVOR_TNF_ENTER(tavor_srq_alloc);
/*
* Check the "options" flag. Currently this flag tells the driver
* whether or not the SRQ's work queues should be come from normal
* system memory or whether they should be allocated from DDR memory.
*/
if (op == NULL) {
wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
} else {
wq_location = op->srqo_wq_loc;
}
/*
* Extract the necessary info from the tavor_srq_info_t structure
*/
real_sizes = srqinfo->srqi_real_sizes;
sizes = srqinfo->srqi_sizes;
pd = srqinfo->srqi_pd;
ibt_srqhdl = srqinfo->srqi_ibt_srqhdl;
flags = srqinfo->srqi_flags;
srqhdl = srqinfo->srqi_srqhdl;
/*
* Determine whether SRQ is being allocated for userland access or
* whether it is being allocated for kernel access. If the SRQ is
* being allocated for userland access, then lookup the UAR doorbell
* page number for the current process. Note: If this is not found
* (e.g. if the process has not previously open()'d the Tavor driver),
* then an error is returned.
*/
srq_is_umap = (flags & IBT_SRQ_USER_MAP) ? 1 : 0;
if (srq_is_umap) {
status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
if (status != DDI_SUCCESS) {
/* Set "status" and "errormsg" and goto failure */
TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
goto srqalloc_fail3;
}
uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
}
/* Increase PD refcnt */
tavor_pd_refcnt_inc(pd);
/* Allocate an SRQ context entry */
status = tavor_rsrc_alloc(state, TAVOR_SRQC, 1, sleepflag, &srqc);
if (status != DDI_SUCCESS) {
/* Set "status" and "errormsg" and goto failure */
TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ context");
goto srqalloc_fail1;
}
/* Allocate the SRQ Handle entry */
status = tavor_rsrc_alloc(state, TAVOR_SRQHDL, 1, sleepflag, &rsrc);
if (status != DDI_SUCCESS) {
/* Set "status" and "errormsg" and goto failure */
TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ handle");
goto srqalloc_fail2;
}
srq = (tavor_srqhdl_t)rsrc->tr_addr;
_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq))
/* Calculate the SRQ number */
tavor_srq_numcalc(state, srqc->tr_indx, &srq->srq_srqnum);
/*
* If this will be a user-mappable SRQ, then allocate an entry for
* the "userland resources database". This will later be added to
* the database (after all further SRQ operations are successful).
* If we fail here, we must undo the reference counts and the
* previous resource allocation.
*/
if (srq_is_umap) {
umapdb = tavor_umap_db_alloc(state->ts_instance,
srq->srq_srqnum, MLNX_UMAP_SRQMEM_RSRC,
(uint64_t)(uintptr_t)rsrc);
if (umapdb == NULL) {
/* Set "status" and "errormsg" and goto failure */
TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
goto srqalloc_fail3;
}
}
/*
* Calculate the appropriate size for the SRQ.
* Note: All Tavor SRQs must be a power-of-2 in size. Also
* they may not be any smaller than TAVOR_SRQ_MIN_SIZE. This step
* is to round the requested size up to the next highest power-of-2
*/
sizes->srq_wr_sz = max(sizes->srq_wr_sz, TAVOR_SRQ_MIN_SIZE);
log_srq_size = highbit(sizes->srq_wr_sz);
if ((sizes->srq_wr_sz & (sizes->srq_wr_sz - 1)) == 0) {
log_srq_size = log_srq_size - 1;
}
/*
* Next we verify that the rounded-up size is valid (i.e. consistent
* with the device limits and/or software-configured limits). If not,
* then obviously we have a lot of cleanup to do before returning.
*/
if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
/* Set "status" and "errormsg" and goto failure */
TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max SRQ size");
goto srqalloc_fail4;
}
/*
* Next we verify that the requested number of SGL is valid (i.e.
* consistent with the device limits and/or software-configured
* limits). If not, then obviously the same cleanup needs to be done.
*/
max_sgl = state->ts_cfg_profile->cp_srq_max_sgl;
if (sizes->srq_sgl_sz > max_sgl) {
/* Set "status" and "errormsg" and goto failure */
TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED, "max SRQ SGL");
goto srqalloc_fail4;
}
/*
* Determine the SRQ's WQE sizes. This depends on the requested
* number of SGLs. Note: This also has the side-effect of
* calculating the real number of SGLs (for the calculated WQE size)
*/
tavor_srq_sgl_to_logwqesz(state, sizes->srq_sgl_sz,
TAVOR_QP_WQ_TYPE_RECVQ, &srq->srq_wq_log_wqesz,
&srq->srq_wq_sgl);
/*
* Allocate the memory for SRQ work queues. Note: The location from
* which we will allocate these work queues has been passed in through
* the tavor_qp_options_t structure. Since Tavor work queues are not
* allowed to cross a 32-bit (4GB) boundary, the alignment of the work
* queue memory is very important. We used to allocate work queues
* (the combined receive and send queues) so that they would be aligned
* on their combined size. That alignment guaranteed that they would
* never cross the 4GB boundary (Tavor work queues are on the order of
* MBs at maximum). Now we are able to relax this alignment constraint
* by ensuring that the IB address assigned to the queue memory (as a
* result of the tavor_mr_register() call) is offset from zero.
* Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
* guarantee the alignment, but when attempting to use IOMMU bypass
* mode we found that we were not allowed to specify any alignment that
* was more restrictive than the system page size. So we avoided this
* constraint by passing two alignment values, one for the memory
* allocation itself and the other for the DMA handle (for later bind).
* This used to cause more memory than necessary to be allocated (in
* order to guarantee the more restrictive alignment contraint). But
* be guaranteeing the zero-based IB virtual address for the queue, we
* are able to conserve this memory.
*
* Note: If SRQ is not user-mappable, then it may come from either
* kernel system memory or from HCA-attached local DDR memory.
*
* Note2: We align this queue on a pagesize boundary. This is required
* to make sure that all the resulting IB addresses will start at 0, for
* a zero-based queue. By making sure we are aligned on at least a
* page, any offset we use into our queue will be the same as when we
* perform tavor_srq_modify() operations later.
*/
wqesz = (1 << srq->srq_wq_log_wqesz);
srq->srq_wqinfo.qa_size = (1 << log_srq_size) * wqesz;
srq->srq_wqinfo.qa_alloc_align = PAGESIZE;
srq->srq_wqinfo.qa_bind_align = PAGESIZE;
if (srq_is_umap) {
srq->srq_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
} else {
srq->srq_wqinfo.qa_location = wq_location;
}
status = tavor_queue_alloc(state, &srq->srq_wqinfo, sleepflag);
if (status != DDI_SUCCESS) {
/* Set "status" and "errormsg" and goto failure */
TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
goto srqalloc_fail4;
}
buf = (uint32_t *)srq->srq_wqinfo.qa_buf_aligned;
_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
/*
* Register the memory for the SRQ work queues. The memory for the SRQ
* must be registered in the Tavor TPT tables. This gives us the LKey
* to specify in the SRQ context later. Note: If the work queue is to
* be allocated from DDR memory, then only a "bypass" mapping is
* appropriate. And if the SRQ memory is user-mappable, then we force
* DDI_DMA_CONSISTENT mapping. Also, in order to meet the alignment
* restriction, we pass the "mro_bind_override_addr" flag in the call
* to tavor_mr_register(). This guarantees that the resulting IB vaddr
* will be zero-based (modulo the offset into the first page). If we
* fail here, we still have the bunch of resource and reference count
* cleanup to do.
*/
flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
IBT_MR_NOSLEEP;
mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
mr_attr.mr_len = srq->srq_wqinfo.qa_size;
mr_attr.mr_as = NULL;
mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
if (srq_is_umap) {
mr_op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
} else {
if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
mr_op.mro_bind_type =
state->ts_cfg_profile->cp_iommu_bypass;
dma_xfer_mode =
state->ts_cfg_profile->cp_streaming_consistent;
if (dma_xfer_mode == DDI_DMA_STREAMING) {
mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
}
} else {
mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
}
}
mr_op.mro_bind_dmahdl = srq->srq_wqinfo.qa_dmahdl;
mr_op.mro_bind_override_addr = 1;
status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
if (status != DDI_SUCCESS) {
/* Set "status" and "errormsg" and goto failure */
TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
goto srqalloc_fail5;
}
_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
addr = mr->mr_bindinfo.bi_addr;
lkey = mr->mr_lkey;
/*
* Calculate the offset between the kernel virtual address space
* and the IB virtual address space. This will be used when
* posting work requests to properly initialize each WQE.
*/
srq_desc_off = (uint64_t)(uintptr_t)srq->srq_wqinfo.qa_buf_aligned -
(uint64_t)mr->mr_bindinfo.bi_addr;
/*
* Create WQL and Wridlist for use by this SRQ
*/
srq->srq_wrid_wql = tavor_wrid_wql_create(state);
if (srq->srq_wrid_wql == NULL) {
/* Set "status" and "errormsg" and goto failure */
TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wql create");
goto srqalloc_fail6;
}
_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wrid_wql)))
srq->srq_wridlist = tavor_wrid_get_list(1 << log_srq_size);
if (srq->srq_wridlist == NULL) {
/* Set "status" and "errormsg" and goto failure */
TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wridlist create");
goto srqalloc_fail7;
}
_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wridlist)))
srq->srq_wridlist->wl_srq_en = 1;
srq->srq_wridlist->wl_free_list_indx = -1;
/*
* Fill in all the return arguments (if necessary). This includes
* real queue size and real SGLs.
*/
if (real_sizes != NULL) {
real_sizes->srq_wr_sz = (1 << log_srq_size);
real_sizes->srq_sgl_sz = srq->srq_wq_sgl;
}
/*
* Fill in the SRQC entry. This is the final step before passing
* ownership of the SRQC entry to the Tavor hardware. We use all of
* the information collected/calculated above to fill in the
* requisite portions of the SRQC. Note: If this SRQ is going to be
* used for userland access, then we need to set the UAR page number
* appropriately (otherwise it's a "don't care")
*/
bzero(&srqc_entry, sizeof (tavor_hw_srqc_t));
srqc_entry.wqe_addr_h = (addr >> 32);
srqc_entry.next_wqe_addr_l = 0;
srqc_entry.ds = (wqesz >> 4);
srqc_entry.state = TAVOR_SRQ_STATE_HW_OWNER;
srqc_entry.pd = pd->pd_pdnum;
srqc_entry.lkey = lkey;
srqc_entry.wqe_cnt = 0;
if (srq_is_umap) {
srqc_entry.uar = uarpg;
} else {
srqc_entry.uar = 0;
}
/*
* Write the SRQC entry to hardware. Lastly, we pass ownership of
* the entry to the hardware (using the Tavor SW2HW_SRQ firmware
* command). Note: In general, this operation shouldn't fail. But
* if it does, we have to undo everything we've done above before
* returning error.
*/
status = tavor_cmn_ownership_cmd_post(state, SW2HW_SRQ, &srqc_entry,
sizeof (tavor_hw_srqc_t), srq->srq_srqnum,
sleepflag);
if (status != TAVOR_CMD_SUCCESS) {
cmn_err(CE_CONT, "Tavor: SW2HW_SRQ command failed: %08x\n",
status);
TNF_PROBE_1(tavor_srq_alloc_sw2hw_srq_cmd_fail,
TAVOR_TNF_ERROR, "", tnf_uint, status, status);
/* Set "status" and "errormsg" and goto failure */
TAVOR_TNF_FAIL(IBT_FAILURE, "tavor SW2HW_SRQ command");
goto srqalloc_fail8;
}
/*
* Fill in the rest of the Tavor SRQ handle. We can update
* the following fields for use in further operations on the SRQ.
*/
srq->srq_srqcrsrcp = srqc;
srq->srq_rsrcp = rsrc;
srq->srq_mrhdl = mr;
srq->srq_refcnt = 0;
srq->srq_is_umap = srq_is_umap;
srq->srq_uarpg = (srq->srq_is_umap) ? uarpg : 0;
srq->srq_umap_dhp = (devmap_cookie_t)NULL;
srq->srq_pdhdl = pd;
srq->srq_wq_lastwqeindx = -1;
srq->srq_wq_bufsz = (1 << log_srq_size);
srq->srq_wq_buf = buf;
srq->srq_desc_off = srq_desc_off;
srq->srq_hdlrarg = (void *)ibt_srqhdl;
srq->srq_state = 0;
srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
srq->srq_real_sizes.srq_sgl_sz = srq->srq_wq_sgl;
/* Determine if later ddi_dma_sync will be necessary */
srq->srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);
/*
* Put SRQ handle in Tavor SRQNum-to-SRQhdl list. Then fill in the
* "srqhdl" and return success
*/
ASSERT(state->ts_srqhdl[srqc->tr_indx] == NULL);
state->ts_srqhdl[srqc->tr_indx] = srq;
/*
* If this is a user-mappable SRQ, then we need to insert the
* previously allocated entry into the "userland resources database".
* This will allow for later lookup during devmap() (i.e. mmap())
* calls.
*/
if (srq->srq_is_umap) {
tavor_umap_db_add(umapdb);
} else {
mutex_enter(&srq->srq_wrid_wql->wql_lock);
tavor_wrid_list_srq_init(srq->srq_wridlist, srq, 0);
mutex_exit(&srq->srq_wrid_wql->wql_lock);
}
*srqhdl = srq;
TAVOR_TNF_EXIT(tavor_srq_alloc);
return (status);
/*
* The following is cleanup for all possible failure cases in this routine
*/
srqalloc_fail8:
kmem_free(srq->srq_wridlist->wl_wre, srq->srq_wridlist->wl_size *
sizeof (tavor_wrid_entry_t));
kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
srqalloc_fail7:
tavor_wql_refcnt_dec(srq->srq_wrid_wql);
srqalloc_fail6:
if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
TAVOR_SLEEPFLAG_FOR_CONTEXT()) != DDI_SUCCESS) {
TAVOR_WARNING(state, "failed to deregister SRQ memory");
}
srqalloc_fail5:
tavor_queue_free(state, &srq->srq_wqinfo);
srqalloc_fail4:
if (srq_is_umap) {
tavor_umap_db_free(umapdb);
}
srqalloc_fail3:
tavor_rsrc_free(state, &rsrc);
srqalloc_fail2:
tavor_rsrc_free(state, &srqc);
srqalloc_fail1:
tavor_pd_refcnt_dec(pd);
srqalloc_fail:
TNF_PROBE_1(tavor_srq_alloc_fail, TAVOR_TNF_ERROR, "",
tnf_string, msg, errormsg);
TAVOR_TNF_EXIT(tavor_srq_alloc);
return (status);
}
/*
* tavor_srq_free()
* Context: Can be called only from user or kernel context.
*/
/* ARGSUSED */
int
tavor_srq_free(tavor_state_t *state, tavor_srqhdl_t *srqhdl, uint_t sleepflag)
{
tavor_rsrc_t *srqc, *rsrc;
tavor_umap_db_entry_t *umapdb;
uint64_t value;
tavor_srqhdl_t srq;
tavor_mrhdl_t mr;
tavor_pdhdl_t pd;
tavor_hw_srqc_t srqc_entry;
uint32_t srqnum;
uint32_t size;
uint_t maxprot;
int status;
TAVOR_TNF_ENTER(tavor_srq_free);
/*
* Pull all the necessary information from the Tavor Shared Receive
* Queue handle. This is necessary here because the resource for the
* SRQ handle is going to be freed up as part of this operation.
*/
srq = *srqhdl;
mutex_enter(&srq->srq_lock);
srqc = srq->srq_srqcrsrcp;
rsrc = srq->srq_rsrcp;
pd = srq->srq_pdhdl;
mr = srq->srq_mrhdl;
srqnum = srq->srq_srqnum;
/*
* If there are work queues still associated with the SRQ, then return
* an error. Otherwise, we will be holding the SRQ lock.
*/
if (srq->srq_refcnt != 0) {
mutex_exit(&srq->srq_lock);
TNF_PROBE_1(tavor_srq_free_refcnt_fail, TAVOR_TNF_ERROR, "",
tnf_int, refcnt, srq->srq_refcnt);
TAVOR_TNF_EXIT(tavor_srq_free);
return (IBT_SRQ_IN_USE);
}
/*
* If this was a user-mappable SRQ, then we need to remove its entry
* from the "userland resources database". If it is also currently
* mmap()'d out to a user process, then we need to call
* devmap_devmem_remap() to remap the SRQ memory to an invalid mapping.
* We also need to invalidate the SRQ tracking information for the
* user mapping.
*/
if (srq->srq_is_umap) {
status = tavor_umap_db_find(state->ts_instance, srq->srq_srqnum,
MLNX_UMAP_SRQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
&umapdb);
if (status != DDI_SUCCESS) {
mutex_exit(&srq->srq_lock);
TAVOR_WARNING(state, "failed to find in database");
TAVOR_TNF_EXIT(tavor_srq_free);
return (ibc_get_ci_failure(0));
}
tavor_umap_db_free(umapdb);
if (srq->srq_umap_dhp != NULL) {
maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
status = devmap_devmem_remap(srq->srq_umap_dhp,
state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size,
maxprot, DEVMAP_MAPPING_INVALID, NULL);
if (status != DDI_SUCCESS) {
mutex_exit(&srq->srq_lock);
TAVOR_WARNING(state, "failed in SRQ memory "
"devmap_devmem_remap()");
TAVOR_TNF_EXIT(tavor_srq_free);
return (ibc_get_ci_failure(0));
}
srq->srq_umap_dhp = (devmap_cookie_t)NULL;
}
}
/*
* Put NULL into the Tavor SRQNum-to-SRQHdl list. This will allow any
* in-progress events to detect that the SRQ corresponding to this
* number has been freed.
*/
state->ts_srqhdl[srqc->tr_indx] = NULL;
mutex_exit(&srq->srq_lock);
_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq));
_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq->srq_wridlist));
/*
* Reclaim SRQC entry from hardware (using the Tavor HW2SW_SRQ
* firmware command). If the ownership transfer fails for any reason,
* then it is an indication that something (either in HW or SW) has
* gone seriously wrong.
*/
status = tavor_cmn_ownership_cmd_post(state, HW2SW_SRQ, &srqc_entry,
sizeof (tavor_hw_srqc_t), srqnum, sleepflag);
if (status != TAVOR_CMD_SUCCESS) {
TAVOR_WARNING(state, "failed to reclaim SRQC ownership");
cmn_err(CE_CONT, "Tavor: HW2SW_SRQ command failed: %08x\n",
status);
TNF_PROBE_1(tavor_srq_free_hw2sw_srq_cmd_fail,
TAVOR_TNF_ERROR, "", tnf_uint, status, status);
TAVOR_TNF_EXIT(tavor_srq_free);
return (IBT_FAILURE);
}
/*
* Deregister the memory for the Shared Receive Queue. If this fails
* for any reason, then it is an indication that something (either
* in HW or SW) has gone seriously wrong. So we print a warning
* message and return.
*/
status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
sleepflag);
if (status != DDI_SUCCESS) {
TAVOR_WARNING(state, "failed to deregister SRQ memory");
TNF_PROBE_0(tavor_srq_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
TAVOR_TNF_EXIT(tavor_srq_free);
return (IBT_FAILURE);
}
/* Calculate the size and free the wridlist container */
if (srq->srq_wridlist != NULL) {
size = (srq->srq_wridlist->wl_size *
sizeof (tavor_wrid_entry_t));
kmem_free(srq->srq_wridlist->wl_wre, size);
kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
/*
* Release reference to WQL; If this is the last reference,
* this call also has the side effect of freeing up the
* 'srq_wrid_wql' memory.
*/
tavor_wql_refcnt_dec(srq->srq_wrid_wql);
}
/* Free the memory for the SRQ */
tavor_queue_free(state, &srq->srq_wqinfo);
/* Free the Tavor SRQ Handle */
tavor_rsrc_free(state, &rsrc);
/* Free the SRQC entry resource */
tavor_rsrc_free(state, &srqc);
/* Decrement the reference count on the protection domain (PD) */
tavor_pd_refcnt_dec(pd);
/* Set the srqhdl pointer to NULL and return success */
*srqhdl = NULL;
TAVOR_TNF_EXIT(tavor_srq_free);
return (DDI_SUCCESS);
}
/*
* tavor_srq_modify()
* Context: Can be called only from user or kernel context.
*/
int
tavor_srq_modify(tavor_state_t *state, tavor_srqhdl_t srq, uint_t size,
uint_t *real_size, uint_t sleepflag)
{
tavor_qalloc_info_t new_srqinfo, old_srqinfo;
tavor_rsrc_t *mtt, *mpt, *old_mtt;
tavor_bind_info_t bind;
tavor_bind_info_t old_bind;
tavor_rsrc_pool_info_t *rsrc_pool;
tavor_mrhdl_t mr;
tavor_hw_mpt_t mpt_entry;
tavor_wrid_entry_t *wre_new, *wre_old;
uint64_t mtt_ddrbaseaddr, mtt_addr;
uint64_t srq_desc_off;
uint32_t *buf, srq_old_bufsz;
uint32_t wqesz;
uint_t max_srq_size;
uint_t dma_xfer_mode, mtt_pgsize_bits;
uint_t srq_sync, log_srq_size, maxprot;
uint_t wq_location;
int status;
char *errormsg;
TAVOR_TNF_ENTER(tavor_srq_modify);
/*
* Check the "inddr" flag. This flag tells the driver whether or not
* the SRQ's work queues should be come from normal system memory or
* whether they should be allocated from DDR memory.
*/
wq_location = state->ts_cfg_profile->cp_srq_wq_inddr;
/*
* If size requested is larger than device capability, return
* Insufficient Resources
*/
max_srq_size = (1 << state->ts_cfg_profile->cp_log_max_srq_sz);
if (size > max_srq_size) {
TNF_PROBE_0(tavor_srq_modify_size_larger_than_maxsize,
TAVOR_TNF_ERROR, "");
TAVOR_TNF_EXIT(tavor_srq_modify);
return (IBT_HCA_WR_EXCEEDED);
}
/*
* Calculate the appropriate size for the SRQ.
* Note: All Tavor SRQs must be a power-of-2 in size. Also
* they may not be any smaller than TAVOR_SRQ_MIN_SIZE. This step
* is to round the requested size up to the next highest power-of-2
*/
size = max(size, TAVOR_SRQ_MIN_SIZE);
log_srq_size = highbit(size);
if ((size & (size - 1)) == 0) {
log_srq_size = log_srq_size - 1;
}
/*
* Next we verify that the rounded-up size is valid (i.e. consistent
* with the device limits and/or software-configured limits).
*/
if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
/* Set "status" and "errormsg" and goto failure */
TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max SRQ size");
goto srqmodify_fail;
}
/*
* Allocate the memory for newly resized Shared Receive Queue.
*
* Note: If SRQ is not user-mappable, then it may come from either
* kernel system memory or from HCA-attached local DDR memory.
*
* Note2: We align this queue on a pagesize boundary. This is required
* to make sure that all the resulting IB addresses will start at 0,
* for a zero-based queue. By making sure we are aligned on at least a
* page, any offset we use into our queue will be the same as it was
* when we allocated it at tavor_srq_alloc() time.
*/
wqesz = (1 << srq->srq_wq_log_wqesz);
new_srqinfo.qa_size = (1 << log_srq_size) * wqesz;
new_srqinfo.qa_alloc_align = PAGESIZE;
new_srqinfo.qa_bind_align = PAGESIZE;
if (srq->srq_is_umap) {
new_srqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
} else {
new_srqinfo.qa_location = wq_location;
}
status = tavor_queue_alloc(state, &new_srqinfo, sleepflag);
if (status != DDI_SUCCESS) {
/* Set "status" and "errormsg" and goto failure */
TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
goto srqmodify_fail;
}
buf = (uint32_t *)new_srqinfo.qa_buf_aligned;
_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
/*
* Allocate the memory for the new WRE list. This will be used later
* when we resize the wridlist based on the new SRQ size.
*/
wre_new = (tavor_wrid_entry_t *)kmem_zalloc((1 << log_srq_size) *
sizeof (tavor_wrid_entry_t), sleepflag);
if (wre_new == NULL) {
/* Set "status" and "errormsg" and goto failure */
TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE,
"failed wre_new alloc");
goto srqmodify_fail;
}
/*
* Fill in the "bind" struct. This struct provides the majority
* of the information that will be used to distinguish between an
* "addr" binding (as is the case here) and a "buf" binding (see
* below). The "bind" struct is later passed to tavor_mr_mem_bind()
* which does most of the "heavy lifting" for the Tavor memory
* registration routines.
*/
_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(bind))
bzero(&bind, sizeof (tavor_bind_info_t));
bind.bi_type = TAVOR_BINDHDL_VADDR;
bind.bi_addr = (uint64_t)(uintptr_t)buf;
bind.bi_len = new_srqinfo.qa_size;
bind.bi_as = NULL;
bind.bi_flags = sleepflag == TAVOR_SLEEP ? IBT_MR_SLEEP :
IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
if (srq->srq_is_umap) {
bind.bi_bypass = state->ts_cfg_profile->cp_iommu_bypass;
} else {
if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
bind.bi_bypass =
state->ts_cfg_profile->cp_iommu_bypass;
dma_xfer_mode =
state->ts_cfg_profile->cp_streaming_consistent;
if (dma_xfer_mode == DDI_DMA_STREAMING) {
bind.bi_flags |= IBT_MR_NONCOHERENT;
}
} else {
bind.bi_bypass = TAVOR_BINDMEM_BYPASS;
}
}
status = tavor_mr_mtt_bind(state, &bind, new_srqinfo.qa_dmahdl, &mtt,
&mtt_pgsize_bits);
if (status != DDI_SUCCESS) {
/* Set "status" and "errormsg" and goto failure */
TAVOR_TNF_FAIL(status, "failed mtt bind");
kmem_free(wre_new, srq->srq_wq_bufsz *
sizeof (tavor_wrid_entry_t));
tavor_queue_free(state, &new_srqinfo);
goto srqmodify_fail;
}
/*
* Calculate the offset between the kernel virtual address space
* and the IB virtual address space. This will be used when
* posting work requests to properly initialize each WQE.
*
* Note: bind addr is zero-based (from alloc) so we calculate the
* correct new offset here.
*/
bind.bi_addr = bind.bi_addr & ((1 << mtt_pgsize_bits) - 1);
srq_desc_off = (uint64_t)(uintptr_t)new_srqinfo.qa_buf_aligned -
(uint64_t)bind.bi_addr;
/*
* Get the base address for the MTT table. This will be necessary
* below when we are modifying the MPT entry.
*/
rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
/*
* Fill in the MPT entry. This is the final step before passing
* ownership of the MPT entry to the Tavor hardware. We use all of
* the information collected/calculated above to fill in the
* requisite portions of the MPT.
*/
bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
mpt_entry.reg_win_len = bind.bi_len;
mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
mpt_entry.mttseg_addr_h = mtt_addr >> 32;
mpt_entry.mttseg_addr_l = mtt_addr >> 6;
/*
* Now we grab the SRQ lock. Since we will be updating the actual
* SRQ location and the producer/consumer indexes, we should hold
* the lock.
*
* We do a TAVOR_NOSLEEP here (and below), though, because we are
* holding the "srq_lock" and if we got raised to interrupt level
* by priority inversion, we would not want to block in this routine
* waiting for success.
*/
mutex_enter(&srq->srq_lock);
/*
* Copy old entries to new buffer
*/
srq_old_bufsz = srq->srq_wq_bufsz;
bcopy(srq->srq_wq_buf, buf, srq_old_bufsz * wqesz);
/* Determine if later ddi_dma_sync will be necessary */
srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);
/* Sync entire "new" SRQ for use by hardware (if necessary) */
if (srq_sync) {
(void) ddi_dma_sync(bind.bi_dmahdl, 0,
new_srqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
}
/*
* Setup MPT information for use in the MODIFY_MPT command
*/
mr = srq->srq_mrhdl;
mutex_enter(&mr->mr_lock);
mpt = srq->srq_mrhdl->mr_mptrsrcp;
/*
* MODIFY_MPT
*
* If this fails for any reason, then it is an indication that
* something (either in HW or SW) has gone seriously wrong. So we
* print a warning message and return.
*/
status = tavor_modify_mpt_cmd_post(state, &mpt_entry, mpt->tr_indx,
TAVOR_CMD_MODIFY_MPT_RESIZESRQ, sleepflag);
if (status != TAVOR_CMD_SUCCESS) {
cmn_err(CE_CONT, "Tavor: MODIFY_MPT command failed: %08x\n",
status);
TNF_PROBE_1(tavor_mr_common_reg_sw2hw_mpt_cmd_fail,
TAVOR_TNF_ERROR, "", tnf_uint, status, status);
TAVOR_TNF_FAIL(status, "MODIFY_MPT command failed");
(void) tavor_mr_mtt_unbind(state, &srq->srq_mrhdl->mr_bindinfo,
srq->srq_mrhdl->mr_mttrsrcp);
kmem_free(wre_new, srq->srq_wq_bufsz *
sizeof (tavor_wrid_entry_t));
tavor_queue_free(state, &new_srqinfo);
mutex_exit(&mr->mr_lock);
mutex_exit(&srq->srq_lock);
return (ibc_get_ci_failure(0));
}
/*
* Update the Tavor Shared Receive Queue handle with all the new
* information. At the same time, save away all the necessary
* information for freeing up the old resources
*/
old_srqinfo = srq->srq_wqinfo;
old_mtt = srq->srq_mrhdl->mr_mttrsrcp;
bcopy(&srq->srq_mrhdl->mr_bindinfo, &old_bind,
sizeof (tavor_bind_info_t));
/* Now set the new info */
srq->srq_wqinfo = new_srqinfo;
srq->srq_wq_buf = buf;
srq->srq_wq_bufsz = (1 << log_srq_size);
bcopy(&bind, &srq->srq_mrhdl->mr_bindinfo, sizeof (tavor_bind_info_t));
srq->srq_mrhdl->mr_mttrsrcp = mtt;
srq->srq_desc_off = srq_desc_off;
srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
/* Update MR mtt pagesize */
mr->mr_logmttpgsz = mtt_pgsize_bits;
mutex_exit(&mr->mr_lock);
#ifdef __lock_lint
mutex_enter(&srq->srq_wrid_wql->wql_lock);
#else
if (srq->srq_wrid_wql != NULL) {
mutex_enter(&srq->srq_wrid_wql->wql_lock);
}
#endif
/*
* Initialize new wridlist, if needed.
*
* If a wridlist already is setup on an SRQ (the QP associated with an
* SRQ has moved "from_reset") then we must update this wridlist based
* on the new SRQ size. We allocate the new size of Work Request ID
* Entries, copy over the old entries to the new list, and
* re-initialize the srq wridlist in non-umap case
*/
wre_old = NULL;
if (srq->srq_wridlist != NULL) {
wre_old = srq->srq_wridlist->wl_wre;
bcopy(wre_old, wre_new, srq_old_bufsz *
sizeof (tavor_wrid_entry_t));
/* Setup new sizes in wre */
srq->srq_wridlist->wl_wre = wre_new;
srq->srq_wridlist->wl_size = srq->srq_wq_bufsz;
if (!srq->srq_is_umap) {
tavor_wrid_list_srq_init(srq->srq_wridlist, srq,
srq_old_bufsz);
}
}
#ifdef __lock_lint
mutex_exit(&srq->srq_wrid_wql->wql_lock);
#else
if (srq->srq_wrid_wql != NULL) {
mutex_exit(&srq->srq_wrid_wql->wql_lock);
}
#endif
/*
* If "old" SRQ was a user-mappable SRQ that is currently mmap()'d out
* to a user process, then we need to call devmap_devmem_remap() to
* invalidate the mapping to the SRQ memory. We also need to
* invalidate the SRQ tracking information for the user mapping.
*
* Note: On failure, the remap really shouldn't ever happen. So, if it
* does, it is an indication that something has gone seriously wrong.
* So we print a warning message and return error (knowing, of course,
* that the "old" SRQ memory will be leaked)
*/
if ((srq->srq_is_umap) && (srq->srq_umap_dhp != NULL)) {
maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
status = devmap_devmem_remap(srq->srq_umap_dhp,
state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size, maxprot,
DEVMAP_MAPPING_INVALID, NULL);
if (status != DDI_SUCCESS) {
mutex_exit(&srq->srq_lock);
TAVOR_WARNING(state, "failed in SRQ memory "
"devmap_devmem_remap()");
/* We can, however, free the memory for old wre */
if (wre_old != NULL) {
kmem_free(wre_old, srq_old_bufsz *
sizeof (tavor_wrid_entry_t));
}
TAVOR_TNF_EXIT(tavor_srq_modify);
return (ibc_get_ci_failure(0));
}
srq->srq_umap_dhp = (devmap_cookie_t)NULL;
}
/*
* Drop the SRQ lock now. The only thing left to do is to free up
* the old resources.
*/
mutex_exit(&srq->srq_lock);
/*
* Unbind the MTT entries.
*/
status = tavor_mr_mtt_unbind(state, &old_bind, old_mtt);
if (status != DDI_SUCCESS) {
TAVOR_WARNING(state, "failed to unbind old SRQ memory");
/* Set "status" and "errormsg" and goto failure */
TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
"failed to unbind (old)");
goto srqmodify_fail;
}
/* Free the memory for old wre */
if (wre_old != NULL) {
kmem_free(wre_old, srq_old_bufsz *
sizeof (tavor_wrid_entry_t));
}
/* Free the memory for the old SRQ */
tavor_queue_free(state, &old_srqinfo);
/*
* Fill in the return arguments (if necessary). This includes the
* real new completion queue size.
*/
if (real_size != NULL) {
*real_size = (1 << log_srq_size);
}
TAVOR_TNF_EXIT(tavor_srq_modify);
return (DDI_SUCCESS);
srqmodify_fail:
TNF_PROBE_1(tavor_srq_modify_fail, TAVOR_TNF_ERROR, "",
tnf_string, msg, errormsg);
TAVOR_TNF_EXIT(tavor_srq_modify);
return (status);
}
/*
* tavor_srq_numcalc()
* Context: Can be called from interrupt or base context.
*/
static void
tavor_srq_numcalc(tavor_state_t *state, uint32_t indx, uint32_t *key)
{
uint32_t tmp, log_num_srq;
/*
* Generate a simple key from counter. Note: We increment this
* static variable _intentionally_ without any kind of mutex around
* it. First, single-threading all operations through a single lock
* would be a bad idea (from a performance point-of-view). Second,
* the upper "unconstrained" bits don't really have to be unique
* because the lower bits are guaranteed to be (although we do make a
* best effort to ensure that they are). Third, the window for the
* race (where both threads read and update the counter at the same
* time) is incredibly small.
*/
_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(tavor_debug_srqnum_cnt))
log_num_srq = state->ts_cfg_profile->cp_log_num_srq;
tmp = (tavor_debug_srqnum_cnt++) << log_num_srq;
*key = (tmp | indx) & TAVOR_CQ_MAXNUMBER_MSK;
}
/*
* tavor_srq_refcnt_inc()
* Context: Can be called from interrupt or base context.
*/
void
tavor_srq_refcnt_inc(tavor_srqhdl_t srq)
{
mutex_enter(&srq->srq_lock);
TNF_PROBE_1_DEBUG(tavor_srq_refcnt_inc, TAVOR_TNF_TRACE, "",
tnf_uint, refcnt, srq->srq_refcnt);
srq->srq_refcnt++;
mutex_exit(&srq->srq_lock);
}
/*
* tavor_srq_refcnt_dec()
* Context: Can be called from interrupt or base context.
*/
void
tavor_srq_refcnt_dec(tavor_srqhdl_t srq)
{
mutex_enter(&srq->srq_lock);
srq->srq_refcnt--;
TNF_PROBE_1_DEBUG(tavor_srq_refcnt_dec, TAVOR_TNF_TRACE, "",
tnf_uint, refcnt, srq->srq_refcnt);
mutex_exit(&srq->srq_lock);
}
/*
* tavor_srqhdl_from_srqnum()
* Context: Can be called from interrupt or base context.
*
* This routine is important because changing the unconstrained
* portion of the SRQ number is critical to the detection of a
* potential race condition in the SRQ handler code (i.e. the case
* where a SRQ is freed and alloc'd again before an event for the
* "old" SRQ can be handled).
*
* While this is not a perfect solution (not sure that one exists)
* it does help to mitigate the chance that this race condition will
* cause us to deliver a "stale" event to the new SRQ owner. Note:
* this solution does not scale well because the number of constrained
* bits increases (and, hence, the number of unconstrained bits
* decreases) as the number of supported SRQ grows. For small and
* intermediate values, it should hopefully provide sufficient
* protection.
*/
tavor_srqhdl_t
tavor_srqhdl_from_srqnum(tavor_state_t *state, uint_t srqnum)
{
uint_t srqindx, srqmask;
/* Calculate the SRQ table index from the srqnum */
srqmask = (1 << state->ts_cfg_profile->cp_log_num_srq) - 1;
srqindx = srqnum & srqmask;
return (state->ts_srqhdl[srqindx]);
}
/*
* tavor_srq_sgl_to_logwqesz()
* Context: Can be called from interrupt or base context.
*/
static void
tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl)
{
uint_t max_size, log2, actual_sgl;
TAVOR_TNF_ENTER(tavor_srq_sgl_to_logwqesz);
switch (wq_type) {
case TAVOR_QP_WQ_TYPE_RECVQ:
/*
* Use requested maximum SGL to calculate max descriptor size
* (while guaranteeing that the descriptor size is a
* power-of-2 cachelines).
*/
max_size = (TAVOR_QP_WQE_MLX_RCV_HDRS + (num_sgl << 4));
log2 = highbit(max_size);
if ((max_size & (max_size - 1)) == 0) {
log2 = log2 - 1;
}
/* Make sure descriptor is at least the minimum size */
log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
/* Calculate actual number of SGL (given WQE size) */
actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_RCV_HDRS) >> 4;
break;
default:
TAVOR_WARNING(state, "unexpected work queue type");
TNF_PROBE_0(tavor_srq_sgl_to_logwqesz_inv_wqtype_fail,
TAVOR_TNF_ERROR, "");
break;
}
/* Fill in the return values */
*logwqesz = log2;
*max_sgl = min(state->ts_cfg_profile->cp_srq_max_sgl, actual_sgl);
TAVOR_TNF_EXIT(tavor_qp_sgl_to_logwqesz);
}