ibd.c revision 03494a9880d80f834bec10a1e8f0a2f8f7c97bf4
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* An implementation of the IPoIB standard based on PSARC 2001/289.
*/
#include <sys/mac_provider.h>
#include <sys/multidata.h>
/*
* and performance studies.
*
* none: h/w (Tavor) and driver does not do checksum, IP software must.
* partial: driver does data checksum, IP must provide psuedo header.
* perf_partial: driver uses IP provided psuedo cksum as data checksum
* (thus, real checksumming is not done).
*/
typedef enum {
/*
* Per interface tunable parameters.
*/
/* should less than max Tavor CQsize and be 2^n - 1 */
uint_t ibd_fifo_depth = 0;
/*
* The driver can use separate CQs for send and receive queueus.
* While using separate CQs, it is possible to put the send CQ
* in polling mode, ie not to enable notifications on that CQ.
* If both CQs are interrupt driven, currently it is not possible
* for their handlers to be invoked concurrently (since Tavor ties
* both interrupts to the same PCI intr line); but the handlers
* are not coded with a single interrupt cpu assumption (eg
* id_num_intrs is incremented atomically).
*
* The driver private struct uses id_scq_hdl to track the separate
* CQ being used for send; the id_rcq_hdl tracks the receive CQ
* if using separate CQs, or it tracks the single CQ when using
* combined CQ. The id_wcs completion array is used in the combined
* CQ case, and for fetching Rx completions in the separate CQs case;
* the id_txwcs is used to fetch Tx completions in the separate CQs
* case.
*/
uint_t ibd_txcomp_poll = 0;
/*
* the softintr is introduced to avoid Event Queue overflow. It
* should not have heavy load in CQ event handle function.
* If service fifos is enabled, this is not required, because
* mac_rx() will be called by service threads.
*/
/*
* Initial number of IBA resources allocated.
*/
#define IBD_NUM_RWQE ibd_num_rwqe
#define IBD_NUM_SWQE ibd_num_swqe
#define IBD_NUM_AH ibd_num_ah
/* when <= threshold, it's faster to copy to a premapped buffer */
/*
* When the number of WQEs on the rxlist < IBD_RX_THRESHOLD, ibd will
* allocate a new WQE to put on the the rxlist. This value must be <=
*/
#define IBD_RX_THRESHOLD ibd_rx_threshold
/*
* Hash table size for the active AH list.
*/
#define IBD_HASH_SIZE ibd_hash_size
#define IBD_TXPOLL_THRESHOLD 64
/*
*/
#define IBD_SEND 0
#define IBD_RECV 1
/*
*/
{ \
}
#define IBD_CLEAR_SCOPE_PKEY(maddr) \
{ \
}
/*
* when free tx wqes >= threshold and reschedule flag is set,
* ibd will call mac_tx_update to re-enable Tx.
*/
#define IBD_TX_UPDATE_THRESHOLD 1
/* Driver State Pointer */
void *ibd_list;
/* Required system entry points */
/* Required driver entry points for GLDv3 */
static int ibd_m_start(void *);
static void ibd_m_stop(void *);
static int ibd_m_unicst(void *, const uint8_t *);
static int ibd_m_promisc(void *, boolean_t);
/* Private driver entry points for GLDv3 */
static uint_t ibd_tx_recycle(char *);
static void ibd_state_fini(ibd_state_t *);
static int ibd_drv_init(ibd_state_t *);
static void ibd_drv_fini(ibd_state_t *);
static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
static void ibd_scq_handler(ibt_cq_hdl_t, void *);
static void ibd_snet_notices_handler(void *, ib_gid_t,
static int ibd_init_txlist(ibd_state_t *);
static void ibd_fini_txlist(ibd_state_t *);
static int ibd_init_rxlist(ibd_state_t *);
static void ibd_fini_rxlist(ibd_state_t *);
static void ibd_freemsg_cb(char *);
static int ibd_acache_init(ibd_state_t *);
static void ibd_acache_fini(ibd_state_t *);
static void ibd_async_unsetprom(ibd_state_t *);
static void ibd_async_setprom(ibd_state_t *);
static void ibd_async_txsched(ibd_state_t *);
static void ibd_async_work(ibd_state_t *);
ipoib_mac_t *);
#ifdef RUN_PERFORMANCE
static void ibd_perf(ibd_state_t *);
#endif
/* Module Driver Info */
static struct modldrv ibd_modldrv = {
&mod_driverops, /* This one is a driver */
"InfiniBand GLDv3 Driver", /* short description */
&ibd_dev_ops /* driver specific ops */
};
/* Module Linkage */
static struct modlinkage ibd_modlinkage = {
};
/*
* Module Info passed to IBTL during IBT_ATTACH.
* NOTE: This data must be static (i.e. IBTL just keeps a pointer to this
* data).
*/
static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
NULL,
"IPIB"
};
/*
* Async operation types.
*/
#define ASYNC_GETAH 1
#define ASYNC_JOIN 2
#define ASYNC_LEAVE 3
#define ASYNC_PROMON 4
#define ASYNC_PROMOFF 5
#define ASYNC_REAP 6
#define ASYNC_TRAP 8
#define ASYNC_SCHED 9
#define ASYNC_LINK 10
#define ASYNC_EXIT 11
/*
* Async operation states
*/
#define NOTSTARTED 0
#define ONGOING 1
#define COMPLETED 2
#define ERRORED 3
#define ROUTERED 4
#define IB_MCGID_IPV4_LOW_GROUP_MASK 0xFFFFFFFF
#define IBD_M_CALLBACK_FLAGS (MC_GETCAPAB)
static mac_callbacks_t ib_m_callbacks = {
NULL,
};
#ifdef DEBUG
int ibd_debuglevel = 100;
static void
debug_print(int l, char *fmt, ...)
{
if (l < ibd_debuglevel)
return;
}
#define DPRINT debug_print
#else /* DEBUG */
#define INCRXPACK 0
#define INCTXPACK 0
#define DPRINT
#endif /* DEBUG */
/*
* Common routine to print warning messages; adds in hca guid, port number
* and pkey to be able to identify the IBA interface.
*/
static void
{
char ibd_print_buf[256];
int len;
0, "hca-guid", 0);
"%s%d: HCA GUID %016llx port %d PKEY %02x ",
}
/* warlock directives */
#ifdef DEBUG
#endif
int
_init()
{
int status;
/*
* Sanity check some parameter settings. Tx completion polling
* only makes sense with separate CQs for Tx and Rx.
*/
"Setting ibd_txcomp_poll = 0 for combined CQ");
ibd_txcomp_poll = 0;
}
if (status != 0) {
return (status);
}
if (status != 0) {
return (status);
}
return (0);
}
int
{
}
int
_fini()
{
int status;
if (status != 0)
return (status);
return (0);
}
/*
* Convert the GID part of the mac address from network byte order
* to host order.
*/
static void
{
}
/*
* Create the IPoIB address in network byte order from host order inputs.
*/
static void
{
}
/*
* Send to the appropriate all-routers group when the IBA multicast group
* does not exist, based on whether the target group is v4 or v6.
*/
static boolean_t
{
/*
* Copy the first 4 bytes in without assuming any alignment of
* input mac address; this will have IPoIB signature, flags and
* scope bits.
*/
/*
*/
else
/*
* Does not have proper bits in the mgid address.
*/
return (retval);
}
/*
* Implementation of various (software) flavors of send and receive side
* checksumming.
*/
#define IBD_CKSUM_SEND(mp) { \
\
if (ibd_csum_send == IBD_CSUM_NONE) \
goto punt_send; \
\
/* \
* Query IP whether Tx cksum needs to be done. \
*/ \
\
if (flags == HCK_PARTIALCKSUM) { \
if (ibd_csum_send == IBD_CSUM_PARTIAL) { \
*up = 0; \
/* \
* SGL. Applicable only for a single SGL \
* within the range of buf. \
*/ \
} else { \
} \
} \
punt_send: \
; \
}
#define IBD_CKSUM_RECV(mp) { \
ipoib_hdr_t *ipibh; \
\
if (ibd_csum_recv == IBD_CSUM_NONE) \
goto punt_recv; \
\
goto punt_recv; \
\
goto punt_recv; \
else \
goto punt_recv; \
\
flags = HCK_PARTIALCKSUM; \
\
if (ibd_csum_recv == IBD_CSUM_PARTIAL) { \
} else { \
} \
punt_recv: \
; \
}
/*
* Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
* padding by default at the end. The routine which is doing is nce_xmit()
* in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
* the packet comes down from IP layer to the IBD driver, it is in the
* following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
* This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
* machdr is not 4 byte aligned and had 2 bytes of padding at the end.
*
* The send routine at IBD driver changes this packet as follows:
* [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
* followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
* aligned.
*
* At the receiving side again ibd_process_rx takes the above packet and
* removes the two bytes of front padding and inserts it at the end. This
* is since the IP layer does not understand padding at the front.
*/
uchar_t *nd_lla_ptr; \
nd_opt_hdr_t *opt; \
int i; \
\
len -= sizeof (nd_neighbor_advert_t); \
(len != 0)) { \
+ IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \
if (type == 0) { \
for (i = IPOIB_ADDRL; i > 0; i--) \
*(nd_lla_ptr + i + 1) = \
*(nd_lla_ptr + i - 1); \
} else { \
for (i = 0; i < IPOIB_ADDRL; i++) \
*(nd_lla_ptr + i) = \
*(nd_lla_ptr + i + 2); \
} \
*(nd_lla_ptr + i) = 0; \
*(nd_lla_ptr + i + 1) = 0; \
} \
}
/*
* The service fifo code is copied verbatim from Cassini. This can be
* enhanced by doing a cpu_bind_thread() to bind each fifo to a cpu.
*/
typedef struct _srv_fifo_t {
} srv_fifo_t, *p_srv_fifo_t;
static int
{
int status;
} else
return (status);
}
static void
{
if (handle->objs_pending != 0)
}
static caddr_t
{
#ifndef __lock_lint
callb_generic_cpr, "srv_fifo");
#endif /* ! _lock_lint */
return (handle->drain_func_arg);
}
static void
{
#ifndef __lock_lint
#endif /* ! _lock_lint */
thread_exit();
}
static int
{
int status;
if (status == DDI_SUCCESS) {
if (ptr) {
else
handle->objs_pending++;
} else
if (signal)
} else {
}
}
return (status);
}
static int
{
int status;
if (status == DDI_SUCCESS) {
if (handle->objs_pending == 0) {
#ifndef __lock_lint
#endif /* !_lock_lint */
}
if (handle->objs_pending > 0) {
else
handle->objs_pending--;
}
} else {
if (handle->objs_pending) {
else
handle->objs_pending--;
} else
}
return (status);
}
/*
* [un]map_rx_srv_fifos has been modified from its CE version.
*/
static void
{
/*
* Hand off to GLDv3.
*/
}
}
static p_srv_fifo_t *
{
int i, inst_taskqs, depth;
/*
* Default behavior on both sparc and amd cpus in terms of
* of worker thread is as follows: (N) indicates worker thread
* not enabled , (Y) indicates worker thread enabled. Default of
* ibd_srv_fifo is set to 0xffff. The default behavior can be
* overridden by setting ibd_srv_fifos to 0 or 1 as shown below.
* Worker thread model assigns lower priority to network
* processing making system more usable at higher network
* loads.
* ________________________________________________________
* |Value of ibd_srv_fifo | 0 | 1 | 0xffff| 0 | 1 | 0xfffff |
* |----------------------|---|---|-------|---|---|---------|
* | | Sparc | x86 |
* |----------------------|---|---|-------|---|---|---------|
* | Single CPU |N | Y | N | N | Y | N |
* |----------------------|---|---|-------|---|---|---------|
* | Multi CPU |N | Y | Y | N | Y | Y |
* |______________________|___|___|_______|___|___|_________|
*/
(ibd_srv_fifos == 0)) {
*nfifos = 0;
return ((p_srv_fifo_t *)1);
}
*nfifos = inst_taskqs;
KM_SLEEP);
/*
* If the administrator has specified a fifo depth, use
* that, else just decide what should be the depth.
*/
if (ibd_fifo_depth == 0)
else
for (i = 0; i < inst_taskqs; i++)
if (_ddi_srv_fifo_create(&srv_fifos[i],
break;
if (i < inst_taskqs)
goto map_rx_srv_fifos_fail1;
goto map_rx_srv_fifos_exit;
i--;
for (; i >= 0; i--) {
}
return (srv_fifos);
}
static void
{
int i;
/*
* If this interface was not using service fifos, quickly return.
*/
if (inst_taskqs == 0)
return;
for (i = 0; i < inst_taskqs; i++) {
}
}
/*
* Choose between sending up the packet directly and handing off
* to a service thread.
*/
static void
{
/*
* Quick path if the interface is not using service fifos.
*/
if (nfifos == 0) {
return;
}
/*
* Is the packet big enough to look at the IPoIB header
* and basic IP header to determine whether it is an
* IPv4 packet?
*/
sizeof (struct ip))) {
/*
* Is the packet an IP(v4) packet?
*/
/*
* TCP or UDP packet? We use the UDP header, since
* the first few words of both headers are laid out
*/
/*
* Are we within limits of this packet? If
* so, use the destination port to hash to
* a service thread.
*/
sizeof (*tran_hdr)))
}
}
}
/*
* packet up in interrupt context, reducing latency.
*/
if (tnum == -1) {
goto hand_off;
}
B_TRUE) != DDI_SUCCESS)
}
/*
* Address handle entries maintained by the driver are kept in the
* free and active lists. Each entry starts out in the free list;
* it migrates to the active list when primed using ibt_get_paths()
* and ibt_modify_ud_dest() for transmission to a specific destination.
* In the active list, the entry has a reference count indicating the
* number of ongoing/uncompleted transmits that reference it. The
* entry is left in the active list even after the reference count
* goes to 0, since successive transmits can find it there and do
* not need to set up another entry (ie the path information is
* cached using the active list). Entries on the active list are
* also hashed using the destination link address as a key for faster
* lookups during transmits.
*
* For any destination address (unicast or multicast, whatever the
* join states), there will be at most one entry in the active list.
* Entries with a 0 reference count on the active list can be reused
* for a transmit to a new destination, if the free list is empty.
*
* active list does not need a lock (all operations are done by the
* async thread) but updates to the reference count are atomically
* done (increments done by Tx path, decrements by the Tx callback handler).
*/
#define IBD_ACACHE_GET_FREE(state) \
int _ret_; \
}
}
#define IBD_ACACHE_GET_ACTIVE(state) \
/*
* Membership states for different mcg's are tracked by two lists:
* the "non" list is used for promiscuous mode, when all mcg traffic
* needs to be inspected. This type of membership is never used for
* transmission, so there can not be an AH in the active list
* corresponding to a member in this list. This list does not need
* any protection, since all operations are performed by the async
* thread.
*
* "Full" and "SendOnly" membership is tracked using a single list,
* the "full" list. This is because this single list can then be
* searched during transmit to a multicast group (if an AH for the
* mcg is not found in the active list), since at least one type
* of membership must be present before initiating the transmit.
* This list is also emptied during driver detach, since sendonly
* membership acquired during transmit is dropped at detach time
* this list are done only by the async thread, but it is also
* searched in program context (see multicast disable case), thus
* the id_mc_mutex protects the list. The driver detach path also
* deconstructs the "full" list, but it ensures that the async
* thread will not be accessing the list (by blocking out mcg
* trap handling and making sure no more Tx reaping will happen).
*
* Currently, an IBA attach is done in the SendOnly case too,
* although this is not required.
*/
/*
* AH and MCE active list manipulation:
*
* Multicast disable requests and MCG delete traps are two cases
* where the active AH entry for the mcg (if any unreferenced one exists)
* will be moved to the free list (to force the next Tx to the mcg to
* join the MCG in SendOnly mode). Port up handling will also move AHs
* from active to free list.
*
* In the case when some transmits are still pending on an entry
* for an mcg, but a multicast disable has already been issued on the
* mcg, there are some options to consider to preserve the join state
* to ensure the emitted packet is properly routed on the IBA fabric.
* For the AH, we can
* 1. take out of active list at multicast disable time.
* 2. take out of active list only when last pending Tx completes.
* For the MCE, we can
* 3. take out of active list at multicast disable time.
* 4. take out of active list only when last pending Tx completes.
* 5. move from active list to stale list at multicast disable time.
* We choose to use 2,4. We use option 4 so that if a multicast enable
* is tried before the pending Tx completes, the enable code finds the
* mce in the active list and just has to make sure it will not be reaped
* (ie the mcg leave done) when the pending Tx does complete. Alternatively,
* a stale list (#5) that would be checked in the enable code would need
* to be implemented. Option 2 is used, because otherwise, a Tx attempt
* after the multicast disable would try to put an AH in the active list,
* and associate the mce it finds in the active list to this new AH,
* whereas the mce is already associated with the previous AH (taken off
* the active list), and will be removed once the pending Tx's complete
* (unless a reference count on mce's is implemented). One implication of
* using 2,4 is that new Tx's posted before the pending Tx's complete will
* grab new references on the AH, further delaying the leave.
*
* In the case of mcg delete (or create) trap when the port is sendonly
* joined, the AH and MCE handling is different: the AH and MCE has to be
* immediately taken off the active lists (forcing a join and path lookup
* at the next Tx is the only guaranteed means of ensuring a proper Tx
* to an mcg as it is repeatedly created and deleted and goes thru
* reincarnations).
*
* When a port is already sendonly joined, and a multicast enable is
* attempted, the same mce structure is promoted; this ensures only a
* single mce on the active list tracks the most powerful join state.
*
* In the case of port up event handling, the MCE for sendonly membership
* is freed up, and the ACE is put into the free list as soon as possible
* (depending on whether posted Tx's have completed). For fullmembership
* MCE's though, the ACE is similarly handled; but the MCE is kept around
* (a re-JOIN is attempted) only if the DLPI leave has not already been
* done; else the mce is deconstructed (mc_fullreap case).
*
* MCG creation and deletion trap handling:
*
* These traps are unreliable (meaning sometimes the trap might never
* be delivered to the subscribed nodes) and may arrive out-of-order
* since they use UD transport. An alternative to relying on these
* unreliable traps is to poll for mcg presence every so often, but
* instead of doing that, we try to be as conservative as possible
* while handling the traps, and hope that the traps do arrive at
* the subscribed nodes soon. Note that if a node is fullmember
* trap for that mcg (by fullmember definition); if it does, it is
* an old trap from a previous incarnation of the mcg.
*
* Whenever a trap is received, the driver cleans up its sendonly
* membership to the group; we choose to do a sendonly leave even
* on a creation trap to handle the case of a prior deletion of the mcg
* having gone unnoticed. Consider an example scenario:
* T1: MCG M is deleted, and fires off deletion trap D1.
* T2: MCG M is recreated, fires off creation trap C1, which is lost.
* T3: Node N tries to transmit to M, joining in sendonly mode.
* T4: MCG M is deleted, and fires off deletion trap D2.
* T5: N receives a deletion trap, but can not distinguish D1 from D2.
* If the trap is D2, then a LEAVE is not required, since the mcg
* is already deleted; but if it is D1, a LEAVE is required. A safe
* approach is to always LEAVE, but the SM may be confused if it
* receives a LEAVE without a prior JOIN.
*
* Management of the non-membership to an mcg is similar to the above,
* except that if the interface is in promiscuous mode, it is required
* to attempt to re-join the mcg after receiving a trap. Unfortunately,
* if the re-join attempt fails (in which case a warning message needs
* to be printed), it is not clear whether it failed due to the mcg not
* mcg is also racy at best. Thus, the driver just prints a warning
* message when it can not rejoin after receiving a create trap, although
* this might be (on rare occassions) a mis-warning if the create trap is
* received after the mcg was deleted.
*/
/*
* Implementation of atomic "recycle" bits and reference count
* on address handles. This utilizes the fact that max reference
* count on any handle is limited by number of send wqes, thus
* high bits in the ac_ref field can be used as the recycle bits,
* and only the low bits hold the number of pending Tx requests.
* This atomic AH reference counting allows the Tx completion
* handler not to acquire the id_ac_mutex to process every completion,
* thus reducing lock contention problems between completion and
* the Tx path.
*/
#define CYCLEVAL 0x80000
#define GET_REF_CYCLE(ace) ( \
/* \
* Make sure "cycle" bit is set. \
*/ \
)
}
#define SET_CYCLE_IF_REF(ace) ( \
CYCLEVAL ? \
/* \
* Clear the "cycle" bit we just set; \
* ref count known to be 0 from above. \
*/ \
/* \
* We set "cycle" bit; let caller know. \
*/ \
B_TRUE \
)
#define DEC_REF_DO_CYCLE(ace) ( \
CYCLEVAL ? \
/* \
* Ref count known to be 0 from above. \
*/ \
B_TRUE : \
B_FALSE \
)
static void *
{
return (lhead);
}
/*
* This is always guaranteed to be able to queue the work.
*/
static void
{
/* Initialize request */
/*
* Queue provided slot onto request pool.
*/
/* Go, fetch, async thread */
}
/*
* Main body of the per interface async thread.
*/
static void
{
callb_generic_cpr, "ibd_async_work");
for (;;) {
/*
* Once we have done the operation, there is no
* guarantee the request slot is going to be valid,
* it might be freed up (as in ASYNC_LEAVE,REAP,TRAP).
*/
/* Perform the request */
case ASYNC_GETAH:
break;
case ASYNC_REAP:
/*
* the req buf contains in mce
* structure, so we do not need
* to free it here.
*/
break;
case ASYNC_LEAVE:
case ASYNC_JOIN:
break;
case ASYNC_PROMON:
break;
case ASYNC_PROMOFF:
break;
case ASYNC_TRAP:
break;
case ASYNC_SCHED:
break;
case ASYNC_LINK:
break;
case ASYNC_EXIT:
#ifndef __lock_lint
#endif /* !__lock_lint */
return;
}
} else {
/*
* Nothing to do: wait till new request arrives.
*/
#ifndef __lock_lint
#endif /* !_lock_lint */
}
}
/*NOTREACHED*/
}
/*
* Return when it is safe to queue requests to the async daemon; primarily
* for subnet trap and async event handling. Disallow requests before the
* daemon is created, and when interface deinitilization starts.
*/
static boolean_t
{
if (state->id_trap_stop) {
return (B_FALSE);
}
state->id_trap_inprog++;
return (B_TRUE);
}
/*
* Wake up ibd_drv_fini() if the detach code is waiting for pending subnet
* trap or event handling to complete to kill the async thread and deconstruct
*/
static void
{
if (--state->id_trap_inprog == 0)
}
/*
* Hash functions:
* ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
* ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
* These operate on mac addresses input into ibd_send, but there is no
* guarantee on the alignment of the ipoib_mac_t structure.
*/
/*ARGSUSED*/
static uint_t
{
/*
* If the input address is 4 byte aligned, we can just dereference
* it. This is most common, since IP will send in a 4 byte aligned
* IP header, which implies the 24 byte IPoIB psuedo header will be
* 4 byte aligned too.
*/
if ((ptraddr & 3) == 0)
return (hval);
}
static int
{
return (0);
else
return (1);
}
/*
* Initialize all the per interface caches and lists; AH cache,
* MCG list etc.
*/
static int
{
int i;
for (i = 0; i < IBD_NUM_AH; i++, ce++) {
return (DDI_FAILURE);
} else {
}
}
return (DDI_SUCCESS);
}
static void
{
}
}
}
/*
* Search AH active hash list for a cached path to input destination.
* If we are "just looking", hold == F. When we are in the Tx path,
* we set hold == T to grab a reference on the AH so that it can not
* be recycled to a new destination while the Tx request is posted.
*/
static ibd_ace_t *
{
/*
* Do hash search.
*/
if (hold)
return (ptr);
}
return (NULL);
}
/*
* This is called by the tx side; if an initialized AH is found in
* the active list, it is locked down and can be used; if no entry
* is found, an async request is queued to do path resolution.
*/
static ibd_ace_t *
{
/*
* Only attempt to print when we can; in the mdt pattr case, the
* address is not aligned properly.
*/
DPRINT(4,
"ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
return (ptr);
}
/*
* Implementation of a single outstanding async request; if
* the operation is not started yet, queue a request and move
* to ongoing state. Remember in id_ah_addr for which address
* we are queueing the request, in case we need to flag an error;
* Any further requests, for the same or different address, until
* the operation completes, is sent back to GLDv3 to be retried.
* The async thread will update id_ah_op with an error indication
* or will set it to indicate the next look up can start; either
* way, it will mac_tx_update() so that all blocked requests come
* back here.
*/
/*
* We did not even find the entry; queue a request
* for it.
*/
}
/*
* Check the status of the pathrecord lookup request
* we had queued before.
*/
state->id_ah_error++;
} else {
/*
* ROUTERED case: We need to send to the
* all-router MCG. If we can find the AH for
* the mcg, the Tx will be attempted. If we
* do not find the AH, we return NORESOURCES
* to retry.
*/
numwqe);
}
/*
* This case can happen when we get a higher band
* packet. The easiest way is to reset the state machine
* to accommodate the higher priority packet.
*/
}
return (ptr);
}
/*
* Grab a not-currently-in-use AH/PathRecord from the active
* list to recycle to a new destination. Only the async thread
* executes this code.
*/
static ibd_ace_t *
{
/*
* Do plain linear search.
*/
/*
* Note that it is possible that the "cycle" bit
* is set on the AH w/o any reference count. The
* mcg must have been deleted, and the tx cleanup
* just decremented the reference count to 0, but
* hasn't gotten around to grabbing the id_ac_mutex
* to move the AH into the free list.
*/
break;
}
}
return (ptr);
}
/*
* Invoked to clean up AH from active list in case of multicast
* disable and to handle sendonly memberships during mcg traps.
* And for port up processing for multicast and unicast AHs.
* Normally, the AH is taken off the active list, and put into
* the free list to be recycled for a new destination. In case
* Tx requests on the AH have not completed yet, the AH is marked
* for reaping (which will put the AH on the free list) once the Tx's
* complete; in this case, depending on the "force" input, we take
* out the AH from the active list right now, or leave it also for
* the reap operation. Returns TRUE if the AH is taken off the active
* list (and either put into the free list right now, or arranged for
* later), FALSE otherwise.
*/
static boolean_t
{
/*
* Note that the AH might already have the cycle bit set
* on it; this might happen if sequences of multicast
* enables and disables are coming so fast, that posted
* Tx's to the mcg have not completed yet, and the cycle
* bit is set successively by each multicast disable.
*/
if (SET_CYCLE_IF_REF(acactive)) {
if (!force) {
/*
* The ace is kept on the active list, further
* Tx's can still grab a reference on it; the
* ace is reaped when all pending Tx's
* referencing the AH complete.
*/
} else {
/*
* In the mcg trap case, we always pull the
* AH from the active list. And also the port
*/
}
} else {
/*
* Determined the ref count is 0, thus reclaim
* immediately after pulling out the ace from
* the active list.
*/
}
}
return (ret);
}
/*
* Helper function for async path record lookup. If we are trying to
* Tx to a MCG, check our membership, possibly trying to join the
* group if required. If that fails, try to send the packet to the
* all router group (indicated by the redirect output), pointing
* the input mac address to the router mcg address.
*/
static ibd_mce_t *
{
/*
* Check the FullMember+SendOnlyNonMember list.
* Since we are the only one who manipulates the
* id_mc_full list, no locks are needed.
*/
return (mce);
}
/*
* Not found; try to join(SendOnlyNonMember) and attach.
*/
NULL) {
return (mce);
}
/*
* MCGroup not present; try to join the all-router group. If
* any of the following steps succeed, we will be redirecting
* to the all router group.
*/
return (NULL);
/*
* Are we already joined to the router group?
*/
"group\n");
return (mce);
}
/*
* Can we join(SendOnlyNonMember) the router group?
*/
NULL) {
return (mce);
}
return (NULL);
}
/*
* Async path record lookup code.
*/
static void
{
int ret = NOTSTARTED;
/*
* Check whether we are trying to transmit to a MCG.
* In that case, we need to make sure we are a member of
* the MCG.
*/
/*
* If we can not find or join the group or even
* redirect, error out.
*/
NULL) {
return;
}
/*
* If we got redirected, we need to determine whether
* the AH for the new mcg is in the cache already, and
* not pull it in then; otherwise proceed to get the
* path for the new mcg. There is no guarantee that
* if the AH is currently in the cache, it will still be
* there when we look in ibd_acache_lookup(), but that's
* okay, we will come back here.
*/
if (redirected) {
"%08X:%08X:%08X:%08X:%08X",
return;
}
}
}
/*
* Get an AH from the free list.
*/
/*
* No free ones; try to grab an unreferenced active
* one. Maybe we need to make the active list LRU,
* but that will create more work for Tx callbacks.
* Is there a way of not having to pull out the
* entry from the active list, but just indicate it
* is being recycled? Yes, but that creates one more
* check in the fast lookup path.
*/
/*
* Pretty serious shortage now.
*/
"slot\n");
return;
}
/*
* We could check whether ac_mce points to a SendOnly
* member and drop that membership now. Or do it lazily
* at detach time.
*/
}
/*
* Update the entry.
*/
goto error;
}
goto error;
}
/*
* mce is set whenever an AH is being associated with a
* MCG; this will come in handy when we leave the MCG. The
* lock protects Tx fastpath from scanning the active list.
*/
return;
/*
* We might want to drop SendOnly membership here if we
* joined above. The lock protects Tx callbacks inserting
* into the free list.
*/
}
/*
* While restoring port's presence on the subnet on a port up, it is possible
* that the port goes down again.
*/
static void
{
/*
* this on a link down, since we will be unable to do SA operations,
* defaulting to the lowest speed. Also notice that we update our
* notion of speed before calling mac_link_update(), which will do
* neccesary higher level notifications for speed changes.
*/
}
/*
* Do all the work required to establish our presence on
* the subnet.
*/
if (opcode == IBD_LINK_UP_ABSENT) {
/*
* If in promiscuous mode ...
*/
/*
* Drop all nonmembership.
*/
/*
* Then, try to regain nonmembership to all mcg's.
*/
}
/*
* Drop all sendonly membership (which also gets rid of the
* AHs); try to reacquire all full membership.
*/
else
}
/*
* Recycle all active AHs to free list (and if there are
* pending posts, make sure they will go into the free list
* once the Tx's complete). Grab the lock to prevent
* concurrent Tx's as well as Tx cleanups.
*/
B_TRUE);
/*
* If this is for an mcg, it must be for a fullmember,
* since we got rid of send-only members above when
* processing the mce list.
*/
/*
* Check if the fullmember mce needs to be torn down,
* ie whether the DLPI disable has already been done.
* If so, do some of the work of tx_cleanup, namely
* causing leave (which will fail), detach and
* mce-freeing. tx_cleanup will put the AH into free
* list. The reason to duplicate some of this
* tx_cleanup work is because we want to delete the
* AH right now instead of waiting for tx_cleanup, to
* force subsequent Tx's to reacquire an AH.
*/
}
}
/*
* mac handle is guaranteed to exist since driver does ibt_close_hca()
* (which stops further events from being delivered) before
* mac_unreigster(). At this point, it is guaranteed that mac_register
* has already been done.
*/
}
/*
* When the link is notified up, we need to do a few things, based
* on the port's current p_init_type_reply claiming a reinit has been
* done or not. The reinit steps are:
* 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
* the old Pkey and GID0 are correct.
* 2. Register for mcg traps (already done by ibmf).
* 3. If PreservePresenceReply indicates the SM has restored port's presence
* in subnet, nothing more to do. Else go to next steps (on async daemon).
* 4. Give up all sendonly memberships.
* 5. Acquire all full memberships.
* 6. In promiscuous mode, acquire all non memberships.
* 7. Recycle all AHs to free list.
*/
static void
{
/*
* Do not send a request to the async daemon if it has not
* yet been created or is being destroyed. If the async
* daemon has not yet been created, we still need to track
* last known state of the link. If this code races with the
* detach path, then we are assured that the detach path has
* not yet done the ibt_close_hca (which waits for all async
* events to complete). If the code races with the attach path,
* the initialization path has already set these up and created
* IBTF resources based on the values.
*/
/*
* If the init code in ibd_drv_init hasn't yet set up the
*/
return;
}
if (code == IBT_EVENT_PORT_UP) {
" ibt_query_port()\n");
return;
}
/*
* If the link already went down by the time the handler gets
* are not valid.
*/
/*
* In InitTypeReply, check if NoLoadReply ==
*/
if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0) &&
(!badup)) {
/*
* Check that the subnet part of GID0 has not changed.
*/
sizeof (ib_gid_t)) != 0)
/*
*/
}
/*
* In InitTypeReply, if PreservePresenceReply indicates the SM
* has ensured that the port's presence in mcg, traps etc is
* intact, nothing more to do.
*/
if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
if (badup)
}
if (!ibd_async_safe(state)) {
return;
}
if (code == IBT_ERROR_PORT_DOWN)
}
/*
* invocations of the handler. IBTL might coalesce link transition events,
* invoke the handler with last known state
*/
static void
{
switch (code) {
break;
case IBT_ERROR_CQ:
break;
case IBT_ERROR_PORT_DOWN:
case IBT_EVENT_PORT_UP:
/*
* Events will be delivered to all instances that have
* done ibt_open_hca() but not yet done ibt_close_hca().
* Only need to do work for our port; IBTF will deliver
* events for other ports on the hca we have ibt_open_hca'ed
* too. Note that ibd_drv_init() initializes id_port before
* doing ibt_open_hca().
*/
break;
break;
case IBT_HCA_ATTACH_EVENT:
case IBT_HCA_DETACH_EVENT:
/*
* When a new card is plugged to the system, attach_event is
* invoked. Additionally, a cfgadm needs to be run to make the
* card known to the system, and an ifconfig needs to be run to
* plumb up any ibd interfaces on the card. In the case of card
* unplug, a cfgadm is run that will trigger any RCM scripts to
* unplumb the ibd interfaces on the card; when the card is
* actually unplugged, the detach_event is invoked;
* additionally, if any ibd instances are still active on the
* card (eg there were no associated RCM scripts), driver's
* detach routine is invoked.
*/
break;
default:
break;
}
}
/*
* Attach device to the IO framework.
*/
static int
{
int instance;
int err;
switch (cmd) {
case DDI_ATTACH:
break;
case DDI_RESUME:
/* This driver does not support resume */
default:
return (DDI_FAILURE);
}
/*
* Allocate soft device data structure
*/
return (DDI_FAILURE);
/* pre ibt_attach() soft state initialization */
goto attach_fail_state_init;
}
/* alloc rx soft intr */
if ((ibd_rx_softintr == 1) &&
}
/* alloc tx soft intr */
if ((ibd_tx_softintr == 1) &&
}
/* "attach" to IBTL */
goto attach_fail_ibt_attach;
}
/* Finish initializing this driver */
goto attach_fail_drv_init;
}
/*
* Initialize pointers to device specific functions which will be
* used by the generic layer.
*/
goto attach_fail_drv_init;
}
/*
* Register ourselves with the GLDv3 interface
*/
if (err != 0) {
goto attach_fail_mac_register;
}
/*
* Setup the handler we will use for regular DLPI stuff. Its important
* to setup the recv handler after registering with gldv3.
*/
IBT_SUCCESS) {
}
/*
* and start the async thread, both of which are required for the
* trap handler to function properly. Enable the trap handler to
* queue requests to the async thread after the mac_register, because
* the async daemon invokes mac_tx_update(), which must be done after
* mac_register().
*/
/*
* Indicate link status to GLDv3 and higher layers. By default,
* we assume we are in up state (which must have been true at
* least at the time the broadcast mcg's were probed); if there
* async handler will have updated last known state, which we
* use to tell GLDv3. The async handler will not send any
* notifications to GLDv3 till we reach here in the initialization
* sequence.
*/
return (DDI_SUCCESS);
/* Attach failure points, cleanup */
if (ibd_tx_softintr == 1)
if (ibd_rx_softintr == 1)
return (DDI_FAILURE);
}
/*
* Detach device from the IO framework.
*/
static int
{
int status;
int instance;
switch (cmd) {
case DDI_DETACH:
break;
case DDI_SUSPEND:
default:
return (DDI_FAILURE);
}
/*
* First, stop receive interrupts; this stops the
* driver from handing up buffers to higher layers.
* Wait for receive buffers to be returned; give up
* after 5 seconds.
*/
status = 50;
if (--status == 0) {
goto failed;
}
}
goto failed;
}
if (ibd_rx_softintr == 1)
if (ibd_tx_softintr == 1)
"driver detach time");
return (DDI_SUCCESS);
/*
* turned off the notification. Turn on notifications. There
* is a race in that we do not reap completions that come in
* after the poll and before notifications get turned on. That
* that will reap any missed completions.
*/
return (DDI_FAILURE);
}
/*
* Pre ibt_attach() driver initialization
*/
static int
{
char buf[64];
state->id_trap_inprog = 0;
return (DDI_SUCCESS);
}
/*
* Post ibt_detach() driver deconstruction
*/
static void
{
}
/*
* Fetch IBA parameters for the network device from IB nexus.
*/
static int
{
/*
* Get the IBA Pkey ... allow only fullmembers, per IPoIB spec.
* Note that the default partition is also allowed.
*/
0, "port-pkey", IB_PKEY_INVALID_LIMITED);
"partition\n");
return (DDI_FAILURE);
}
/*
* ... the IBA port ...
*/
0, "port-number", 0);
return (DDI_FAILURE);
}
/*
* ... and HCA GUID.
*/
0, "hca-guid", 0);
if (*hca_guid == 0) {
"guid\n");
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
/*
* Fetch link speed from SA for snmp ifspeed reporting.
*/
static uint64_t
{
int ret;
/*
* Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
* translates to 2 Gbps data rate. Thus, 1X single data rate is
* 2000000000. Start with that as default.
*/
ifspeed = 2000000000;
/*
* Get the port speed from Loopback path information.
*/
goto earlydone;
if (num_paths < 1)
goto earlydone;
/*
* In case SA does not return an expected value, report the default
* speed as 1X.
*/
ret = 1;
case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */
ret = 1;
break;
case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */
ret = 4;
break;
case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */
ret = 12;
break;
case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */
ret = 2;
break;
case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */
ret = 8;
break;
case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */
ret = 16;
break;
case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */
ret = 24;
break;
case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */
ret = 32;
break;
case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */
ret = 48;
break;
}
return (ifspeed);
}
/*
* Search input mcg list (id_mc_full or id_mc_non) for an entry
* representing the input mcg mgid.
*/
static ibd_mce_t *
{
/*
* Do plain linear search.
*/
sizeof (ib_gid_t)) == 0)
return (ptr);
}
return (NULL);
}
/*
* Execute IBA JOIN.
*/
static ibt_status_t
{
}
/*
* This code JOINs the port in the proper way (depending on the join
* It also attaches the QPN to the mcg so it can receive those mcg
* packets. This code makes sure not to attach the mcg to the QP if
* that has been previously done due to the mcg being joined with a
* different join state, even though this is not required by SWG_0216,
* refid 3610.
*/
static ibd_mce_t *
{
/*
* For enable_multicast Full member joins, we need to do some
* extra work. If there is already an mce on the list that
* indicates full membership, that means the membership has
* not yet been dropped (since the disable_multicast was issued)
* because there are pending Tx's to the mcg; in that case, just
* mark the mce not to be reaped when the Tx completion queues
* an async reap operation.
*
* If there is already an mce on the list indicating sendonly
* membership, try to promote to full membership. Be careful
* not to deallocate the old mce, since there might be an AH
* pointing to it; instead, update the old mce with new data
* that tracks the full membership.
*/
return (omce);
} else {
}
}
/*
* Allocate the ibd_mce_t to track this JOIN.
*/
return (NULL);
}
/*
* Is an IBA attach required? Not if the interface is already joined
* to the mcg in a different appropriate join state.
*/
if (jstate == IB_MC_JSTATE_NON) {
} else if (jstate == IB_MC_JSTATE_FULL) {
} else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
}
if (do_attach) {
/*
* Do the IBA attach.
*/
"%d\n", ibt_status);
/*
* NOTE that we should probably preserve the join info
* in the list and later try to leave again at detach
* time.
*/
return (NULL);
}
}
/*
* Insert the ibd_mce_t in the proper list.
*/
if (jstate == IB_MC_JSTATE_NON) {
} else {
/*
* Set up the mc_req fields used for reaping the
* mcg in case of delayed tx completion (see
* ibd_tx_cleanup()). Also done for sendonly join in
* case we are promoted to fullmembership later and
* keep using the same mce.
*/
/*
* Check whether this is the case of trying to join
* full member, and we were already joined send only.
* We try to drop our SendOnly membership, but it is
* possible that the mcg does not exist anymore (and
* the subnet trap never reached us), so the leave
* operation might fail.
*/
sizeof (ibt_mcg_info_t));
return (omce);
}
}
return (mce);
}
/*
* Called during port up event handling to attempt to reacquire full
* membership to an mcg. Stripped down version of ibd_join_group().
* Note that it is possible that the mcg might have gone away, and
* gets recreated at this point.
*/
static void
{
/*
* If the mc_fullreap flag is set, or this join fails, a subsequent
* that by adding a boolean flag into ibd_mce_t, if required.
*/
if (mce->mc_fullreap)
return;
"multicast gid %016llx:%016llx",
}
/*
* This code handles delayed Tx completion cleanups for mcg's to which
* disable_multicast has been issued, regular mcg related cleanups during
* disable_multicast, disable_promiscous and mcg traps, as well as
* cleanups during driver detach time. Depending on the join state,
* it deletes the mce from the appropriate list and issues the IBA
* is left on the active list for a subsequent Tx completion cleanup.
*/
static void
{
/*
* Before detaching, we must check whether the other list
* contains the mcg; if we detach blindly, the consumer
* who set up the other list will also stop receiving
* traffic.
*/
if (jstate == IB_MC_JSTATE_FULL) {
/*
* The following check is only relevant while coming
* from the Tx completion path in the reap case.
*/
if (!mce->mc_fullreap)
return;
} else if (jstate == IB_MC_JSTATE_NON) {
} else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
}
/*
* If we are reacting to a mcg trap and leaving our sendonly or
* non membership, the mcg is possibly already gone, so attempting
* to leave might fail. On the other hand, we must try to leave
* anyway, since this might be a trap from long ago, and we could
* have potentially sendonly joined to a recent incarnation of
* the mcg and are about to loose track of this information.
*/
if (do_detach) {
}
}
/*
* Async code executed due to multicast and promiscuous disable requests
* and mcg trap handling; also executed during driver detach. Mostly, a
* leave and detach is done; except for the fullmember case when Tx
* requests are pending, whence arrangements are made for subsequent
* cleanup on Tx completion.
*/
static void
{
if (jstate == IB_MC_JSTATE_NON) {
/*
* In case we are handling a mcg trap, we might not find
* the mcg in the non list.
*/
return;
} else {
/*
* In case we are handling a mcg trap, make sure the trap
* is not arriving late; if we have an mce that indicates
* that we are already a fullmember, that would be a clear
* indication that the trap arrived late (ie, is for a
* previous incarnation of the mcg).
*/
if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
return;
} else {
/*
* If join group failed, mce will be NULL here.
* This is because in GLDv3 driver, set multicast
* will always return success.
*/
return;
}
/*
* If no pending Tx's remain that reference the AH
* for the mcg, recycle it from active to free list.
* Else in the IB_MC_JSTATE_FULL case, just mark the AH,
* so the last completing Tx will cause an async reap
* operation to be invoked, at which time we will drop our
* membership to the mcg so that the pending Tx's complete
* successfully. Refer to comments on "AH and MCE active
* list manipulation" at top of this file. The lock protects
* against Tx fast path and Tx cleanup code.
*/
}
if (recycled) {
}
}
/*
* Find the broadcast address as defined by IPoIB; implicitly
* determines the IBA scope, mtu, tclass etc of the link the
* interface is going to be a member of.
*/
static ibt_status_t
{
int i, mcgmtu;
/*
* Look for the IPoIB broadcast group.
*/
break;
}
}
if (!found) {
return (IBT_FAILURE);
}
/*
* Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
*/
"greater than port's maximum MTU %d", mcgmtu,
return (IBT_FAILURE);
}
return (IBT_SUCCESS);
}
/*
* Post ibt_attach() initialization.
*/
static int
{
char pathname[OBP_MAXPATHLEN];
/*
* Initialize id_port before ibt_open_hca because of
*/
return (DDI_FAILURE);
return (DDI_FAILURE);
}
&port_infosz);
return (DDI_FAILURE);
}
/*
* If the link already went down by the time we get here, give up;
* we can not even get the gid since that is not valid. We would
* fail in ibd_find_bgroup() anyway.
*/
return (DDI_FAILURE);
}
/*
* This verifies the Pkey ibnexus handed us is still valid.
* This is also the point from which the pkey table for the
* port must hold the exact pkey value at the exact index
*/
return (DDI_FAILURE);
}
}
goto drv_init_fail_alloc_pd;
}
/* Initialize the parallel ARP cache and AHs */
goto drv_init_fail_acache;
}
/*
* Check various tunable limits.
*/
} else {
}
/*
* First, check #r/s wqes against max channel size.
*/
else
else
/*
* Theoretically, there is no point in having more than #rwqe
* plus #swqe cqe's, except that the CQ will be signalled for
* overflow when the last wqe completes, if none of the previous
* cqe's have been polled. Thus, we allocate just a few less wqe's
* to make sure such overflow does not occur.
*/
if (ibd_separate_cqs == 1) {
/*
* Allocate Receive CQ.
*/
} else {
}
"requested size and supportable CQ size is less "
"than the required threshold %d",
goto drv_init_fail_min_rwqes;
}
goto drv_init_fail_alloc_rcq;
}
/*
* Allocate Send CQ.
*/
} else {
}
goto drv_init_fail_alloc_scq;
}
} else {
/*
*/
} else {
state->id_num_swqe);
}
"requested size and supportable CQ size is less "
"than the required threshold %d",
goto drv_init_fail_min_rwqes;
}
goto drv_init_fail_alloc_rcq;
}
}
/*
* Print message in case we could not allocate as many wqe's
* as was requested. Note that in the combined CQ case, we will
* get the following message.
*/
"\n");
goto drv_init_fail_alloc_chan;
}
DDI_SUCCESS) {
goto drv_init_fail_query_chan;
}
/* Initialize the Transmit buffer list */
}
/* Setup the handler we will use for regular DLPI stuff */
IBT_NEXT_COMPLETION) != IBT_SUCCESS) {
" ibt_enable_cq_notify()\n");
goto drv_init_fail_cq_notify;
}
}
/* Create the service fifos before we start receiving */
goto drv_init_fail_srv_fifo;
}
/* Initialize the Receive buffer list */
}
/* Join to IPoIB broadcast group as required by IPoIB */
goto drv_init_fail_join_group;
}
/* Create the async thread */
/* Do we have to specially leave the group? */
}
/*
* The local mac address is now known. Create the IPoIB
* address.
*/
/*
* Similarly, program in the broadcast mac address.
*/
return (DDI_SUCCESS);
if (ibd_separate_cqs == 1)
return (DDI_FAILURE);
}
/*
* Allocate the statically allocated Tx buffer list.
*/
static int
{
int i;
for (i = 0; i < state->id_num_swqe; i++) {
"ibd_alloc_swqe()\n");
return (DDI_FAILURE);
}
/* add to list */
} else {
}
}
return (DDI_SUCCESS);
}
/*
* Free the statically allocated Tx buffer list.
*/
static void
{
}
}
/*
* Allocate a single send wqe and register it so it is almost
* ready to be posted to the hardware.
*/
static int
{
/* alloc copy buffer, must be max size to handle multiple mblk case */
IBT_SUCCESS) {
return (DDI_FAILURE);
}
/* These are set in send */
return (DDI_SUCCESS);
}
/*
* Free an allocated send wqe.
*/
static void
{
return;
}
}
/*
* Post a rwqe to the hardware and add it to the Rx list. The
* "recycle" parameter indicates whether an old rwqe is being
* recycled, or this is a new one.
*/
static int
{
/*
* Here we should add dl_cnt before post recv, because we would
* have to make sure dl_cnt has already updated before
* corresponding ibd_process_rx() is called.
*/
IBT_SUCCESS) {
return (DDI_FAILURE);
}
/*
* Buffers being recycled are already in the list.
*/
if (recycle)
return (DDI_SUCCESS);
} else {
}
return (DDI_SUCCESS);
}
/*
* Allocate the statically allocated Rx buffer list.
*/
static int
{
int i;
for (i = 0; i < state->id_num_rwqe; i++) {
return (DDI_FAILURE);
}
return (DDI_FAILURE);
}
}
return (DDI_SUCCESS);
}
/*
* Free the statically allocated Rx buffer list.
*
*/
static void
{
}
}
/*
* Allocate a single recv wqe and register it so it is almost
* ready to be posted to the hardware.
*/
static int
{
return (DDI_FAILURE);
}
return (DDI_FAILURE);
}
NULL) {
return (DDI_FAILURE);
}
IBT_SUCCESS) {
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
/*
* Free an allocated recv wqe.
*/
static void
{
return;
}
/*
* should not be recycled. The freemsg() will invoke
* ibd_freemsg_cb().
*/
}
}
/*
* Delete the rwqe being freed from the rx list.
*/
static void
{
else
else
}
/*
* Pre ibt_detach() deconstruction.
*/
static void
{
/*
* Desubscribe from trap notices; we will be tearing down
* the mcg lists soon. Make sure the trap handler does nothing
* even if it is invoked (ie till we invoke ibt_detach()).
*/
while (state->id_trap_inprog > 0)
/*
* Flushing the channel ensures that all pending WQE's
* are marked with flush_error and handed to the CQ. It
* does not guarantee the invocation of the CQ handler.
* This call is guaranteed to return successfully for UD QPNs.
*/
/*
* We possibly need a loop here to wait for all the Tx
* callbacks to happen. The Tx handlers will retrieve
* held resources like AH ac_ref count, registered memory
* and possibly ASYNC_REAP requests. Rx interrupts were already
* turned off (in ibd_detach()); turn off Tx interrupts and
* poll. By the time the polling returns an empty indicator,
* we are sure we have seen all pending Tx callbacks. Note
* that after the ibt_set_cq_handler() returns, the old handler
* is guaranteed not to be invoked anymore.
*/
if (ibd_separate_cqs == 1)
/*
* No more async requests will be posted since the device has been
* unregistered; completion handlers have been turned off, so Tx
* handler will not cause any more ASYNC_REAP requests. Queue a
* request for the async thread to exit, which will be serviced
* after any pending ones. This can take a while, specially if the
* SM is unreachable, since IBMF will slowly timeout each SM request
* issued by the async thread. Reap the thread before continuing on,
* we do not want it to be lingering in modunloaded code.
*/
/*
* We can not be in promiscuous mode anymore, upper layers
* would have made a request to disable it (if ever set previously)
* before the detach is allowed to progress to this point; and the
* aysnc thread would have processed that request by now. Thus the
* nonmember list is guaranteed empty at this point.
*/
/*
* membership to the broadcast group, and any nonmembership
* acquired during transmits. We do this after the Tx completion
* handlers are done, since those might result in some late
* leaves; this also eliminates a potential race with that
* has also been suppressed at this point. Thus, no locks
* are required while traversing the mc full list.
*/
}
/*
* Kill the channel now; guaranteed to return successfully
* for UD QPNs.
*/
/*
* Kill the CQ; all completion handlers are guaranteed to
* have terminated by the time this returns. Since we killed
* the QPN above, we can not receive the IBT_CQ_BUSY error.
*/
if (ibd_separate_cqs == 1) {
}
/*
* We killed the receive interrupts, thus, we will not be
* required to handle received packets anymore. Thus, kill
* service threads since they are not going to be used anymore.
*/
/*
* till all handlers are guaranteed to have completed.
*/
/*
* Clean up the active AH hash list.
*/
/*
* Free parallel ARP cache and AHs; we are sure all of these
* resources have been released by the Tx completion handler.
*/
/*
* We freed the QPN, all the MRs and AHs. This step should not
* fail; print a warning message if it does fail, due to a bug
* in the driver.
*/
}
/*
* threaded and nonreentrant for this CQ. When using combined CQ,
* this handles Tx and Rx completions. With separate CQs, this handles
* only Rx completions.
*/
/* ARGSUSED */
static void
{
if (ibd_rx_softintr == 1)
else
}
/*
* Separate CQ handler for Tx completions, when the Tx CQ is in
* interrupt driven mode.
*/
/* ARGSUSED */
static void
{
if (ibd_tx_softintr == 1)
else
(void) ibd_tx_recycle((char *)state);
}
/*
* on a kernel thread (handling can thus block) and can be invoked
* concurrently. The handler can be invoked anytime after it is
* registered and before ibt_detach().
*/
/* ARGSUSED */
static void
{
/*
* The trap handler will get invoked once for every event for
* evert port. The input "gid" is the GID0 of the port the
* trap came in on; we just need to act on traps that came
* to our port, meaning the port on which the ipoib interface
* resides. Since ipoib uses GID0 of the port, we just match
* the gids to check whether we need to handle the trap.
*/
return;
switch (code) {
case IBT_SM_EVENT_UNAVAILABLE:
/*
* If we are in promiscuous mode or have
* sendnonmembers, we need to print a warning
* message right now. Else, just store the
* information, print when we enter promiscuous
* mode or attempt nonmember send. We might
* also want to stop caching sendnonmember.
*/
"degraded due to unavailability of multicast "
"traps");
break;
case IBT_SM_EVENT_AVAILABLE:
/*
* If we printed a warning message above or
* while trying to nonmember send or get into
* promiscuous mode, print an okay message.
*/
"restored due to availability of multicast "
"traps");
break;
case IBT_SM_EVENT_MCG_CREATED:
case IBT_SM_EVENT_MCG_DELETED:
/*
* First check if the instance is being
* [de]initialized; back off then, without doing
* anything more, since we are not sure if the
* async thread is around, or whether we might
* be racing with the detach code in ibd_drv_fini()
* that scans the mcg list.
*/
if (!ibd_async_safe(state))
return;
break;
}
}
static void
{
/*
* Atomically search the nonmember and sendonlymember lists and
* delete.
*/
/*
* mcg. Given the unreliable out-of-order mode of trap
* delivery, we can never be sure whether it is a problem
* if the join fails. Thus, we warn the admin of a failure
* if this was a creation trap. Note that the trap might
* actually be reporting a long past event, and the mcg
* might already have been deleted, thus we might be warning
* in vain.
*/
"new multicast gid %016llx:%016llx",
}
/*
* Free the request slot allocated by the subnet event thread.
*/
}
/*
* GLDv3 entry point to get capabilities.
*/
static boolean_t
{
switch (cap) {
case MAC_CAPAB_HCKSUM: {
if (ibd_csum_send > IBD_CSUM_NONE)
else
return (B_FALSE);
break;
}
default:
return (B_FALSE);
}
return (B_TRUE);
}
/*
* GLDv3 entry point to start hardware.
*/
/* ARGSUSED */
static int
ibd_m_start(void *arg)
{
return (0);
}
/*
* GLDv3 entry point to stop hardware from receiving packets.
*/
/* ARGSUSED */
static void
ibd_m_stop(void *arg)
{
#ifdef RUN_PERFORMANCE
#endif
}
/*
* GLDv3 entry point to modify device's mac address. We do not
* allow address modifications.
*/
static int
{
return (0);
else
return (EINVAL);
}
/*
* of here on the async thread.
*/
static void
{
if (op == ASYNC_JOIN) {
}
} else {
/*
* Here, we must search for the proper mcg_info and
* use that to leave the group.
*/
}
}
/*
* This function queues the operation to the async thread and
* return success for a valid multicast address.
*/
static int
{
/*
* The incoming multicast address might not be aligned properly
* on a 4 byte boundary to be considered an ipoib_mac_t. We force
* it to look like one though, to get the offsets of the mc gid,
* since we know we are not going to dereference any values with
* the ipoib_mac_t pointer.
*/
/*
* Check validity of MCG address. We could additionally check
* mcg, but since this operation is only invokable by priviledged
* programs anyway, we allow the flexibility to those dlpi apps.
* Note that we do not validate the "scope" of the IBA mcg.
*/
return (EINVAL);
/*
* fill in multicast pkey and scope
*/
/*
* nothing (ie we stay JOINed to the broadcast group done in
* ibd_drv_init()), to mimic ethernet behavior. IPv4 specifically
* requires to be joined to broadcast groups at all times.
* ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
* depends on this.
*/
return (0);
return (ENOMEM);
if (add) {
} else {
}
return (0);
}
/*
* The blocking part of the IBA promiscuous operations are done
* out of here on the async thread. The dlpireq parameter indicates
* whether this invocation is due to a dlpi request or due to
*/
static void
{
}
}
/*
* The blocking part of the IBA promiscuous operations are done
* out of here on the async thread. The dlpireq parameter indicates
* whether this invocation is due to a dlpi request or due to
*/
static void
{
/*
* Obtain all active MC groups on the IB fabric with
* specified criteria (scope + Pkey + Qkey + mtu).
*/
IBT_SUCCESS) {
"groups");
goto done;
}
/*
* Iterate over the returned mcg's and join as NonMember
* to the IP mcg's.
*/
for (i = 0; i < numg; i++) {
/*
* Do a NonMember JOIN on the MC group.
*/
"multicast gid %016llx:%016llx",
}
done:
}
/*
* GLDv3 assumes phys state receives more packets than multi state,
* which is not true for IPoIB. Thus, treat the multi and phys
* promiscuous states the same way to work with GLDv3's assumption.
*/
static int
{
return (ENOMEM);
if (on) {
} else {
}
return (0);
}
/*
* GLDv3 entry point for gathering statistics.
*/
static int
{
switch (stat) {
case MAC_STAT_IFSPEED:
break;
case MAC_STAT_MULTIRCV:
break;
case MAC_STAT_BRDCSTRCV:
break;
case MAC_STAT_MULTIXMT:
break;
case MAC_STAT_BRDCSTXMT:
break;
case MAC_STAT_RBYTES:
break;
case MAC_STAT_IPACKETS:
break;
case MAC_STAT_OBYTES:
break;
case MAC_STAT_OPACKETS:
break;
case MAC_STAT_NORCVBUF:
break;
case MAC_STAT_OERRORS:
break;
case MAC_STAT_IERRORS:
*val = 0;
break;
case MAC_STAT_NOXMTBUF:
break;
default:
return (ENOTSUP);
}
return (0);
}
/*
* Tx reschedule
*/
static void
{
/*
* For poll mode, if ibd is out of Tx wqe, reschedule to collect
* the CQEs. Otherwise, just return for out of Tx wqe.
*/
if (ibd_txcomp_poll == 1) {
return;
}
return;
}
if (state->id_sched_needed) {
}
}
/*
* Release one or more chained send wqes back into free list.
*/
static void
{
/*
* Add back on Tx list for reuse.
*/
}
} else {
}
}
/*
* Acquire send wqe from free list.
* Returns error number and send wqe pointer.
*/
static int
{
int rc = 0;
/*
* Check and reclaim some of the completed Tx requests.
* If someone else is already in this code and pulling Tx
* completions, no need to poll, since the current lock holder
* will do the work anyway. Normally, we poll for completions
* every few Tx attempts, but if we are short on Tx descriptors,
* we always try to poll.
*/
if ((ibd_txcomp_poll == 1) &&
}
/*
* Grab required transmit wqes.
*/
} else {
/*
* If we did not find the number we were looking for, flag
* no resource. Adjust list appropriately in either case.
*/
}
return (rc);
}
/*
* The passed in packet has this format:
* IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
*/
static boolean_t
{
if (ibd_txcomp_poll == 1) {
goto ibd_send_fail;
}
return (B_FALSE);
}
/*
* Obtain an address handle for the destination.
*/
} else {
DPRINT(5,
"ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
/*
* for the poll mode, it is probably some cqe pending in the
* cq. So ibd has to poll cq here, otherwise acache probably
* may not be recycled.
*/
if (ibd_txcomp_poll == 1) {
}
/*
* Here if ibd_acache_lookup() returns EFAULT, it means ibd
* can not find a path for the specific dest address. We
* should get rid of this kind of packet. With the normal
* case, ibd will return the packet to upper layer and wait
* for AH creating.
*/
else {
}
goto ibd_send_fail;
}
/*
* For ND6 packets, padding is at the front of the source lladdr.
* Insert the padding at front.
*/
sizeof (ib_header_info_t))) {
goto ibd_send_fail;
}
}
sizeof (ib_header_info_t));
"failure ");
goto ibd_send_fail;
}
sizeof (ib_header_info_t));
}
/* LINTED: E_CONSTANT_CONDITION */
}
}
;
/*
* GLDv3 will check mtu. We do checksum related work here.
*/
/*
* Copy the data to preregistered buffers, or register the buffer.
*/
(pktsize > IBD_TX_COPY_THRESHOLD)) {
if (ibt_status != IBT_SUCCESS) {
/*
* We do not expect any error other than
* IBT_INSUFF_RESOURCE.
*/
if (ibt_status != IBT_INSUFF_RESOURCE)
"failed in ibt_register_mem()",
/*
* Deregister already registered memory;
* fallback to copying the mblk.
*/
goto ibd_copy_path;
}
}
} else {
}
}
/*
* Queue the wqe to hardware.
*/
if (ibt_status != IBT_SUCCESS) {
/*
* We should not fail here; but just in case we do, we
* print out a warning to log.
*/
}
if (dofree)
return (B_TRUE);
else {
}
}
if (dofree)
return (ret);
}
/*
* GLDv3 entry point for transmitting datagram.
*/
static mblk_t *
{
/* Send fail */
break;
}
}
return (mp);
}
/*
* this handles Tx and Rx completions. With separate CQs, this handles
* only Rx completions.
*/
static uint_t
{
/*
* Poll for completed entries; the CQ will not interrupt any
* more for incoming (or transmitted) packets.
*/
/*
* Now enable CQ notifications; all packets that arrive now
* (or complete transmission) will cause new interrupts.
*/
IBT_SUCCESS) {
/*
* We do not expect a failure here.
*/
}
/*
* Repoll to catch all packets that might have arrived after
* we finished the first poll loop and before interrupts got
* armed.
*/
return (DDI_INTR_CLAIMED);
}
/*
* Common code for interrupt handling as well as for polling
* for all completed wqe's while detaching.
*/
static void
{
int i;
/*
* In some cases (eg detaching), this code can be invoked on
* any cpu after disabling cq notification (thus no concurrency
* exists). Apart from that, the following applies normally:
* The receive completion handling is always on the Rx interrupt
* cpu. Transmit completion handling could be from any cpu if
* Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
* is interrupt driven. Combined completion handling is always
* on the interrupt cpu. Thus, lock accordingly and use the
* proper completion array.
*/
if (ibd_separate_cqs == 1) {
} else {
}
} else {
}
/*
* Channel being torn down.
*/
/*
* Only invoke the Tx handler to
* release possibly held resources
* like AH refcount etc. Can not
* invoke Rx handler because it might
* try adding buffers to the Rx pool
* when we are trying to deinitialize.
*/
continue;
} else {
"ibd_intr: Bad CQ status",
}
}
}
} else {
}
}
}
}
/*
* Deregister the mr associated with a given mblk.
*/
static void
{
int i;
/*
* We do not expect any errors here.
*/
}
}
}
/*
* Common code that deals with clean ups after a successful or
* erroneous transmission attempt.
*/
static void
{
/*
* If this was a dynamic registration in ibd_send(),
* deregister now.
*/
}
/*
* Drop the reference count on the AH; it can be reused
* now for a different destination if there are no more
* posted sends that will use it. This can be eliminated
* if we can always associate each Tx buffer with an AH.
* The ace can be null if we are cleaning up from the
* ibd_send() error path.
*/
/*
* The recycling logic can be eliminated from here
* and put into the async thread if we create another
* list to hold ACE's for unjoined mcg's.
*/
if (DEC_REF_DO_CYCLE(ace)) {
/*
* Check with the lock taken: we decremented
* reference count without the lock, and some
* transmitter might alreay have bumped the
* reference count (possible in case of multicast
* disable when we leave the AH on the active
* list). If not still 0, get out, leaving the
* recycle bit intact.
*
* Atomically transition the AH from active
* to free list, and queue a work request to
* leave the group and destroy the mce. No
* transmitter can be looking at the AH or
* the MCE in between, since we have the
* ac_mutex lock. In the SendOnly reap case,
* it is not neccesary to hold the ac_mutex
* and recheck the ref count (since the AH was
* taken off the active list), we just do it
* to have uniform processing with the Full
* reap case.
*/
if (GET_REF_CYCLE(ace) == 0) {
/*
* Identify the case of fullmember reap as
* opposed to mcg trap reap. Also, port up
* might set ac_mce to NULL to indicate Tx
* cleanup should do no more than put the
* AH in the free list (see ibd_async_link).
*/
/*
* mc_req was initialized at mce
* creation time.
*/
}
}
}
}
/*
* Release the send wqe for reuse.
*/
}
/*
* Processing to be done after receipt of a packet; hand off to GLD
* in the format expected by GLD.
* The recvd packet has this format: 2b sap :: 00 :: data.
*/
static void
{
/*
* Track number handed to upper layer, and number still
* available to receive packets.
*/
/*
* Adjust write pointer depending on how much data came in.
*/
/*
* the IB link will deliver one of the IB link layer
* headers called, the Global Routing Header (GRH).
* ibd driver uses the information in GRH to build the
* Header_info structure and pass it with the datagram up
* to GLDv3.
* If the GRH is not valid, indicate to GLDv3 by setting
* the VerTcFlow field to 0.
*/
/* if it is loop back packet, just drop it. */
IPOIB_ADDRL) == 0) {
return;
}
sizeof (ipoib_mac_t));
} else {
}
} else {
/*
* It can not be a IBA multicast packet. Must have been
* unicast for us. Just copy the interface address to dst.
*/
sizeof (ipoib_mac_t));
}
/*
* lladdr. However the inet6 layer is not aware of it, hence remove
* the padding from such packets.
*/
sizeof (ipoib_hdr_t))) {
return;
}
sizeof (ipoib_pgrh_t));
}
IPV6_HDR_LEN + len) {
IPV6_HDR_LEN + len)) {
" failed");
return;
}
sizeof (ipoib_pgrh_t) +
sizeof (ipoib_hdr_t));
}
/* LINTED: E_CONSTANT_CONDITION */
}
}
/*
* does hardware checksum, we will pull the checksum from the
* work completion structure here.
* on interrupt cpu.
*/
/*
* Possibly replenish the Rx pool if needed.
*/
if (rxcnt < IBD_RX_THRESHOLD) {
state->id_rx_short++;
DDI_FAILURE) {
return;
}
}
}
}
/*
* Callback code invoked from STREAMs when the recv data buffer is free
* for recycling.
*/
static void
ibd_freemsg_cb(char *arg)
{
/*
* If the wqe is being destructed, do not attempt recycling.
*/
return;
}
/*
* Upper layer has released held mblk.
*/
/*
* There are already enough buffers on the Rx ring.
* Free this one up.
*/
} else {
return;
}
/*
* Post back to h/w. We could actually have more than
* id_num_rwqe WQEs on the list if there were multiple
* ibd_freemsg_cb() calls outstanding (since the lock is
* not held the entire time). This will start getting
* corrected over subsequent ibd_freemsg_cb() calls.
*/
return;
}
}
}
static uint_t
ibd_tx_recycle(char *arg)
{
/*
* Poll for completed entries; the CQ will not interrupt any
* more for completed packets.
*/
/*
* Now enable CQ notifications; all completions originating now
* will cause new interrupts.
*/
IBT_SUCCESS) {
/*
* We do not expect a failure here.
*/
}
/*
* Repoll to catch all packets that might have completed after
* we finished the first poll loop and before interrupts got
* armed.
*/
/*
* Call txsched to notify GLDv3 if it required.
*/
return (DDI_INTR_CLAIMED);
}
#ifdef RUN_PERFORMANCE
/*
* To run the performance test, first do the "ifconfig ibdN plumb" on
* the Rx and Tx side. Then use mdb -kw to tweak the following variables:
* ibd_performance=1.
* ibd_receiver=1 on Rx side.
* ibd_sender=1 on Tx side.
* Do "ifconfig ibdN" on Rx side to get the Rx mac address, and update
* make it drop into a 1 minute loop waiting for packets. An
*/
#define IBD_NUM_UNSIGNAL ibd_num_unsignal
#define IBD_TX_PKTSIZE ibd_tx_pktsize
#define IBD_TX_DATASIZE ibd_tx_datasize
static ibd_swqe_t **swqes;
/*
* Set these on Rx and Tx side to do performance run.
*/
static int ibd_performance = 0;
static int ibd_receiver = 0;
static int ibd_sender = 0;
static ipoib_mac_t ibd_dest;
/*
* Interrupt coalescing is achieved by asking for a completion intr
* only every ibd_num_unsignal'th packet.
*/
static int ibd_num_unsignal = 8;
/*
* How big is each packet?
*/
static int ibd_tx_pktsize = 2048;
/*
* Total data size to be transmitted.
*/
static volatile int num_completions;
/* ARGSUSED */
static void
{
if (ibd_receiver == 1)
else
/*
* Mark the handler as having run and possibly freed up some
* slots. Blocked sends can be retried.
*/
IBT_SUCCESS) {
if (ibd_receiver == 1) {
/*
* We can immediately recycle the buffer. No
* need to pass up to any IP layer ...
*/
for (i = 0; i < polled; i++) {
}
}
}
/*
* If we just repolled, we are done; exit.
*/
if (cq_enabled)
return;
/*
* Enable CQ.
*/
/*
* We do not expect a failure here.
*/
}
cq_enabled = B_TRUE;
/*
* Repoll for packets that came in after we finished previous
* poll loop but before we turned on notifications.
*/
goto repoll;
}
static void
{
return;
}
if ((packets % IBD_NUM_UNSIGNAL) != 0) {
/*
* This is required to ensure the last packet will trigger
* a CQ handler callback, thus we can spin waiting fot all
* packets to be received.
*/
"ibd_perf_tx: #Packets not multiple of Signal Grp size");
return;
}
num_completions = 0;
return;
}
return;
}
/*
* Get the ud_dest for the destination.
*/
return;
}
/*
* Set up the send buffer.
*/
return;
}
/*
* This buffer can be used in the case when we want to
* send data from the same memory area over and over;
* it might help in reducing memory traffic.
*/
return;
}
/*
* Allocate private send wqe's.
*/
for (i = 0; i < IBD_NUM_SWQE; i++) {
return;
}
#if 0
#else
#endif
/*
* The last of IBD_NUM_UNSIGNAL consecutive posted WRs
* is marked to invoke the CQ handler. That is the only
* way we come to know when the send queue can accept more
* WRs.
*/
if (((i + 1) % IBD_NUM_UNSIGNAL) != 0)
}
/*
* Post all the requests. We expect this stream of post's will
* not overwhelm the hardware due to periodic completions and
* pollings that happen out of ibd_perf_handler.
* Post a set of requests, till the channel can accept; after
* that, wait for the CQ handler to notify us that there is more
* space.
*/
for (i = 0; i < IBD_NUM_SWQE; i++) {
if (stat == IBT_CHAN_FULL) {
/*
* Spin till the CQ handler runs
* and then try again.
*/
while (!cq_handler_ran)
;
goto retry;
}
goto done;
}
}
done:
/*
* We should really be snapshotting when we get the last
* completion.
*/
;
/*
* Wait a sec for everything to get over.
*/
/*
* Reset CQ handler to real one; free resources.
*/
if (ibd_separate_cqs == 0) {
} else {
if (ibd_txcomp_poll == 0)
state);
else
}
for (i = 0; i < IBD_NUM_SWQE; i++)
}
static void
{
return;
}
/*
* We do not need to allocate private recv wqe's. We will
* just use the regular ones.
*/
num_completions = 0;
/*
* Delay for a minute for all the packets to come in from
* transmitter.
*/
/*
* Reset CQ handler to real one; free resources.
*/
}
static void
{
if (ibd_performance == 0)
return;
if (ibd_receiver == 1) {
return;
}
if (ibd_sender == 1) {
return;
}
}
#endif /* RUN_PERFORMANCE */