ibd.h revision 03514dd70879e522caeae9bc4b36d18c43e15a43
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
#ifndef _SYS_IB_CLIENTS_IBD_H
#define _SYS_IB_CLIENTS_IBD_H
#ifdef __cplusplus
extern "C" {
#endif
/*
* Completion queue polling control
*/
#define IBD_CQ_POLLING 0x1
#define IBD_REDO_CQ_POLLING 0x2
/*
* Maximum length for returning chained mps back to crossbow.
* Also used as the maximum number of rx wc's polled at a time.
*/
#define IBD_MAX_RX_MP_LEN 16
/*
* When doing multiple-send-wr, this value determines how many to do at
* a time (in a single ibt_post_send).
*/
#define IBD_MAX_TX_POST_MULTIPLE 4
/*
* Flag bits for resources to reap
*/
#define IBD_RSRC_SWQE 0x1
#define IBD_RSRC_LSOBUF 0x2
#define IBD_RSRC_RC_SWQE 0x4
#define IBD_RSRC_RC_TX_LARGEBUF 0x8
/*
* Async operation types
*/
#define IBD_ASYNC_GETAH 1
#define IBD_ASYNC_JOIN 2
#define IBD_ASYNC_LEAVE 3
#define IBD_ASYNC_PROMON 4
#define IBD_ASYNC_PROMOFF 5
#define IBD_ASYNC_REAP 6
#define IBD_ASYNC_TRAP 7
#define IBD_ASYNC_SCHED 8
#define IBD_ASYNC_LINK 9
#define IBD_ASYNC_EXIT 10
#define IBD_ASYNC_RC_TOO_BIG 11
#define IBD_ASYNC_RC_CLOSE_ACT_CHAN 12
#define IBD_ASYNC_RC_RECYCLE_ACE 13
#define IBD_ASYNC_RC_CLOSE_PAS_CHAN 14
/*
*/
#define IBD_DRV_STATE_INITIALIZED 0x000001
#define IBD_DRV_RXINTR_ADDED 0x000002
#define IBD_DRV_TXINTR_ADDED 0x000004
#define IBD_DRV_IBTL_ATTACH_DONE 0x000008
#define IBD_DRV_HCA_OPENED 0x000010
#define IBD_DRV_PD_ALLOCD 0x000020
#define IBD_DRV_MAC_REGISTERED 0x000040
#define IBD_DRV_PORT_DETAILS_OBTAINED 0x000080
#define IBD_DRV_BCAST_GROUP_FOUND 0x000100
#define IBD_DRV_ACACHE_INITIALIZED 0x000200
#define IBD_DRV_CQS_ALLOCD 0x000400
#define IBD_DRV_UD_CHANNEL_SETUP 0x000800
#define IBD_DRV_TXLIST_ALLOCD 0x001000
#define IBD_DRV_SCQ_NOTIFY_ENABLED 0x002000
#define IBD_DRV_RXLIST_ALLOCD 0x004000
#define IBD_DRV_BCAST_GROUP_JOINED 0x008000
#define IBD_DRV_ASYNC_THR_CREATED 0x010000
#define IBD_DRV_RCQ_NOTIFY_ENABLED 0x020000
#define IBD_DRV_SM_NOTICES_REGISTERED 0x040000
#define IBD_DRV_STARTED 0x080000
#define IBD_DRV_RC_SRQ_ALLOCD 0x100000
#define IBD_DRV_RC_LARGEBUF_ALLOCD 0x200000
#define IBD_DRV_RC_LISTEN 0x400000
#ifdef DEBUG
#define IBD_DRV_RC_PRIVATE_STATE 0x800000
#endif
#define IBD_DRV_IN_DELETION 0x1000000
#define IBD_DRV_IN_LATE_HCA_INIT 0x2000000
#define IBD_DRV_REQ_LIST_INITED 0x4000000
#define IBD_DRV_RC_TIMEOUT 0x8000000
/*
* Miscellaneous constants
*/
#define IBD_SEND 0
#define IBD_RECV 1
/* Tunables defaults and limits */
#define IBD_LINK_MODE_UD 0
#define IBD_LINK_MODE_RC 1
#define IBD_DEF_LINK_MODE IBD_LINK_MODE_RC
#define IBD_DEF_LSO_POLICY B_TRUE
#define IBD_DEF_NUM_LSO_BUFS 1024
#define IBD_DEF_CREATE_BCAST_GROUP B_TRUE
#define IBD_DEF_COALESCE_COMPLETIONS B_TRUE
#define IBD_DEF_UD_RX_COMP_COUNT 4
#define IBD_DEF_UD_RX_COMP_USEC 10
#define IBD_DEF_UD_TX_COMP_COUNT 16
#define IBD_DEF_UD_TX_COMP_USEC 300
#define IBD_DEF_RC_RX_COMP_COUNT 4
#define IBD_DEF_RC_RX_COMP_USEC 10
#define IBD_DEF_RC_TX_COMP_COUNT 10
#define IBD_DEF_RC_TX_COMP_USEC 300
#define IBD_DEF_UD_TX_COPY_THRESH 4096
#define IBD_DEF_RC_RX_COPY_THRESH 4096
#define IBD_DEF_RC_TX_COPY_THRESH 4096
#define IBD_DEF_UD_NUM_RWQE 4000
#define IBD_DEF_UD_NUM_SWQE 4000
#define IBD_DEF_RC_ENABLE_SRQ B_TRUE
#if defined(__i386)
#define IBD_DEF_RC_NUM_RWQE 511
#define IBD_DEF_RC_NUM_SWQE 255
#else
#define IBD_DEF_RC_NUM_RWQE 2047
#define IBD_DEF_RC_NUM_SWQE 511
#endif
#define IBD_DEF_NUM_AH 256
#define IBD_DEF_HASH_SIZE 32
/* Tunable limits */
#define IBD_MIN_NUM_LSO_BUFS 512
#define IBD_MAX_NUM_LSO_BUFS 4096
#define IBD_MIN_UD_TX_COPY_THRESH 2048
#define IBD_MAX_UD_TX_COPY_THRESH 65536
#define IBD_MIN_UD_NUM_SWQE 512
#define IBD_MAX_UD_NUM_SWQE 8000
#define IBD_MIN_UD_NUM_RWQE 512
#define IBD_MAX_UD_NUM_RWQE 8000
#define IBD_MIN_NUM_AH 32
#define IBD_MAX_NUM_AH 8192
#define IBD_MIN_HASH_SIZE 32
#define IBD_MAX_HASH_SIZE 1024
#if defined(__i386)
#define IBD_MIN_RC_NUM_SWQE 255
#else
#define IBD_MIN_RC_NUM_SWQE 511
#endif
#define IBD_MAX_RC_NUM_SWQE 8000
#define IBD_MIN_RC_NUM_RWQE 511
#define IBD_MAX_RC_NUM_RWQE 8000
#define IBD_MIN_RC_RX_COPY_THRESH 1500
#define IBD_MAX_RC_RX_COPY_THRESH 65520
#define IBD_MIN_RC_TX_COPY_THRESH 1500
#define IBD_MAX_RC_TX_COPY_THRESH 65520
/*
* Thresholds
*
* When waiting for resources (swqes or lso buffers) to become available,
* the first two thresholds below determine how long to wait before informing
* the network layer to start sending packets again. The IBD_TX_POLL_THRESH
* determines how low the available swqes should go before we start polling
* the completion queue.
*/
#define IBD_FREE_LSOS_THRESH 8
#define IBD_FREE_SWQES_THRESH 20
#define IBD_TX_POLL_THRESH 80
#ifdef DEBUG
void debug_print(int l, char *fmt, ...);
#define DPRINT debug_print
#else
#define DPRINT 0 &&
#endif
/*
* AH and MCE active list manipulation:
*
* Multicast disable requests and MCG delete traps are two cases
* where the active AH entry for the mcg (if any unreferenced one exists)
* will be moved to the free list (to force the next Tx to the mcg to
* join the MCG in SendOnly mode). Port up handling will also move AHs
* from active to free list.
*
* In the case when some transmits are still pending on an entry
* for an mcg, but a multicast disable has already been issued on the
* mcg, there are some options to consider to preserve the join state
* to ensure the emitted packet is properly routed on the IBA fabric.
* For the AH, we can
* 1. take out of active list at multicast disable time.
* 2. take out of active list only when last pending Tx completes.
* For the MCE, we can
* 3. take out of active list at multicast disable time.
* 4. take out of active list only when last pending Tx completes.
* 5. move from active list to stale list at multicast disable time.
* We choose to use 2,4. We use option 4 so that if a multicast enable
* is tried before the pending Tx completes, the enable code finds the
* mce in the active list and just has to make sure it will not be reaped
* (ie the mcg leave done) when the pending Tx does complete. Alternatively,
* a stale list (#5) that would be checked in the enable code would need
* to be implemented. Option 2 is used, because otherwise, a Tx attempt
* after the multicast disable would try to put an AH in the active list,
* and associate the mce it finds in the active list to this new AH,
* whereas the mce is already associated with the previous AH (taken off
* the active list), and will be removed once the pending Tx's complete
* (unless a reference count on mce's is implemented). One implication of
* using 2,4 is that new Tx's posted before the pending Tx's complete will
* grab new references on the AH, further delaying the leave.
*
* In the case of mcg delete (or create) trap when the port is sendonly
* joined, the AH and MCE handling is different: the AH and MCE has to be
* immediately taken off the active lists (forcing a join and path lookup
* at the next Tx is the only guaranteed means of ensuring a proper Tx
* to an mcg as it is repeatedly created and deleted and goes thru
* reincarnations).
*
* When a port is already sendonly joined, and a multicast enable is
* attempted, the same mce structure is promoted; this ensures only a
* single mce on the active list tracks the most powerful join state.
*
* In the case of port up event handling, the MCE for sendonly membership
* is freed up, and the ACE is put into the free list as soon as possible
* (depending on whether posted Tx's have completed). For fullmembership
* MCE's though, the ACE is similarly handled; but the MCE is kept around
* (a re-JOIN is attempted) only if the DLPI leave has not already been
* done; else the mce is deconstructed (mc_fullreap case).
*
* MCG creation and deletion trap handling:
*
* These traps are unreliable (meaning sometimes the trap might never
* be delivered to the subscribed nodes) and may arrive out-of-order
* since they use UD transport. An alternative to relying on these
* unreliable traps is to poll for mcg presence every so often, but
* instead of doing that, we try to be as conservative as possible
* while handling the traps, and hope that the traps do arrive at
* the subscribed nodes soon. Note that if a node is fullmember
* trap for that mcg (by fullmember definition); if it does, it is
* an old trap from a previous incarnation of the mcg.
*
* Whenever a trap is received, the driver cleans up its sendonly
* membership to the group; we choose to do a sendonly leave even
* on a creation trap to handle the case of a prior deletion of the mcg
* having gone unnoticed. Consider an example scenario:
* T1: MCG M is deleted, and fires off deletion trap D1.
* T2: MCG M is recreated, fires off creation trap C1, which is lost.
* T3: Node N tries to transmit to M, joining in sendonly mode.
* T4: MCG M is deleted, and fires off deletion trap D2.
* T5: N receives a deletion trap, but can not distinguish D1 from D2.
* If the trap is D2, then a LEAVE is not required, since the mcg
* is already deleted; but if it is D1, a LEAVE is required. A safe
* approach is to always LEAVE, but the SM may be confused if it
* receives a LEAVE without a prior JOIN.
*
* Management of the non-membership to an mcg is similar to the above,
* except that if the interface is in promiscuous mode, it is required
* to attempt to re-join the mcg after receiving a trap. Unfortunately,
* if the re-join attempt fails (in which case a warning message needs
* to be printed), it is not clear whether it failed due to the mcg not
* mcg is also racy at best. Thus, the driver just prints a warning
* message when it can not rejoin after receiving a create trap, although
* this might be (on rare occasions) a mis-warning if the create trap is
* received after the mcg was deleted.
*/
/*
* Implementation of atomic "recycle" bits and reference count
* on address handles. This utilizes the fact that max reference
* count on any handle is limited by number of send wqes, thus
* high bits in the ac_ref field can be used as the recycle bits,
* and only the low bits hold the number of pending Tx requests.
* This atomic AH reference counting allows the Tx completion
* handler not to acquire the id_ac_mutex to process every completion,
* thus reducing lock contention problems between completion and
* the Tx path.
*/
#define CYCLEVAL 0x80000
#define GET_REF_CYCLE(ace) ( \
/* \
* Make sure "cycle" bit is set. \
*/ \
)
}
#define SET_CYCLE_IF_REF(ace) ( \
CYCLEVAL ? \
/* \
* Clear the "cycle" bit we just set; \
* ref count known to be 0 from above. \
*/ \
/* \
* We set "cycle" bit; let caller know. \
*/ \
B_TRUE \
)
#define DEC_REF_DO_CYCLE(ace) ( \
/* \
* Ref count known to be 0 from above. \
*/ \
B_TRUE : \
B_FALSE \
)
/*
* Address handle entries maintained by the driver are kept in the
* free and active lists. Each entry starts out in the free list;
* it migrates to the active list when primed using ibt_get_paths()
* and ibt_modify_ud_dest() for transmission to a specific destination.
* In the active list, the entry has a reference count indicating the
* number of ongoing/uncompleted transmits that reference it. The
* entry is left in the active list even after the reference count
* goes to 0, since successive transmits can find it there and do
* not need to set up another entry (ie the path information is
* cached using the active list). Entries on the active list are
* also hashed using the destination link address as a key for faster
* lookups during transmits.
*
* For any destination address (unicast or multicast, whatever the
* join states), there will be at most one entry in the active list.
* Entries with a 0 reference count on the active list can be reused
* for a transmit to a new destination, if the free list is empty.
*
* active list does not need a lock (all operations are done by the
* async thread) but updates to the reference count are atomically
* done (increments done by Tx path, decrements by the Tx callback handler).
*/
#define IBD_ACACHE_GET_FREE(state) \
int _ret_; \
}
}
#define IBD_ACACHE_GET_ACTIVE(state) \
/*
* Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
* padding by default at the end. The routine which is doing is nce_xmit()
* in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
* the packet comes down from IP layer to the IBD driver, it is in the
* following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
* This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
* machdr is not 4 byte aligned and had 2 bytes of padding at the end.
*
* The send routine at IBD driver changes this packet as follows:
* [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
* followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
* aligned.
*
* At the receiving side again ibd_process_rx takes the above packet and
* removes the two bytes of front padding and inserts it at the end. This
* is since the IP layer does not understand padding at the front.
*/
uchar_t *nd_lla_ptr; \
nd_opt_hdr_t *opt; \
int i; \
\
len -= sizeof (nd_neighbor_advert_t); \
(len != 0)) { \
+ IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \
for (i = IPOIB_ADDRL; i > 0; i--) \
*(nd_lla_ptr + i + 1) = \
*(nd_lla_ptr + i - 1); \
} else { \
for (i = 0; i < IPOIB_ADDRL; i++) \
*(nd_lla_ptr + i) = \
*(nd_lla_ptr + i + 2); \
} \
*(nd_lla_ptr + i) = 0; \
*(nd_lla_ptr + i + 1) = 0; \
} \
}
/*
* IETF defined IPoIB encapsulation header, with 2b of ethertype
* followed by 2 reserved bytes. This is at the start of the
* datagram sent to and received over the wire by the driver.
*/
typedef struct ipoib_header {
} ipoib_hdr_t;
#define IPOIB_HDRSIZE sizeof (struct ipoib_header)
/*
* IETF defined IPoIB link address; IBA QPN, followed by GID,
* which has a prefix and suffix, as reported via ARP.
*/
typedef struct ipoib_mac {
} ipoib_mac_t;
#define IPOIB_ADDRL sizeof (struct ipoib_mac)
/*
* Pseudo header prepended to datagram in DLIOCRAW transmit path
* and when GLD hands the datagram to the gldm_send entry point.
*/
typedef struct ipoib_ptxhdr {
/*
* The pseudo-GRH structure that sits before the data in the
* receive buffer, and is overlaid on top of the real GRH.
* The driver sets the ipoib_vertcflow to 0 if the pseudo-GRH
* does not hold valid information. If it is indicated valid,
* the driver must additionally provide the sender's qpn in
* network byte order in ipoib_sqpn, and not touch the
* remaining parts which were DMA'ed in by the IBA hardware.
*/
typedef struct ipoib_pgrh {
} ipoib_pgrh_t;
/*
* The GRH is also dma'ed into recv buffers, thus space needs
* to be allocated for them.
*/
#define IPOIB_GRH_SIZE sizeof (ipoib_pgrh_t)
/* support the RC (reliable connected) mode */
#define IBD_MAC_ADDR_RC 0x80000000
/* support the UC (unreliable connected) mode */
#define IBD_MAC_ADDR_UC 0x40000000
#define IBD_RC_SERVICE_ID 0x100000000000000ULL
/*
* Legacy OFED had used a wrong service ID (one additional zero digit) for
* many years. To interop with legacy OFED, we support this wrong service ID
* here.
*/
#define IBD_RC_SERVICE_ID_OFED_INTEROP 0x1000000000000000ULL
#define IBD_RC_MIN_CQ_SIZE 0x7f
/* Number of ibt_wc_t provided for each RC channel */
#define IBD_RC_MAX_CQ_WC 0x3f
#include <sys/mac_provider.h>
/* State of a reliable connected channel (ibd_rc_chan_t->chan_state) */
typedef enum {
IBD_RC_STATE_INIT = 0,
/* Active side */
IBD_RC_STATE_ACT_REP_RECV, /* reply received */
IBD_RC_STATE_ACT_ESTAB, /* established, ready to send */
IBD_RC_STATE_ACT_REJECT, /* rejected */
/* Someone else is closing this channel, please don't re-close it */
/* Passive side */
IBD_RC_STATE_PAS_REQ_RECV, /* request received */
IBD_RC_STATE_PAS_ESTAB, /* established, ready to receive */
IBD_RC_STATE_PAS_REJECT, /* rejected */
/*
* Structure to encapsulate various types of async requests.
*/
typedef struct ibd_acache_rq {
int rq_op; /* what operation */
void *rq_ptr;
void *rq_ptr2;
} ibd_req_t;
typedef struct ibd_mcache {
} ibd_mce_t;
typedef struct ibd_acache_s {
/* For Reliable Connected mode */
struct ibd_rc_chan_s *ac_chan;
/* protect tx_too_big_ongoing */
/* Deal with too big packet */
} ibd_ace_t;
#define IBD_MAX_SQSEG 59
#define IBD_MAX_RQSEG 1
typedef enum {
typedef enum {
IBD_WQE_TXBUF = 1,
IBD_WQE_LSOBUF = 2,
IBD_WQE_MAPPED = 3,
#ifdef DEBUG
typedef struct ibd_rc_stat_s {
/* pkt size <= state->id_rc_tx_copy_thresh */
/* fail in ibt_map_mem_iov() */
/* succ in ibt_map_mem_iov() */
/* no swqe even after recycle */
/* no tx large buf even after recycle */
/* short swqe in ibd_send() */
/* call mac_tx_update() when there is enough swqe */
/* short large buf in ibd_send() */
/* call mac_tx_update() when there is enough Tx large buffers */
/* ace->ac_chan == NULL for unicast packet */
/* not in active established state */
#endif
typedef struct ibd_rc_chan_list_s {
/* This mutex protects chan_list and ibd_rc_chan_t.next */
struct ibd_rc_chan_s *chan_list;
typedef struct ibd_rc_tx_largebuf_s {
struct ibd_rc_tx_largebuf_s *lb_next;
/*
* Pre-registered copybuf used for send and receive
*/
typedef struct ibd_copybuf_s {
typedef struct ibd_wqe_s {
} ibd_wqe_t;
/*
* Send WQE
*/
typedef struct ibd_swqe_s {
} ibd_swqe_t;
/*
* Receive WQE
*/
typedef struct ibd_rwqe_s {
struct ibd_state_s *w_state;
struct ibd_rc_chan_s *w_chan;
} ibd_rwqe_t;
typedef struct ibd_list_s {
union {
} ustat;
} ibd_list_t;
/*
* LSO buffers
*
* Under normal circumstances we should never need to use any buffer
* that's larger than MTU. Unfortunately, IB HCA has limitations
* on the length of SGL that are much smaller than those for regular
* ethernet NICs. Since the network layer doesn't care to limit the
* number of mblk fragments in any send mp chain, we end up having to
* use these larger-than-MTU sized (larger than id_tx_buf_sz actually)
* buffers occasionally.
*/
typedef struct ibd_lsobuf_s {
struct ibd_lsobuf_s *lb_next;
int lb_isfree;
} ibd_lsobuf_t;
typedef struct ibd_lsobkt_s {
} ibd_lsobkt_t;
#define IBD_PORT_DRIVER 0x1
#define IBD_PARTITION_OBJ 0x2
/*
* Posting to a single software rx post queue is contentious,
* so break it out to (multiple) an array of queues.
*
* Try to ensure rx_queue structs fall in different cache lines using a filler.
* Note: the RX_QUEUE_CACHE_LINE needs to change if the struct changes.
*/
#define RX_QUEUE_CACHE_LINE \
typedef struct ibd_rx_queue_s {
/*
* This structure maintains information per port per HCA
* (per network interface).
*/
typedef struct ibd_state_s {
int id_tx_busy;
int id_scq_poll_busy;
int id_rx_nqueues;
/*
* id_ud_num_rwqe
* Number of "receive WQE" elements that will be allocated and used
* by ibd. This parameter is limited by the maximum channel size of
* the HCA. Each buffer in the receive wqe will be of MTU size.
*/
int id_rcq_poll_busy;
int id_mtu;
struct list id_req_list;
struct list id_ah_active;
struct list id_ah_free;
char id_ah_op;
struct list id_mc_full;
char id_prom_op;
int id_sched_needed;
int id_sched_cnt;
int id_sched_lso_cnt;
/*
* id_ud_num_swqe
* Number of "send WQE" elements that will be allocated and used by
* ibd. When tuning this parameter, the size of pre-allocated, pre-
* mapped copy buffer in each of these send wqes must be taken into
* account. This copy buffer size is determined by the value of
* IBD_TX_BUF_SZ (this is currently set to the same value of
* ibd_tx_copy_thresh, but may be changed independently if needed).
*/
/* For Reliable Connected Mode */
int rc_mtu;
/*
* In IPoIB over Reliable Connected mode, its mac address is added
* an "IBD_MAC_ADDR_RC" prefix. But for loopback filter in function
* ibd_process_rx(), the input mac address should not include the
* "IBD_MAC_ADDR_RC" prefix.
*
* So, we introduce the rc_macaddr_loopback for the loopback filter in
* IPoIB over Reliable Connected mode.
*
* rc_macaddr_loopback = id_macaddr excludes "IBD_MAC_ADDR_RC" prefix.
*/
/* obsolete active channel list */
/* Send */
/*
* This mutex protects rc_tx_largebuf_free_head, rc_tx_largebuf_nfree
* and ibd_rc_tx_largebuf_t->lb_next
*/
/* The chunk of whole Tx large buffers */
/* For SRQ */
/* For chained receive */
/* Protect rc_timeout_start and rc_timeout */
/* Counters for RC mode */
/* RX */
/*
* # of Received packets. These packets are directly transferred to GLD
* without copy it
*/
/*
* # of Received packets. We will allocate new buffers for these packet,
* copy their content into new buffers, then transfer to GLD
*/
#ifdef DEBUG
#endif
/* wc->wc_status != IBT_WC_SUCCESS */
/* Tx */
/* pkt size <= ibd_rc_tx_copy_thresh */
/* fail in ibt_map_mem_iov() */
/* succ in ibt_map_mem_iov() */
/* short swqe in ibd_send() */
/* call mac_tx_update when there is enough swqe */
/* short tx large copy buf in ibd_send() */
/* call mac_tx_update when there is enough Tx copy buf */
/* No swqe even after call swqe recycle function */
/* No large Tx buf even after call swqe recycle function */
/* Connection setup and close */
/* ace->ac_chan == NULL for unicast packet */
/* not in active established state */
/* Fail to close a channel because someone else is still using it */
/* RCQ is being invoked when closing RC channel */
/* the counter of reset RC channel */
/*
* Fail to stop this port because this port is connecting to a remote
* port
*/
#ifdef DEBUG
#endif
int id_port_inst;
struct ibd_state_s *id_next;
/*
* UD Mode Tunables
*
* id_ud_tx_copy_thresh
* This sets the threshold at which ibd will attempt to do a bcopy
* of the outgoing data into a pre-mapped buffer. IPoIB driver's
* send behavior is restricted by various parameters, so setting of
* this value must be made after careful considerations only. For
* instance, IB HCAs currently impose a relatively small limit
* (when compared to ethernet NICs) on the length of the SGL for
* transmit. On the other hand, the ip stack could send down mp
* chains that are quite long when LSO is enabled.
*
* id_num_lso_bufs
* Number of "larger-than-MTU" copy buffers to use for cases when the
* outgoing mblk chain is too fragmented to be used with
* ibt_map_mem_iov() and too large to be used with regular MTU-sized
* copy buffers. It is not recommended to tune this variable without
* The size of each of these lso buffers is determined by the value of
* IBD_LSO_BUFSZ.
*
* id_num_ah
* Number of AH cache entries to allocate
*
* id_hash_size
* Hash table size for the active AH list
*
*/
/* RC Mode Tunables */
/*
* id_rc_tx_copy_thresh
* This sets the threshold at which ibd will attempt to do a bcopy
* of the outgoing data into a pre-mapped buffer.
*
* id_rc_rx_copy_thresh
* If (the size of incoming buffer <= id_rc_rx_copy_thresh), ibd
* will attempt to allocate a buffer and do a bcopy of the incoming
* data into the allocated buffer.
*
* id_rc_rx_rwqe_thresh
* If (the number of available rwqe < ibd_rc_rx_rwqe_thresh), ibd
* will attempt to allocate a buffer and do a bcopy of the incoming
* data into the allocated buffer.
*
* id_rc_num_swqe
* 1) Send CQ size = ibd_rc_num_swqe
* 2) The send queue size = ibd_rc_num_swqe -1
* 3) Number of pre-allocated Tx buffers for ibt_post_send() =
* ibd_rc_num_swqe - 1.
*
* id_rc_num_rwqe
* 1) For non-SRQ, we pre-post id_rc_num_rwqe number of WRs
* via ibt_post_receive() for receive queue of each RC channel.
* 2) For SRQ and non-SRQ, receive CQ size = id_rc_num_rwqe
*
* For SRQ
* If using SRQ, we allocate id_rc_num_srq number of buffers (the
* size of each buffer is equal to RC mtu). And post them by
* ibt_post_srq().
*
* id_rc_num_srq
* id_rc_num_srq should not be larger than id_rc_num_rwqe,
* otherwise it will cause a bug with the following warnings:
* NOTICE: hermon0: Device Error: EQE cq overrun or protection error
* NOTICE: hermon0: Device Error: EQE local work queue catastrophic
* error
* NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff
* catastrophic channel error
* NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff
* completion queue error
*/
} ibd_state_t;
/*
* Structures to track global IBTF data, data that is shared
* among the IBD device instances. This includes the one ibt_hdl
* and the list of service registrations.
*/
typedef struct ibd_service_s {
struct ibd_service_s *is_link;
typedef struct ibd_global_state_s {
typedef struct ibd_rc_msg_hello_s {
typedef struct ibd_rc_chan_s {
struct ibd_rc_chan_s *next;
/* channel hdl that we'll be using for Reliable Connected Mode */
struct ibd_state_s *state;
/* start address of Tx Buffers */
/* For chained send */
int tx_busy;
/* For tx buffer recycle */
int tx_poll_busy;
/* Rx */
/* For chained receive */
/*
* We need two channels for each connection.
* One channel for Tx; another channel for Rx.
* If "is_tx_chan == B_TRUE", this is a Tx channel.
*/
/*
* For the connection reaper routine ibd_rc_conn_timeout_call().
* "is_used == B_FALSE" indicates this RC channel has not been used for
* a long (=ibd_rc_conn_timeout) time.
*/
/*
* When closing this channel, we need to make sure
* "chan->rcq_invoking == 0".
*/
/*
* The following functions are defined in "ibd.c".
* They are also used by "ibd_cm.c"
*/
void ibd_print_warn(ibd_state_t *, char *, ...);
/*
* The following functions are defined in "ibd_cm.c".
* They are also used in "ibd.c".
*/
void ibd_rc_stop_listen(ibd_state_t *);
uint64_t);
void ibd_rc_close_all_chan(ibd_state_t *);
void ibd_rc_conn_timeout_call(void *carg);
/* Receive Functions */
int ibd_rc_init_srq_list(ibd_state_t *);
void ibd_rc_fini_srq_list(ibd_state_t *);
/* Send Functions */
void ibd_rc_fini_tx_largebuf_list(ibd_state_t *);
void ibd_rc_tx_cleanup(ibd_swqe_t *);
/* Others */
void ibd_rc_get_conf(ibd_state_t *);
int ibd_rc_init_stats(ibd_state_t *);
#endif /* _KERNEL && !_BOOT */
#ifdef __cplusplus
}
#endif
#endif /* _SYS_IB_CLIENTS_IBD_H */