ibd.h revision 9469b38fa64cb68eac7cc7f4b7595b0de1fb77d6
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_IB_CLIENTS_IBD_H
#define _SYS_IB_CLIENTS_IBD_H
#ifdef __cplusplus
extern "C" {
#endif
/*
* Completion queue polling control
*/
#define IBD_CQ_POLLING 0x1
#define IBD_REDO_CQ_POLLING 0x2
/*
* Maximum length for returning chained mps back to crossbow.
* Also used as the maximum number of rx wc's polled at a time.
*/
#define IBD_MAX_RX_MP_LEN 16
/*
* When doing multiple-send-wr, this value determines how many to do at
* a time (in a single ibt_post_send).
*/
#define IBD_MAX_TX_POST_MULTIPLE 4
/*
* Flag bits for resources to reap
*/
#define IBD_RSRC_SWQE 0x1
#define IBD_RSRC_LSOBUF 0x2
#define IBD_RSRC_RC_SWQE 0x4
#define IBD_RSRC_RC_TX_LARGEBUF 0x8
/*
* Async operation types
*/
#define IBD_ASYNC_GETAH 1
#define IBD_ASYNC_JOIN 2
#define IBD_ASYNC_LEAVE 3
#define IBD_ASYNC_PROMON 4
#define IBD_ASYNC_PROMOFF 5
#define IBD_ASYNC_REAP 6
#define IBD_ASYNC_TRAP 7
#define IBD_ASYNC_SCHED 8
#define IBD_ASYNC_LINK 9
#define IBD_ASYNC_EXIT 10
#define IBD_ASYNC_RC_TOO_BIG 11
#define IBD_ASYNC_RC_CLOSE_ACT_CHAN 12
#define IBD_ASYNC_RC_RECYCLE_ACE 13
/*
* Miscellaneous constants
*/
#define IBD_SEND 0
#define IBD_RECV 1
/*
* Thresholds
*
* When waiting for resources (swqes or lso buffers) to become available,
* the first two thresholds below determine how long to wait before informing
* the network layer to start sending packets again. The IBD_TX_POLL_THRESH
* determines how low the available swqes should go before we start polling
* the completion queue.
*/
#define IBD_FREE_LSOS_THRESH 8
#define IBD_FREE_SWQES_THRESH 20
#define IBD_TX_POLL_THRESH 80
#ifdef DEBUG
void debug_print(int l, char *fmt, ...);
#define DPRINT debug_print
#else
#define DPRINT 0 &&
#endif
/*
* AH and MCE active list manipulation:
*
* Multicast disable requests and MCG delete traps are two cases
* where the active AH entry for the mcg (if any unreferenced one exists)
* will be moved to the free list (to force the next Tx to the mcg to
* join the MCG in SendOnly mode). Port up handling will also move AHs
* from active to free list.
*
* In the case when some transmits are still pending on an entry
* for an mcg, but a multicast disable has already been issued on the
* mcg, there are some options to consider to preserve the join state
* to ensure the emitted packet is properly routed on the IBA fabric.
* For the AH, we can
* 1. take out of active list at multicast disable time.
* 2. take out of active list only when last pending Tx completes.
* For the MCE, we can
* 3. take out of active list at multicast disable time.
* 4. take out of active list only when last pending Tx completes.
* 5. move from active list to stale list at multicast disable time.
* We choose to use 2,4. We use option 4 so that if a multicast enable
* is tried before the pending Tx completes, the enable code finds the
* mce in the active list and just has to make sure it will not be reaped
* (ie the mcg leave done) when the pending Tx does complete. Alternatively,
* a stale list (#5) that would be checked in the enable code would need
* to be implemented. Option 2 is used, because otherwise, a Tx attempt
* after the multicast disable would try to put an AH in the active list,
* and associate the mce it finds in the active list to this new AH,
* whereas the mce is already associated with the previous AH (taken off
* the active list), and will be removed once the pending Tx's complete
* (unless a reference count on mce's is implemented). One implication of
* using 2,4 is that new Tx's posted before the pending Tx's complete will
* grab new references on the AH, further delaying the leave.
*
* In the case of mcg delete (or create) trap when the port is sendonly
* joined, the AH and MCE handling is different: the AH and MCE has to be
* immediately taken off the active lists (forcing a join and path lookup
* at the next Tx is the only guaranteed means of ensuring a proper Tx
* to an mcg as it is repeatedly created and deleted and goes thru
* reincarnations).
*
* When a port is already sendonly joined, and a multicast enable is
* attempted, the same mce structure is promoted; this ensures only a
* single mce on the active list tracks the most powerful join state.
*
* In the case of port up event handling, the MCE for sendonly membership
* is freed up, and the ACE is put into the free list as soon as possible
* (depending on whether posted Tx's have completed). For fullmembership
* MCE's though, the ACE is similarly handled; but the MCE is kept around
* (a re-JOIN is attempted) only if the DLPI leave has not already been
* done; else the mce is deconstructed (mc_fullreap case).
*
* MCG creation and deletion trap handling:
*
* These traps are unreliable (meaning sometimes the trap might never
* be delivered to the subscribed nodes) and may arrive out-of-order
* since they use UD transport. An alternative to relying on these
* unreliable traps is to poll for mcg presence every so often, but
* instead of doing that, we try to be as conservative as possible
* while handling the traps, and hope that the traps do arrive at
* the subscribed nodes soon. Note that if a node is fullmember
* trap for that mcg (by fullmember definition); if it does, it is
* an old trap from a previous incarnation of the mcg.
*
* Whenever a trap is received, the driver cleans up its sendonly
* membership to the group; we choose to do a sendonly leave even
* on a creation trap to handle the case of a prior deletion of the mcg
* having gone unnoticed. Consider an example scenario:
* T1: MCG M is deleted, and fires off deletion trap D1.
* T2: MCG M is recreated, fires off creation trap C1, which is lost.
* T3: Node N tries to transmit to M, joining in sendonly mode.
* T4: MCG M is deleted, and fires off deletion trap D2.
* T5: N receives a deletion trap, but can not distinguish D1 from D2.
* If the trap is D2, then a LEAVE is not required, since the mcg
* is already deleted; but if it is D1, a LEAVE is required. A safe
* approach is to always LEAVE, but the SM may be confused if it
* receives a LEAVE without a prior JOIN.
*
* Management of the non-membership to an mcg is similar to the above,
* except that if the interface is in promiscuous mode, it is required
* to attempt to re-join the mcg after receiving a trap. Unfortunately,
* if the re-join attempt fails (in which case a warning message needs
* to be printed), it is not clear whether it failed due to the mcg not
* mcg is also racy at best. Thus, the driver just prints a warning
* message when it can not rejoin after receiving a create trap, although
* this might be (on rare occasions) a mis-warning if the create trap is
* received after the mcg was deleted.
*/
/*
* Implementation of atomic "recycle" bits and reference count
* on address handles. This utilizes the fact that max reference
* count on any handle is limited by number of send wqes, thus
* high bits in the ac_ref field can be used as the recycle bits,
* and only the low bits hold the number of pending Tx requests.
* This atomic AH reference counting allows the Tx completion
* handler not to acquire the id_ac_mutex to process every completion,
* thus reducing lock contention problems between completion and
* the Tx path.
*/
#define CYCLEVAL 0x80000
#define GET_REF_CYCLE(ace) ( \
/* \
* Make sure "cycle" bit is set. \
*/ \
)
}
#define SET_CYCLE_IF_REF(ace) ( \
CYCLEVAL ? \
/* \
* Clear the "cycle" bit we just set; \
* ref count known to be 0 from above. \
*/ \
/* \
* We set "cycle" bit; let caller know. \
*/ \
B_TRUE \
)
#define DEC_REF_DO_CYCLE(ace) ( \
/* \
* Ref count known to be 0 from above. \
*/ \
B_TRUE : \
B_FALSE \
)
/*
* Address handle entries maintained by the driver are kept in the
* free and active lists. Each entry starts out in the free list;
* it migrates to the active list when primed using ibt_get_paths()
* and ibt_modify_ud_dest() for transmission to a specific destination.
* In the active list, the entry has a reference count indicating the
* number of ongoing/uncompleted transmits that reference it. The
* entry is left in the active list even after the reference count
* goes to 0, since successive transmits can find it there and do
* not need to set up another entry (ie the path information is
* cached using the active list). Entries on the active list are
* also hashed using the destination link address as a key for faster
* lookups during transmits.
*
* For any destination address (unicast or multicast, whatever the
* join states), there will be at most one entry in the active list.
* Entries with a 0 reference count on the active list can be reused
* for a transmit to a new destination, if the free list is empty.
*
* active list does not need a lock (all operations are done by the
* async thread) but updates to the reference count are atomically
* done (increments done by Tx path, decrements by the Tx callback handler).
*/
#define IBD_ACACHE_GET_FREE(state) \
int _ret_; \
}
}
#define IBD_ACACHE_GET_ACTIVE(state) \
/*
* Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
* padding by default at the end. The routine which is doing is nce_xmit()
* in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
* the packet comes down from IP layer to the IBD driver, it is in the
* following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
* This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
* machdr is not 4 byte aligned and had 2 bytes of padding at the end.
*
* The send routine at IBD driver changes this packet as follows:
* [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
* followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
* aligned.
*
* At the receiving side again ibd_process_rx takes the above packet and
* removes the two bytes of front padding and inserts it at the end. This
* is since the IP layer does not understand padding at the front.
*/
uchar_t *nd_lla_ptr; \
nd_opt_hdr_t *opt; \
int i; \
\
len -= sizeof (nd_neighbor_advert_t); \
(len != 0)) { \
+ IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \
for (i = IPOIB_ADDRL; i > 0; i--) \
*(nd_lla_ptr + i + 1) = \
*(nd_lla_ptr + i - 1); \
} else { \
for (i = 0; i < IPOIB_ADDRL; i++) \
*(nd_lla_ptr + i) = \
*(nd_lla_ptr + i + 2); \
} \
*(nd_lla_ptr + i) = 0; \
*(nd_lla_ptr + i + 1) = 0; \
} \
}
/*
* IETF defined IPoIB encapsulation header, with 2b of ethertype
* followed by 2 reserved bytes. This is at the start of the
* datagram sent to and received over the wire by the driver.
*/
typedef struct ipoib_header {
} ipoib_hdr_t;
#define IPOIB_HDRSIZE sizeof (struct ipoib_header)
/*
* IETF defined IPoIB link address; IBA QPN, followed by GID,
* which has a prefix and suffix, as reported via ARP.
*/
typedef struct ipoib_mac {
} ipoib_mac_t;
#define IPOIB_ADDRL sizeof (struct ipoib_mac)
/*
* Pseudo header prepended to datagram in DLIOCRAW transmit path
* and when GLD hands the datagram to the gldm_send entry point.
*/
typedef struct ipoib_ptxhdr {
/*
* The pseudo-GRH structure that sits before the data in the
* receive buffer, and is overlaid on top of the real GRH.
* The driver sets the ipoib_vertcflow to 0 if the pseudo-GRH
* does not hold valid information. If it is indicated valid,
* the driver must additionally provide the sender's qpn in
* network byte order in ipoib_sqpn, and not touch the
* remaining parts which were DMA'ed in by the IBA hardware.
*/
typedef struct ipoib_pgrh {
} ipoib_pgrh_t;
/*
* The GRH is also dma'ed into recv buffers, thus space needs
* to be allocated for them.
*/
#define IPOIB_GRH_SIZE sizeof (ipoib_pgrh_t)
/* support the RC (reliable connected) mode */
#define IBD_MAC_ADDR_RC 0x80000000
/* support the UC (unreliable connected) mode */
#define IBD_MAC_ADDR_UC 0x40000000
#define IBD_RC_SERVICE_ID 0x100000000000000ULL
/*
* Legacy OFED had used a wrong service ID (one additional zero digit) for
* many years. To interop with legacy OFED, we support this wrong service ID
* here.
*/
#define IBD_RC_SERVICE_ID_OFED_INTEROP 0x1000000000000000ULL
#define IBD_RC_MIN_CQ_SIZE 0x7f
/* Number of ibt_wc_t provided for each RC channel */
#define IBD_RC_MAX_CQ_WC 0x3f
#include <sys/mac_provider.h>
/* State of a reliable connected channel (ibd_rc_chan_t->chan_state) */
typedef enum {
IBD_RC_STATE_INIT = 0,
/* Active side */
IBD_RC_STATE_ACT_REP_RECV, /* reply received */
IBD_RC_STATE_ACT_ESTAB, /* established, ready to send */
IBD_RC_STATE_ACT_REJECT, /* rejected */
/* Someone else is closing this channel, please don't re-close it */
/* Passive side */
IBD_RC_STATE_PAS_REQ_RECV, /* request received */
IBD_RC_STATE_PAS_ESTAB, /* established, ready to receive */
IBD_RC_STATE_PAS_REJECT, /* rejected */
/*
* Structure to encapsulate various types of async requests.
*/
typedef struct ibd_acache_rq {
int rq_op; /* what operation */
void *rq_ptr;
void *rq_ptr2;
} ibd_req_t;
typedef struct ibd_mcache {
} ibd_mce_t;
typedef struct ibd_acache_s {
/* For Reliable Connected mode */
struct ibd_rc_chan_s *ac_chan;
/* protect tx_too_big_ongoing */
/* Deal with too big packet */
} ibd_ace_t;
#define IBD_MAX_SQSEG 59
#define IBD_MAX_RQSEG 1
typedef enum {
typedef enum {
IBD_WQE_TXBUF = 1,
IBD_WQE_LSOBUF = 2,
IBD_WQE_MAPPED = 3,
#ifdef DEBUG
typedef struct ibd_rc_stat_s {
/* pkt size <= ibd_rc_tx_copy_thresh */
/* fail in ibt_map_mem_iov() */
/* succ in ibt_map_mem_iov() */
/* no swqe even after recycle */
/* no tx large buf even after recycle */
/* short swqe in ibd_send() */
/* call mac_tx_update() when there is enough swqe */
/* short large buf in ibd_send() */
/* call mac_tx_update() when there is enough Tx large buffers */
/* ace->ac_chan == NULL for unicast packet */
/* not in active established state */
#endif
typedef struct ibd_rc_chan_list_s {
/* This mutex protects chan_list and ibd_rc_chan_t.next */
struct ibd_rc_chan_s *chan_list;
typedef struct ibd_rc_tx_largebuf_s {
struct ibd_rc_tx_largebuf_s *lb_next;
/*
* Pre-registered copybuf used for send and receive
*/
typedef struct ibd_copybuf_s {
typedef struct ibd_wqe_s {
} ibd_wqe_t;
/*
* Send WQE
*/
typedef struct ibd_swqe_s {
} ibd_swqe_t;
/*
* Receive WQE
*/
typedef struct ibd_rwqe_s {
struct ibd_state_s *w_state;
struct ibd_rc_chan_s *w_chan;
} ibd_rwqe_t;
typedef struct ibd_list_s {
union {
} ustat;
} ibd_list_t;
/*
* LSO buffers
*
* Under normal circumstances we should never need to use any buffer
* that's larger than MTU. Unfortunately, IB HCA has limitations
* on the length of SGL that are much smaller than those for regular
* ethernet NICs. Since the network layer doesn't care to limit the
* number of mblk fragments in any send mp chain, we end up having to
* use these larger-than-MTU sized (larger than id_tx_buf_sz actually)
* buffers occasionally.
*/
typedef struct ibd_lsobuf_s {
struct ibd_lsobuf_s *lb_next;
int lb_isfree;
} ibd_lsobuf_t;
typedef struct ibd_lsobkt_s {
} ibd_lsobkt_t;
/*
* Posting to a single software rx post queue is contentious,
* so break it out to (multiple) an array of queues.
*
* Try to ensure rx_queue structs fall in different cache lines using a filler.
* Note: the RX_QUEUE_CACHE_LINE needs to change if the struct changes.
*/
#define RX_QUEUE_CACHE_LINE \
typedef struct ibd_rx_queue_s {
/*
* This structure maintains information per port per HCA
* (per network interface).
*/
typedef struct ibd_state_s {
int id_tx_busy;
int id_scq_poll_busy;
int id_rx_nqueues;
int id_rcq_poll_busy;
int id_mtu;
struct list id_req_list;
struct list id_ah_active;
struct list id_ah_free;
char id_ah_op;
struct list id_mc_full;
char id_prom_op;
int id_sched_needed;
int id_sched_cnt;
int id_sched_lso_cnt;
/* For Reliable Connected Mode */
int rc_mtu;
/*
* In IPoIB over Reliable Connected mode, its mac address is added
* an "IBD_MAC_ADDR_RC" prefix. But for loopback filter in function
* ibd_process_rx(), the input mac address should not include the
* "IBD_MAC_ADDR_RC" prefix.
*
* So, we introduce the rc_macaddr_loopback for the loopback filter in
* IPoIB over Reliable Connected mode.
*
* rc_macaddr_loopback = id_macaddr excludes "IBD_MAC_ADDR_RC" prefix.
*/
/* obsolete active channel list */
/* Send */
/*
* This mutex protects rc_tx_largebuf_free_head, rc_tx_largebuf_nfree
* and ibd_rc_tx_largebuf_t->lb_next
*/
/* The chunk of whole Tx large buffers */
/* For SRQ */
/* For chained receive */
/* Counters for RC mode */
/* RX */
/*
* # of Received packets. These packets are directly transferred to GLD
* without copy it
*/
/*
* # of Received packets. We will allocate new buffers for these packet,
* copy their content into new buffers, then transfer to GLD
*/
#ifdef DEBUG
#endif
/* # of invoke Receive CQ handler */
/* wc->wc_status != IBT_WC_SUCCESS */
/* Tx */
/* pkt size <= ibd_rc_tx_copy_thresh */
/* fail in ibt_map_mem_iov() */
/* succ in ibt_map_mem_iov() */
/* short swqe in ibd_send() */
/* call mac_tx_update when there is enough swqe */
/* short tx large copy buf in ibd_send() */
/* call mac_tx_update when there is enough Tx copy buf */
/* No swqe even after call swqe recycle function */
/* No large Tx buf even after call swqe recycle function */
/* # of invoke Send CQ handler */
/* Connection setup and close */
/* ace->ac_chan == NULL for unicast packet */
/* not in active established state */
/* the counter of reset RC channel */
#ifdef DEBUG
#endif
} ibd_state_t;
/*
* Structures to track global IBTF data, data that is shared
* among the IBD device instances. This includes the one ibt_hdl
* and the list of service registrations.
*/
typedef struct ibd_service_s {
struct ibd_service_s *is_link;
typedef struct ibd_global_state_s {
typedef struct ibd_rc_msg_hello_s {
typedef struct ibd_rc_chan_s {
struct ibd_rc_chan_s *next;
/* channel hdl that we'll be using for Reliable Connected Mode */
struct ibd_state_s *state;
/* used to judge duplicate connection */
/* start address of Tx Buffers */
/* For chained send */
int tx_busy;
/* For tx buffer recycle */
int tx_poll_busy;
/* Rx */
/* For chained receive */
/*
* We need two channels for each connection.
* One channel for Tx; another channel for Rx.
* If "is_tx_chan == B_TRUE", this is a Tx channel.
*/
/*
* The following functions are defined in "ibd.c".
* They are also used by "ibd_cm.c"
*/
void ibd_print_warn(ibd_state_t *, char *, ...);
/*
* The following functions are defined in "ibd_cm.c".
* They are also used in "ibd.c".
*/
void ibd_rc_stop_listen(ibd_state_t *);
uint64_t);
void ibd_rc_close_all_chan(ibd_state_t *);
/* Receive Functions */
int ibd_rc_init_srq_list(ibd_state_t *);
void ibd_rc_fini_srq_list(ibd_state_t *);
/* Send Functions */
void ibd_rc_fini_tx_largebuf_list(ibd_state_t *);
void ibd_rc_tx_cleanup(ibd_swqe_t *);
/* Others */
void ibd_rc_get_conf(ibd_state_t *);
int ibd_rc_init_stats(ibd_state_t *);
#endif /* _KERNEL && !_BOOT */
#ifdef __cplusplus
}
#endif
#endif /* _SYS_IB_CLIENTS_IBD_H */