/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
/*
* An implementation of the IPoIB standard based on PSARC 2001/289.
*/
#include <sys/mac_provider.h>
#include <sys/multidata.h>
#include <sys/priv_names.h>
/*
* The write-up below includes details on the following:
* 1. The dladm administrative model.
* 2. Late HCA initialization feature.
* 3. Brussels support and its implications to the current architecture.
*
* 1. The dladm administrative model.
* ------------------------------------------
* With the dladm model, ibnex will create one ibd instance per port. These
* instances will be created independent of the port state.
*
* The ibd driver is two faceted: One side of it working as the port driver and
* the other as the partition object driver.
*
* The port instance is a child of the HCA, and will have an entry in the devfs.
* A DDI attach only happens for the port driver, and its attach is
* handled in ibd_port_attach(). Similary, a DDI detach for the port driver is
* handled in ibd_port_unattach().
*
* The partition object is only a registrant to the mac layer via mac_register()
* and does not have an entry in the device tree. There is no DDI softstate
* managed by the DDI framework for the partition objects. However, the state is
* managed inside the ibd driver, and every partition object hangs off the
* "ibd_objlist_head".
*
* The partition object first comes into existence when a user runs the
* 'create-part' subcommand of dladm. This is like invoking the attach entry
* point of the partition object. The partition object goes away with the
* 'delete-part' subcommand of dladm. This is like invoking the detach entry
* point of the partition object.
*
* The create-part and delete-part subcommands result in dld ioctls that end up
* calling ibd_create_parition() and ibd_delete_partition respectively.
* There ioctls are registered with the dld layer in _init() via a call to
* dld_ioc_register().
*
* The port instance by itself cannot be plumbed. It is only the partition
* objects that can be plumbed and they alone participate in I/O and not the
* port driver.
*
* There are some info ioctls supported in ibd which are used by dladm(1M) to
* display useful information. The info entry point for ibd is
* ibd_get_partition_info().
*
* 2. Late HCA initialization feature.
* ------------------------------------
* As mentioned in section 1, the user creates the partition objects via
* dladm(1M). It is possible that:
* a) The physical port itself is down and the SM cannot be reached.
* b) The PKEY specified by the used has not been created in the SM yet.
* c) An IPoIB broadcast group for the specified PKEY is not present.
*
* In all of the above cases, complete initialization of the partition object is
* not possible. However, the new model allows the creation of partition
* objects even in such cases but will defer the initialization for later.
* When such a partition object is plumbed, the link state will be displayed as
* "down".
* The driver, at this point, is listening to events that herald the
* availability of resources -
* i) LINK_UP when the link becomes available
* ii) PORT_CHANGE when the PKEY has been created
* iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been
* created
* via ibd_async_handler() for events i) and ii), and via
* ibd_snet_notices_handler() for iii.
* The driver handles these events (as and when they arrive) and completes the
* initialization of the partition object and transitions it to a usable state.
*
* 3. Brussels support and its implications to the current architecture.
* ---------------------------------------------------------------------
* The brussels support introduces two new interfaces to the ibd driver -
* ibd_m_getprop() and ibd_m_setprop().
* These interfaces allow setting and retrieval of certain properties.
* Some of them are public properties while most other are private properties
* meant to be used by developers. Tuning the latter kind can cause
* performance issues and should not be used without understanding the
* implications. All properties are specific to an instance of either the
* partition object or the port driver.
*
* The public properties are : mtu and linkmode.
* mtu is a read-only property.
* linkmode can take two values - UD and CM.
*
* Changing the linkmode requires some bookkeeping in the driver. The
* capabilities need to be re-reported to the mac layer. This is done by
* calling mac_capab_update(). The maxsdu is updated by calling
* mac_maxsdu_update2().
* The private properties retain their values across the change of linkmode.
* NOTE:
* - The port driver does not support any property apart from mtu.
* - All other properties are only meant for the partition object.
* - The properties cannot be set when an instance is plumbed. The
* instance has to be unplumbed to effect any setting.
*/
/*
* Driver wide tunables
*
* ibd_tx_softintr
* ibd_rx_softintr
* The softintr mechanism allows ibd to avoid event queue overflows if
* the receive/completion handlers are to be expensive. These are enabled
* by default.
*
* ibd_log_sz
* This specifies the size of the ibd log buffer in bytes. The buffer is
* allocated and logging is enabled only when IBD_LOGGING is defined.
*
*/
#ifdef IBD_LOGGING
#endif
#ifdef IBD_LOGGING
#endif
/* Post IBD_RX_POST_CNT receive work requests at a time. */
/* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */
/* Minimum number of receive work requests driver needs to always have */
/*
* LSO parameters
*/
/*
* Async operation states
*/
#define IBD_OP_NOTSTARTED 0
/*
* the OR of start and stop flag values.
*/
/*
* Miscellaneous constants
*/
#ifdef IBD_LOGGING
#endif
/*
* Enumerations for link states
*/
typedef enum {
/*
* Driver State Pointer
*/
void *ibd_list;
/*
* Driver Global Data
*/
/*
* Partition object list
*/
/*
* Logging
*/
#ifdef IBD_LOGGING
#endif
/*
* Required system entry points
*/
/*
* Required driver entry points for GLDv3
*/
static int ibd_m_start(void *);
static void ibd_m_stop(void *);
static int ibd_m_promisc(void *, boolean_t);
static int ibd_m_unicst(void *, const uint8_t *);
const void *);
static void ibd_m_propinfo(void *, const char *, mac_prop_id_t,
const void *);
/*
* Private driver entry points for GLDv3
*/
/*
* Initialization
*/
static int ibd_init_txlist(ibd_state_t *);
static int ibd_init_rxlist(ibd_state_t *);
static int ibd_acache_init(ibd_state_t *);
#ifdef IBD_LOGGING
static void ibd_log_init(void);
#endif
/*
*/
static void ibd_state_fini(ibd_state_t *);
static void ibd_fini_txlist(ibd_state_t *);
static void ibd_fini_rxlist(ibd_state_t *);
static void ibd_acache_fini(ibd_state_t *);
#ifdef IBD_LOGGING
static void ibd_log_fini(void);
#endif
/*
* Allocation/acquire/map routines
*/
static int ibd_alloc_tx_copybufs(ibd_state_t *);
static int ibd_alloc_rx_copybufs(ibd_state_t *);
static int ibd_alloc_tx_lsobufs(ibd_state_t *);
uint32_t *);
/*
*/
static void ibd_free_tx_copybufs(ibd_state_t *);
static void ibd_free_rx_copybufs(ibd_state_t *);
static void ibd_free_rx_rsrcs(ibd_state_t *);
static void ibd_free_tx_lsobufs(ibd_state_t *);
/*
*/
static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
static void ibd_scq_handler(ibt_cq_hdl_t, void *);
static void ibd_freemsg_cb(char *);
static void ibd_snet_notices_handler(void *, ib_gid_t,
/*
*/
/*
* Threads
*/
static void ibd_async_work(ibd_state_t *);
/*
* Async tasks
*/
static void ibd_async_setprom(ibd_state_t *);
static void ibd_async_unsetprom(ibd_state_t *);
static void ibd_async_txsched(ibd_state_t *);
/*
* Async task helpers
*/
ipoib_mac_t *, ipoib_mac_t *);
static void ibd_async_done(ibd_state_t *);
/*
*/
static int ibd_record_capab(ibd_state_t *);
static int ibd_get_port_details(ibd_state_t *);
static int ibd_alloc_cqs(ibd_state_t *);
static int ibd_setup_ud_channel(ibd_state_t *);
static int ibd_start(ibd_state_t *);
static int ibd_port_attach(dev_info_t *);
static int ibd_part_busy(ibd_state_t *);
/*
* Miscellaneous helpers
*/
static int ibd_sched_poll(ibd_state_t *, int, int);
static void ibd_resume_transmission(ibd_state_t *);
static void *list_get_head(list_t *);
#ifdef IBD_LOGGING
static void ibd_log(const char *, ...);
#endif
/* Module Driver Info */
&mod_driverops, /* This one is a driver */
"InfiniBand GLDv3 Driver", /* short description */
&ibd_dev_ops /* driver specific ops */
};
/* Module Linkage */
};
/*
* Module (static) info passed to IBTL during ibt_attach
*/
NULL,
"IBPART"
};
NULL,
"IPIB"
};
/*
* GLDv3 entry points
*/
#define IBD_M_CALLBACK_FLAGS \
NULL,
NULL,
NULL,
NULL,
};
/* Private properties */
char *ibd_priv_props[] = {
"_ibd_broadcast_group",
"_ibd_coalesce_completions",
"_ibd_create_broadcast_group",
"_ibd_hash_size",
"_ibd_lso_enable",
"_ibd_num_ah",
"_ibd_num_lso_bufs",
"_ibd_rc_enable_srq",
"_ibd_rc_num_rwqe",
"_ibd_rc_num_srq",
"_ibd_rc_num_swqe",
"_ibd_rc_rx_comp_count",
"_ibd_rc_rx_comp_usec",
"_ibd_rc_rx_copy_thresh",
"_ibd_rc_rx_rwqe_thresh",
"_ibd_rc_tx_comp_count",
"_ibd_rc_tx_comp_usec",
"_ibd_rc_tx_copy_thresh",
"_ibd_ud_num_rwqe",
"_ibd_ud_num_swqe",
"_ibd_ud_rx_comp_count",
"_ibd_ud_rx_comp_usec",
"_ibd_ud_tx_comp_count",
"_ibd_ud_tx_comp_usec",
"_ibd_ud_tx_copy_thresh",
};
};
/*
*/
{ \
}
{ \
}
/*
* Rudimentary debugging support
*/
#ifdef DEBUG
void
{
if (l < ibd_debuglevel)
return;
}
#endif
/*
* Common routine to print warning messages; adds in hca guid, port number
* and pkey to be able to identify the IBA interface.
*/
void
{
int len;
0, "hca-guid", 0);
"%s%d: HCA GUID %016llx port %d PKEY %02x link %s ",
}
/*
* Warlock directives
*/
/*
* id_lso_lock
*
* state->id_lso->bkt_nfree may be accessed without a lock to
* determine the threshold at which we have to ask the nw layer
* to resume transmission (see ibd_resume_transmission()).
*/
/*
* id_scq_poll_lock
*/
/*
* id_txpost_lock
*/
/*
* id_acache_req_lock
*/
/*
* id_ac_mutex
*
* This mutex is actually supposed to protect id_ah_op as well,
* but this path of the code isn't clean (see update of id_ah_op
* in ibd_async_acache(), immediately after the call to
* ibd_async_mcache()). For now, we'll skip this check by
* declaring that id_ah_op is protected by some internal scheme
* that warlock isn't aware of.
*/
/*
* id_mc_mutex
*/
/*
* id_trap_lock
*/
/*
* id_prom_op
*/
/*
* id_sched_lock
*/
/*
* id_link_mutex
*/
/*
* id_tx_list.dl_mutex
*/
/*
* id_rx_list.dl_mutex
*/
/*
* rc_timeout_lock
*/
/*
* Items protected by atomic updates
*/
/*
* Non-mutex protection schemes for data elements. Almost all of
* these are non-shared items.
*/
/*
* ibd_rc_chan_s::next is protected by two mutexes:
* 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex
* 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex.
*/
/*
* ibd_state_s.rc_tx_large_bufs_lock
*/
/*
* ibd_acache_s.tx_too_big_mutex
*/
/*
* tx_wqe_list.dl_mutex
*/
/*
* ibd_state_s.rc_ace_recycle_lock
*/
/*
* rc_srq_rwqe_list.dl_mutex
*/
/*
* Non-mutex protection schemes for data elements. They are counters
* for problem diagnosis. Don't need be protected.
*/
#ifdef DEBUG
/*
* Non-mutex protection schemes for data elements. They are counters
* for problem diagnosis. Don't need be protected.
*/
#endif
int
_init()
{
int status;
PAGESIZE), 0);
if (status != 0) {
return (status);
}
if (status != 0) {
return (status);
}
DLDIOCCNT(ibd_dld_ioctl_list)) != 0) {
return (EIO);
}
#ifdef IBD_LOGGING
ibd_log_init();
#endif
return (0);
}
int
{
}
int
_fini()
{
int status;
if (status != 0)
return (status);
#ifdef IBD_LOGGING
ibd_log_fini();
#endif
return (0);
}
/*
* Convert the GID part of the mac address from network byte order
* to host order.
*/
static void
{
}
/*
* Create the IPoIB address in network byte order from host order inputs.
*/
static void
{
}
/*
* Send to the appropriate all-routers group when the IBA multicast group
* does not exist, based on whether the target group is v4 or v6.
*/
static boolean_t
{
/*
* Copy the first 4 bytes in without assuming any alignment of
* input mac address; this will have IPoIB signature, flags and
* scope bits.
*/
/*
*/
else
/*
* Does not have proper bits in the mgid address.
*/
return (retval);
}
/*
* Membership states for different mcg's are tracked by two lists:
* the "non" list is used for promiscuous mode, when all mcg traffic
* needs to be inspected. This type of membership is never used for
* transmission, so there can not be an AH in the active list
* corresponding to a member in this list. This list does not need
* any protection, since all operations are performed by the async
* thread.
*
* "Full" and "SendOnly" membership is tracked using a single list,
* the "full" list. This is because this single list can then be
* searched during transmit to a multicast group (if an AH for the
* mcg is not found in the active list), since at least one type
* of membership must be present before initiating the transmit.
* This list is also emptied during driver detach, since sendonly
* membership acquired during transmit is dropped at detach time
* this list are done only by the async thread, but it is also
* searched in program context (see multicast disable case), thus
* the id_mc_mutex protects the list. The driver detach path also
* deconstructs the "full" list, but it ensures that the async
* thread will not be accessing the list (by blocking out mcg
* trap handling and making sure no more Tx reaping will happen).
*
* Currently, an IBA attach is done in the SendOnly case too,
* although this is not required.
*/
static void *
{
return (lhead);
}
/*
* This is always guaranteed to be able to queue the work.
*/
void
{
/* Initialize request */
/*
* Queue provided slot onto request pool.
*/
/* Go, fetch, async thread */
}
/*
* Main body of the per interface async thread.
*/
static void
{
callb_generic_cpr, "ibd_async_work");
for (;;) {
/*
* If we are in late hca initialization mode, do not
* process any other async request other than TRAP. TRAP
* is used for indicating creation of a broadcast group;
*/
goto free_req_and_continue;
}
/*
* Once we have done the operation, there is no
* guarantee the request slot is going to be valid,
* it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
* TRAP).
*
* Perform the request.
*/
case IBD_ASYNC_GETAH:
break;
case IBD_ASYNC_JOIN:
case IBD_ASYNC_LEAVE:
break;
case IBD_ASYNC_PROMON:
break;
case IBD_ASYNC_PROMOFF:
break;
case IBD_ASYNC_REAP:
/*
* the req buf contains in mce
* structure, so we do not need
* to free it here.
*/
break;
case IBD_ASYNC_TRAP:
break;
case IBD_ASYNC_SCHED:
break;
case IBD_ASYNC_LINK:
break;
case IBD_ASYNC_EXIT:
#ifndef __lock_lint
#else
#endif
return;
case IBD_ASYNC_RC_TOO_BIG:
ptr);
break;
break;
case IBD_ASYNC_RC_RECYCLE_ACE:
break;
break;
}
} else {
#ifndef __lock_lint
/*
* Nothing to do: wait till new request arrives.
*/
#endif
}
}
/*NOTREACHED*/
}
/*
* Return when it is safe to queue requests to the async daemon; primarily
* for subnet trap and async event handling. Disallow requests before the
* daemon is created, and when interface deinitilization starts.
*/
static boolean_t
{
if (state->id_trap_stop) {
return (B_FALSE);
}
state->id_trap_inprog++;
return (B_TRUE);
}
/*
* Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
* trap or event handling to complete to kill the async thread and deconstruct
*/
static void
{
if (--state->id_trap_inprog == 0)
}
/*
* Hash functions:
* ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
* ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
* These operate on mac addresses input into ibd_send, but there is no
* guarantee on the alignment of the ipoib_mac_t structure.
*/
/*ARGSUSED*/
static uint_t
{
/*
* If the input address is 4 byte aligned, we can just dereference
* it. This is most common, since IP will send in a 4 byte aligned
* IP header, which implies the 24 byte IPoIB psuedo header will be
* 4 byte aligned too.
*/
if ((ptraddr & 3) == 0)
return (hval);
}
static int
{
return (0);
else
return (1);
}
/*
* Initialize all the per interface caches and lists; AH cache,
* MCG list etc.
*/
static int
{
int i;
return (DDI_FAILURE);
} else {
MUTEX_DRIVER, NULL);
}
}
return (DDI_SUCCESS);
}
static void
{
}
}
}
/*
* Search AH active hash list for a cached path to input destination.
* If we are "just looking", hold == F. When we are in the Tx path,
* we set hold == T to grab a reference on the AH so that it can not
* be recycled to a new destination while the Tx request is posted.
*/
{
/*
* Do hash search.
*/
if (hold)
return (ptr);
}
return (NULL);
}
/*
* This is called by the tx side; if an initialized AH is found in
* the active list, it is locked down and can be used; if no entry
* is found, an async request is queued to do path resolution.
*/
static ibd_ace_t *
{
/*
* Only attempt to print when we can; in the mdt pattr case, the
* address is not aligned properly.
*/
DPRINT(4,
"ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
}
return (ptr);
}
return (ptr);
}
/*
* Implementation of a single outstanding async request; if
* the operation is not started yet, queue a request and move
* to ongoing state. Remember in id_ah_addr for which address
* we are queueing the request, in case we need to flag an error;
* Any further requests, for the same or different address, until
* the operation completes, is sent back to GLDv3 to be retried.
* The async thread will update id_ah_op with an error indication
* or will set it to indicate the next look up can start; either
* way, it will mac_tx_update() so that all blocked requests come
* back here.
*/
/*
* We did not even find the entry; queue a request
* for it.
*/
}
/*
* Check the status of the pathrecord lookup request
* we had queued before.
*/
state->id_ah_error++;
} else {
/*
* IBD_OP_ROUTERED case: We need to send to the
* all-router MCG. If we can find the AH for
* the mcg, the Tx will be attempted. If we
* do not find the AH, we return NORESOURCES
* to retry.
*/
numwqe);
}
/*
* This case can happen when we get a higher band
* packet. The easiest way is to reset the state machine
* to accommodate the higher priority packet.
*/
}
return (ptr);
}
/*
* Grab a not-currently-in-use AH/PathRecord from the active
* list to recycle to a new destination. Only the async thread
* executes this code.
*/
static ibd_ace_t *
{
/*
* Do plain linear search.
*/
/*
* Note that it is possible that the "cycle" bit
* is set on the AH w/o any reference count. The
* mcg must have been deleted, and the tx cleanup
* just decremented the reference count to 0, but
* hasn't gotten around to grabbing the id_ac_mutex
* to move the AH into the free list.
*/
if (!try_rc_chan_recycle) {
}
} else {
break;
}
}
}
return (ptr);
}
/*
* Invoked to clean up AH from active list in case of multicast
* disable and to handle sendonly memberships during mcg traps.
* And for port up processing for multicast and unicast AHs.
* Normally, the AH is taken off the active list, and put into
* the free list to be recycled for a new destination. In case
* Tx requests on the AH have not completed yet, the AH is marked
* for reaping (which will put the AH on the free list) once the Tx's
* complete; in this case, depending on the "force" input, we take
* out the AH from the active list right now, or leave it also for
* the reap operation. Returns TRUE if the AH is taken off the active
* list (and either put into the free list right now, or arranged for
* later), FALSE otherwise.
*/
{
/*
* Note that the AH might already have the cycle bit set
* on it; this might happen if sequences of multicast
* enables and disables are coming so fast, that posted
* Tx's to the mcg have not completed yet, and the cycle
* bit is set successively by each multicast disable.
*/
if (SET_CYCLE_IF_REF(acactive)) {
if (!force) {
/*
* The ace is kept on the active list, further
* Tx's can still grab a reference on it; the
* ace is reaped when all pending Tx's
* referencing the AH complete.
*/
} else {
/*
* In the mcg trap case, we always pull the
* AH from the active list. And also the port
*/
}
} else {
/*
* Determined the ref count is 0, thus reclaim
* immediately after pulling out the ace from
* the active list.
*/
}
}
return (ret);
}
/*
* Helper function for async path record lookup. If we are trying to
* Tx to a MCG, check our membership, possibly trying to join the
* group if required. If that fails, try to send the packet to the
* all router group (indicated by the redirect output), pointing
* the input mac address to the router mcg address.
*/
static ibd_mce_t *
{
/*
* Check the FullMember+SendOnlyNonMember list.
* Since we are the only one who manipulates the
* id_mc_full list, no locks are needed.
*/
return (mce);
}
/*
* Not found; try to join(SendOnlyNonMember) and attach.
*/
NULL) {
return (mce);
}
/*
* MCGroup not present; try to join the all-router group. If
* any of the following steps succeed, we will be redirecting
* to the all router group.
*/
return (NULL);
/*
* Are we already joined to the router group?
*/
"group\n");
return (mce);
}
/*
* Can we join(SendOnlyNonMember) the router group?
*/
NULL) {
return (mce);
}
return (NULL);
}
/*
* Async path record lookup code.
*/
static void
{
/*
* Check whether we are trying to transmit to a MCG.
* In that case, we need to make sure we are a member of
* the MCG.
*/
/*
* If we can not find or join the group or even
* redirect, error out.
*/
NULL) {
return;
}
/*
* If we got redirected, we need to determine whether
* the AH for the new mcg is in the cache already, and
* not pull it in then; otherwise proceed to get the
* path for the new mcg. There is no guarantee that
* if the AH is currently in the cache, it will still be
* there when we look in ibd_acache_lookup(), but that's
* okay, we will come back here.
*/
if (redirected) {
"%08X:%08X:%08X:%08X:%08X",
return;
}
}
}
/*
* Get an AH from the free list.
*/
/*
* No free ones; try to grab an unreferenced active
* one. Maybe we need to make the active list LRU,
* but that will create more work for Tx callbacks.
* Is there a way of not having to pull out the
* entry from the active list, but just indicate it
* is being recycled? Yes, but that creates one more
* check in the fast lookup path.
*/
/*
* Pretty serious shortage now.
*/
"slot\n");
return;
}
/*
* We could check whether ac_mce points to a SendOnly
* member and drop that membership now. Or do it lazily
* at detach time.
*/
}
/*
* Update the entry.
*/
goto error;
}
goto error;
}
/*
* mce is set whenever an AH is being associated with a
* MCG; this will come in handy when we leave the MCG. The
* lock protects Tx fastpath from scanning the active list.
*/
/*
* initiate a RC mode connection for unicast address
*/
"ibd_rc_try_connect(ace=%p)", ce);
" channel");
state->rc_conn_fail++;
goto error;
}
}
return;
/*
* We might want to drop SendOnly membership here if we
* joined above. The lock protects Tx callbacks inserting
* into the free list.
*/
}
/*
* While restoring port's presence on the subnet on a port up, it is possible
* that the port goes down again.
*/
static void
{
/*
* this on a link down, since we will be unable to do SA operations,
* defaulting to the lowest speed. Also notice that we update our
* notion of speed before calling mac_link_update(), which will do
* necessary higher level notifications for speed changes.
*/
}
/*
* Do all the work required to establish our presence on
* the subnet.
*/
if (opcode == IBD_LINK_UP_ABSENT) {
/*
* If in promiscuous mode ...
*/
/*
* Drop all nonmembership.
*/
/*
* Then, try to regain nonmembership to all mcg's.
*/
}
/*
* Drop all sendonly membership (which also gets rid of the
* AHs); try to reacquire all full membership.
*/
else
}
/*
* Recycle all active AHs to free list (and if there are
* pending posts, make sure they will go into the free list
* once the Tx's complete). Grab the lock to prevent
* concurrent Tx's as well as Tx cleanups.
*/
} else {
"thread is closing it, ace=%p, "
"ac_chan=%p, chan_state=%d",
}
} else {
}
/*
* If this is for an mcg, it must be for a fullmember,
* since we got rid of send-only members above when
* processing the mce list.
*/
/*
* Check if the fullmember mce needs to be torn down,
* ie whether the DLPI disable has already been done.
* If so, do some of the work of tx_cleanup, namely
* causing leave (which will fail), detach and
* mce-freeing. tx_cleanup will put the AH into free
* list. The reason to duplicate some of this
* tx_cleanup work is because we want to delete the
* AH right now instead of waiting for tx_cleanup, to
* force subsequent Tx's to reacquire an AH.
*/
}
}
/*
* mac handle is guaranteed to exist since driver does ibt_close_hca()
* (which stops further events from being delivered) before
* mac_unregister(). At this point, it is guaranteed that mac_register
* has already been done.
*/
}
/*
* Check the pkey table to see if we can find the pkey we're looking for.
* Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
* failure.
*/
static int
{
return (0);
}
}
return (-1);
}
/*
* Late HCA Initialization:
* If plumb had succeeded without the availability of an active port or the
* pkey, and either of their availability is now being indicated via PORT_UP
* or PORT_CHANGE respectively, try a start of the interface.
*
* Normal Operation:
* When the link is notified up, we need to do a few things, based
* on the port's current p_init_type_reply claiming a reinit has been
* done or not. The reinit steps are:
* 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
* the old Pkey and GID0 are correct.
* 2. Register for mcg traps (already done by ibmf).
* 3. If PreservePresenceReply indicates the SM has restored port's presence
* in subnet, nothing more to do. Else go to next steps (on async daemon).
* 4. Give up all sendonly memberships.
* 5. Acquire all full memberships.
* 6. In promiscuous mode, acquire all non memberships.
* 7. Recycle all AHs to free list.
*/
static void
{
int ret;
/*
* Let's not race with a plumb or an unplumb; if we detect a
* pkey relocation event later on here, we may have to restart.
*/
/*
* If the link state is unknown, a plumb has not yet been attempted
* on the interface. Nothing to do.
*/
goto link_mod_return;
}
/*
* If link state is down because of plumb failure, and we are not in
* late HCA init, and we were not successfully plumbed, nothing to do.
*/
goto link_mod_return;
}
/*
* If this routine was called in response to a port down event,
* we just need to see if this should be informed.
*/
if (code == IBT_ERROR_PORT_DOWN) {
goto update_link_state;
}
/*
* If it's not a port down event we've received, try to get the port
* attributes first. If we fail here, the port is as good as down.
* Otherwise, if the link went down by the time the handler gets
* are not valid and this is as bad as a port down anyway.
*/
goto update_link_state;
}
/*
* If in the previous attempt, the pkey was not found either due to the
* port state being down, or due to it's absence in the pkey table,
* look for it now and try to start the interface.
*/
"init, ret=%d", ret);
}
goto link_mod_return;
}
/*
* Check the SM InitTypeReply flags. If both NoLoadReply and
* PreserveContentReply are 0, we don't know anything about the
* data loaded into the port attributes, so we need to verify
* if gid0 and pkey are still valid.
*/
if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
/*
* Check to see if the subnet part of GID0 has changed. If
* not, check the simple case first to see if the pkey
* index is the same as before; finally check to see if the
* pkey has been relocated to a different index in the table.
*/
/*
* Currently a restart is required if our pkey has moved
* in the pkey table. If we get the ibt_recycle_ud() to
* work as documented (expected), we may be able to
* avoid a complete restart. Note that we've already
* marked both the start and stop 'in-progress' flags,
* so it is ok to go ahead and do this restart.
*/
"ret=%d", ret);
}
goto link_mod_return;
} else {
}
}
if (port_infop) {
}
/*
* If we're reporting a link up, check InitTypeReply to see if
* the SM has ensured that the port's presence in mcg, traps,
* etc. is intact.
*/
if (new_link_state == LINK_STATE_DOWN) {
} else {
if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
} else {
}
}
/*
* If the old state is the same as the new state, and the SM indicated
* no change in the port parameters, nothing to do.
*/
goto link_mod_return;
}
/*
* Ok, so there was a link state change; see if it's safe to ask
* the async thread to do the work
*/
if (!ibd_async_safe(state)) {
goto link_mod_return;
}
/*
* Queue up a request for ibd_async_link() to handle this link
* state change event
*/
}
/*
* invocations of the handler. IBTL might coalesce link transition events,
* invoke the handler with last known state
*/
static void
{
switch (code) {
break;
case IBT_ERROR_CQ:
break;
case IBT_PORT_CHANGE_EVENT:
/*
* Events will be delivered to all instances that have
* done ibt_open_hca() but not yet done ibt_close_hca().
* Only need to do work for our port; IBTF will deliver
* events for other ports on the hca we have ibt_open_hca'ed
* too. Note that id_port is initialized in ibd_attach()
* before we do an ibt_open_hca() in ibd_attach().
*/
break;
}
break;
case IBT_ERROR_PORT_DOWN:
case IBT_CLNT_REREG_EVENT:
case IBT_EVENT_PORT_UP:
/*
* Events will be delivered to all instances that have
* done ibt_open_hca() but not yet done ibt_close_hca().
* Only need to do work for our port; IBTF will deliver
* events for other ports on the hca we have ibt_open_hca'ed
* too. Note that id_port is initialized in ibd_attach()
* before we do an ibt_open_hca() in ibd_attach().
*/
break;
break;
case IBT_HCA_ATTACH_EVENT:
case IBT_HCA_DETACH_EVENT:
/*
* When a new card is plugged to the system, attach_event is
* invoked. Additionally, a cfgadm needs to be run to make the
* card known to the system, and an ifconfig needs to be run to
* plumb up any ibd interfaces on the card. In the case of card
* unplug, a cfgadm is run that will trigger any RCM scripts to
* unplumb the ibd interfaces on the card; when the card is
* actually unplugged, the detach_event is invoked;
* additionally, if any ibd instances are still active on the
* card (eg there were no associated RCM scripts), driver's
* detach routine is invoked.
*/
break;
default:
break;
}
}
static int
{
int ret;
return (DDI_FAILURE);
}
/*
* Note that when we register with mac during attach, we don't
* have the id_macaddr yet, so we'll simply be registering a
* zero macaddr that we'll overwrite later during plumb (in
* ibd_m_start()). Similar is the case with id_mtu - we'll
* update the mac layer with the correct mtu during plumb.
*/
} else if (state->id_enable_rc) {
} else {
}
/*
* Register ourselves with the GLDv3 interface
*/
DPRINT(10,
"ibd_register_mac: mac_register() failed, ret=%d", ret);
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
static int
{
/*
* Query the HCA and fetch its attributes
*/
/*
* 1. Set the Hardware Checksum capability. Currently we only consider
* full checksum offload.
*/
if (state->id_enable_rc) {
state->id_hwcksum_capab = 0;
} else {
== IBT_HCA_CKSUM_FULL) {
}
}
/*
* 2. Set LSO policy, capability and maximum length
*/
if (state->id_enable_rc) {
state->id_lso_maxlen = 0;
} else {
if (hca_attrs.hca_max_lso_size > 0) {
else
} else {
state->id_lso_maxlen = 0;
}
}
/*
* 3. Set Reserved L_Key capability
*/
} else {
/* If no reserved lkey, we will not use ibt_map_mem_iov */
}
/*
* 4. Set maximum sqseg value after checking to see if extended sgl
* size information is provided by the hca
*/
} else {
}
}
}
/*
* Translating the virtual address regions into physical regions
* for using the Reserved LKey feature results in a wr sgl that
* is a little longer. Since failing ibt_map_mem_iov() is costly,
* we'll fix a high-water mark (65%) for when we should stop.
*/
/*
* 5. Set number of recv and send wqes after checking hca maximum
* channel size. Store the max channel size in the state so that it
* dladm.
*/
return (DDI_SUCCESS);
}
static int
{
return (DDI_FAILURE);
}
return (DDI_FAILURE);
}
/*
* "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB port is
* connecting to a remote IPoIB port. We can't remove this port.
*/
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
static void
{
/* make sure rx resources are freed */
if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
}
if (progress & IBD_DRV_MAC_REGISTERED) {
}
if (progress & IBD_DRV_ASYNC_THR_CREATED) {
/*
* No new async requests will be posted since the device
* link state has been marked as unknown; completion handlers
* have been turned off, so Tx handler will not cause any
* more IBD_ASYNC_REAP requests.
*
* Queue a request for the async thread to exit, which will
* be serviced after any pending ones. This can take a while,
* specially if the SM is unreachable, since IBMF will slowly
* timeout each SM request issued by the async thread. Reap
* the thread before continuing on, we do not want it to be
* lingering in modunloaded code.
*/
}
if (progress & IBD_DRV_REQ_LIST_INITED) {
}
if (progress & IBD_DRV_PD_ALLOCD) {
"protection domain, ret=%d", ret);
}
}
if (progress & IBD_DRV_HCA_OPENED) {
IBT_SUCCESS) {
"HCA device, ret=%d", ret);
}
}
if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
IBT_SUCCESS) {
"ibt_detach() failed, ret=%d", ret);
}
}
if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) &&
IBT_SUCCESS) {
"failed, ret=%d", ret);
}
}
if (progress & IBD_DRV_TXINTR_ADDED) {
}
if (progress & IBD_DRV_RXINTR_ADDED) {
}
#ifdef DEBUG
if (progress & IBD_DRV_RC_PRIVATE_STATE) {
}
#endif
if (progress & IBD_DRV_STATE_INITIALIZED) {
}
}
int
{
int rv;
/*
* Initialize mutexes and condition variables
*/
return (DDI_FAILURE);
}
/*
* Allocate rx,tx softintr
*/
if (ibd_rx_softintr == 1) {
"ddi_add_softintr(id_rx), ret=%d", rv);
return (DDI_FAILURE);
}
}
if (ibd_tx_softintr == 1) {
"ddi_add_softintr(id_tx), ret=%d", rv);
return (DDI_FAILURE);
}
}
/*
* Attach to IBTL
*/
"ibt_attach(), ret=%d", ret);
return (DDI_FAILURE);
}
}
ret);
return (DDI_FAILURE);
}
/*
* Open the HCA
*/
ret);
return (DDI_FAILURE);
}
#ifdef DEBUG
/* Initialize Driver Counters for Reliable Connected Mode */
if (state->id_enable_rc) {
"ibd_rc_init_stats");
return (DDI_FAILURE);
}
}
#endif
/*
* Record capabilities
*/
(void) ibd_record_capab(state);
/*
* Allocate a protection domain on the HCA
*/
ret);
return (DDI_FAILURE);
}
/*
* We need to initialise the req_list that is required for the
* operation of the async_thread.
*/
/*
* Create the async thread; thread_create never fails.
*/
return (DDI_SUCCESS);
}
/*
* Attach device to the IO framework.
*/
static int
{
int ret;
switch (cmd) {
case DDI_ATTACH:
break;
default:
ret = DDI_FAILURE;
break;
}
return (ret);
}
/*
* Detach device from the IO framework.
*/
static int
{
int instance;
/*
*/
if (cmd != DDI_DETACH)
return (DDI_FAILURE);
/*
* Get the instance softstate
*/
/*
* Release all resources we're holding still. Note that if we'd
* done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
* so far, we should find all the flags we need in id_mac_state.
*/
}
/*
* Pre ibt_attach() driver initialization
*/
static int
{
state->id_trap_inprog = 0;
state->id_tx_busy = 0;
/* For Reliable Connected Mode */
MUTEX_DRIVER, NULL);
/*
* Make the default link mode as RC. If this fails during connection
* setup, the link mode is automatically transitioned to UD.
* Also set the RC MTU.
*/
/* Iniatialize all tunables to default */
return (DDI_SUCCESS);
}
/*
* Post ibt_detach() driver deconstruction
*/
static void
{
/* For Reliable Connected Mode */
}
/*
* Fetch link speed from SA for snmp ifspeed reporting.
*/
static uint64_t
{
int ret;
/*
* Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
* translates to 2 Gbps data rate. Thus, 1X single data rate is
* 2000000000. Start with that as default.
*/
ifspeed = 2000000000;
/*
* Get the port speed from Loopback path information.
*/
goto earlydone;
if (num_paths < 1)
goto earlydone;
/*
* In case SA does not return an expected value, report the default
* speed as 1X.
*/
ret = 1;
case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */
ret = 1;
break;
case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */
ret = 4;
break;
case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */
ret = 12;
break;
case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */
ret = 2;
break;
case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */
ret = 8;
break;
case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */
ret = 16;
break;
case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */
ret = 24;
break;
case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */
ret = 32;
break;
case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */
ret = 48;
break;
}
return (ifspeed);
}
/*
* Search input mcg list (id_mc_full or id_mc_non) for an entry
* representing the input mcg mgid.
*/
static ibd_mce_t *
{
/*
* Do plain linear search.
*/
sizeof (ib_gid_t)) == 0)
return (ptr);
}
return (NULL);
}
/*
* Execute IBA JOIN.
*/
static ibt_status_t
{
}
/*
* This code JOINs the port in the proper way (depending on the join
* It also attaches the QPN to the mcg so it can receive those mcg
* packets. This code makes sure not to attach the mcg to the QP if
* that has been previously done due to the mcg being joined with a
* different join state, even though this is not required by SWG_0216,
* refid 3610.
*/
static ibd_mce_t *
{
/*
* For enable_multicast Full member joins, we need to do some
* extra work. If there is already an mce on the list that
* indicates full membership, that means the membership has
* not yet been dropped (since the disable_multicast was issued)
* because there are pending Tx's to the mcg; in that case, just
* mark the mce not to be reaped when the Tx completion queues
* an async reap operation.
*
* If there is already an mce on the list indicating sendonly
* membership, try to promote to full membership. Be careful
* not to deallocate the old mce, since there might be an AH
* pointing to it; instead, update the old mce with new data
* that tracks the full membership.
*/
return (omce);
} else {
}
}
/*
* Allocate the ibd_mce_t to track this JOIN.
*/
return (NULL);
}
/*
* Is an IBA attach required? Not if the interface is already joined
* to the mcg in a different appropriate join state.
*/
if (jstate == IB_MC_JSTATE_NON) {
} else if (jstate == IB_MC_JSTATE_FULL) {
} else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
}
if (do_attach) {
/*
* Do the IBA attach.
*/
"%d\n", ibt_status);
/*
* NOTE that we should probably preserve the join info
* in the list and later try to leave again at detach
* time.
*/
return (NULL);
}
}
/*
* Insert the ibd_mce_t in the proper list.
*/
if (jstate == IB_MC_JSTATE_NON) {
} else {
/*
* Set up the mc_req fields used for reaping the
* mcg in case of delayed tx completion (see
* ibd_tx_cleanup()). Also done for sendonly join in
* case we are promoted to fullmembership later and
* keep using the same mce.
*/
/*
* Check whether this is the case of trying to join
* full member, and we were already joined send only.
* We try to drop our SendOnly membership, but it is
* possible that the mcg does not exist anymore (and
* the subnet trap never reached us), so the leave
* operation might fail.
*/
sizeof (ibt_mcg_info_t));
return (omce);
}
}
return (mce);
}
/*
* Called during port up event handling to attempt to reacquire full
* membership to an mcg. Stripped down version of ibd_join_group().
* Note that it is possible that the mcg might have gone away, and
* gets recreated at this point.
*/
static void
{
/*
* If the mc_fullreap flag is set, or this join fails, a subsequent
* that by adding a boolean flag into ibd_mce_t, if required.
*/
if (mce->mc_fullreap)
return;
/* While reacquiring, leave and then join the MCG */
"multicast gid %016llx:%016llx",
}
/*
* This code handles delayed Tx completion cleanups for mcg's to which
* disable_multicast has been issued, regular mcg related cleanups during
* disable_multicast, disable_promiscuous and mcg traps, as well as
* cleanups during driver detach time. Depending on the join state,
* it deletes the mce from the appropriate list and issues the IBA
* is left on the active list for a subsequent Tx completion cleanup.
*/
static void
{
/*
* Before detaching, we must check whether the other list
* contains the mcg; if we detach blindly, the consumer
* who set up the other list will also stop receiving
* traffic.
*/
if (jstate == IB_MC_JSTATE_FULL) {
/*
* The following check is only relevant while coming
* from the Tx completion path in the reap case.
*/
if (!mce->mc_fullreap)
return;
} else if (jstate == IB_MC_JSTATE_NON) {
} else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
}
/*
* If we are reacting to a mcg trap and leaving our sendonly or
* non membership, the mcg is possibly already gone, so attempting
* to leave might fail. On the other hand, we must try to leave
* anyway, since this might be a trap from long ago, and we could
* have potentially sendonly joined to a recent incarnation of
* the mcg and are about to loose track of this information.
*/
if (do_detach) {
}
}
/*
* Async code executed due to multicast and promiscuous disable requests
* and mcg trap handling; also executed during driver detach. Mostly, a
* leave and detach is done; except for the fullmember case when Tx
* requests are pending, whence arrangements are made for subsequent
* cleanup on Tx completion.
*/
static void
{
if (jstate == IB_MC_JSTATE_NON) {
/*
* In case we are handling a mcg trap, we might not find
* the mcg in the non list.
*/
return;
}
} else {
/*
* In case we are handling a mcg trap, make sure the trap
* is not arriving late; if we have an mce that indicates
* that we are already a fullmember, that would be a clear
* indication that the trap arrived late (ie, is for a
* previous incarnation of the mcg).
*/
if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
return;
}
} else {
/*
* If join group failed, mce will be NULL here.
* This is because in GLDv3 driver, set multicast
* will always return success.
*/
return;
}
}
/*
* If no pending Tx's remain that reference the AH
* for the mcg, recycle it from active to free list.
* Else in the IB_MC_JSTATE_FULL case, just mark the AH,
* so the last completing Tx will cause an async reap
* operation to be invoked, at which time we will drop our
* membership to the mcg so that the pending Tx's complete
* successfully. Refer to comments on "AH and MCE active
* list manipulation" at top of this file. The lock protects
* against Tx fast path and Tx cleanup code.
*/
}
if (recycled) {
}
}
/*
* Find the broadcast address as defined by IPoIB; implicitly
* determines the IBA scope, mtu, tclass etc of the link the
* interface is going to be a member of.
*/
static ibt_status_t
{
int i, mcgmtu;
int ret;
/*
* Look for the IPoIB broadcast group.
*/
break;
}
}
if (!found) {
if (state->id_create_broadcast_group) {
/*
* If we created the broadcast group, but failed to
* find it, we can't do anything except leave the
* one we created and return failure.
*/
if (state->id_bgroup_created) {
"absent. Unable to query after create.");
goto find_bgroup_fail;
}
/*
* Create the ipoib broadcast group if it didn't exist
*/
"absent, create failed: ret = %d\n", ret);
return (IBT_FAILURE);
}
goto query_bcast_grp;
} else {
return (IBT_FAILURE);
}
}
/*
* Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
*/
"greater than port's maximum MTU %d", mcgmtu,
goto find_bgroup_fail;
}
return (IBT_SUCCESS);
if (state->id_bgroup_created) {
}
return (IBT_FAILURE);
}
static int
{
/*
* Allocate one big chunk for all regular tx copy bufs
*/
}
sizeof (ibd_swqe_t), KM_SLEEP);
/*
* Do one memory registration on the entire txbuf area
*/
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
static int
{
int i;
/*
* Allocate the lso bucket
*/
/*
* Allocate the entire lso memory and register it
*/
return (DDI_FAILURE);
}
/*
* Now allocate the buflist. Note that the elements in the buflist and
* the buffers in the lso memory have a permanent 1-1 relation, so we
* can always derive the address of a buflist entry from the address of
* an lso buffer.
*/
KM_SLEEP);
/*
* Set up the lso buf chain
*/
for (i = 0; i < state->id_num_lso_bufs; i++) {
memp += IBD_LSO_BUFSZ;
lbufp++;
}
/*
* Set up the LSO buffer information in ibd state
*/
return (DDI_SUCCESS);
}
/*
* Statically allocate Tx buffer list(s).
*/
static int
{
int i;
return (DDI_FAILURE);
}
/*
* Allocate and setup the swqe list
*/
/* These are set in send */
/* add to list */
}
return (DDI_SUCCESS);
}
static int
{
int i;
/*
* Determine how many bufs we'd need for the size requested
*/
num_needed++;
/*
* If we don't have enough lso bufs, return failure
*/
return (-1);
}
/*
* Pick the first 'num_needed' bufs from the free list
*/
for (i = 0; i < num_needed; i++) {
}
/*
* If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
* to adjust the last sgl entry's length. Since we know we need atleast
* one, the i-1 use below is ok.
*/
if (frag_sz) {
}
/*
* Update nfree count and return
*/
*nds_p = num_needed;
return (0);
}
static void
{
int i;
for (i = 0; i < nds; i++) {
/*
* Figure out the buflist element this sgl buffer corresponds
* to and put it back at the head
*/
}
}
static void
{
/*
* Unregister txbuf mr
*/
}
/*
* Free txbuf memory
*/
sizeof (ibd_swqe_t));
}
static void
{
return;
}
/*
* First, free the buflist
*/
/*
* Unregister the LSO memory and free it
*/
DPRINT(10,
"ibd_free_lsobufs: ibt_deregister_mr failed");
}
/*
* Finally free the bucket
*/
}
/*
* Free the statically allocated Tx buffer list.
*/
static void
{
/*
* Free the allocated swqes
*/
}
/*
* post a list of rwqes, NULL terminated.
*/
static void
{
uint_t i;
while (rwqe) {
/* Post up to IBD_RX_POST_CNT receive work requests */
for (i = 0; i < IBD_RX_POST_CNT; i++) {
i++;
break;
}
}
/*
* If posting fails for some reason, we'll never receive
* completion intimation, so we'll need to cleanup. But
* we need to make sure we don't clean up nodes whose
* wrs have been successfully posted. We assume that the
* hca driver returns on the first failure to post and
* therefore the first 'num_posted' entries don't need
* cleanup here.
*/
num_posted = 0;
&num_posted);
if (ibt_status != IBT_SUCCESS) {
/* This cannot happen unless the device has an error. */
"posting multiple wrs failed: "
"requested=%d, done=%d, ret=%d",
num_posted - i);
}
}
}
/*
* Grab a list of rwqes from the array of lists, and post the list.
*/
static void
{
/* rotate through the rx_queue array, expecting an adequate number */
}
/* macro explained below */
/*
* Add a rwqe to one of the the Rx lists. If the list is large enough
* (exactly IBD_RX_POST_CNT), post the list to the hardware.
*
* Note: one of 2^N lists is chosen via a hash. This is done
* because using one list is contentious. If the first list is busy
* (mutex_tryenter fails), use a second list (just call mutex_enter).
*
* The number 8 in RX_QUEUE_HASH is a random choice that provides
* even distribution of mapping rwqes to the 2^N queues.
*/
static void
{
/* Failed. Try a different queue ("ptr + 16" ensures that). */
}
/* only call ibt_post_recv() every Nth time through here */
return;
}
}
}
static int
{
int i;
/*
* Allocate one big chunk for all regular rx copy bufs
*/
sizeof (ibd_rwqe_t), KM_SLEEP);
sizeof (ibd_rx_queue_t), KM_SLEEP);
for (i = 0; i < state->id_rx_nqueues; i++) {
}
/*
* Do one memory registration on the entire rxbuf area
*/
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
/*
* Allocate the statically allocated Rx buffer list.
*/
static int
{
int i;
/* rx rsrcs were never freed. Just repost them */
/* allow freemsg_cb to free the rwqes */
"id_running was not 1\n");
}
"failed in desballoc()");
if (rwqe->rwqe_im_mblk) {
} else
}
return (DDI_FAILURE);
}
}
return (DDI_SUCCESS);
}
return (DDI_FAILURE);
/*
* Allocate and setup the rwqe list
*/
/* allow freemsg_cb to free the rwqes */
"id_running was not 1\n");
}
"failed in desballoc()");
}
/* remove reference to free'd rwqes */
return (DDI_FAILURE);
}
}
return (DDI_SUCCESS);
}
static void
{
int i;
/*
* Unregister rxbuf mr
*/
}
/*
* Free rxbuf memory
*/
for (i = 0; i < state->id_rx_nqueues; i++) {
}
sizeof (ibd_rx_queue_t));
sizeof (ibd_rwqe_t));
}
static void
{
/* already freed */
return;
}
}
/*
* Free the statically allocated Rx buffer list.
*/
static void
{
int i;
/* run through the rx_queue's, calling freemsg() */
for (i = 0; i < state->id_rx_nqueues; i++) {
}
}
/* cannot free rx resources unless gld returned everything */
}
/*
* Free an allocated recv wqe.
*/
/* ARGSUSED */
static void
{
/*
* desballoc() failed (no memory).
*
* This rwqe is placed on a free list so that it
* can be reinstated when memory is available.
*
* NOTE: no code currently exists to reinstate
* these "lost" rwqes.
*/
}
/*
* IBA Rx completion queue handler. Guaranteed to be single
* threaded and nonreentrant for this CQ.
*/
/* ARGSUSED */
static void
{
if (ibd_rx_softintr == 1) {
return;
} else {
}
} else
}
/*
* CQ handler for Tx completions, when the Tx CQ is in
* interrupt driven mode.
*/
/* ARGSUSED */
static void
{
if (ibd_tx_softintr == 1) {
return;
} else {
}
} else
}
/*
* on a kernel thread (handling can thus block) and can be invoked
* concurrently. The handler can be invoked anytime after it is
* registered and before ibt_detach().
*/
/* ARGSUSED */
static void
{
/*
* The trap handler will get invoked once for every event for
* every port. The input "gid" is the GID0 of the port the
* trap came in on; we just need to act on traps that came
* to our port, meaning the port on which the ipoib interface
* resides. Since ipoib uses GID0 of the port, we just match
* the gids to check whether we need to handle the trap.
*/
return;
switch (code) {
case IBT_SM_EVENT_UNAVAILABLE:
/*
* If we are in promiscuous mode or have
* sendnonmembers, we need to print a warning
* message right now. Else, just store the
* information, print when we enter promiscuous
* mode or attempt nonmember send. We might
* also want to stop caching sendnonmember.
*/
"degraded due to unavailability of multicast "
"traps");
break;
case IBT_SM_EVENT_AVAILABLE:
/*
* If we printed a warning message above or
* while trying to nonmember send or get into
* promiscuous mode, print an okay message.
*/
"restored due to availability of multicast "
"traps");
break;
case IBT_SM_EVENT_MCG_CREATED:
case IBT_SM_EVENT_MCG_DELETED:
/*
* If it is a "deleted" event and we are in late hca
* init, nothing to do.
*/
IBD_DRV_IN_LATE_HCA_INIT) && (code ==
break;
}
/*
* First check if the instance is being
* [de]initialized; back off then, without doing
* anything more, since we are not sure if the
* async thread is around, or whether we might
* be racing with the detach code in ibd_m_stop()
* that scans the mcg list.
*/
if (!ibd_async_safe(state))
return;
break;
}
}
static void
{
int ret;
/*
* Check if we have already joined the IPoIB broadcast group for our
* PKEY. If joined, perform the rest of the operation.
* Else, the interface is not initialised. Do the initialisation here
* by calling ibd_start() and return.
*/
(code == IBT_SM_EVENT_MCG_CREATED)) {
/*
* If we are in late HCA init and a notification for the
* creation of a MCG came in, check if it is the IPoIB MCG for
* this pkey. If not, return.
*/
return;
}
/*
* Check if there is still a necessity to start the interface.
* It is possible that the user attempted unplumb at just about
* the same time, and if unplumb succeeded, we have nothing to
* do.
*/
"init, ret=%d", ret);
}
return;
}
/*
* Atomically search the nonmember and sendonlymember lists and
* delete.
*/
/*
* mcg. Given the unreliable out-of-order mode of trap
* delivery, we can never be sure whether it is a problem
* if the join fails. Thus, we warn the admin of a failure
* if this was a creation trap. Note that the trap might
* actually be reporting a long past event, and the mcg
* might already have been deleted, thus we might be warning
* in vain.
*/
"new multicast gid %016llx:%016llx",
}
/*
* Free the request slot allocated by the subnet event thread.
*/
}
/*
* GLDv3 entry point to get capabilities.
*/
static boolean_t
{
return (B_FALSE);
switch (cap) {
case MAC_CAPAB_HCKSUM: {
/*
* We either do full checksum or not do it at all
*/
else
return (B_FALSE);
break;
}
case MAC_CAPAB_LSO: {
/*
* In addition to the capability and policy, since LSO
* relies on hw checksum, we'll not enable LSO if we
* don't have hw checksum. Of course, if the HCA doesn't
* provide the reserved lkey capability, enabling LSO will
* actually affect performance adversely, so we'll disable
* LSO even for that case.
*/
return (B_FALSE);
return (B_FALSE);
if (state->id_hca_res_lkey_capab == 0) {
"disabling LSO");
return (B_FALSE);
}
break;
}
default:
return (B_FALSE);
}
return (B_TRUE);
}
/*
*/
static int
{
int err = 0;
/* Cannot set properties on a port driver */
return (ENOTSUP);
}
switch (pr_num) {
case MAC_PROP_IB_LINKMODE:
break;
}
break;
}
if (link_mode != IBD_LINK_MODE_UD &&
link_mode != IBD_LINK_MODE_RC) {
} else {
if (link_mode == IBD_LINK_MODE_RC) {
if (state->id_enable_rc) {
return (0);
}
/* inform MAC framework of new MTU */
} else {
if (!state->id_enable_rc) {
return (0);
}
state->id_enable_rc = 0;
}
(void) ibd_record_capab(state);
}
break;
case MAC_PROP_PRIVATE:
pr_valsize, pr_val);
break;
default:
break;
}
return (err);
}
static int
{
int err = 0;
switch (pr_num) {
case MAC_PROP_MTU:
break;
default:
return (ENOTSUP);
}
break;
}
switch (pr_num) {
case MAC_PROP_IB_LINKMODE:
break;
case MAC_PROP_PRIVATE:
pr_val);
break;
default:
break;
}
return (err);
}
static void
{
switch (pr_num) {
case MAC_PROP_IB_LINKMODE: {
break;
}
case MAC_PROP_MTU: {
min = 1500;
} else if (state->id_enable_rc) {
} else {
}
break;
}
case MAC_PROP_PRIVATE: {
int value;
return;
"_ibd_create_broadcast_group") == 0) {
} else {
return;
}
break;
}
} /* switch (pr_num) */
}
/* ARGSUSED2 */
static int
{
int err = 0;
long result;
return (EINVAL);
}
} else {
}
return (err);
}
return (EBUSY);
}
return (EINVAL);
}
} else {
}
return (err);
}
return (EBUSY);
}
return (EINVAL);
}
} else {
}
return (err);
}
return (EBUSY);
}
return (EINVAL);
}
} else {
}
return (err);
}
return (EBUSY);
}
return (EINVAL);
}
} else {
}
return (err);
}
return (EBUSY);
}
return (EINVAL);
}
return (EINVAL);
}
if (result < IBD_MIN_NUM_LSO_BUFS ||
} else {
}
return (err);
}
return (EBUSY);
}
return (EINVAL);
}
} else {
}
if (!state->rc_enable_srq) {
state->id_rc_num_srq = 0;
}
return (err);
}
return (EBUSY);
}
return (EINVAL);
}
if (result < IBD_MIN_RC_NUM_RWQE ||
} else {
if (state->id_allow_coalesce_comp_tuning &&
/*
* If rx_rwqe_threshold is greater than the number of
* rwqes, pull it back to 25% of number of rwqes.
*/
}
return (err);
}
return (EBUSY);
}
return (EINVAL);
}
if (!state->rc_enable_srq)
return (EINVAL);
if (result < IBD_MIN_RC_NUM_SRQ ||
} else
return (err);
}
return (EBUSY);
}
return (EINVAL);
}
if (result < IBD_MIN_RC_NUM_SWQE ||
} else {
if (state->id_allow_coalesce_comp_tuning &&
}
return (err);
}
if (!state->id_allow_coalesce_comp_tuning) {
return (ENOTSUP);
}
return (EINVAL);
}
} else {
}
return (err);
}
if (!state->id_allow_coalesce_comp_tuning) {
return (ENOTSUP);
}
return (EINVAL);
}
if (result < 1) {
} else {
}
return (err);
}
return (EBUSY);
}
return (EINVAL);
}
if (result < IBD_MIN_RC_RX_COPY_THRESH ||
} else {
}
return (err);
}
return (EBUSY);
}
return (EINVAL);
}
if (result < IBD_MIN_RC_RX_RWQE_THRESH ||
} else {
}
return (err);
}
if (!state->id_allow_coalesce_comp_tuning) {
return (ENOTSUP);
}
return (EINVAL);
}
} else {
}
return (err);
}
if (!state->id_allow_coalesce_comp_tuning) {
return (ENOTSUP);
}
return (EINVAL);
}
if (result < 1)
else {
}
return (err);
}
return (EBUSY);
}
return (EINVAL);
}
if (result < IBD_MIN_RC_TX_COPY_THRESH ||
} else {
}
return (err);
}
return (EBUSY);
}
return (EINVAL);
}
if (result < IBD_MIN_UD_NUM_RWQE ||
} else {
} else {
}
if (state->id_allow_coalesce_comp_tuning &&
}
return (err);
}
return (EBUSY);
}
return (EINVAL);
}
if (result < IBD_MIN_UD_NUM_SWQE ||
} else {
} else {
}
if (state->id_allow_coalesce_comp_tuning &&
}
return (err);
}
if (!state->id_allow_coalesce_comp_tuning) {
return (ENOTSUP);
}
return (EINVAL);
}
} else {
}
return (err);
}
if (!state->id_allow_coalesce_comp_tuning) {
return (ENOTSUP);
}
return (EINVAL);
}
if (result < 1) {
} else {
}
return (err);
}
if (!state->id_allow_coalesce_comp_tuning) {
return (ENOTSUP);
}
return (EINVAL);
}
} else {
}
return (err);
}
if (!state->id_allow_coalesce_comp_tuning) {
return (ENOTSUP);
}
return (EINVAL);
}
if (result < 1) {
} else {
}
return (err);
}
return (EBUSY);
}
return (EINVAL);
}
if (result < IBD_MIN_UD_TX_COPY_THRESH ||
} else {
}
return (err);
}
return (ENOTSUP);
}
static int
void *pr_val)
{
int value;
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
err = 0;
goto done;
}
done:
if (err == 0) {
}
return (err);
}
static int
{
/*
* Query for port information
*/
"failed, ret=%d", ret);
return (ENETDOWN);
}
/*
* If the link is active, verify the pkey
*/
} else {
}
/*
* Now that the port is active, record the port speed
*/
} else {
state->id_link_speed = 0;
}
return (0);
}
static int
{
/*
* Theoretically, there is no point in having more than #rwqe
* plus #swqe cqe's, except that the CQ will be signaled for
* overflow when the last wqe completes, if none of the previous
* cqe's have been polled. Thus, we allocate just a few less wqe's
* to make sure such overflow does not occur.
*/
/*
* Allocate Receive CQ.
*/
} else {
}
"failed, ret=%d\n", ret);
return (DDI_FAILURE);
}
"moderation failed, ret=%d\n", ret);
}
/* make the #rx wc's the same as max rx chain size */
/*
* Allocate Send CQ.
*/
} else {
}
"failed, ret=%d\n", ret);
return (DDI_FAILURE);
}
"moderation failed, ret=%d\n", ret);
}
/*
* Print message in case we could not allocate as many wqe's
* as was requested.
*/
if (num_rwqe_change) {
}
if (num_swqe_change) {
}
return (DDI_SUCCESS);
}
static int
{
if (state->id_hca_res_lkey_capab)
"failed, ret=%d\n", ret);
return (DDI_FAILURE);
}
&ud_chan_attr)) != IBT_SUCCESS) {
"failed, ret=%d\n", ret);
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
static int
{
/*
* we need to mark the link state appropriately to prevent the
* ip layer from using this instance for any new transfers. Note
* that if the original state of the link was "up" when we're
* here, we'll set the final link state to "unknown", to behave
* in the same fashion as other ethernet drivers.
*/
if (cur_link_state == LINK_STATE_DOWN) {
} else {
}
if (progress & IBD_DRV_STARTED) {
}
if (progress & IBD_DRV_IN_LATE_HCA_INIT) {
}
/* Stop listen under Reliable Connected Mode */
if (progress & IBD_DRV_RC_LISTEN) {
}
}
/* Stop timeout routine */
if (progress & IBD_DRV_RC_TIMEOUT) {
state->rc_timeout = 0;
if (tid != 0)
}
attempts = 100;
/*
* "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB
* port is connecting to a remote IPoIB port. Wait for
* the end of this connecting operation.
*/
if (--attempts == 0) {
state->rc_stop_connect++;
break;
}
}
state->id_sched_needed = 0;
(void) ibd_rc_close_all_chan(state);
}
/*
* First, stop receive interrupts; this stops the driver from
* handing up buffers to higher layers. Wait for receive buffers
* to be returned and give up after 1 second.
*/
if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
attempts = 10;
0) > 0) {
if (--attempts == 0) {
/*
* There are pending bufs with the network
* layer and we have no choice but to wait
* for them to be done with. Reap all the
* we turned off the notification and
* return failure.
*/
"reclaiming failed");
break;
}
}
}
if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) {
}
if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
/*
* "state->id_ah_op == IBD_OP_ONGOING"
* means this IPoIB port is connecting
* to a remote IPoIB port. We can't
* delete SRQ here.
*/
state->rc_stop_connect++;
"connecting");
} else {
state->id_mac_state &=
}
} else {
}
} else {
}
}
if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
while (state->id_trap_inprog > 0)
}
if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
/*
* Flushing the channel ensures that all pending WQE's
* are marked with flush_error and handed to the CQ. It
* does not guarantee the invocation of the CQ handler.
* This call is guaranteed to return successfully for
* UD QPNs.
*/
IBT_SUCCESS) {
"failed, ret=%d", ret);
}
/*
* Give some time for the TX CQ handler to process the
* completions.
*/
attempts = 10;
!= state->id_ud_num_swqe) {
if (--attempts == 0)
break;
}
state->id_ud_num_swqe) {
}
attempts = 10;
if (--attempts == 0)
break;
}
}
}
if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
/*
* membership to the broadcast group, and any nonmembership
* acquired during transmits. We do this after the Tx completion
* handlers are done, since those might result in some late
* leaves; this also eliminates a potential race with that
* has also been suppressed at this point. Thus, no locks
* are required while traversing the mc full list.
*/
}
}
if (progress & IBD_DRV_RXLIST_ALLOCD) {
}
if (progress & IBD_DRV_TXLIST_ALLOCD) {
}
if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
IBT_SUCCESS) {
"failed, ret=%d", ret);
}
}
if (progress & IBD_DRV_CQS_ALLOCD) {
IBT_SUCCESS) {
"failed, ret=%d", ret);
}
"ret=%d", ret);
}
}
if (progress & IBD_DRV_ACACHE_INITIALIZED) {
}
if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
/*
* If we'd created the ipoib broadcast group and had
* successfully joined it, leave it now
*/
if (state->id_bgroup_created) {
}
}
return (DDI_SUCCESS);
}
/*
* the caller is likely to do something to change the id_mac_state.
* If there's already someone doing either a start or a stop (possibly
* due to the async handler detecting a pkey relocation event, a plumb
* or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
* that's done.
*/
static void
{
}
static void
{
}
/*
* GLDv3 entry point to start hardware.
*/
/*ARGSUSED*/
static int
{
int ret;
return (EINVAL);
return (EIO);
}
return (ret);
}
static int
{
int err;
int late_hca_init = 0;
return (DDI_SUCCESS);
/*
* We do not increment the running flag when calling ibd_start() as
* a result of some event which moves the state away from late HCA
* initialization viz. MCG_CREATED, PORT_CHANGE or link availability.
*/
return (EINVAL);
}
/*
* Get port details; if we fail here, something bad happened.
* Fail plumb.
*/
goto start_fail;
}
/*
* If state->id_link_state is DOWN, it indicates that either the port
* is down, or the pkey is not available. In both cases, resort to late
* initialization. Register for subnet notices, and return success.
*/
late_hca_init = 1;
goto late_hca_init_return;
}
/*
* Find the IPoIB broadcast group
*/
/* Resort to late initialization */
late_hca_init = 1;
goto reg_snet_notices;
}
/*
* Initialize per-interface caches and lists; if we fail here,
* it is most likely due to a lack of resources
*/
goto start_fail;
}
/*
* Allocate send and receive completion queues
*/
goto start_fail;
}
/*
* Setup a UD channel
*/
goto start_fail;
}
/*
* Allocate and initialize the tx buffer list
*/
goto start_fail;
}
/*
* Create the send cq handler here
*/
IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
"failed, ret=%d", ret);
goto start_fail;
}
/*
* Allocate and initialize the rx buffer list
*/
goto start_fail;
}
/*
* Join IPoIB broadcast group
*/
err = ENOTACTIVE;
goto start_fail;
}
/*
* When we did mac_register() in ibd_attach(), we didn't register
* the real macaddr and we didn't have the true port mtu. Now that
* we're almost ready, set the local mac address and broadcast
* addresses and update gldv3 about the real values of these
* parameters.
*/
if (state->id_enable_rc) {
} else {
}
if (!state->id_enable_rc) {
}
/*
* Setup the receive cq handler
*/
IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
"failed, ret=%d", ret);
goto start_fail;
}
/*
* In case of normal initialization sequence,
* Setup the subnet notices handler after we've initialized the acache/
* mcache and started the async thread, both of which are required for
* the trap handler to function properly.
*
* Now that the async thread has been started (and we've already done
* a mac_register() during attach so mac_tx_update() can be called
* if necessary without any problem), we can enable the trap handler
* to queue requests to the async thread.
*
* In case of late hca initialization, the subnet notices handler will
* of handling these events is to start the interface. So, the
* registering the subnet notices handler. Also, if we are in
* ibd_start() as a result of, say, some event handling after entering
* late hca initialization phase no need to register again.
*/
}
if (late_hca_init == 1) {
/*
* In case of late initialization, mark the link state as down,
* immaterial of the actual link state as reported in the
* port_info.
*/
return (DDI_SUCCESS);
}
if (state->id_enable_rc) {
if (state->rc_enable_srq) {
if (ibd_rc_repost_srq_free_list(state) !=
IBT_SUCCESS) {
goto start_fail;
}
} else {
/* Allocate SRQ resource */
if (ibd_rc_init_srq_list(state) !=
IBT_SUCCESS) {
goto start_fail;
}
}
}
"failed");
goto start_fail;
}
/* RC: begin to listen only after everything is available */
goto start_fail;
}
}
/*
* Indicate link status to GLDv3 and higher layers. By default,
* we assume we are in up state (which must have been true at
* least at the time the broadcast mcg's were probed); if there
* async handler will have updated last known state, which we
* use to tell GLDv3. The async handler will not send any
* notifications to GLDv3 till we reach here in the initialization
* sequence.
*/
/* Start timer after everything is ready */
if (state->id_enable_rc) {
}
return (DDI_SUCCESS);
/*
* If we ran into a problem during ibd_start() and ran into
* some other problem during undoing our partial work, we can't
* do anything about it. Ignore any errors we might get from
* ibd_undo_start() and just return the original error we got.
*/
return (err);
}
/*
* GLDv3 entry point to stop hardware from receiving packets.
*/
/*ARGSUSED*/
static void
{
return;
}
/*
* GLDv3 entry point to modify device's mac address. We do not
* allow address modifications.
*/
static int
{
return (EINVAL);
/*
* Don't bother even comparing the macaddr if we haven't
* completed ibd_m_start().
*/
return (0);
return (0);
else
return (EINVAL);
}
/*
* of here on the async thread.
*/
static void
{
if (op == IBD_ASYNC_JOIN) {
}
} else {
/*
* Here, we must search for the proper mcg_info and
* use that to leave the group.
*/
}
}
/*
* This function queues the operation to the async thread and
* return success for a valid multicast address.
*/
static int
{
return (EINVAL);
/*
* If we haven't completed ibd_m_start(), async thread wouldn't
* have been started and id_bcaddr wouldn't be set, so there's
* no point in continuing.
*/
return (0);
/*
* The incoming multicast address might not be aligned properly
* on a 4 byte boundary to be considered an ipoib_mac_t. We force
* it to look like one though, to get the offsets of the mc gid,
* since we know we are not going to dereference any values with
* the ipoib_mac_t pointer.
*/
/*
* Check validity of MCG address. We could additionally check
* mcg, but since this operation is only invokable by privileged
* programs anyway, we allow the flexibility to those dlpi apps.
* Note that we do not validate the "scope" of the IBA mcg.
*/
return (EINVAL);
/*
* fill in multicast pkey and scope
*/
/*
* nothing (i.e. we stay JOINed to the broadcast group done in
* ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
* requires to be joined to broadcast groups at all times.
* ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
* depends on this.
*/
return (0);
return (ENOMEM);
if (add) {
} else {
}
return (0);
}
/*
* The blocking part of the IBA promiscuous operations are done
* out of here on the async thread. The dlpireq parameter indicates
* whether this invocation is due to a dlpi request or due to
*/
static void
{
}
}
/*
* The blocking part of the IBA promiscuous operations are done
* out of here on the async thread. The dlpireq parameter indicates
* whether this invocation is due to a dlpi request or due to
*/
static void
{
int i;
/*
* Obtain all active MC groups on the IB fabric with
* specified criteria (scope + Pkey + Qkey + mtu).
*/
IBT_SUCCESS) {
"groups");
goto done;
}
/*
* Iterate over the returned mcg's and join as NonMember
* to the IP mcg's.
*/
for (i = 0; i < numg; i++) {
/*
* Do a NonMember JOIN on the MC group.
*/
"multicast gid %016llx:%016llx",
}
done:
}
/*
* GLDv3 assumes phys state receives more packets than multi state,
* which is not true for IPoIB. Thus, treat the multi and phys
* promiscuous states the same way to work with GLDv3's assumption.
*/
static int
{
return (EINVAL);
/*
* Async thread wouldn't have been started if we haven't
* passed ibd_m_start()
*/
return (0);
return (ENOMEM);
if (on) {
} else {
}
return (0);
}
/*
* GLDv3 entry point for gathering statistics.
*/
static int
{
switch (stat) {
case MAC_STAT_IFSPEED:
break;
case MAC_STAT_MULTIRCV:
break;
case MAC_STAT_BRDCSTRCV:
break;
case MAC_STAT_MULTIXMT:
break;
case MAC_STAT_BRDCSTXMT:
break;
case MAC_STAT_RBYTES:
break;
case MAC_STAT_IPACKETS:
+ state->rc_rcv_copy_pkt;
break;
case MAC_STAT_OBYTES:
break;
case MAC_STAT_OPACKETS:
break;
case MAC_STAT_OERRORS:
break;
case MAC_STAT_IERRORS:
*val = 0;
break;
case MAC_STAT_NOXMTBUF:
break;
case MAC_STAT_NORCVBUF:
default:
return (ENOTSUP);
}
return (0);
}
static void
{
}
static void
{
int flag;
int met_thresh = 0;
int thresh = 0;
if (met_thresh > thresh)
}
if (met_thresh > thresh) {
state->id_sched_cnt++;
ret = 0;
}
if (ret == 0)
}
/*
* Release the send wqe back into free list.
*/
static void
{
/*
* Add back on Tx list for reuse.
*/
}
/*
* Acquire a send wqe from free list.
* Returns error number and send wqe pointer.
*/
static ibd_swqe_t *
{
/* transfer id_tx_rel_list to id_tx_list */
/* clear id_tx_rel_list */
} else { /* no free swqe */
state->id_tx_short++;
}
return (wqe);
}
static int
{
/*
* The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
* we need to adjust it here for lso.
*/
/*
* Calculate the LSO header size and set it in the UD LSO structure.
* Note that the only assumption we make is that each of the IPoIB,
* IP and TCP headers will be contained in a single mblk fragment;
* together, the headers may span multiple mblk fragments.
*/
}
}
/*
* If the lso header fits entirely within a single mblk fragment,
* we'll avoid an additional copy of the lso header here and just
* pass the b_rptr of the mblk directly.
*
* If this isn't true, we'd have to allocate for it explicitly.
*/
} else {
/* On work completion, remember to free this allocated hdr */
lso->lso_hdr_sz = 0;
return (-1);
}
}
/*
* Copy in the lso header only if we need to
*/
} else {
break;
}
}
}
return (0);
}
static void
{
return;
/*
* Free any header space that we might've allocated if we
* did an LSO
*/
lso->lso_hdr_sz = 0;
}
}
}
static void
{
uint_t i;
/* post the one request, then check for more */
if (ibt_status != IBT_SUCCESS) {
"posting one wr failed: ret=%d", ibt_status);
}
for (;;) {
state->id_tx_busy = 0;
return;
}
}
/*
* Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
* at a time if possible, and keep posting them.
*/
}
/*
* If posting fails for some reason, we'll never receive
* completion intimation, so we'll need to cleanup. But
* we need to make sure we don't clean up nodes whose
* wrs have been successfully posted. We assume that the
* hca driver returns on the first failure to post and
* therefore the first 'num_posted' entries don't need
* cleanup here.
*/
num_posted = 0;
if (ibt_status != IBT_SUCCESS) {
"posting multiple wrs failed: "
"requested=%d, done=%d, ret=%d",
for (i = num_posted; i < n_wrs; i++)
}
}
}
static int
{
int nmblks;
int i;
/*
* Let's skip ahead to the data if this is LSO
*/
pending_hdr = 0;
if (lsohdr_sz) {
if (frag_len > pending_hdr)
break;
pending_hdr -= frag_len;
}
}
/*
* Calculate the size of message data and number of msg blocks
*/
pktsize = 0;
}
pktsize -= pending_hdr;
/*
* We only do ibt_map_mem_iov() if the pktsize is above the
* "copy-threshold", and if the number of mp fragments is less than
* the maximum acceptable.
*/
if ((state->id_hca_res_lkey_capab) &&
if (i == 0) {
}
}
if (ibt_status != IBT_SUCCESS) {
goto ibd_copy_path;
}
return (0);
}
/*
* Even though this is the copy path for transfers less than
* id_tx_buf_sz, it could still be an LSO packet. If so, it
* is possible the first data mblk fragment (data_mp) still
* contains part of the LSO header that we need to skip.
*/
pending_hdr = 0;
}
return (0);
}
/*
* Copy path for transfers greater than id_tx_buf_sz
*/
return (-1);
}
/*
* Copy the larger-than-id_tx_buf_sz packet into a set of
* fixed-sized, pre-mapped LSO buffers. Note that we might
* need to skip part of the LSO header in the first fragment
* as before.
*/
skip = pending_hdr;
avail = 0;
} else {
skip = 0;
}
}
}
return (0);
}
/*
* Schedule a completion queue polling to reap the resource we're
* short on. If we implement the change to reap tx completions
* in a separate thread, we'll need to wake up that thread here.
*/
static int
{
/*
* If we are asked to queue a work entry, we need to do it
*/
if (q_flag) {
return (-1);
}
return (0);
}
/*
* The passed in packet has this format:
* IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
*/
static boolean_t
{
/* if (rc_chan == NULL) send by UD; else send by RC; */
int nmblks;
/*
* If we aren't done with the device initialization and start,
* we shouldn't be here.
*/
return (B_FALSE);
/*
* Obtain an address handle for the destination.
*/
state->rc_null_conn++;
} else {
node = WQE_TO_SWQE(
} else {
}
state->rc_swqe_short++;
state->id_sched_needed |=
return (B_FALSE);
}
} else {
}
}
}
} else {
}
/*
* If we don't have an swqe available, schedule a
* transmit completion queue cleanup and hold off on
* sending more packets until we have some free swqes
*/
}
return (B_FALSE);
}
/*
* If a poll cannot be scheduled, we have no choice but
* to drop this packet
*/
}
return (B_TRUE);
}
}
/*
* Initialize the commonly used fields in swqe to NULL to protect
* against ibd_tx_cleanup accidentally misinterpreting these on a
* failure.
*/
/*
* Calculate the size of message data and number of msg blocks
*/
pktsize = 0;
}
} else {
DPRINT(5,
"ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
/*
* Here if ibd_acache_lookup() returns EFAULT, it means ibd
* can not find a path for the specific dest address. We
* should get rid of this kind of packet. We also should get
* rid of the packet if we cannot schedule a poll via the
* async thread. For the normal case, ibd will return the
* packet to upper layer and wait for AH creating.
*
* Note that we always queue a work slot entry for the async
* thread when we fail AH lookup (even in intr mode); this is
* due to the convoluted way the code currently looks for AH.
*/
} else {
}
goto ibd_send_fail;
}
/*
* For ND6 packets, padding is at the front of the source lladdr.
* Insert the padding at front.
*/
sizeof (ib_header_info_t))) {
goto ibd_send_fail;
}
}
sizeof (ib_header_info_t));
"failure ");
goto ibd_send_fail;
}
sizeof (ib_header_info_t));
}
/* LINTED: E_CONSTANT_CONDITION */
}
}
pktsize -= sizeof (ib_addrs_t);
if (rc_chan) { /* send in RC mode */
uint_t i;
/*
* Upper layer does Tx checksum, we don't need do any
* checksum here.
*/
/*
* We only do ibt_map_mem_iov() if the pktsize is above
* the "copy-threshold", and if the number of mp
* fragments is less than the maximum acceptable.
*/
/*
* Only process unicast packet in Reliable Connected
* mode.
*/
}
} else {
if ((state->rc_enable_iov_map) &&
/* do ibt_map_mem_iov() */
iov_attr.iov_lso_hdr_sz = 0;
i = 0;
i++;
}
}
iov_attr.iov_list_len = i;
if (ret != IBT_SUCCESS) {
") failed, nmblks=%d, real_nmblks"
goto ibd_rc_large_copy;
}
} else {
if (state->rc_tx_largebuf_nfree == 0) {
state->id_sched_needed |=
/*
* If we don't have Tx large bufs,
* return failure. node->w_buftype
* should not be IBD_WQE_RC_COPYBUF,
* otherwise it will cause problem
* in ibd_rc_tx_cleanup()
*/
goto ibd_send_fail;
}
/* Update nfree count */
if (blksize != 0) {
blksize);
}
}
}
}
} else {
}
} else {
}
return (B_TRUE);
} /* send by RC */
/*
* Too long pktsize. The packet size from GLD should <=
* state->id_mtu + sizeof (ib_addrs_t)
*/
if (ace->tx_too_big_ongoing) {
} else {
"ibd_req_t fail");
/* Drop it. */
} else {
}
}
} else {
"Multicast packet length %d > %d is too long to "
"send packet (%d > %d), drop it",
/* Drop it. */
}
goto ibd_send_fail;
}
/*
* Do LSO and checksum related work here. For LSO send, adjust the
* ud destination, the opcode and the LSO header information to the
* work request.
*/
lsohdr_sz = 0;
} else {
/*
* The routine can only fail if there's no memory; we
* can only drop the packet if this happens
*/
"ibd_send: no memory, lso posting failed");
goto ibd_send_fail;
}
}
else
/*
* Prepare the sgl for posting; the routine can only fail if there's
* no lso buf available for posting. If this is the case, we should
* probably resched for lso bufs to become available and then try again.
*/
} else {
}
goto ibd_send_fail;
}
/*
* Queue the wqe to hardware; since we can now simply queue a
* post instead of doing it serially, we cannot assume anything
* about the 'node' after ibd_post_send() returns.
*/
if (state->id_tx_busy) {
if (state->id_tx_head) {
} else {
}
} else {
}
return (B_TRUE);
if (dofree)
if (rc_chan) {
} else {
}
}
return (rc);
}
/*
* GLDv3 entry point for transmitting datagram.
*/
static mblk_t *
{
return (NULL);
}
}
/* Send fail */
break;
}
}
return (mp);
}
/*
* this handles Tx and Rx completions. With separate CQs, this handles
* only Rx completions.
*/
static uint_t
{
return (DDI_INTR_CLAIMED);
}
/*
* Poll and fully drain the send cq
*/
static void
{
int i;
/*
* Channel being torn down.
*/
} else {
"unexpected wc_status %d",
}
/*
* Fallthrough to invoke the Tx handler to
* release held resources, e.g., AH refcount.
*/
}
/*
* Add this swqe to the list to be cleaned up.
*/
if (head)
else
}
/*
* Resume any blocked transmissions if possible
*/
}
}
/*
* Poll and fully drain the receive cq
*/
static void
{
int i;
/*
* Channel being torn down.
*/
"expected flushed rwqe");
} else {
"unexpected wc_status %d",
}
continue;
}
continue;
/*
* Add this mp to the list to send to the nw layer.
*/
if (head)
else
}
if (head)
/*
* Account for #rwqes polled.
* Post more here, if less than one fourth full.
*/
}
}
/*
* Common code for interrupt handling as well as for polling
* for all completed wqe's while detaching.
*/
static void
{
return;
}
/*
* In some cases (eg detaching), this code can be invoked on
* any cpu after disabling cq notification (thus no concurrency
* exists). Apart from that, the following applies normally:
* Transmit completion handling could be from any cpu if
* Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
* is interrupt driven.
*/
/*
* Poll and drain the CQ
*/
/*
* Enable CQ notifications and redrain the cq to catch any
* completions we might have missed after the ibd_drain_scq()
* above and before the ibt_enable_cq_notify() that follows.
* Finally, service any new requests to poll the cq that
* could've come in after the ibt_enable_cq_notify().
*/
do {
IBT_SUCCESS) {
}
else {
redo = 0;
}
} while (redo);
}
/*
* Common code for interrupt handling as well as for polling
* for all completed wqe's while detaching.
*/
static void
{
return;
}
/*
* Poll and drain the CQ
*/
/*
* Enable CQ notifications and redrain the cq to catch any
* completions we might have missed after the ibd_drain_cq()
* above and before the ibt_enable_cq_notify() that follows.
* Finally, service any new requests to poll the cq that
* could've come in after the ibt_enable_cq_notify().
*/
do {
IBT_SUCCESS) {
}
else {
redo = 0;
}
} while (redo);
}
/*
* Unmap the memory area associated with a given swqe.
*/
void
{
DPRINT(10,
"failed in ibt_unmap_mem_iov, ret=%d\n", stat);
}
}
}
void
{
/*
* The recycling logic can be eliminated from here
* and put into the async thread if we create another
* list to hold ACE's for unjoined mcg's.
*/
if (DEC_REF_DO_CYCLE(ace)) {
/*
* Check with the lock taken: we decremented
* reference count without the lock, and some
* transmitter might already have bumped the
* reference count (possible in case of multicast
* disable when we leave the AH on the active
* list). If not still 0, get out, leaving the
* recycle bit intact.
*
* Atomically transition the AH from active
* to free list, and queue a work request to
* leave the group and destroy the mce. No
* transmitter can be looking at the AH or
* the MCE in between, since we have the
* ac_mutex lock. In the SendOnly reap case,
* it is not necessary to hold the ac_mutex
* and recheck the ref count (since the AH was
* taken off the active list), we just do it
* to have uniform processing with the Full
* reap case.
*/
if (GET_REF_CYCLE(ace) == 0) {
/*
* Identify the case of fullmember reap as
* opposed to mcg trap reap. Also, port up
* might set ac_mce to NULL to indicate Tx
* cleanup should do no more than put the
* AH in the free list (see ibd_async_link).
*/
/*
* mc_req was initialized at mce
* creation time.
*/
}
}
}
}
/*
* Common code that deals with clean ups after a successful or
* erroneous transmission attempt.
*/
static void
{
/*
* If this was a dynamic mapping in ibd_send(), we need to
* unmap here. If this was an lso buffer we'd used for sending,
* we need to release the lso buf to the pool, since the resource
* is scarce. However, if this was simply a normal send using
* the copybuf (present in each swqe), we don't need to release it.
*/
}
}
/*
* Drop the reference count on the AH; it can be reused
* now for a different destination if there are no more
* posted sends that will use it. This can be eliminated
* if we can always associate each Tx buffer with an AH.
* The ace can be null if we are cleaning up from the
* ibd_send() error path.
*/
}
/*
* Release the send wqe for reuse.
*/
}
static void
{
int n = 0;
/*
* If this was a dynamic mapping in ibd_send(), we need to
* unmap here. If this was an lso buffer we'd used for sending,
* we need to release the lso buf to the pool, since the
* resource is scarce. However, if this was simply a normal
* send using the copybuf (present in each swqe), we don't need
* to release it.
*/
}
}
/*
* Drop the reference count on the AH; it can be reused
* now for a different destination if there are no more
* posted sends that will use it. This can be eliminated
* if we can always associate each Tx buffer with an AH.
* The ace can be null if we are cleaning up from the
* ibd_send() error path.
*/
}
n++;
}
/*
* Release the send wqes for reuse.
*/
}
/*
* Processing to be done after receipt of a packet; hand off to GLD
* in the format expected by GLD. The received packet has this
* format: 2b sap :: 00 :: data.
*/
static mblk_t *
{
int len;
/*
* Track number handed to upper layer that need to be returned.
*/
/* Never run out of rwqes, use allocb when running low */
if (mp) {
} else { /* no memory */
return (NULL);
}
} else {
}
/*
* Adjust write pointer depending on how much data came in.
*/
/*
* Make sure this is NULL or we're in trouble.
*/
"ibd_process_rx: got duplicate mp from rcq?");
}
/*
* the IB link will deliver one of the IB link layer
* headers called, the Global Routing Header (GRH).
* ibd driver uses the information in GRH to build the
* Header_info structure and pass it with the datagram up
* to GLDv3.
* If the GRH is not valid, indicate to GLDv3 by setting
* the VerTcFlow field to 0.
*/
/* if it is loop back packet, just drop it. */
if (state->id_enable_rc) {
IPOIB_ADDRL) == 0) {
return (NULL);
}
} else {
IPOIB_ADDRL) == 0) {
return (NULL);
}
}
sizeof (ipoib_mac_t));
} else {
}
} else {
/*
* It can not be a IBA multicast packet. Must have been
* unicast for us. Just copy the interface address to dst.
*/
sizeof (ipoib_mac_t));
}
/*
* lladdr. However the inet6 layer is not aware of it, hence remove
* the padding from such packets.
*/
/* LINTED: E_CONSTANT_CONDITION */
}
}
/*
* Update statistics
*/
/*
* Set receive checksum status in mp
* Hardware checksumming can be considered valid only if:
* 1. CQE.IP_OK bit is set
* 2. CQE.CKSUM = 0xffff
* 3. IPv6 routing header is not present in the packet
* 4. If there are no IP_OPTIONS in the IP HEADER
*/
}
return (mp);
}
/*
* Callback code invoked from STREAMs when the receive data buffer is
* free for recycling.
*/
static void
{
/*
* If the driver is stopped, just free the rwqe.
*/
return;
}
return;
}
}
static uint_t
{
/*
* Poll for completed entries
*/
return (DDI_INTR_CLAIMED);
}
#ifdef IBD_LOGGING
static void
ibd_log_init(void)
{
ibd_lbuf_ndx = 0;
}
static void
ibd_log_fini(void)
{
if (ibd_lbuf)
ibd_lbuf_ndx = 0;
}
static void
{
return;
if (msglen >= IBD_DMAX_LINE)
ibd_lbuf_ndx = 0;
}
#endif
/* ARGSUSED */
static int
int *rvalp)
{
return (EINVAL);
}
if (port_state == NULL) {
return (EINVAL);
}
/* Limited PKeys not supported */
goto part_create_return;
}
if (cmd->ioc_force_create == 0) {
/*
* Check if the port pkey table contains the pkey for which
* this partition is being created.
*/
goto part_create_return;
}
goto part_create_return;
}
for (i = 0; i < pinfop->p_pkey_tbl_sz; i++) {
break;
}
}
if (i == pinfop->p_pkey_tbl_sz) {
goto part_create_return;
}
} else {
}
for (p = ibd_objlist_head; p; p = p->id_next) {
goto part_create_return;
}
}
goto fail;
}
goto fail;
}
if (state->id_enable_rc) {
} else {
}
if (err != 0) {
err);
goto fail;
}
if (err != 0) {
"%d", err);
goto fail;
}
/*
* Add the new partition state structure to the list
*/
if (ibd_objlist_head)
if (pinfop) {
}
return (rval);
fail:
if (pinfop) {
}
return (rval);
}
/* ARGSUSED */
static int
int *rvalp)
{
int err;
/* Find the ibd state structure corresponding to the partition */
break;
}
return (ENOENT);
}
"%d", err);
return (err);
}
/*
* Call ibd_part_unattach() only after making sure that the instance has
* not been started yet and is also not in late hca init mode.
*/
err = 0;
crgetzoneid(credp));
}
/* Remove the partition state structure from the linked list */
else
err);
}
return (0);
}
/* ARGSUSED */
static int
int *rvalp)
{
#ifdef _MULTI_DATAMODEL
#endif
int size;
int rval = 0;
size = sizeof (ibd_ioctl_t);
return (EFAULT);
}
cmd.ioc_status = 0;
switch (cmd.ioc_info_cmd) {
case IBD_INFO_CMD_IBPART:
size = sizeof (ibpart_ioctl_t);
return (EFAULT);
}
/* Find the ibd state structure corresponding the partition */
break;
}
}
return (ENOENT);
}
return (EFAULT);
}
break;
case IBD_INFO_CMD_IBPORT:
size = sizeof (ibd_ioctl_t);
mode)) {
return (EFAULT);
}
return (EINVAL);
}
return (EINVAL);
}
#ifdef _MULTI_DATAMODEL
case DDI_MODEL_ILP32: {
size = sizeof (ibport_ioctl32_t);
goto fail;
}
if (portioc32.ioc_pkey_tbl_sz !=
pinfop->p_pkey_tbl_sz) {
size = sizeof (ibd_ioctl_t);
goto fail;
}
goto fail;
}
mode)) {
goto fail;
}
size = sizeof (ibport_ioctl32_t);
mode)) {
goto fail;
}
break;
}
case DDI_MODEL_NONE:
size = sizeof (ibport_ioctl_t);
goto fail;
}
size = sizeof (ibd_ioctl_t);
goto fail;
}
goto fail;
}
goto fail;
}
size = sizeof (ibport_ioctl_t);
mode)) {
goto fail;
}
break;
}
#else /* ! _MULTI_DATAMODEL */
size = sizeof (ibport_ioctl_t);
goto fail;
}
size = sizeof (ibd_ioctl_t);
goto fail;
}
goto fail;
}
goto fail;
}
size = sizeof (ibport_ioctl_t);
mode)) {
goto fail;
}
#endif /* _MULTI_DATAMODEL */
break;
case IBD_INFO_CMD_PKEYTBLSZ:
size = sizeof (ibd_ioctl_t);
mode)) {
return (EFAULT);
}
return (EINVAL);
}
return (EINVAL);
}
#ifdef _MULTI_DATAMODEL
case DDI_MODEL_ILP32: {
size = sizeof (ibport_ioctl32_t);
goto fail;
}
mode)) {
goto fail;
}
break;
}
case DDI_MODEL_NONE:
size = sizeof (ibport_ioctl_t);
goto fail;
}
mode)) {
goto fail;
}
break;
}
#else /* ! _MULTI_DATAMODEL */
size = sizeof (ibport_ioctl_t);
goto fail;
}
mode)) {
goto fail;
}
#endif /* _MULTI_DATAMODEL */
break;
default:
return (EINVAL);
} /* switch (cmd.ioc_info_cmd) */
fail:
if (pinfop) {
}
return (rval);
}
/* ARGSUSED */
static void
{
switch (code) {
case IBT_EVENT_PORT_UP:
case IBT_ERROR_PORT_DOWN:
break;
}
break;
default:
break;
}
}
static int
{
return (-1);
*lstate = LINK_STATE_UP;
else
return (0);
}
static int
{
int instance;
/*
* Allocate softstate structure
*/
return (DDI_FAILURE);
}
"port-number", 0)) == 0) {
return (DDI_FAILURE);
}
"hca-guid", 0)) == 0) {
state->id_hca_guid);
return (DDI_FAILURE);
}
"port-guid", 0)) == 0) {
return (DDI_FAILURE);
}
/*
* Attach to IBTL
*/
ret);
goto done;
}
ret);
goto done;
}
/* Update link status */
ret);
goto done;
}
/*
* Register ibd interfaces with the Nemo framework
*/
goto done;
}
return (DDI_SUCCESS);
done:
return (DDI_FAILURE);
}
static int
{
int instance;
if (progress & IBD_DRV_MAC_REGISTERED) {
}
if (progress & IBD_DRV_HCA_OPENED) {
IBT_SUCCESS) {
"HCA device, ret=%d", ret);
}
}
if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
"ibt_detach() failed, ret=%d", ret);
}
}
return (DDI_SUCCESS);
}
{
/* Find the ibd state structure corresponding the partition */
break;
}
}
return (IBT_NO_SUCH_OBJECT);
}
return (IBT_SUCCESS);
}
{
int n = 0;
n++;
*nparts = n;
if (n == 0) {
return (IBT_SUCCESS);
}
#ifdef DEBUG
ASSERT(n > 0);
n--;
#endif
attr++;
}
return (IBT_SUCCESS);
}