clients/ibd/ibd.c

	ibd.c revision 2b24ab6b3865caeede9eeb9db6b83e1d89dcd1ea
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * An implementation of the IPoIB standard based on PSARC 2001/289.
 */

#include <sys/types.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/modctl.h>
#include <sys/stropts.h>
#include <sys/stream.h>
#include <sys/strsun.h>
#include <sys/strsubr.h>
#include <sys/dlpi.h>
#include <sys/mac_provider.h>

#include <sys/pattr.h>      /* for HCK_FULLCKSUM */
#include <sys/sysmacros.h>  /* for offsetof */
#include <sys/disp.h>       /* for async thread pri */
#include <sys/atomic.h>     /* for atomic_add*() */
#include <sys/ethernet.h>   /* for ETHERTYPE_IPV6 */
#include <netinet/in.h>     /* for netinet/ip.h below */
#include <netinet/ip.h>     /* for struct ip */
#include <netinet/udp.h>    /* for struct udphdr */
#include <inet/common.h>    /* for inet/ip.h below */
#include <inet/ip.h>        /* for ipha_t */
#include <inet/ip6.h>       /* for ip6_t */
#include <inet/tcp.h>       /* for tcph_t */
#include <netinet/icmp6.h>  /* for icmp6_t */
#include <sys/callb.h>
#include <sys/modhash.h>

#include <sys/ib/clients/ibd/ibd.h>
#include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */
#include <sys/note.h>
#include <sys/multidata.h>

#include <sys/ib/mgt/ibmf/ibmf.h>   /* for ibd_get_portspeed */

/*
 * Per-interface tunables
 *
 * ibd_tx_copy_thresh
 *     This sets the threshold at which ibd will attempt to do a bcopy of the
 *     outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior
 *     is restricted by various parameters, so setting of this value must be
 *     made after careful considerations only.  For instance, IB HCAs currently
 *     impose a relatively small limit (when compared to ethernet NICs) on the
 *     length of the SGL for transmit. On the other hand, the ip stack could
 *     send down mp chains that are quite long when LSO is enabled.
 *
 * ibd_num_swqe
 *     Number of "send WQE" elements that will be allocated and used by ibd.
 *     When tuning this parameter, the size of pre-allocated, pre-mapped copy
 *     buffer in each of these send wqes must be taken into account. This
 *     copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is
 *     currently set to the same value of ibd_tx_copy_thresh, but may be
 *     changed independently if needed).
 *
 * ibd_num_rwqe
 *     Number of "receive WQE" elements that will be allocated and used by
 *     ibd. This parameter is limited by the maximum channel size of the HCA.
 *     Each buffer in the receive wqe will be of MTU size.
 *
 * ibd_num_lso_bufs
 *     Number of "larger-than-MTU" copy buffers to use for cases when the
 *     outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov()
 *     and too large to be used with regular MTU-sized copy buffers. It is
 *     not recommended to tune this variable without understanding the
 *     application environment and/or memory resources. The size of each of
 *     these lso buffers is determined by the value of IBD_LSO_BUFSZ.
 *
 * ibd_num_ah
 *     Number of AH cache entries to allocate
 *
 * ibd_hash_size
 *     Hash table size for the active AH list
 *
 * ibd_separate_cqs
 * ibd_txcomp_poll
 *     These boolean variables (1 or 0) may be used to tune the behavior of
 *     ibd in managing the send and receive completion queues and in deciding
 *     whether or not transmit completions should be polled or interrupt
 *     driven (when the completion queues are separate). If both the completion
 *     queues are interrupt driven, it may not be possible for the handlers to
 *     be invoked concurrently, depending on how the interrupts are tied on
 *     the PCI intr line.  Note that some combination of these two parameters
 *     may not be meaningful (and therefore not allowed).
 *
 * ibd_tx_softintr
 * ibd_rx_softintr
 *     The softintr mechanism allows ibd to avoid event queue overflows if
 *     the receive/completion handlers are to be expensive. These are enabled
 *     by default.
 *
 * ibd_log_sz
 *     This specifies the size of the ibd log buffer in bytes. The buffer is
 *     allocated and logging is enabled only when IBD_LOGGING is defined.
 *
 */
uint_t ibd_tx_copy_thresh = 0x1000;
uint_t ibd_num_swqe = 4000;
uint_t ibd_num_rwqe = 4000;
uint_t ibd_num_lso_bufs = 0x400;
uint_t ibd_num_ah = 64;
uint_t ibd_hash_size = 32;
uint_t ibd_separate_cqs = 1;
uint_t ibd_txcomp_poll = 0;
uint_t ibd_rx_softintr = 1;
uint_t ibd_tx_softintr = 1;
uint_t ibd_create_broadcast_group = 1;
uint_t ibd_force_lso_disable = 1;
#ifdef IBD_LOGGING
uint_t ibd_log_sz = 0x20000;
#endif

#define IBD_TX_COPY_THRESH      ibd_tx_copy_thresh
#define IBD_TX_BUF_SZ           ibd_tx_copy_thresh
#define IBD_NUM_SWQE            ibd_num_swqe
#define IBD_NUM_RWQE            ibd_num_rwqe
#define IBD_NUM_LSO_BUFS        ibd_num_lso_bufs
#define IBD_NUM_AH          ibd_num_ah
#define IBD_HASH_SIZE           ibd_hash_size
#ifdef IBD_LOGGING
#define IBD_LOG_SZ          ibd_log_sz
#endif

/*
 * Receive CQ moderation parameters: NOT tunables
 */
static uint_t ibd_rxcomp_count = 4;
static uint_t ibd_rxcomp_usec = 10;

/*
 * Send CQ moderation parameters: NOT tunables
 */
#define IBD_TXCOMP_COUNT        10
#define IBD_TXCOMP_USEC         300

/*
 * Thresholds
 *
 * When waiting for resources (swqes or lso buffers) to become available,
 * the first two thresholds below determine how long to wait before informing
 * the network layer to start sending packets again. The IBD_TX_POLL_THRESH
 * determines how low the available swqes should go before we start polling
 * the completion queue.
 */
#define IBD_FREE_LSOS_THRESH        8
#define IBD_FREE_SWQES_THRESH       20
#define IBD_TX_POLL_THRESH      80

/*
 * When doing multiple-send-wr or multiple-recv-wr posts, this value
 * determines how many to do at a time (in a single ibt_post_send/recv).
 */
#define IBD_MAX_POST_MULTIPLE       4

/*
 * Maximum length for returning chained mps back to crossbow
 */
#define IBD_MAX_RX_MP_LEN       16

/*
 * LSO parameters
 */
#define IBD_LSO_MAXLEN          65536
#define IBD_LSO_BUFSZ           8192
#define IBD_PROP_LSO_POLICY     "lso-policy"

/*
 * Completion queue polling control
 */
#define IBD_RX_CQ_POLLING       0x1
#define IBD_TX_CQ_POLLING       0x2
#define IBD_REDO_RX_CQ_POLLING      0x4
#define IBD_REDO_TX_CQ_POLLING      0x8

/*
 * Flag bits for resources to reap
 */
#define IBD_RSRC_SWQE           0x1
#define IBD_RSRC_LSOBUF         0x2

/*
 * Async operation types
 */
#define IBD_ASYNC_GETAH         1
#define IBD_ASYNC_JOIN          2
#define IBD_ASYNC_LEAVE         3
#define IBD_ASYNC_PROMON        4
#define IBD_ASYNC_PROMOFF       5
#define IBD_ASYNC_REAP          6
#define IBD_ASYNC_TRAP          7
#define IBD_ASYNC_SCHED         8
#define IBD_ASYNC_LINK          9
#define IBD_ASYNC_EXIT          10

/*
 * Async operation states
 */
#define IBD_OP_NOTSTARTED       0
#define IBD_OP_ONGOING          1
#define IBD_OP_COMPLETED        2
#define IBD_OP_ERRORED          3
#define IBD_OP_ROUTERED         4

/*
 * State of IBD driver initialization during attach/m_start
 */
#define IBD_DRV_STATE_INITIALIZED   0x00001
#define IBD_DRV_RXINTR_ADDED        0x00002
#define IBD_DRV_TXINTR_ADDED        0x00004
#define IBD_DRV_IBTL_ATTACH_DONE    0x00008
#define IBD_DRV_HCA_OPENED      0x00010
#define IBD_DRV_PD_ALLOCD       0x00020
#define IBD_DRV_MAC_REGISTERED      0x00040
#define IBD_DRV_PORT_DETAILS_OBTAINED   0x00080
#define IBD_DRV_BCAST_GROUP_FOUND   0x00100
#define IBD_DRV_ACACHE_INITIALIZED  0x00200
#define IBD_DRV_CQS_ALLOCD      0x00400
#define IBD_DRV_UD_CHANNEL_SETUP    0x00800
#define IBD_DRV_TXLIST_ALLOCD       0x01000
#define IBD_DRV_SCQ_NOTIFY_ENABLED  0x02000
#define IBD_DRV_RXLIST_ALLOCD       0x04000
#define IBD_DRV_BCAST_GROUP_JOINED  0x08000
#define IBD_DRV_ASYNC_THR_CREATED   0x10000
#define IBD_DRV_RCQ_NOTIFY_ENABLED  0x20000
#define IBD_DRV_SM_NOTICES_REGISTERED   0x40000
#define IBD_DRV_STARTED         0x80000

/*
 * Start/stop in-progress flags; note that restart must always remain
 * the OR of start and stop flag values.
 */
#define IBD_DRV_START_IN_PROGRESS   0x10000000
#define IBD_DRV_STOP_IN_PROGRESS    0x20000000
#define IBD_DRV_RESTART_IN_PROGRESS 0x30000000

/*
 * Miscellaneous constants
 */
#define IBD_SEND            0
#define IBD_RECV            1
#define IB_MGID_IPV4_LOWGRP_MASK    0xFFFFFFFF
#define IBD_DEF_MAX_SDU         2044
#define IBD_DEFAULT_QKEY        0xB1B
#ifdef IBD_LOGGING
#define IBD_DMAX_LINE           100
#endif

/*
 * Enumerations for link states
 */
typedef enum {
    IBD_LINK_DOWN,
    IBD_LINK_UP,
    IBD_LINK_UP_ABSENT
} ibd_link_op_t;

/*
 * Driver State Pointer
 */
void *ibd_list;

/*
 * Logging
 */
#ifdef IBD_LOGGING
kmutex_t ibd_lbuf_lock;
uint8_t *ibd_lbuf;
uint32_t ibd_lbuf_ndx;
#endif

/*
 * Required system entry points
 */
static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);

/*
 * Required driver entry points for GLDv3
 */
static int ibd_m_stat(void *, uint_t, uint64_t *);
static int ibd_m_start(void *);
static void ibd_m_stop(void *);
static int ibd_m_promisc(void *, boolean_t);
static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
static int ibd_m_unicst(void *, const uint8_t *);
static mblk_t *ibd_m_tx(void *, mblk_t *);
static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);

/*
 * Private driver entry points for GLDv3
 */

/*
 * Initialization
 */
static int ibd_state_init(ibd_state_t *, dev_info_t *);
static int ibd_init_txlist(ibd_state_t *);
static int ibd_init_rxlist(ibd_state_t *);
static int ibd_acache_init(ibd_state_t *);
#ifdef IBD_LOGGING
static void ibd_log_init(void);
#endif

/*
 * Termination/cleanup
 */
static void ibd_state_fini(ibd_state_t *);
static void ibd_fini_txlist(ibd_state_t *);
static void ibd_fini_rxlist(ibd_state_t *);
static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
static void ibd_acache_fini(ibd_state_t *);
#ifdef IBD_LOGGING
static void ibd_log_fini(void);
#endif

/*
 * Allocation/acquire/map routines
 */
static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **, int, ibt_lkey_t);
static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **);
static int ibd_alloc_tx_copybufs(ibd_state_t *);
static int ibd_alloc_tx_lsobufs(ibd_state_t *);
static int ibd_acquire_swqe(ibd_state_t *, ibd_swqe_t **);
static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
    uint32_t *);

/*
 * Free/release/unmap routines
 */
static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *);
static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
static void ibd_delete_rwqe(ibd_state_t *, ibd_rwqe_t *);
static void ibd_free_tx_copybufs(ibd_state_t *);
static void ibd_free_tx_lsobufs(ibd_state_t *);
static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *);
static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *);

/*
 * Handlers/callback routines
 */
static uint_t ibd_intr(char *);
static uint_t ibd_tx_recycle(char *);
static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
static void ibd_scq_handler(ibt_cq_hdl_t, void *);
static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t);
static uint_t ibd_drain_cq(ibd_state_t *, ibt_cq_hdl_t, ibt_wc_t *, uint_t);
static void ibd_freemsg_cb(char *);
static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
    ibt_async_event_t *);
static void ibd_snet_notices_handler(void *, ib_gid_t,
    ibt_subnet_event_code_t, ibt_subnet_event_t *);

/*
 * Send/receive routines
 */
static boolean_t ibd_send(ibd_state_t *, mblk_t *);
static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
static int ibd_post_recv(ibd_state_t *, ibd_rwqe_t *, boolean_t);
static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
static void ibd_flush_rx(ibd_state_t *, mblk_t *);

/*
 * Threads
 */
static void ibd_async_work(ibd_state_t *);

/*
 * Async tasks
 */
static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
static void ibd_async_setprom(ibd_state_t *);
static void ibd_async_unsetprom(ibd_state_t *);
static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
static void ibd_async_txsched(ibd_state_t *);
static void ibd_async_link(ibd_state_t *, ibd_req_t *);

/*
 * Async task helpers
 */
static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
static boolean_t ibd_get_allroutergroup(ibd_state_t *,
    ipoib_mac_t *, ipoib_mac_t *);
static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
static ibt_status_t ibd_find_bgroup(ibd_state_t *);
static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
static uint64_t ibd_get_portspeed(ibd_state_t *);
static boolean_t ibd_async_safe(ibd_state_t *);
static void ibd_async_done(ibd_state_t *);
static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int);
static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t);
static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);

/*
 * Helpers for attach/start routines
 */
static int ibd_register_mac(ibd_state_t *, dev_info_t *);
static int ibd_record_capab(ibd_state_t *, dev_info_t *);
static int ibd_unattach(ibd_state_t *, dev_info_t *);
static int ibd_get_port_details(ibd_state_t *);
static int ibd_alloc_cqs(ibd_state_t *);
static int ibd_setup_ud_channel(ibd_state_t *);
static int ibd_start(ibd_state_t *);
static int ibd_undo_start(ibd_state_t *, link_state_t);
static void ibd_set_mac_progress(ibd_state_t *, uint_t);
static void ibd_clr_mac_progress(ibd_state_t *, uint_t);


/*
 * Miscellaneous helpers
 */
static int ibd_sched_poll(ibd_state_t *, int, int);
static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int);
static int ibd_resume_transmission(ibd_state_t *);
static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
static void *list_get_head(list_t *);
static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
static void ibd_print_warn(ibd_state_t *, char *, ...);
#ifdef IBD_LOGGING
static void ibd_log(const char *, ...);
#endif

DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
    nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);

/* Module Driver Info */
static struct modldrv ibd_modldrv = {
    &mod_driverops,         /* This one is a driver */
    "InfiniBand GLDv3 Driver",  /* short description */
    &ibd_dev_ops            /* driver specific ops */
};

/* Module Linkage */
static struct modlinkage ibd_modlinkage = {
    MODREV_1, (void *)&ibd_modldrv, NULL
};

/*
 * Module (static) info passed to IBTL during ibt_attach
 */
static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
    IBTI_V_CURR,
    IBT_NETWORK,
    ibd_async_handler,
    NULL,
    "IPIB"
};

/*
 * GLDv3 entry points
 */
#define IBD_M_CALLBACK_FLAGS    (MC_GETCAPAB)
static mac_callbacks_t ibd_m_callbacks = {
    IBD_M_CALLBACK_FLAGS,
    ibd_m_stat,
    ibd_m_start,
    ibd_m_stop,
    ibd_m_promisc,
    ibd_m_multicst,
    ibd_m_unicst,
    ibd_m_tx,
    NULL,
    ibd_m_getcapab
};

/*
 * Fill/clear <scope> and <p_key> in multicast/broadcast address
 */
#define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)     \
{                           \
    *(uint32_t *)((char *)(maddr) + 4) |=       \
        htonl((uint32_t)(scope) << 16);     \
    *(uint32_t *)((char *)(maddr) + 8) |=       \
        htonl((uint32_t)(pkey) << 16);      \
}

#define IBD_CLEAR_SCOPE_PKEY(maddr)         \
{                           \
    *(uint32_t *)((char *)(maddr) + 4) &=       \
        htonl(~((uint32_t)0xF << 16));      \
    *(uint32_t *)((char *)(maddr) + 8) &=       \
        htonl(~((uint32_t)0xFFFF << 16));       \
}

/*
 * Rudimentary debugging support
 */
#ifdef DEBUG
int ibd_debuglevel = 100;
static void
debug_print(int l, char *fmt, ...)
{
    va_list ap;

    if (l < ibd_debuglevel)
        return;
    va_start(ap, fmt);
    vcmn_err(CE_CONT, fmt, ap);
    va_end(ap);
}
#define DPRINT      debug_print
#else
#define DPRINT
#endif

/*
 * Common routine to print warning messages; adds in hca guid, port number
 * and pkey to be able to identify the IBA interface.
 */
static void
ibd_print_warn(ibd_state_t *state, char *fmt, ...)
{
    ib_guid_t hca_guid;
    char ibd_print_buf[256];
    int len;
    va_list ap;

    hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
        0, "hca-guid", 0);
    len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
        "%s%d: HCA GUID %016llx port %d PKEY %02x ",
        ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
        (u_longlong_t)hca_guid, state->id_port, state->id_pkey);
    va_start(ap, fmt);
    (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
        fmt, ap);
    cmn_err(CE_NOTE, "!%s", ibd_print_buf);
    va_end(ap);
}

/*
 * Warlock directives
 */

/*
 * id_lso_lock
 *
 * state->id_lso->bkt_nfree may be accessed without a lock to
 * determine the threshold at which we have to ask the nw layer
 * to resume transmission (see ibd_resume_transmission()).
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
    ibd_state_t::id_lso))
_NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
_NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))

/*
 * id_cq_poll_lock
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_cq_poll_lock,
    ibd_state_t::id_cq_poll_busy))

/*
 * id_txpost_lock
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
    ibd_state_t::id_tx_head))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
    ibd_state_t::id_tx_busy))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
    ibd_state_t::id_tx_tailp))

/*
 * id_rxpost_lock
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
    ibd_state_t::id_rx_head))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
    ibd_state_t::id_rx_busy))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
    ibd_state_t::id_rx_tailp))

/*
 * id_acache_req_lock
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
    ibd_state_t::id_acache_req_cv))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
    ibd_state_t::id_req_list))

/*
 * id_ac_mutex
 *
 * This mutex is actually supposed to protect id_ah_op as well,
 * but this path of the code isn't clean (see update of id_ah_op
 * in ibd_async_acache(), immediately after the call to
 * ibd_async_mcache()). For now, we'll skip this check by
 * declaring that id_ah_op is protected by some internal scheme
 * that warlock isn't aware of.
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
    ibd_state_t::id_ah_active))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
    ibd_state_t::id_ah_free))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
    ibd_state_t::id_ah_addr))
_NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
    ibd_state_t::id_ah_op))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
    ibd_state_t::id_ah_error))
_NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))

/*
 * id_mc_mutex
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
    ibd_state_t::id_mc_full))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
    ibd_state_t::id_mc_non))

/*
 * id_trap_lock
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
    ibd_state_t::id_trap_cv))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
    ibd_state_t::id_trap_stop))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
    ibd_state_t::id_trap_inprog))

/*
 * id_prom_op
 */
_NOTE(SCHEME_PROTECTS_DATA("only by async thread",
    ibd_state_t::id_prom_op))

/*
 * id_sched_lock
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
    ibd_state_t::id_sched_needed))

/*
 * id_link_mutex
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
    ibd_state_t::id_link_state))
_NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
_NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
    ibd_state_t::id_link_speed))

/*
 * id_tx_list.dl_mutex
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
    ibd_state_t::id_tx_list.dl_head))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
    ibd_state_t::id_tx_list.dl_tail))
_NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
    ibd_state_t::id_tx_list.dl_pending_sends))
_NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
    ibd_state_t::id_tx_list.dl_cnt))

/*
 * id_rx_list.dl_mutex
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex,
    ibd_state_t::id_rx_list.dl_head))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex,
    ibd_state_t::id_rx_list.dl_tail))
_NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
    ibd_state_t::id_rx_list.dl_bufs_outstanding))
_NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
    ibd_state_t::id_rx_list.dl_cnt))


/*
 * Items protected by atomic updates
 */
_NOTE(SCHEME_PROTECTS_DATA("atomic update only",
    ibd_state_s::id_brd_rcv
    ibd_state_s::id_brd_xmt
    ibd_state_s::id_multi_rcv
    ibd_state_s::id_multi_xmt
    ibd_state_s::id_num_intrs
    ibd_state_s::id_rcv_bytes
    ibd_state_s::id_rcv_pkt
    ibd_state_s::id_tx_short
    ibd_state_s::id_xmt_bytes
    ibd_state_s::id_xmt_pkt))

/*
 * Non-mutex protection schemes for data elements. Almost all of
 * these are non-shared items.
 */
_NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
    callb_cpr
    ib_gid_s
    ib_header_info
    ibd_acache_rq
    ibd_acache_s::ac_mce
    ibd_mcache::mc_fullreap
    ibd_mcache::mc_jstate
    ibd_mcache::mc_req
    ibd_rwqe_s
    ibd_swqe_s
    ibd_wqe_s
    ibt_wr_ds_s::ds_va
    ibt_wr_lso_s
    ipoib_mac::ipoib_qpn
    mac_capab_lso_s
    msgb::b_next
    msgb::b_rptr
    msgb::b_wptr))

int
_init()
{
    int status;

    /*
     * Sanity check some parameter settings. Tx completion polling
     * only makes sense with separate CQs for Tx and Rx.
     */
    if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) {
        cmn_err(CE_NOTE, "!ibd: %s",
            "Setting ibd_txcomp_poll = 0 for combined CQ");
        ibd_txcomp_poll = 0;
    }

    status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0);
    if (status != 0) {
        DPRINT(10, "_init:failed in ddi_soft_state_init()");
        return (status);
    }

    mac_init_ops(&ibd_dev_ops, "ibd");
    status = mod_install(&ibd_modlinkage);
    if (status != 0) {
        DPRINT(10, "_init:failed in mod_install()");
        ddi_soft_state_fini(&ibd_list);
        mac_fini_ops(&ibd_dev_ops);
        return (status);
    }

#ifdef IBD_LOGGING
    ibd_log_init();
#endif
    return (0);
}

int
_info(struct modinfo *modinfop)
{
    return (mod_info(&ibd_modlinkage, modinfop));
}

int
_fini()
{
    int status;

    status = mod_remove(&ibd_modlinkage);
    if (status != 0)
        return (status);

    mac_fini_ops(&ibd_dev_ops);
    ddi_soft_state_fini(&ibd_list);
#ifdef IBD_LOGGING
    ibd_log_fini();
#endif
    return (0);
}

/*
 * Convert the GID part of the mac address from network byte order
 * to host order.
 */
static void
ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
{
    ib_sn_prefix_t nbopref;
    ib_guid_t nboguid;

    bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
    bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
    dgid->gid_prefix = b2h64(nbopref);
    dgid->gid_guid = b2h64(nboguid);
}

/*
 * Create the IPoIB address in network byte order from host order inputs.
 */
static void
ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
    ib_guid_t guid)
{
    ib_sn_prefix_t nbopref;
    ib_guid_t nboguid;

    mac->ipoib_qpn = htonl(qpn);
    nbopref = h2b64(prefix);
    nboguid = h2b64(guid);
    bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
    bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
}

/*
 * Send to the appropriate all-routers group when the IBA multicast group
 * does not exist, based on whether the target group is v4 or v6.
 */
static boolean_t
ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
    ipoib_mac_t *rmac)
{
    boolean_t retval = B_TRUE;
    uint32_t adjscope = state->id_scope << 16;
    uint32_t topword;

    /*
     * Copy the first 4 bytes in without assuming any alignment of
     * input mac address; this will have IPoIB signature, flags and
     * scope bits.
     */
    bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
    topword = ntohl(topword);

    /*
     * Generate proper address for IPv4/v6, adding in the Pkey properly.
     */
    if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
        (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
        ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
            ((uint32_t)(state->id_pkey << 16))),
            (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
    else
        /*
         * Does not have proper bits in the mgid address.
         */
        retval = B_FALSE;

    return (retval);
}

/*
 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
 * front of optional src/tgt link layer address. Right now Solaris inserts
 * padding by default at the end. The routine which is doing is nce_xmit()
 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
 * the packet comes down from IP layer to the IBD driver, it is in the
 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
 * machdr is not 4 byte aligned and had 2 bytes of padding at the end.
 *
 * The send routine at IBD driver changes this packet as follows:
 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
 * aligned.
 *
 * At the receiving side again ibd_process_rx takes the above packet and
 * removes the two bytes of front padding and inserts it at the end. This
 * is since the IP layer does not understand padding at the front.
 */
#define IBD_PAD_NSNA(ip6h, len, type) {                 \
    uchar_t     *nd_lla_ptr;                    \
    icmp6_t     *icmp6;                     \
    nd_opt_hdr_t    *opt;                       \
    int         i;                      \
                                    \
    icmp6 = (icmp6_t *)&ip6h[1];                    \
    len -= sizeof (nd_neighbor_advert_t);               \
    if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) ||      \
        (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) &&       \
        (len != 0)) {                       \
        opt = (nd_opt_hdr_t *)((uint8_t *)ip6h          \
            + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t));    \
        ASSERT(opt != NULL);                    \
        nd_lla_ptr = (uchar_t *)&opt[1];            \
        if (type == IBD_SEND) {                 \
            for (i = IPOIB_ADDRL; i > 0; i--)       \
                *(nd_lla_ptr + i + 1) =         \
                    *(nd_lla_ptr + i - 1);      \
        } else {                        \
            for (i = 0; i < IPOIB_ADDRL; i++)       \
                *(nd_lla_ptr + i) =         \
                    *(nd_lla_ptr + i + 2);      \
        }                           \
        *(nd_lla_ptr + i) = 0;                  \
        *(nd_lla_ptr + i + 1) = 0;              \
    }                               \
}

/*
 * Address handle entries maintained by the driver are kept in the
 * free and active lists. Each entry starts out in the free list;
 * it migrates to the active list when primed using ibt_get_paths()
 * and ibt_modify_ud_dest() for transmission to a specific destination.
 * In the active list, the entry has a reference count indicating the
 * number of ongoing/uncompleted transmits that reference it. The
 * entry is left in the active list even after the reference count
 * goes to 0, since successive transmits can find it there and do
 * not need to set up another entry (ie the path information is
 * cached using the active list). Entries on the active list are
 * also hashed using the destination link address as a key for faster
 * lookups during transmits.
 *
 * For any destination address (unicast or multicast, whatever the
 * join states), there will be at most one entry in the active list.
 * Entries with a 0 reference count on the active list can be reused
 * for a transmit to a new destination, if the free list is empty.
 *
 * The AH free list insertion/deletion is protected with the id_ac_mutex,
 * since the async thread and Tx callback handlers insert/delete. The
 * active list does not need a lock (all operations are done by the
 * async thread) but updates to the reference count are atomically
 * done (increments done by Tx path, decrements by the Tx callback handler).
 */
#define IBD_ACACHE_INSERT_FREE(state, ce) \
    list_insert_head(&state->id_ah_free, ce)
#define IBD_ACACHE_GET_FREE(state) \
    list_get_head(&state->id_ah_free)
#define IBD_ACACHE_INSERT_ACTIVE(state, ce) {           \
    int _ret_;                      \
    list_insert_head(&state->id_ah_active, ce);     \
    _ret_ = mod_hash_insert(state->id_ah_active_hash,   \
        (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);   \
    ASSERT(_ret_ == 0);                 \
}
#define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) {          \
    list_remove(&state->id_ah_active, ce);          \
    (void) mod_hash_remove(state->id_ah_active_hash,    \
        (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);   \
}
#define IBD_ACACHE_GET_ACTIVE(state) \
    list_get_head(&state->id_ah_active)

/*
 * Membership states for different mcg's are tracked by two lists:
 * the "non" list is used for promiscuous mode, when all mcg traffic
 * needs to be inspected. This type of membership is never used for
 * transmission, so there can not be an AH in the active list
 * corresponding to a member in this list. This list does not need
 * any protection, since all operations are performed by the async
 * thread.
 *
 * "Full" and "SendOnly" membership is tracked using a single list,
 * the "full" list. This is because this single list can then be
 * searched during transmit to a multicast group (if an AH for the
 * mcg is not found in the active list), since at least one type
 * of membership must be present before initiating the transmit.
 * This list is also emptied during driver detach, since sendonly
 * membership acquired during transmit is dropped at detach time
 * alongwith ipv4 broadcast full membership. Insert/deletes to
 * this list are done only by the async thread, but it is also
 * searched in program context (see multicast disable case), thus
 * the id_mc_mutex protects the list. The driver detach path also
 * deconstructs the "full" list, but it ensures that the async
 * thread will not be accessing the list (by blocking out mcg
 * trap handling and making sure no more Tx reaping will happen).
 *
 * Currently, an IBA attach is done in the SendOnly case too,
 * although this is not required.
 */
#define IBD_MCACHE_INSERT_FULL(state, mce) \
    list_insert_head(&state->id_mc_full, mce)
#define IBD_MCACHE_INSERT_NON(state, mce) \
    list_insert_head(&state->id_mc_non, mce)
#define IBD_MCACHE_FIND_FULL(state, mgid) \
    ibd_mcache_find(mgid, &state->id_mc_full)
#define IBD_MCACHE_FIND_NON(state, mgid) \
    ibd_mcache_find(mgid, &state->id_mc_non)
#define IBD_MCACHE_PULLOUT_FULL(state, mce) \
    list_remove(&state->id_mc_full, mce)
#define IBD_MCACHE_PULLOUT_NON(state, mce) \
    list_remove(&state->id_mc_non, mce)

/*
 * AH and MCE active list manipulation:
 *
 * Multicast disable requests and MCG delete traps are two cases
 * where the active AH entry for the mcg (if any unreferenced one exists)
 * will be moved to the free list (to force the next Tx to the mcg to
 * join the MCG in SendOnly mode). Port up handling will also move AHs
 * from active to free list.
 *
 * In the case when some transmits are still pending on an entry
 * for an mcg, but a multicast disable has already been issued on the
 * mcg, there are some options to consider to preserve the join state
 * to ensure the emitted packet is properly routed on the IBA fabric.
 * For the AH, we can
 * 1. take out of active list at multicast disable time.
 * 2. take out of active list only when last pending Tx completes.
 * For the MCE, we can
 * 3. take out of active list at multicast disable time.
 * 4. take out of active list only when last pending Tx completes.
 * 5. move from active list to stale list at multicast disable time.
 * We choose to use 2,4. We use option 4 so that if a multicast enable
 * is tried before the pending Tx completes, the enable code finds the
 * mce in the active list and just has to make sure it will not be reaped
 * (ie the mcg leave done) when the pending Tx does complete. Alternatively,
 * a stale list (#5) that would be checked in the enable code would need
 * to be implemented. Option 2 is used, because otherwise, a Tx attempt
 * after the multicast disable would try to put an AH in the active list,
 * and associate the mce it finds in the active list to this new AH,
 * whereas the mce is already associated with the previous AH (taken off
 * the active list), and will be removed once the pending Tx's complete
 * (unless a reference count on mce's is implemented). One implication of
 * using 2,4 is that new Tx's posted before the pending Tx's complete will
 * grab new references on the AH, further delaying the leave.
 *
 * In the case of mcg delete (or create) trap when the port is sendonly
 * joined, the AH and MCE handling is different: the AH and MCE has to be
 * immediately taken off the active lists (forcing a join and path lookup
 * at the next Tx is the only guaranteed means of ensuring a proper Tx
 * to an mcg as it is repeatedly created and deleted and goes thru
 * reincarnations).
 *
 * When a port is already sendonly joined, and a multicast enable is
 * attempted, the same mce structure is promoted; this ensures only a
 * single mce on the active list tracks the most powerful join state.
 *
 * In the case of port up event handling, the MCE for sendonly membership
 * is freed up, and the ACE is put into the free list as soon as possible
 * (depending on whether posted Tx's have completed). For fullmembership
 * MCE's though, the ACE is similarly handled; but the MCE is kept around
 * (a re-JOIN is attempted) only if the DLPI leave has not already been
 * done; else the mce is deconstructed (mc_fullreap case).
 *
 * MCG creation and deletion trap handling:
 *
 * These traps are unreliable (meaning sometimes the trap might never
 * be delivered to the subscribed nodes) and may arrive out-of-order
 * since they use UD transport. An alternative to relying on these
 * unreliable traps is to poll for mcg presence every so often, but
 * instead of doing that, we try to be as conservative as possible
 * while handling the traps, and hope that the traps do arrive at
 * the subscribed nodes soon. Note that if a node is fullmember
 * joined to an mcg, it can not possibly receive a mcg create/delete
 * trap for that mcg (by fullmember definition); if it does, it is
 * an old trap from a previous incarnation of the mcg.
 *
 * Whenever a trap is received, the driver cleans up its sendonly
 * membership to the group; we choose to do a sendonly leave even
 * on a creation trap to handle the case of a prior deletion of the mcg
 * having gone unnoticed. Consider an example scenario:
 * T1: MCG M is deleted, and fires off deletion trap D1.
 * T2: MCG M is recreated, fires off creation trap C1, which is lost.
 * T3: Node N tries to transmit to M, joining in sendonly mode.
 * T4: MCG M is deleted, and fires off deletion trap D2.
 * T5: N receives a deletion trap, but can not distinguish D1 from D2.
 *     If the trap is D2, then a LEAVE is not required, since the mcg
 *     is already deleted; but if it is D1, a LEAVE is required. A safe
 *     approach is to always LEAVE, but the SM may be confused if it
 *     receives a LEAVE without a prior JOIN.
 *
 * Management of the non-membership to an mcg is similar to the above,
 * except that if the interface is in promiscuous mode, it is required
 * to attempt to re-join the mcg after receiving a trap. Unfortunately,
 * if the re-join attempt fails (in which case a warning message needs
 * to be printed), it is not clear whether it failed due to the mcg not
 * existing, or some fabric/hca issues, due to the delayed nature of
 * trap delivery. Querying the SA to establish presence/absence of the
 * mcg is also racy at best. Thus, the driver just prints a warning
 * message when it can not rejoin after receiving a create trap, although
 * this might be (on rare occassions) a mis-warning if the create trap is
 * received after the mcg was deleted.
 */

/*
 * Implementation of atomic "recycle" bits and reference count
 * on address handles. This utilizes the fact that max reference
 * count on any handle is limited by number of send wqes, thus
 * high bits in the ac_ref field can be used as the recycle bits,
 * and only the low bits hold the number of pending Tx requests.
 * This atomic AH reference counting allows the Tx completion
 * handler not to acquire the id_ac_mutex to process every completion,
 * thus reducing lock contention problems between completion and
 * the Tx path.
 */
#define CYCLEVAL        0x80000
#define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0
#define CYCLE_SET(ace)      (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL)
#define GET_REF(ace)        ((ace)->ac_ref)
#define GET_REF_CYCLE(ace) (                \
    /*                      \
     * Make sure "cycle" bit is set.        \
     */                     \
    ASSERT(CYCLE_SET(ace)),             \
    ((ace)->ac_ref & ~(CYCLEVAL))           \
)
#define INC_REF(ace, num) {             \
    atomic_add_32(&(ace)->ac_ref, num);     \
}
#define SET_CYCLE_IF_REF(ace) (             \
    CYCLE_SET(ace) ? B_TRUE :           \
        atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \
        CYCLEVAL ?              \
        /*                  \
         * Clear the "cycle" bit we just set;   \
         * ref count known to be 0 from above.  \
         */                 \
        CLEAR_REFCYCLE(ace), B_FALSE :      \
        /*                  \
         * We set "cycle" bit; let caller know. \
         */                 \
        B_TRUE                  \
)
#define DEC_REF_DO_CYCLE(ace) (             \
    atomic_add_32_nv(&ace->ac_ref, -1) ==       \
        CYCLEVAL ?                  \
        /*                  \
         * Ref count known to be 0 from above.  \
         */                 \
        B_TRUE :                \
        B_FALSE                 \
)

static void *
list_get_head(list_t *list)
{
    list_node_t *lhead = list_head(list);

    if (lhead != NULL)
        list_remove(list, lhead);
    return (lhead);
}

/*
 * This is always guaranteed to be able to queue the work.
 */
static void
ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
{
    /* Initialize request */
    DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
    ptr->rq_op = op;

    /*
     * Queue provided slot onto request pool.
     */
    mutex_enter(&state->id_acache_req_lock);
    list_insert_tail(&state->id_req_list, ptr);

    /* Go, fetch, async thread */
    cv_signal(&state->id_acache_req_cv);
    mutex_exit(&state->id_acache_req_lock);
}

/*
 * Main body of the per interface async thread.
 */
static void
ibd_async_work(ibd_state_t *state)
{
    ibd_req_t *ptr;
    callb_cpr_t cprinfo;

    mutex_enter(&state->id_acache_req_lock);
    CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
        callb_generic_cpr, "ibd_async_work");

    for (;;) {
        ptr = list_get_head(&state->id_req_list);
        if (ptr != NULL) {
            mutex_exit(&state->id_acache_req_lock);

            /*
             * Once we have done the operation, there is no
             * guarantee the request slot is going to be valid,
             * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
             * TRAP).
             *
             * Perform the request.
             */
            switch (ptr->rq_op) {
                case IBD_ASYNC_GETAH:
                    ibd_async_acache(state, &ptr->rq_mac);
                    break;
                case IBD_ASYNC_JOIN:
                case IBD_ASYNC_LEAVE:
                    ibd_async_multicast(state,
                        ptr->rq_gid, ptr->rq_op);
                    break;
                case IBD_ASYNC_PROMON:
                    ibd_async_setprom(state);
                    break;
                case IBD_ASYNC_PROMOFF:
                    ibd_async_unsetprom(state);
                    break;
                case IBD_ASYNC_REAP:
                    ibd_async_reap_group(state,
                        ptr->rq_ptr, ptr->rq_gid,
                        IB_MC_JSTATE_FULL);
                    /*
                     * the req buf contains in mce
                     * structure, so we do not need
                     * to free it here.
                     */
                    ptr = NULL;
                    break;
                case IBD_ASYNC_TRAP:
                    ibd_async_trap(state, ptr);
                    break;
                case IBD_ASYNC_SCHED:
                    ibd_async_txsched(state);
                    break;
                case IBD_ASYNC_LINK:
                    ibd_async_link(state, ptr);
                    break;
                case IBD_ASYNC_EXIT:
                    mutex_enter(&state->id_acache_req_lock);
#ifndef __lock_lint
                    CALLB_CPR_EXIT(&cprinfo);
#else
                    mutex_exit(&state->id_acache_req_lock);
#endif
                    return;
            }
            if (ptr != NULL)
                kmem_cache_free(state->id_req_kmc, ptr);

            mutex_enter(&state->id_acache_req_lock);
        } else {
#ifndef __lock_lint
            /*
             * Nothing to do: wait till new request arrives.
             */
            CALLB_CPR_SAFE_BEGIN(&cprinfo);
            cv_wait(&state->id_acache_req_cv,
                &state->id_acache_req_lock);
            CALLB_CPR_SAFE_END(&cprinfo,
                &state->id_acache_req_lock);
#endif
        }
    }

    /*NOTREACHED*/
    _NOTE(NOT_REACHED)
}

/*
 * Return when it is safe to queue requests to the async daemon; primarily
 * for subnet trap and async event handling. Disallow requests before the
 * daemon is created, and when interface deinitilization starts.
 */
static boolean_t
ibd_async_safe(ibd_state_t *state)
{
    mutex_enter(&state->id_trap_lock);
    if (state->id_trap_stop) {
        mutex_exit(&state->id_trap_lock);
        return (B_FALSE);
    }
    state->id_trap_inprog++;
    mutex_exit(&state->id_trap_lock);
    return (B_TRUE);
}

/*
 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
 * trap or event handling to complete to kill the async thread and deconstruct
 * the mcg/ace list.
 */
static void
ibd_async_done(ibd_state_t *state)
{
    mutex_enter(&state->id_trap_lock);
    if (--state->id_trap_inprog == 0)
        cv_signal(&state->id_trap_cv);
    mutex_exit(&state->id_trap_lock);
}

/*
 * Hash functions:
 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
 * These operate on mac addresses input into ibd_send, but there is no
 * guarantee on the alignment of the ipoib_mac_t structure.
 */
/*ARGSUSED*/
static uint_t
ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
{
    ulong_t ptraddr = (ulong_t)key;
    uint_t hval;

    /*
     * If the input address is 4 byte aligned, we can just dereference
     * it. This is most common, since IP will send in a 4 byte aligned
     * IP header, which implies the 24 byte IPoIB psuedo header will be
     * 4 byte aligned too.
     */
    if ((ptraddr & 3) == 0)
        return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);

    bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
    return (hval);
}

static int
ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
{
    if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
        return (0);
    else
        return (1);
}

/*
 * Initialize all the per interface caches and lists; AH cache,
 * MCG list etc.
 */
static int
ibd_acache_init(ibd_state_t *state)
{
    ibd_ace_t *ce;
    int i;

    mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
    cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);

    mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
    mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
    list_create(&state->id_ah_free, sizeof (ibd_ace_t),
        offsetof(ibd_ace_t, ac_list));
    list_create(&state->id_ah_active, sizeof (ibd_ace_t),
        offsetof(ibd_ace_t, ac_list));
    state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
        IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
        ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
    list_create(&state->id_mc_full, sizeof (ibd_mce_t),
        offsetof(ibd_mce_t, mc_list));
    list_create(&state->id_mc_non, sizeof (ibd_mce_t),
        offsetof(ibd_mce_t, mc_list));
    list_create(&state->id_req_list, sizeof (ibd_req_t),
        offsetof(ibd_req_t, rq_list));

    state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
        IBD_NUM_AH, KM_SLEEP);
    for (i = 0; i < IBD_NUM_AH; i++, ce++) {
        if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
            state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
            ibd_acache_fini(state);
            return (DDI_FAILURE);
        } else {
            CLEAR_REFCYCLE(ce);
            ce->ac_mce = NULL;
            IBD_ACACHE_INSERT_FREE(state, ce);
        }
    }
    return (DDI_SUCCESS);
}

static void
ibd_acache_fini(ibd_state_t *state)
{
    ibd_ace_t *ptr;

    mutex_enter(&state->id_ac_mutex);

    while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
        ASSERT(GET_REF(ptr) == 0);
        (void) ibt_free_ud_dest(ptr->ac_dest);
    }

    while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
        ASSERT(GET_REF(ptr) == 0);
        (void) ibt_free_ud_dest(ptr->ac_dest);
    }

    list_destroy(&state->id_ah_free);
    list_destroy(&state->id_ah_active);
    list_destroy(&state->id_mc_full);
    list_destroy(&state->id_mc_non);
    list_destroy(&state->id_req_list);
    kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH);
    mutex_exit(&state->id_ac_mutex);
    mutex_destroy(&state->id_ac_mutex);
    mutex_destroy(&state->id_mc_mutex);
    mutex_destroy(&state->id_acache_req_lock);
    cv_destroy(&state->id_acache_req_cv);
}

/*
 * Search AH active hash list for a cached path to input destination.
 * If we are "just looking", hold == F. When we are in the Tx path,
 * we set hold == T to grab a reference on the AH so that it can not
 * be recycled to a new destination while the Tx request is posted.
 */
static ibd_ace_t *
ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
{
    ibd_ace_t *ptr;

    ASSERT(mutex_owned(&state->id_ac_mutex));

    /*
     * Do hash search.
     */
    if (mod_hash_find(state->id_ah_active_hash,
        (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
        if (hold)
            INC_REF(ptr, num);
        return (ptr);
    }
    return (NULL);
}

/*
 * This is called by the tx side; if an initialized AH is found in
 * the active list, it is locked down and can be used; if no entry
 * is found, an async request is queued to do path resolution.
 */
static ibd_ace_t *
ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
{
    ibd_ace_t *ptr;
    ibd_req_t *req;

    /*
     * Only attempt to print when we can; in the mdt pattr case, the
     * address is not aligned properly.
     */
    if (((ulong_t)mac & 3) == 0) {
        DPRINT(4,
            "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
            htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
            htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
            htonl(mac->ipoib_gidsuff[1]));
    }

    mutex_enter(&state->id_ac_mutex);

    if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) {
        mutex_exit(&state->id_ac_mutex);
        return (ptr);
    }

    /*
     * Implementation of a single outstanding async request; if
     * the operation is not started yet, queue a request and move
     * to ongoing state. Remember in id_ah_addr for which address
     * we are queueing the request, in case we need to flag an error;
     * Any further requests, for the same or different address, until
     * the operation completes, is sent back to GLDv3 to be retried.
     * The async thread will update id_ah_op with an error indication
     * or will set it to indicate the next look up can start; either
     * way, it will mac_tx_update() so that all blocked requests come
     * back here.
     */
    *err = EAGAIN;
    if (state->id_ah_op == IBD_OP_NOTSTARTED) {
        req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
        if (req != NULL) {
            /*
             * We did not even find the entry; queue a request
             * for it.
             */
            bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
            ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
            state->id_ah_op = IBD_OP_ONGOING;
            bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
        }
    } else if ((state->id_ah_op != IBD_OP_ONGOING) &&
        (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
        /*
         * Check the status of the pathrecord lookup request
         * we had queued before.
         */
        if (state->id_ah_op == IBD_OP_ERRORED) {
            *err = EFAULT;
            state->id_ah_error++;
        } else {
            /*
             * IBD_OP_ROUTERED case: We need to send to the
             * all-router MCG. If we can find the AH for
             * the mcg, the Tx will be attempted. If we
             * do not find the AH, we return NORESOURCES
             * to retry.
             */
            ipoib_mac_t routermac;

            (void) ibd_get_allroutergroup(state, mac, &routermac);
            ptr = ibd_acache_find(state, &routermac, B_TRUE,
                numwqe);
        }
        state->id_ah_op = IBD_OP_NOTSTARTED;
    } else if ((state->id_ah_op != IBD_OP_ONGOING) &&
        (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
        /*
         * This case can happen when we get a higher band
         * packet. The easiest way is to reset the state machine
         * to accommodate the higher priority packet.
         */
        state->id_ah_op = IBD_OP_NOTSTARTED;
    }
    mutex_exit(&state->id_ac_mutex);

    return (ptr);
}

/*
 * Grab a not-currently-in-use AH/PathRecord from the active
 * list to recycle to a new destination. Only the async thread
 * executes this code.
 */
static ibd_ace_t *
ibd_acache_get_unref(ibd_state_t *state)
{
    ibd_ace_t *ptr = list_head(&state->id_ah_active);

    ASSERT(mutex_owned(&state->id_ac_mutex));

    /*
     * Do plain linear search.
     */
    while (ptr != NULL) {
        /*
         * Note that it is possible that the "cycle" bit
         * is set on the AH w/o any reference count. The
         * mcg must have been deleted, and the tx cleanup
         * just decremented the reference count to 0, but
         * hasn't gotten around to grabbing the id_ac_mutex
         * to move the AH into the free list.
         */
        if (GET_REF(ptr) == 0) {
            IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
            break;
        }
        ptr = list_next(&state->id_ah_active, ptr);
    }
    return (ptr);
}

/*
 * Invoked to clean up AH from active list in case of multicast
 * disable and to handle sendonly memberships during mcg traps.
 * And for port up processing for multicast and unicast AHs.
 * Normally, the AH is taken off the active list, and put into
 * the free list to be recycled for a new destination. In case
 * Tx requests on the AH have not completed yet, the AH is marked
 * for reaping (which will put the AH on the free list) once the Tx's
 * complete; in this case, depending on the "force" input, we take
 * out the AH from the active list right now, or leave it also for
 * the reap operation. Returns TRUE if the AH is taken off the active
 * list (and either put into the free list right now, or arranged for
 * later), FALSE otherwise.
 */
static boolean_t
ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
{
    ibd_ace_t *acactive;
    boolean_t ret = B_TRUE;

    ASSERT(mutex_owned(&state->id_ac_mutex));

    if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {

        /*
         * Note that the AH might already have the cycle bit set
         * on it; this might happen if sequences of multicast
         * enables and disables are coming so fast, that posted
         * Tx's to the mcg have not completed yet, and the cycle
         * bit is set successively by each multicast disable.
         */
        if (SET_CYCLE_IF_REF(acactive)) {
            if (!force) {
                /*
                 * The ace is kept on the active list, further
                 * Tx's can still grab a reference on it; the
                 * ace is reaped when all pending Tx's
                 * referencing the AH complete.
                 */
                ret = B_FALSE;
            } else {
                /*
                 * In the mcg trap case, we always pull the
                 * AH from the active list. And also the port
                 * up multi/unicast case.
                 */
                IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
                acactive->ac_mce = NULL;
            }
        } else {
            /*
             * Determined the ref count is 0, thus reclaim
             * immediately after pulling out the ace from
             * the active list.
             */
            IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
            acactive->ac_mce = NULL;
            IBD_ACACHE_INSERT_FREE(state, acactive);
        }

    }
    return (ret);
}

/*
 * Helper function for async path record lookup. If we are trying to
 * Tx to a MCG, check our membership, possibly trying to join the
 * group if required. If that fails, try to send the packet to the
 * all router group (indicated by the redirect output), pointing
 * the input mac address to the router mcg address.
 */
static ibd_mce_t *
ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
{
    ib_gid_t mgid;
    ibd_mce_t *mce;
    ipoib_mac_t routermac;

    *redirect = B_FALSE;
    ibd_n2h_gid(mac, &mgid);

    /*
     * Check the FullMember+SendOnlyNonMember list.
     * Since we are the only one who manipulates the
     * id_mc_full list, no locks are needed.
     */
    mce = IBD_MCACHE_FIND_FULL(state, mgid);
    if (mce != NULL) {
        DPRINT(4, "ibd_async_mcache : already joined to group");
        return (mce);
    }

    /*
     * Not found; try to join(SendOnlyNonMember) and attach.
     */
    DPRINT(4, "ibd_async_mcache : not joined to group");
    if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
        NULL) {
        DPRINT(4, "ibd_async_mcache : nonmem joined to group");
        return (mce);
    }

    /*
     * MCGroup not present; try to join the all-router group. If
     * any of the following steps succeed, we will be redirecting
     * to the all router group.
     */
    DPRINT(4, "ibd_async_mcache : nonmem join failed");
    if (!ibd_get_allroutergroup(state, mac, &routermac))
        return (NULL);
    *redirect = B_TRUE;
    ibd_n2h_gid(&routermac, &mgid);
    bcopy(&routermac, mac, IPOIB_ADDRL);
    DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
        mgid.gid_prefix, mgid.gid_guid);

    /*
     * Are we already joined to the router group?
     */
    if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
        DPRINT(4, "ibd_async_mcache : using already joined router"
            "group\n");
        return (mce);
    }

    /*
     * Can we join(SendOnlyNonMember) the router group?
     */
    DPRINT(4, "ibd_async_mcache : attempting join to router grp");
    if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
        NULL) {
        DPRINT(4, "ibd_async_mcache : joined to router grp");
        return (mce);
    }

    return (NULL);
}

/*
 * Async path record lookup code.
 */
static void
ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
{
    ibd_ace_t *ce;
    ibd_mce_t *mce = NULL;
    ibt_path_attr_t path_attr;
    ibt_path_info_t path_info;
    ib_gid_t destgid;
    char ret = IBD_OP_NOTSTARTED;

    DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
        htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
        htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
        htonl(mac->ipoib_gidsuff[1]));

    /*
     * Check whether we are trying to transmit to a MCG.
     * In that case, we need to make sure we are a member of
     * the MCG.
     */
    if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
        boolean_t redirected;

        /*
         * If we can not find or join the group or even
         * redirect, error out.
         */
        if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
            NULL) {
            state->id_ah_op = IBD_OP_ERRORED;
            return;
        }

        /*
         * If we got redirected, we need to determine whether
         * the AH for the new mcg is in the cache already, and
         * not pull it in then; otherwise proceed to get the
         * path for the new mcg. There is no guarantee that
         * if the AH is currently in the cache, it will still be
         * there when we look in ibd_acache_lookup(), but that's
         * okay, we will come back here.
         */
        if (redirected) {
            ret = IBD_OP_ROUTERED;
            DPRINT(4, "ibd_async_acache :  redirected to "
                "%08X:%08X:%08X:%08X:%08X",
                htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
                htonl(mac->ipoib_gidpref[1]),
                htonl(mac->ipoib_gidsuff[0]),
                htonl(mac->ipoib_gidsuff[1]));

            mutex_enter(&state->id_ac_mutex);
            if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
                state->id_ah_op = IBD_OP_ROUTERED;
                mutex_exit(&state->id_ac_mutex);
                DPRINT(4, "ibd_async_acache : router AH found");
                return;
            }
            mutex_exit(&state->id_ac_mutex);
        }
    }

    /*
     * Get an AH from the free list.
     */
    mutex_enter(&state->id_ac_mutex);
    if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
        /*
         * No free ones; try to grab an unreferenced active
         * one. Maybe we need to make the active list LRU,
         * but that will create more work for Tx callbacks.
         * Is there a way of not having to pull out the
         * entry from the active list, but just indicate it
         * is being recycled? Yes, but that creates one more
         * check in the fast lookup path.
         */
        if ((ce = ibd_acache_get_unref(state)) == NULL) {
            /*
             * Pretty serious shortage now.
             */
            state->id_ah_op = IBD_OP_NOTSTARTED;
            mutex_exit(&state->id_ac_mutex);
            DPRINT(10, "ibd_async_acache : failed to find AH "
                "slot\n");
            return;
        }
        /*
         * We could check whether ac_mce points to a SendOnly
         * member and drop that membership now. Or do it lazily
         * at detach time.
         */
        ce->ac_mce = NULL;
    }
    mutex_exit(&state->id_ac_mutex);
    ASSERT(ce->ac_mce == NULL);

    /*
     * Update the entry.
     */
    bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);

    bzero(&path_info, sizeof (path_info));
    bzero(&path_attr, sizeof (ibt_path_attr_t));
    path_attr.pa_sgid = state->id_sgid;
    path_attr.pa_num_dgids = 1;
    ibd_n2h_gid(&ce->ac_mac, &destgid);
    path_attr.pa_dgids = &destgid;
    path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
    if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
        &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) {
        DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
        goto error;
    }
    if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
        ntohl(ce->ac_mac.ipoib_qpn),
        &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
        DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
        goto error;
    }

    /*
     * mce is set whenever an AH is being associated with a
     * MCG; this will come in handy when we leave the MCG. The
     * lock protects Tx fastpath from scanning the active list.
     */
    if (mce != NULL)
        ce->ac_mce = mce;
    mutex_enter(&state->id_ac_mutex);
    IBD_ACACHE_INSERT_ACTIVE(state, ce);
    state->id_ah_op = ret;
    mutex_exit(&state->id_ac_mutex);
    return;
error:
    /*
     * We might want to drop SendOnly membership here if we
     * joined above. The lock protects Tx callbacks inserting
     * into the free list.
     */
    mutex_enter(&state->id_ac_mutex);
    state->id_ah_op = IBD_OP_ERRORED;
    IBD_ACACHE_INSERT_FREE(state, ce);
    mutex_exit(&state->id_ac_mutex);
}

/*
 * While restoring port's presence on the subnet on a port up, it is possible
 * that the port goes down again.
 */
static void
ibd_async_link(ibd_state_t *state, ibd_req_t *req)
{
    ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
    link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
        LINK_STATE_UP;
    ibd_mce_t *mce, *pmce;
    ibd_ace_t *ace, *pace;

    DPRINT(10, "ibd_async_link(): %d", opcode);

    /*
     * On a link up, revalidate the link speed/width. No point doing
     * this on a link down, since we will be unable to do SA operations,
     * defaulting to the lowest speed. Also notice that we update our
     * notion of speed before calling mac_link_update(), which will do
     * neccesary higher level notifications for speed changes.
     */
    if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
        state->id_link_speed = ibd_get_portspeed(state);
        _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
    }

    /*
     * Do all the work required to establish our presence on
     * the subnet.
     */
    if (opcode == IBD_LINK_UP_ABSENT) {
        /*
         * If in promiscuous mode ...
         */
        if (state->id_prom_op == IBD_OP_COMPLETED) {
            /*
             * Drop all nonmembership.
             */
            ibd_async_unsetprom(state);

            /*
             * Then, try to regain nonmembership to all mcg's.
             */
            ibd_async_setprom(state);

        }

        /*
         * Drop all sendonly membership (which also gets rid of the
         * AHs); try to reacquire all full membership.
         */
        mce = list_head(&state->id_mc_full);
        while ((pmce = mce) != NULL) {
            mce = list_next(&state->id_mc_full, mce);
            if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
                ibd_leave_group(state,
                    pmce->mc_info.mc_adds_vect.av_dgid,
                    IB_MC_JSTATE_SEND_ONLY_NON);
            else
                ibd_reacquire_group(state, pmce);
        }

        /*
         * Recycle all active AHs to free list (and if there are
         * pending posts, make sure they will go into the free list
         * once the Tx's complete). Grab the lock to prevent
         * concurrent Tx's as well as Tx cleanups.
         */
        mutex_enter(&state->id_ac_mutex);
        ace = list_head(&state->id_ah_active);
        while ((pace = ace) != NULL) {
            boolean_t cycled;

            ace = list_next(&state->id_ah_active, ace);
            mce = pace->ac_mce;
            cycled = ibd_acache_recycle(state, &pace->ac_mac,
                B_TRUE);
            /*
             * If this is for an mcg, it must be for a fullmember,
             * since we got rid of send-only members above when
             * processing the mce list.
             */
            ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
                IB_MC_JSTATE_FULL)));

            /*
             * Check if the fullmember mce needs to be torn down,
             * ie whether the DLPI disable has already been done.
             * If so, do some of the work of tx_cleanup, namely
             * causing leave (which will fail), detach and
             * mce-freeing. tx_cleanup will put the AH into free
             * list. The reason to duplicate some of this
             * tx_cleanup work is because we want to delete the
             * AH right now instead of waiting for tx_cleanup, to
             * force subsequent Tx's to reacquire an AH.
             */
            if ((mce != NULL) && (mce->mc_fullreap))
                ibd_async_reap_group(state, mce,
                    mce->mc_info.mc_adds_vect.av_dgid,
                    mce->mc_jstate);
        }
        mutex_exit(&state->id_ac_mutex);
    }

    /*
     * mac handle is guaranteed to exist since driver does ibt_close_hca()
     * (which stops further events from being delivered) before
     * mac_unregister(). At this point, it is guaranteed that mac_register
     * has already been done.
     */
    mutex_enter(&state->id_link_mutex);
    state->id_link_state = lstate;
    mac_link_update(state->id_mh, lstate);
    mutex_exit(&state->id_link_mutex);

    ibd_async_done(state);
}

/*
 * Check the pkey table to see if we can find the pkey we're looking for.
 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
 * failure.
 */
static int
ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
    uint16_t *pkix)
{
    uint16_t ndx;

    ASSERT(pkix != NULL);

    for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
        if (pkey_tbl[ndx] == pkey) {
            *pkix = ndx;
            return (0);
        }
    }
    return (-1);
}

/*
 * When the link is notified up, we need to do a few things, based
 * on the port's current p_init_type_reply claiming a reinit has been
 * done or not. The reinit steps are:
 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
 *    the old Pkey and GID0 are correct.
 * 2. Register for mcg traps (already done by ibmf).
 * 3. If PreservePresenceReply indicates the SM has restored port's presence
 *    in subnet, nothing more to do. Else go to next steps (on async daemon).
 * 4. Give up all sendonly memberships.
 * 5. Acquire all full memberships.
 * 6. In promiscuous mode, acquire all non memberships.
 * 7. Recycle all AHs to free list.
 */
static void
ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
{
    ibt_hca_portinfo_t *port_infop = NULL;
    ibt_status_t ibt_status;
    uint_t psize, port_infosz;
    ibd_link_op_t opcode;
    ibd_req_t *req;
    link_state_t new_link_state = LINK_STATE_UP;
    uint8_t itreply;
    uint16_t pkix;
    int ret;

    /*
     * Let's not race with a plumb or an unplumb; if we detect a
     * pkey relocation event later on here, we may have to restart.
     */
    ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);

    mutex_enter(&state->id_link_mutex);

    /*
     * If the init code in ibd_m_start hasn't yet set up the
     * pkey/gid, nothing to do; that code will set the link state.
     */
    if (state->id_link_state == LINK_STATE_UNKNOWN) {
        mutex_exit(&state->id_link_mutex);
        goto link_mod_return;
    }

    /*
     * If this routine was called in response to a port down event,
     * we just need to see if this should be informed.
     */
    if (code == IBT_ERROR_PORT_DOWN) {
        new_link_state = LINK_STATE_DOWN;
        goto update_link_state;
    }

    /*
     * If it's not a port down event we've received, try to get the port
     * attributes first. If we fail here, the port is as good as down.
     * Otherwise, if the link went down by the time the handler gets
     * here, give up - we cannot even validate the pkey/gid since those
     * are not valid and this is as bad as a port down anyway.
     */
    ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
        &port_infop, &psize, &port_infosz);
    if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
        (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
        new_link_state = LINK_STATE_DOWN;
        goto update_link_state;
    }

    /*
     * Check the SM InitTypeReply flags. If both NoLoadReply and
     * PreserveContentReply are 0, we don't know anything about the
     * data loaded into the port attributes, so we need to verify
     * if gid0 and pkey are still valid.
     */
    itreply = port_infop->p_init_type_reply;
    if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
        ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
        /*
         * Check to see if the subnet part of GID0 has changed. If
         * not, check the simple case first to see if the pkey
         * index is the same as before; finally check to see if the
         * pkey has been relocated to a different index in the table.
         */
        if (bcmp(port_infop->p_sgid_tbl,
            &state->id_sgid, sizeof (ib_gid_t)) != 0) {

            new_link_state = LINK_STATE_DOWN;

        } else if (port_infop->p_pkey_tbl[state->id_pkix] ==
            state->id_pkey) {

            new_link_state = LINK_STATE_UP;

        } else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
            port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {

            ibt_free_portinfo(port_infop, port_infosz);
            mutex_exit(&state->id_link_mutex);

            /*
             * Currently a restart is required if our pkey has moved
             * in the pkey table. If we get the ibt_recycle_ud() to
             * work as documented (expected), we may be able to
             * avoid a complete restart.  Note that we've already
             * marked both the start and stop 'in-progress' flags,
             * so it is ok to go ahead and do this restart.
             */
            ibd_undo_start(state, LINK_STATE_DOWN);
            if ((ret = ibd_start(state)) != 0) {
                DPRINT(10, "ibd_restart: cannot restart, "
                    "ret=%d", ret);
            }

            goto link_mod_return;
        } else {
            new_link_state = LINK_STATE_DOWN;
        }
    }

update_link_state:
    if (port_infop) {
        ibt_free_portinfo(port_infop, port_infosz);
    }

    /*
     * If the old state is the same as the new state, nothing to do
     */
    if (state->id_link_state == new_link_state) {
        mutex_exit(&state->id_link_mutex);
        goto link_mod_return;
    }

    /*
     * Ok, so there was a link state change; see if it's safe to ask
     * the async thread to do the work
     */
    if (!ibd_async_safe(state)) {
        state->id_link_state = new_link_state;
        mutex_exit(&state->id_link_mutex);
        goto link_mod_return;
    }

    mutex_exit(&state->id_link_mutex);

    /*
     * If we're reporting a link up, check InitTypeReply to see if
     * the SM has ensured that the port's presence in mcg, traps,
     * etc. is intact.
     */
    if (new_link_state == LINK_STATE_DOWN) {
        opcode = IBD_LINK_DOWN;
    } else {
        if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
            SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
            opcode = IBD_LINK_UP;
        } else {
            opcode = IBD_LINK_UP_ABSENT;
        }
    }

    /*
     * Queue up a request for ibd_async_link() to handle this link
     * state change event
     */
    req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
    req->rq_ptr = (void *)opcode;
    ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);

link_mod_return:
    ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
}

/*
 * For the port up/down events, IBTL guarantees there will not be concurrent
 * invocations of the handler. IBTL might coalesce link transition events,
 * and not invoke the handler for _each_ up/down transition, but it will
 * invoke the handler with last known state
 */
static void
ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
    ibt_async_code_t code, ibt_async_event_t *event)
{
    ibd_state_t *state = (ibd_state_t *)clnt_private;

    switch (code) {
    case IBT_ERROR_CATASTROPHIC_CHAN:
        ibd_print_warn(state, "catastrophic channel error");
        break;
    case IBT_ERROR_CQ:
        ibd_print_warn(state, "completion queue error");
        break;
    case IBT_PORT_CHANGE_EVENT:
        /*
         * Events will be delivered to all instances that have
         * done ibt_open_hca() but not yet done ibt_close_hca().
         * Only need to do work for our port; IBTF will deliver
         * events for other ports on the hca we have ibt_open_hca'ed
         * too. Note that id_port is initialized in ibd_attach()
         * before we do an ibt_open_hca() in ibd_attach().
         */
        ASSERT(state->id_hca_hdl == hca_hdl);
        if (state->id_port != event->ev_port)
            break;

        if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
            IBT_PORT_CHANGE_PKEY) {
            ibd_link_mod(state, code);
        }
        break;
    case IBT_ERROR_PORT_DOWN:
    case IBT_CLNT_REREG_EVENT:
    case IBT_EVENT_PORT_UP:
        /*
         * Events will be delivered to all instances that have
         * done ibt_open_hca() but not yet done ibt_close_hca().
         * Only need to do work for our port; IBTF will deliver
         * events for other ports on the hca we have ibt_open_hca'ed
         * too. Note that id_port is initialized in ibd_attach()
         * before we do an ibt_open_hca() in ibd_attach().
         */
        ASSERT(state->id_hca_hdl == hca_hdl);
        if (state->id_port != event->ev_port)
            break;

        ibd_link_mod(state, code);
        break;

    case IBT_HCA_ATTACH_EVENT:
    case IBT_HCA_DETACH_EVENT:
        /*
         * When a new card is plugged to the system, attach_event is
         * invoked. Additionally, a cfgadm needs to be run to make the
         * card known to the system, and an ifconfig needs to be run to
         * plumb up any ibd interfaces on the card. In the case of card
         * unplug, a cfgadm is run that will trigger any RCM scripts to
         * unplumb the ibd interfaces on the card; when the card is
         * actually unplugged, the detach_event is invoked;
         * additionally, if any ibd instances are still active on the
         * card (eg there were no associated RCM scripts), driver's
         * detach routine is invoked.
         */
        break;
    default:
        break;
    }
}

static int
ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
{
    mac_register_t *macp;
    int ret;

    if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
        DPRINT(10, "ibd_register_mac: mac_alloc() failed");
        return (DDI_FAILURE);
    }

    /*
     * Note that when we register with mac during attach, we don't
     * have the id_macaddr yet, so we'll simply be registering a
     * zero macaddr that we'll overwrite later during plumb (in
     * ibd_m_start()). Similar is the case with id_mtu - we'll
     * update the mac layer with the correct mtu during plumb.
     */
    macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
    macp->m_driver = state;
    macp->m_dip = dip;
    macp->m_src_addr = (uint8_t *)&state->id_macaddr;
    macp->m_callbacks = &ibd_m_callbacks;
    macp->m_min_sdu = 0;
    macp->m_max_sdu = IBD_DEF_MAX_SDU;

    /*
     *  Register ourselves with the GLDv3 interface
     */
    if ((ret = mac_register(macp, &state->id_mh)) != 0) {
        mac_free(macp);
        DPRINT(10,
            "ibd_register_mac: mac_register() failed, ret=%d", ret);
        return (DDI_FAILURE);
    }

    mac_free(macp);
    return (DDI_SUCCESS);
}

static int
ibd_record_capab(ibd_state_t *state, dev_info_t *dip)
{
    ibt_hca_attr_t hca_attrs;
    ibt_status_t ibt_status;

    /*
     * Query the HCA and fetch its attributes
     */
    ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
    ASSERT(ibt_status == IBT_SUCCESS);

    /*
     * 1. Set the Hardware Checksum capability. Currently we only consider
     *    full checksum offload.
     */
    if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) {
        state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
    }

    /*
     * 2. Set LSO policy, capability and maximum length
     */
    if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
        DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) {
        state->id_lso_policy = B_TRUE;
    } else {
        state->id_lso_policy = B_FALSE;
    }

    /*
     * Work-around for Bug 6866957. Ignore policy from ibd.conf.
     * Turn off LSO forcibly. Remove it when the work-around is no longer
     * needed.
     */
    if (ibd_force_lso_disable) {
        state->id_lso_policy = B_FALSE;
    }
    /* End of Workaround */

    if (hca_attrs.hca_max_lso_size > 0) {
        state->id_lso_capable = B_TRUE;
        if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
            state->id_lso_maxlen = IBD_LSO_MAXLEN;
        else
            state->id_lso_maxlen = hca_attrs.hca_max_lso_size;
    } else {
        state->id_lso_capable = B_FALSE;
        state->id_lso_maxlen = 0;
    }

    /*
     * 3. Set Reserved L_Key capability
     */
    if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
        state->id_hca_res_lkey_capab = 1;
        state->id_res_lkey = hca_attrs.hca_reserved_lkey;
    }

    /*
     * 4. Set maximum sqseg value after checking to see if extended sgl
     *    size information is provided by the hca
     */
    if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
        state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
    } else {
        state->id_max_sqseg = hca_attrs.hca_max_sgl;
    }
    if (state->id_max_sqseg > IBD_MAX_SQSEG) {
        state->id_max_sqseg = IBD_MAX_SQSEG;
    } else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
        ibd_print_warn(state, "Set #sgl = %d instead of default %d",
            state->id_max_sqseg, IBD_MAX_SQSEG);
    }

    /*
     * 5. Set number of recv and send wqes after checking hca maximum
     *    channel size
     */
    if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) {
        state->id_num_rwqe = hca_attrs.hca_max_chan_sz;
    } else {
        state->id_num_rwqe = IBD_NUM_RWQE;
    }
    if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) {
        state->id_num_swqe = hca_attrs.hca_max_chan_sz;
    } else {
        state->id_num_swqe = IBD_NUM_SWQE;
    }

    return (DDI_SUCCESS);
}

static int
ibd_unattach(ibd_state_t *state, dev_info_t *dip)
{
    int instance;
    uint32_t progress = state->id_mac_state;
    ibt_status_t ret;

    if (progress & IBD_DRV_MAC_REGISTERED) {
        (void) mac_unregister(state->id_mh);
        state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
    }

    if (progress & IBD_DRV_PD_ALLOCD) {
        if ((ret = ibt_free_pd(state->id_hca_hdl,
            state->id_pd_hdl)) != IBT_SUCCESS) {
            ibd_print_warn(state, "failed to free "
                "protection domain, ret=%d", ret);
        }
        state->id_pd_hdl = NULL;
        state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
    }

    if (progress & IBD_DRV_HCA_OPENED) {
        if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
            IBT_SUCCESS) {
            ibd_print_warn(state, "failed to close "
                "HCA device, ret=%d", ret);
        }
        state->id_hca_hdl = NULL;
        state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
    }

    if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
        if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
            ibd_print_warn(state,
                "ibt_detach() failed, ret=%d", ret);
        }
        state->id_ibt_hdl = NULL;
        state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
    }

    if (progress & IBD_DRV_TXINTR_ADDED) {
        ddi_remove_softintr(state->id_tx);
        state->id_tx = NULL;
        state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
    }

    if (progress & IBD_DRV_RXINTR_ADDED) {
        ddi_remove_softintr(state->id_rx);
        state->id_rx = NULL;
        state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
    }

    if (progress & IBD_DRV_STATE_INITIALIZED) {
        ibd_state_fini(state);
        state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
    }

    instance = ddi_get_instance(dip);
    ddi_soft_state_free(ibd_list, instance);

    return (DDI_SUCCESS);
}

/*
 * Attach device to the IO framework.
 */
static int
ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
    ibd_state_t *state = NULL;
    ib_guid_t hca_guid;
    int instance;
    ibt_status_t ret;
    int rv;

    /*
     * IBD doesn't support suspend/resume
     */
    if (cmd != DDI_ATTACH)
        return (DDI_FAILURE);

    /*
     * Allocate softstate structure
     */
    instance = ddi_get_instance(dip);
    if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE)
        return (DDI_FAILURE);
    state = ddi_get_soft_state(ibd_list, instance);

    /*
     * Initialize mutexes and condition variables
     */
    if (ibd_state_init(state, dip) != DDI_SUCCESS) {
        DPRINT(10, "ibd_attach: failed in ibd_state_init()");
        goto attach_fail;
    }
    state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;

    /*
     * Allocate rx,tx softintr
     */
    if (ibd_rx_softintr == 1) {
        if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
            NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
            DPRINT(10, "ibd_attach: failed in "
                "ddi_add_softintr(id_rx),  ret=%d", rv);
            goto attach_fail;
        }
        state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
    }
    if (ibd_tx_softintr == 1) {
        if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
            NULL, NULL, ibd_tx_recycle,
            (caddr_t)state)) != DDI_SUCCESS) {
            DPRINT(10, "ibd_attach: failed in "
                "ddi_add_softintr(id_tx), ret=%d", rv);
            goto attach_fail;
        }
        state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
    }

    /*
     * Obtain IBA P_Key, port number and HCA guid and validate
     * them (for P_Key, only full members are allowed as per
     * IPoIB specification; neither port number nor HCA guid
     * can be zero)
     */
    if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
        "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) {
        DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)",
            state->id_pkey);
        goto attach_fail;
    }
    if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
        "port-number", 0)) == 0) {
        DPRINT(10, "ibd_attach: invalid port number (%d)",
            state->id_port);
        goto attach_fail;
    }
    if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
        "hca-guid", 0)) == 0) {
        DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)",
            hca_guid);
        goto attach_fail;
    }

    /*
     * Attach to IBTL
     */
    if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
        &state->id_ibt_hdl)) != IBT_SUCCESS) {
        DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret);
        goto attach_fail;
    }
    state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;

    /*
     * Open the HCA
     */
    if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid,
        &state->id_hca_hdl)) != IBT_SUCCESS) {
        DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret);
        goto attach_fail;
    }
    state->id_mac_state |= IBD_DRV_HCA_OPENED;

    /*
     * Record capabilities
     */
    (void) ibd_record_capab(state, dip);

    /*
     * Allocate a protection domain on the HCA
     */
    if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
        &state->id_pd_hdl)) != IBT_SUCCESS) {
        DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret);
        goto attach_fail;
    }
    state->id_mac_state |= IBD_DRV_PD_ALLOCD;


    /*
     * Register ibd interfaces with the Nemo framework
     */
    if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
        DPRINT(10, "ibd_attach: failed in ibd_register_mac()");
        goto attach_fail;
    }
    state->id_mac_state |= IBD_DRV_MAC_REGISTERED;

    /*
     * We're done with everything we could to make the attach
     * succeed.  All the buffer allocations and IPoIB broadcast
     * group joins are deferred to when the interface instance
     * is actually plumbed to avoid wasting memory.
     */
    return (DDI_SUCCESS);

attach_fail:
    (void) ibd_unattach(state, dip);
    return (DDI_FAILURE);
}

/*
 * Detach device from the IO framework.
 */
static int
ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
    ibd_state_t *state;
    int instance;

    /*
     * IBD doesn't support suspend/resume
     */
    if (cmd != DDI_DETACH)
        return (DDI_FAILURE);

    /*
     * Get the instance softstate
     */
    instance = ddi_get_instance(dip);
    state = ddi_get_soft_state(ibd_list, instance);

    /*
     * Release all resources we're holding still.  Note that if we'd
     * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
     * so far, we should find all the flags we need in id_mac_state.
     */
    (void) ibd_unattach(state, dip);

    return (DDI_SUCCESS);
}

/*
 * Pre ibt_attach() driver initialization
 */
static int
ibd_state_init(ibd_state_t *state, dev_info_t *dip)
{
    char buf[64];

    mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
    state->id_link_state = LINK_STATE_UNKNOWN;

    mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
    cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
    state->id_trap_stop = B_TRUE;
    state->id_trap_inprog = 0;

    mutex_init(&state->id_cq_poll_lock, NULL, MUTEX_DRIVER, NULL);
    state->id_dip = dip;

    mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);

    state->id_tx_list.dl_head = NULL;
    state->id_tx_list.dl_tail = NULL;
    state->id_tx_list.dl_pending_sends = B_FALSE;
    state->id_tx_list.dl_cnt = 0;
    mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
    mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
    state->id_tx_busy = 0;

    state->id_rx_list.dl_head = NULL;
    state->id_rx_list.dl_tail = NULL;
    state->id_rx_list.dl_bufs_outstanding = 0;
    state->id_rx_list.dl_cnt = 0;
    mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
    mutex_init(&state->id_rxpost_lock, NULL, MUTEX_DRIVER, NULL);

    (void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip));
    state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
        0, NULL, NULL, NULL, NULL, NULL, 0);

    mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
    cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);

    return (DDI_SUCCESS);
}

/*
 * Post ibt_detach() driver deconstruction
 */
static void
ibd_state_fini(ibd_state_t *state)
{
    cv_destroy(&state->id_macst_cv);
    mutex_destroy(&state->id_macst_lock);

    kmem_cache_destroy(state->id_req_kmc);

    mutex_destroy(&state->id_rxpost_lock);
    mutex_destroy(&state->id_rx_list.dl_mutex);

    mutex_destroy(&state->id_txpost_lock);
    mutex_destroy(&state->id_tx_list.dl_mutex);

    mutex_destroy(&state->id_sched_lock);
    mutex_destroy(&state->id_cq_poll_lock);

    cv_destroy(&state->id_trap_cv);
    mutex_destroy(&state->id_trap_lock);
    mutex_destroy(&state->id_link_mutex);
}

/*
 * Fetch link speed from SA for snmp ifspeed reporting.
 */
static uint64_t
ibd_get_portspeed(ibd_state_t *state)
{
    int         ret;
    ibt_path_info_t     path;
    ibt_path_attr_t     path_attr;
    uint8_t         num_paths;
    uint64_t        ifspeed;

    /*
     * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
     * translates to 2 Gbps data rate. Thus, 1X single data rate is
     * 2000000000. Start with that as default.
     */
    ifspeed = 2000000000;

    bzero(&path_attr, sizeof (path_attr));

    /*
     * Get the port speed from Loopback path information.
     */
    path_attr.pa_dgids = &state->id_sgid;
    path_attr.pa_num_dgids = 1;
    path_attr.pa_sgid = state->id_sgid;

    if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
        &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
        goto earlydone;

    if (num_paths < 1)
        goto earlydone;

    /*
     * In case SA does not return an expected value, report the default
     * speed as 1X.
     */
    ret = 1;
    switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
        case IBT_SRATE_2:   /*  1X SDR i.e 2.5 Gbps */
            ret = 1;
            break;
        case IBT_SRATE_10:  /*  4X SDR or 1X QDR i.e 10 Gbps */
            ret = 4;
            break;
        case IBT_SRATE_30:  /* 12X SDR i.e 30 Gbps */
            ret = 12;
            break;
        case IBT_SRATE_5:   /*  1X DDR i.e  5 Gbps */
            ret = 2;
            break;
        case IBT_SRATE_20:  /*  4X DDR or 8X SDR i.e 20 Gbps */
            ret = 8;
            break;
        case IBT_SRATE_40:  /*  8X DDR or 4X QDR i.e 40 Gbps */
            ret = 16;
            break;
        case IBT_SRATE_60:  /* 12X DDR i.e 60 Gbps */
            ret = 24;
            break;
        case IBT_SRATE_80:  /*  8X QDR i.e 80 Gbps */
            ret = 32;
            break;
        case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */
            ret = 48;
            break;
    }

    ifspeed *= ret;

earlydone:
    return (ifspeed);
}

/*
 * Search input mcg list (id_mc_full or id_mc_non) for an entry
 * representing the input mcg mgid.
 */
static ibd_mce_t *
ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
{
    ibd_mce_t *ptr = list_head(mlist);

    /*
     * Do plain linear search.
     */
    while (ptr != NULL) {
        if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
            sizeof (ib_gid_t)) == 0)
            return (ptr);
        ptr = list_next(mlist, ptr);
    }
    return (NULL);
}

/*
 * Execute IBA JOIN.
 */
static ibt_status_t
ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
{
    ibt_mcg_attr_t mcg_attr;

    bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
    mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
    mcg_attr.mc_mgid = mgid;
    mcg_attr.mc_join_state = mce->mc_jstate;
    mcg_attr.mc_scope = state->id_scope;
    mcg_attr.mc_pkey = state->id_pkey;
    mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
    mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
    mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
    return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
        NULL, NULL));
}

/*
 * This code JOINs the port in the proper way (depending on the join
 * state) so that IBA fabric will forward mcg packets to/from the port.
 * It also attaches the QPN to the mcg so it can receive those mcg
 * packets. This code makes sure not to attach the mcg to the QP if
 * that has been previously done due to the mcg being joined with a
 * different join state, even though this is not required by SWG_0216,
 * refid 3610.
 */
static ibd_mce_t *
ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
{
    ibt_status_t ibt_status;
    ibd_mce_t *mce, *tmce, *omce = NULL;
    boolean_t do_attach = B_TRUE;

    DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
        jstate, mgid.gid_prefix, mgid.gid_guid);

    /*
     * For enable_multicast Full member joins, we need to do some
     * extra work. If there is already an mce on the list that
     * indicates full membership, that means the membership has
     * not yet been dropped (since the disable_multicast was issued)
     * because there are pending Tx's to the mcg; in that case, just
     * mark the mce not to be reaped when the Tx completion queues
     * an async reap operation.
     *
     * If there is already an mce on the list indicating sendonly
     * membership, try to promote to full membership. Be careful
     * not to deallocate the old mce, since there might be an AH
     * pointing to it; instead, update the old mce with new data
     * that tracks the full membership.
     */
    if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
        IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
        if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
            ASSERT(omce->mc_fullreap);
            omce->mc_fullreap = B_FALSE;
            return (omce);
        } else {
            ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
        }
    }

    /*
     * Allocate the ibd_mce_t to track this JOIN.
     */
    mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
    mce->mc_fullreap = B_FALSE;
    mce->mc_jstate = jstate;

    if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
        DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
            ibt_status);
        kmem_free(mce, sizeof (ibd_mce_t));
        return (NULL);
    }

    /*
     * Is an IBA attach required? Not if the interface is already joined
     * to the mcg in a different appropriate join state.
     */
    if (jstate == IB_MC_JSTATE_NON) {
        tmce = IBD_MCACHE_FIND_FULL(state, mgid);
        if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
            do_attach = B_FALSE;
    } else if (jstate == IB_MC_JSTATE_FULL) {
        if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
            do_attach = B_FALSE;
    } else {    /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
        do_attach = B_FALSE;
    }

    if (do_attach) {
        /*
         * Do the IBA attach.
         */
        DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
        if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
            &mce->mc_info)) != IBT_SUCCESS) {
            DPRINT(10, "ibd_join_group : failed qp attachment "
                "%d\n", ibt_status);
            /*
             * NOTE that we should probably preserve the join info
             * in the list and later try to leave again at detach
             * time.
             */
            (void) ibt_leave_mcg(state->id_sgid, mgid,
                state->id_sgid, jstate);
            kmem_free(mce, sizeof (ibd_mce_t));
            return (NULL);
        }
    }

    /*
     * Insert the ibd_mce_t in the proper list.
     */
    if (jstate == IB_MC_JSTATE_NON) {
        IBD_MCACHE_INSERT_NON(state, mce);
    } else {
        /*
         * Set up the mc_req fields used for reaping the
         * mcg in case of delayed tx completion (see
         * ibd_tx_cleanup()). Also done for sendonly join in
         * case we are promoted to fullmembership later and
         * keep using the same mce.
         */
        mce->mc_req.rq_gid = mgid;
        mce->mc_req.rq_ptr = mce;
        /*
         * Check whether this is the case of trying to join
         * full member, and we were already joined send only.
         * We try to drop our SendOnly membership, but it is
         * possible that the mcg does not exist anymore (and
         * the subnet trap never reached us), so the leave
         * operation might fail.
         */
        if (omce != NULL) {
            (void) ibt_leave_mcg(state->id_sgid, mgid,
                state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
            omce->mc_jstate = IB_MC_JSTATE_FULL;
            bcopy(&mce->mc_info, &omce->mc_info,
                sizeof (ibt_mcg_info_t));
            kmem_free(mce, sizeof (ibd_mce_t));
            return (omce);
        }
        mutex_enter(&state->id_mc_mutex);
        IBD_MCACHE_INSERT_FULL(state, mce);
        mutex_exit(&state->id_mc_mutex);
    }

    return (mce);
}

/*
 * Called during port up event handling to attempt to reacquire full
 * membership to an mcg. Stripped down version of ibd_join_group().
 * Note that it is possible that the mcg might have gone away, and
 * gets recreated at this point.
 */
static void
ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
{
    ib_gid_t mgid;

    /*
     * If the mc_fullreap flag is set, or this join fails, a subsequent
     * reap/leave is going to try to leave the group. We could prevent
     * that by adding a boolean flag into ibd_mce_t, if required.
     */
    if (mce->mc_fullreap)
        return;

    mgid = mce->mc_info.mc_adds_vect.av_dgid;

    DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
        mgid.gid_guid);

    if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
        ibd_print_warn(state, "Failure on port up to rejoin "
            "multicast gid %016llx:%016llx",
            (u_longlong_t)mgid.gid_prefix,
            (u_longlong_t)mgid.gid_guid);
}

/*
 * This code handles delayed Tx completion cleanups for mcg's to which
 * disable_multicast has been issued, regular mcg related cleanups during
 * disable_multicast, disable_promiscous and mcg traps, as well as
 * cleanups during driver detach time. Depending on the join state,
 * it deletes the mce from the appropriate list and issues the IBA
 * leave/detach; except in the disable_multicast case when the mce
 * is left on the active list for a subsequent Tx completion cleanup.
 */
static void
ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
    uint8_t jstate)
{
    ibd_mce_t *tmce;
    boolean_t do_detach = B_TRUE;

    /*
     * Before detaching, we must check whether the other list
     * contains the mcg; if we detach blindly, the consumer
     * who set up the other list will also stop receiving
     * traffic.
     */
    if (jstate == IB_MC_JSTATE_FULL) {
        /*
         * The following check is only relevant while coming
         * from the Tx completion path in the reap case.
         */
        if (!mce->mc_fullreap)
            return;
        mutex_enter(&state->id_mc_mutex);
        IBD_MCACHE_PULLOUT_FULL(state, mce);
        mutex_exit(&state->id_mc_mutex);
        if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
            do_detach = B_FALSE;
    } else if (jstate == IB_MC_JSTATE_NON) {
        IBD_MCACHE_PULLOUT_NON(state, mce);
        tmce = IBD_MCACHE_FIND_FULL(state, mgid);
        if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
            do_detach = B_FALSE;
    } else {    /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
        mutex_enter(&state->id_mc_mutex);
        IBD_MCACHE_PULLOUT_FULL(state, mce);
        mutex_exit(&state->id_mc_mutex);
        do_detach = B_FALSE;
    }

    /*
     * If we are reacting to a mcg trap and leaving our sendonly or
     * non membership, the mcg is possibly already gone, so attempting
     * to leave might fail. On the other hand, we must try to leave
     * anyway, since this might be a trap from long ago, and we could
     * have potentially sendonly joined to a recent incarnation of
     * the mcg and are about to loose track of this information.
     */
    if (do_detach) {
        DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
            "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
        (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
    }

    (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
    kmem_free(mce, sizeof (ibd_mce_t));
}

/*
 * Async code executed due to multicast and promiscuous disable requests
 * and mcg trap handling; also executed during driver detach. Mostly, a
 * leave and detach is done; except for the fullmember case when Tx
 * requests are pending, whence arrangements are made for subsequent
 * cleanup on Tx completion.
 */
static void
ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
{
    ipoib_mac_t mcmac;
    boolean_t recycled;
    ibd_mce_t *mce;

    DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
        jstate, mgid.gid_prefix, mgid.gid_guid);

    if (jstate == IB_MC_JSTATE_NON) {
        recycled = B_TRUE;
        mce = IBD_MCACHE_FIND_NON(state, mgid);
        /*
         * In case we are handling a mcg trap, we might not find
         * the mcg in the non list.
         */
        if (mce == NULL) {
            return;
        }
    } else {
        mce = IBD_MCACHE_FIND_FULL(state, mgid);

        /*
         * In case we are handling a mcg trap, make sure the trap
         * is not arriving late; if we have an mce that indicates
         * that we are already a fullmember, that would be a clear
         * indication that the trap arrived late (ie, is for a
         * previous incarnation of the mcg).
         */
        if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
            if ((mce == NULL) || (mce->mc_jstate ==
                IB_MC_JSTATE_FULL)) {
                return;
            }
        } else {
            ASSERT(jstate == IB_MC_JSTATE_FULL);

            /*
             * If join group failed, mce will be NULL here.
             * This is because in GLDv3 driver, set multicast
             *  will always return success.
             */
            if (mce == NULL) {
                return;
            }

            mce->mc_fullreap = B_TRUE;
        }

        /*
         * If no pending Tx's remain that reference the AH
         * for the mcg, recycle it from active to free list.
         * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
         * so the last completing Tx will cause an async reap
         * operation to be invoked, at which time we will drop our
         * membership to the mcg so that the pending Tx's complete
         * successfully. Refer to comments on "AH and MCE active
         * list manipulation" at top of this file. The lock protects
         * against Tx fast path and Tx cleanup code.
         */
        mutex_enter(&state->id_ac_mutex);
        ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
        recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
            IB_MC_JSTATE_SEND_ONLY_NON));
        mutex_exit(&state->id_ac_mutex);
    }

    if (recycled) {
        DPRINT(2, "ibd_leave_group : leave_group reaping : "
            "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
        ibd_async_reap_group(state, mce, mgid, jstate);
    }
}

/*
 * Find the broadcast address as defined by IPoIB; implicitly
 * determines the IBA scope, mtu, tclass etc of the link the
 * interface is going to be a member of.
 */
static ibt_status_t
ibd_find_bgroup(ibd_state_t *state)
{
    ibt_mcg_attr_t mcg_attr;
    uint_t numg;
    uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
        IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
        IB_MC_SCOPE_GLOBAL };
    int i, mcgmtu;
    boolean_t found = B_FALSE;
    int ret;
    ibt_mcg_info_t mcg_info;

    state->id_bgroup_created = B_FALSE;

query_bcast_grp:
    bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
    mcg_attr.mc_pkey = state->id_pkey;
    state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;

    for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
        state->id_scope = mcg_attr.mc_scope = scopes[i];

        /*
         * Look for the IPoIB broadcast group.
         */
        state->id_mgid.gid_prefix =
            (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
            ((uint64_t)state->id_scope << 48) |
            ((uint32_t)(state->id_pkey << 16)));
        mcg_attr.mc_mgid = state->id_mgid;
        if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
            &state->id_mcinfo, &numg) == IBT_SUCCESS) {
            found = B_TRUE;
            break;
        }
    }

    if (!found) {
        if (ibd_create_broadcast_group) {
            /*
             * If we created the broadcast group, but failed to
             * find it, we can't do anything except leave the
             * one we created and return failure.
             */
            if (state->id_bgroup_created) {
                ibd_print_warn(state, "IPoIB broadcast group "
                    "absent. Unable to query after create.");
                goto find_bgroup_fail;
            }

            /*
             * Create the ipoib broadcast group if it didn't exist
             */
            bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
            mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
            mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
            mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
            mcg_attr.mc_pkey = state->id_pkey;
            mcg_attr.mc_flow = 0;
            mcg_attr.mc_sl = 0;
            mcg_attr.mc_tclass = 0;
            state->id_mgid.gid_prefix =
                (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
                ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
                ((uint32_t)(state->id_pkey << 16)));
            mcg_attr.mc_mgid = state->id_mgid;

            if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
                &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
                ibd_print_warn(state, "IPoIB broadcast group "
                    "absent, create failed: ret = %d\n", ret);
                state->id_bgroup_created = B_FALSE;
                return (IBT_FAILURE);
            }
            state->id_bgroup_created = B_TRUE;
            goto query_bcast_grp;
        } else {
            ibd_print_warn(state, "IPoIB broadcast group absent");
            return (IBT_FAILURE);
        }
    }

    /*
     * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
     */
    mcgmtu = (128 << state->id_mcinfo->mc_mtu);
    if (state->id_mtu < mcgmtu) {
        ibd_print_warn(state, "IPoIB broadcast group MTU %d "
            "greater than port's maximum MTU %d", mcgmtu,
            state->id_mtu);
        ibt_free_mcg_info(state->id_mcinfo, 1);
        goto find_bgroup_fail;
    }
    state->id_mtu = mcgmtu;

    return (IBT_SUCCESS);

find_bgroup_fail:
    if (state->id_bgroup_created) {
        (void) ibt_leave_mcg(state->id_sgid,
            mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
            IB_MC_JSTATE_FULL);
    }

    return (IBT_FAILURE);
}

static int
ibd_alloc_tx_copybufs(ibd_state_t *state)
{
    ibt_mr_attr_t mem_attr;

    /*
     * Allocate one big chunk for all regular tx copy bufs
     */
    state->id_tx_buf_sz = state->id_mtu;
    if (state->id_lso_policy && state->id_lso_capable &&
        (IBD_TX_BUF_SZ > state->id_mtu)) {
        state->id_tx_buf_sz = IBD_TX_BUF_SZ;
    }

    state->id_tx_bufs = kmem_zalloc(state->id_num_swqe *
        state->id_tx_buf_sz, KM_SLEEP);

    /*
     * Do one memory registration on the entire txbuf area
     */
    mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
    mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz;
    mem_attr.mr_as = NULL;
    mem_attr.mr_flags = IBT_MR_SLEEP;
    if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
        &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
        DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
        kmem_free(state->id_tx_bufs,
            state->id_num_swqe * state->id_tx_buf_sz);
        state->id_tx_bufs = NULL;
        return (DDI_FAILURE);
    }

    return (DDI_SUCCESS);
}

static int
ibd_alloc_tx_lsobufs(ibd_state_t *state)
{
    ibt_mr_attr_t mem_attr;
    ibd_lsobuf_t *buflist;
    ibd_lsobuf_t *lbufp;
    ibd_lsobuf_t *tail;
    ibd_lsobkt_t *bktp;
    uint8_t *membase;
    uint8_t *memp;
    uint_t memsz;
    int i;

    /*
     * Allocate the lso bucket
     */
    bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);

    /*
     * Allocate the entire lso memory and register it
     */
    memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ;
    membase = kmem_zalloc(memsz, KM_SLEEP);

    mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
    mem_attr.mr_len = memsz;
    mem_attr.mr_as = NULL;
    mem_attr.mr_flags = IBT_MR_SLEEP;
    if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
        &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
        DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
        kmem_free(membase, memsz);
        kmem_free(bktp, sizeof (ibd_lsobkt_t));
        return (DDI_FAILURE);
    }

    /*
     * Now allocate the buflist.  Note that the elements in the buflist and
     * the buffers in the lso memory have a permanent 1-1 relation, so we
     * can always derive the address of a buflist entry from the address of
     * an lso buffer.
     */
    buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t),
        KM_SLEEP);

    /*
     * Set up the lso buf chain
     */
    memp = membase;
    lbufp = buflist;
    for (i = 0; i < IBD_NUM_LSO_BUFS; i++) {
        lbufp->lb_isfree = 1;
        lbufp->lb_buf = memp;
        lbufp->lb_next = lbufp + 1;

        tail = lbufp;

        memp += IBD_LSO_BUFSZ;
        lbufp++;
    }
    tail->lb_next = NULL;

    /*
     * Set up the LSO buffer information in ibd state
     */
    bktp->bkt_bufl = buflist;
    bktp->bkt_free_head = buflist;
    bktp->bkt_mem = membase;
    bktp->bkt_nelem = IBD_NUM_LSO_BUFS;
    bktp->bkt_nfree = bktp->bkt_nelem;

    state->id_lso = bktp;

    return (DDI_SUCCESS);
}

/*
 * Statically allocate Tx buffer list(s).
 */
static int
ibd_init_txlist(ibd_state_t *state)
{
    ibd_swqe_t *swqe;
    ibt_lkey_t lkey;
    int i;

    if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
        return (DDI_FAILURE);

    if (state->id_lso_policy && state->id_lso_capable) {
        if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
            state->id_lso_policy = B_FALSE;
    }

    /*
     * Allocate and setup the swqe list
     */
    lkey = state->id_tx_mr_desc.md_lkey;
    for (i = 0; i < state->id_num_swqe; i++) {
        if (ibd_alloc_swqe(state, &swqe, i, lkey) != DDI_SUCCESS) {
            DPRINT(10, "ibd_init_txlist: ibd_alloc_swqe failed");
            ibd_fini_txlist(state);
            return (DDI_FAILURE);
        }

        /* add to list */
        state->id_tx_list.dl_cnt++;
        if (state->id_tx_list.dl_head == NULL) {
            swqe->swqe_prev = NULL;
            swqe->swqe_next = NULL;
            state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
            state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
        } else {
            swqe->swqe_prev = state->id_tx_list.dl_tail;
            swqe->swqe_next = NULL;
            state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
            state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
        }
    }

    return (DDI_SUCCESS);
}

static int
ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
    uint32_t *nds_p)
{
    ibd_lsobkt_t *bktp;
    ibd_lsobuf_t *lbufp;
    ibd_lsobuf_t *nextp;
    ibt_lkey_t lso_lkey;
    uint_t frag_sz;
    uint_t num_needed;
    int i;

    ASSERT(sgl_p != NULL);
    ASSERT(nds_p != NULL);
    ASSERT(req_sz != 0);

    /*
     * Determine how many bufs we'd need for the size requested
     */
    num_needed = req_sz / IBD_LSO_BUFSZ;
    if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
        num_needed++;

    mutex_enter(&state->id_lso_lock);

    /*
     * If we don't have enough lso bufs, return failure
     */
    ASSERT(state->id_lso != NULL);
    bktp = state->id_lso;
    if (bktp->bkt_nfree < num_needed) {
        mutex_exit(&state->id_lso_lock);
        return (-1);
    }

    /*
     * Pick the first 'num_needed' bufs from the free list
     */
    lso_lkey = bktp->bkt_mr_desc.md_lkey;
    lbufp = bktp->bkt_free_head;
    for (i = 0; i < num_needed; i++) {
        ASSERT(lbufp->lb_isfree != 0);
        ASSERT(lbufp->lb_buf != NULL);

        nextp = lbufp->lb_next;

        sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
        sgl_p[i].ds_key = lso_lkey;
        sgl_p[i].ds_len = IBD_LSO_BUFSZ;

        lbufp->lb_isfree = 0;
        lbufp->lb_next = NULL;

        lbufp = nextp;
    }
    bktp->bkt_free_head = lbufp;

    /*
     * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
     * to adjust the last sgl entry's length. Since we know we need atleast
     * one, the i-1 use below is ok.
     */
    if (frag_sz) {
        sgl_p[i-1].ds_len = frag_sz;
    }

    /*
     * Update nfree count and return
     */
    bktp->bkt_nfree -= num_needed;

    mutex_exit(&state->id_lso_lock);

    *nds_p = num_needed;

    return (0);
}

static void
ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
{
    ibd_lsobkt_t *bktp;
    ibd_lsobuf_t *lbufp;
    uint8_t *lso_mem_end;
    uint_t ndx;
    int i;

    mutex_enter(&state->id_lso_lock);

    bktp = state->id_lso;
    ASSERT(bktp != NULL);

    lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
    for (i = 0; i < nds; i++) {
        uint8_t *va;

        va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
        ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);

        /*
         * Figure out the buflist element this sgl buffer corresponds
         * to and put it back at the head
         */
        ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
        lbufp = bktp->bkt_bufl + ndx;

        ASSERT(lbufp->lb_isfree == 0);
        ASSERT(lbufp->lb_buf == va);

        lbufp->lb_isfree = 1;
        lbufp->lb_next = bktp->bkt_free_head;
        bktp->bkt_free_head = lbufp;
    }
    bktp->bkt_nfree += nds;

    mutex_exit(&state->id_lso_lock);
}

static void
ibd_free_tx_copybufs(ibd_state_t *state)
{
    /*
     * Unregister txbuf mr
     */
    if (ibt_deregister_mr(state->id_hca_hdl,
        state->id_tx_mr_hdl) != IBT_SUCCESS) {
        DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
    }
    state->id_tx_mr_hdl = NULL;

    /*
     * Free txbuf memory
     */
    kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz);
    state->id_tx_bufs = NULL;
}

static void
ibd_free_tx_lsobufs(ibd_state_t *state)
{
    ibd_lsobkt_t *bktp;

    mutex_enter(&state->id_lso_lock);

    if ((bktp = state->id_lso) == NULL) {
        mutex_exit(&state->id_lso_lock);
        return;
    }

    /*
     * First, free the buflist
     */
    ASSERT(bktp->bkt_bufl != NULL);
    kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));

    /*
     * Unregister the LSO memory and free it
     */
    ASSERT(bktp->bkt_mr_hdl != NULL);
    if (ibt_deregister_mr(state->id_hca_hdl,
        bktp->bkt_mr_hdl) != IBT_SUCCESS) {
        DPRINT(10,
            "ibd_free_lsobufs: ibt_deregister_mr failed");
    }
    ASSERT(bktp->bkt_mem);
    kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);

    /*
     * Finally free the bucket
     */
    kmem_free(bktp, sizeof (ibd_lsobkt_t));
    state->id_lso = NULL;

    mutex_exit(&state->id_lso_lock);
}

/*
 * Free the statically allocated Tx buffer list.
 */
static void
ibd_fini_txlist(ibd_state_t *state)
{
    ibd_swqe_t *node;

    /*
     * Free the allocated swqes
     */
    mutex_enter(&state->id_tx_list.dl_mutex);
    while (state->id_tx_list.dl_head != NULL) {
        node = WQE_TO_SWQE(state->id_tx_list.dl_head);
        state->id_tx_list.dl_head = node->swqe_next;
        ASSERT(state->id_tx_list.dl_cnt > 0);
        state->id_tx_list.dl_cnt--;
        ibd_free_swqe(state, node);
    }
    mutex_exit(&state->id_tx_list.dl_mutex);

    ibd_free_tx_lsobufs(state);
    ibd_free_tx_copybufs(state);
}

/*
 * Allocate a single send wqe and register it so it is almost
 * ready to be posted to the hardware.
 */
static int
ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe, int ndx, ibt_lkey_t lkey)
{
    ibd_swqe_t *swqe;

    swqe = kmem_zalloc(sizeof (ibd_swqe_t), KM_SLEEP);
    *wqe = swqe;

    swqe->swqe_type = IBD_WQE_SEND;
    swqe->swqe_next = NULL;
    swqe->swqe_prev = NULL;
    swqe->swqe_im_mblk = NULL;

    swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
        (state->id_tx_bufs + ndx * state->id_tx_buf_sz);
    swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
    swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */

    swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
    swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
    swqe->w_swr.wr_trans = IBT_UD_SRV;

    /* These are set in send */
    swqe->w_swr.wr_nds = 0;
    swqe->w_swr.wr_sgl = NULL;
    swqe->w_swr.wr_opcode = IBT_WRC_SEND;

    return (DDI_SUCCESS);
}

/*
 * Free an allocated send wqe.
 */
/*ARGSUSED*/
static void
ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
{
    kmem_free(swqe, sizeof (ibd_swqe_t));
}

/*
 * Post a rwqe to the hardware and add it to the Rx list. The
 * "recycle" parameter indicates whether an old rwqe is being
 * recycled, or this is a new one.
 */
static int
ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle)
{
    ibt_status_t ibt_status;

    if (recycle == B_FALSE) {
        mutex_enter(&state->id_rx_list.dl_mutex);
        if (state->id_rx_list.dl_head == NULL) {
            rwqe->rwqe_prev = NULL;
            rwqe->rwqe_next = NULL;
            state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe);
            state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
        } else {
            rwqe->rwqe_prev = state->id_rx_list.dl_tail;
            rwqe->rwqe_next = NULL;
            state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe);
            state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
        }
        mutex_exit(&state->id_rx_list.dl_mutex);
    }

    mutex_enter(&state->id_rxpost_lock);
    if (state->id_rx_busy) {
        rwqe->w_post_link = NULL;
        if (state->id_rx_head)
            *(state->id_rx_tailp) = (ibd_wqe_t *)rwqe;
        else
            state->id_rx_head = rwqe;
        state->id_rx_tailp = &(rwqe->w_post_link);
    } else {
        state->id_rx_busy = 1;
        do {
            mutex_exit(&state->id_rxpost_lock);

            /*
             * Here we should add dl_cnt before post recv, because
             * we would have to make sure dl_cnt is updated before
             * the corresponding ibd_process_rx() is called.
             */
            atomic_add_32(&state->id_rx_list.dl_cnt, 1);

            ibt_status = ibt_post_recv(state->id_chnl_hdl,
                &rwqe->w_rwr, 1, NULL);
            if (ibt_status != IBT_SUCCESS) {
                (void) atomic_add_32_nv(
                    &state->id_rx_list.dl_cnt, -1);
                ibd_print_warn(state, "ibd_post_recv: "
                    "posting failed, ret=%d", ibt_status);
                return (DDI_FAILURE);
            }

            mutex_enter(&state->id_rxpost_lock);
            rwqe = state->id_rx_head;
            if (rwqe) {
                state->id_rx_head =
                    (ibd_rwqe_t *)(rwqe->w_post_link);
            }
        } while (rwqe);
        state->id_rx_busy = 0;
    }
    mutex_exit(&state->id_rxpost_lock);

    return (DDI_SUCCESS);
}

/*
 * Allocate the statically allocated Rx buffer list.
 */
static int
ibd_init_rxlist(ibd_state_t *state)
{
    ibd_rwqe_t *rwqe;
    int i;

    for (i = 0; i < state->id_num_rwqe; i++) {
        if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) {
            ibd_fini_rxlist(state);
            return (DDI_FAILURE);
        }

        if (ibd_post_recv(state, rwqe, B_FALSE) == DDI_FAILURE) {
            ibd_free_rwqe(state, rwqe);
            ibd_fini_rxlist(state);
            return (DDI_FAILURE);
        }
    }

    return (DDI_SUCCESS);
}

/*
 * Free the statically allocated Rx buffer list.
 *
 */
static void
ibd_fini_rxlist(ibd_state_t *state)
{
    ibd_rwqe_t *node;

    mutex_enter(&state->id_rx_list.dl_mutex);
    while (state->id_rx_list.dl_head != NULL) {
        node = WQE_TO_RWQE(state->id_rx_list.dl_head);
        state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next;
        ASSERT(state->id_rx_list.dl_cnt > 0);
        state->id_rx_list.dl_cnt--;

        ibd_free_rwqe(state, node);
    }
    mutex_exit(&state->id_rx_list.dl_mutex);
}

/*
 * Allocate a single recv wqe and register it so it is almost
 * ready to be posted to the hardware.
 */
static int
ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe)
{
    ibt_mr_attr_t mem_attr;
    ibd_rwqe_t *rwqe;

    if ((rwqe = kmem_zalloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) {
        DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
        return (DDI_FAILURE);
    }
    *wqe = rwqe;
    rwqe->rwqe_type = IBD_WQE_RECV;
    rwqe->w_state = state;
    rwqe->rwqe_next = NULL;
    rwqe->rwqe_prev = NULL;
    rwqe->w_freeing_wqe = B_FALSE;
    rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
    rwqe->w_freemsg_cb.free_arg = (char *)rwqe;

    rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu +
        IPOIB_GRH_SIZE, KM_NOSLEEP);
    if (rwqe->rwqe_copybuf.ic_bufaddr == NULL) {
        DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
        kmem_free(rwqe, sizeof (ibd_rwqe_t));
        return (DDI_FAILURE);
    }

    if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
        state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) ==
        NULL) {
        DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()");
        kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
            state->id_mtu + IPOIB_GRH_SIZE);
        rwqe->rwqe_copybuf.ic_bufaddr = NULL;
        kmem_free(rwqe, sizeof (ibd_rwqe_t));
        return (DDI_FAILURE);
    }

    mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
    mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE;
    mem_attr.mr_as = NULL;
    mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
    if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
        &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) !=
        IBT_SUCCESS) {
        DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()");
        rwqe->w_freeing_wqe = B_TRUE;
        freemsg(rwqe->rwqe_im_mblk);
        kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
            state->id_mtu + IPOIB_GRH_SIZE);
        rwqe->rwqe_copybuf.ic_bufaddr = NULL;
        kmem_free(rwqe, sizeof (ibd_rwqe_t));
        return (DDI_FAILURE);
    }

    rwqe->rwqe_copybuf.ic_sgl.ds_va =
        (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
    rwqe->rwqe_copybuf.ic_sgl.ds_key =
        rwqe->rwqe_copybuf.ic_mr_desc.md_lkey;
    rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE;
    rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
    rwqe->w_rwr.wr_nds = 1;
    rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;

    return (DDI_SUCCESS);
}

/*
 * Free an allocated recv wqe.
 */
static void
ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
{
    if (ibt_deregister_mr(state->id_hca_hdl,
        rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) {
        DPRINT(10, "ibd_free_rwqe: failed in ibt_deregister_mr()");
        return;
    }

    /*
     * Indicate to the callback function that this rwqe/mblk
     * should not be recycled. The freemsg() will invoke
     * ibd_freemsg_cb().
     */
    if (rwqe->rwqe_im_mblk != NULL) {
        rwqe->w_freeing_wqe = B_TRUE;
        freemsg(rwqe->rwqe_im_mblk);
    }
    kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
        state->id_mtu + IPOIB_GRH_SIZE);
    rwqe->rwqe_copybuf.ic_bufaddr = NULL;
    kmem_free(rwqe, sizeof (ibd_rwqe_t));
}

/*
 * Delete the rwqe being freed from the rx list.
 */
static void
ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
{
    mutex_enter(&state->id_rx_list.dl_mutex);
    if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe))
        state->id_rx_list.dl_head = rwqe->rwqe_next;
    else
        rwqe->rwqe_prev->w_next = rwqe->rwqe_next;
    if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe))
        state->id_rx_list.dl_tail = rwqe->rwqe_prev;
    else
        rwqe->rwqe_next->w_prev = rwqe->rwqe_prev;
    mutex_exit(&state->id_rx_list.dl_mutex);
}

/*
 * IBA Rx/Tx completion queue handler. Guaranteed to be single
 * threaded and nonreentrant for this CQ. When using combined CQ,
 * this handles Tx and Rx completions. With separate CQs, this handles
 * only Rx completions.
 */
/* ARGSUSED */
static void
ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
{
    ibd_state_t *state = (ibd_state_t *)arg;

    atomic_add_64(&state->id_num_intrs, 1);

    if (ibd_rx_softintr == 1)
        ddi_trigger_softintr(state->id_rx);
    else
        (void) ibd_intr((char *)state);
}

/*
 * Separate CQ handler for Tx completions, when the Tx CQ is in
 * interrupt driven mode.
 */
/* ARGSUSED */
static void
ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
{
    ibd_state_t *state = (ibd_state_t *)arg;

    atomic_add_64(&state->id_num_intrs, 1);

    if (ibd_tx_softintr == 1)
        ddi_trigger_softintr(state->id_tx);
    else
        (void) ibd_tx_recycle((char *)state);
}

/*
 * Multicast group create/delete trap handler. These will be delivered
 * on a kernel thread (handling can thus block) and can be invoked
 * concurrently. The handler can be invoked anytime after it is
 * registered and before ibt_detach().
 */
/* ARGSUSED */
static void
ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
    ibt_subnet_event_t *event)
{
    ibd_state_t *state = (ibd_state_t *)arg;
    ibd_req_t *req;

    /*
     * The trap handler will get invoked once for every event for
     * evert port. The input "gid" is the GID0 of the port the
     * trap came in on; we just need to act on traps that came
     * to our port, meaning the port on which the ipoib interface
     * resides. Since ipoib uses GID0 of the port, we just match
     * the gids to check whether we need to handle the trap.
     */
    if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
        return;

    DPRINT(10, "ibd_notices_handler : %d\n", code);

    switch (code) {
        case IBT_SM_EVENT_UNAVAILABLE:
            /*
             * If we are in promiscuous mode or have
             * sendnonmembers, we need to print a warning
             * message right now. Else, just store the
             * information, print when we enter promiscuous
             * mode or attempt nonmember send. We might
             * also want to stop caching sendnonmember.
             */
            ibd_print_warn(state, "IBA multicast support "
                "degraded due to unavailability of multicast "
                "traps");
            break;
        case IBT_SM_EVENT_AVAILABLE:
            /*
             * If we printed a warning message above or
             * while trying to nonmember send or get into
             * promiscuous mode, print an okay message.
             */
            ibd_print_warn(state, "IBA multicast support "
                "restored due to availability of multicast "
                "traps");
            break;
        case IBT_SM_EVENT_MCG_CREATED:
        case IBT_SM_EVENT_MCG_DELETED:
            /*
             * Common processing of creation/deletion traps.
             * First check if the instance is being
             * [de]initialized; back off then, without doing
             * anything more, since we are not sure if the
             * async thread is around, or whether we might
             * be racing with the detach code in ibd_m_stop()
             * that scans the mcg list.
             */
            if (!ibd_async_safe(state))
                return;

            req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
            req->rq_gid = event->sm_notice_gid;
            req->rq_ptr = (void *)code;
            ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
            break;
    }
}

static void
ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
{
    ib_gid_t mgid = req->rq_gid;
    ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;

    DPRINT(10, "ibd_async_trap : %d\n", code);

    /*
     * Atomically search the nonmember and sendonlymember lists and
     * delete.
     */
    ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);

    if (state->id_prom_op == IBD_OP_COMPLETED) {
        ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);

        /*
         * If in promiscuous mode, try to join/attach to the new
         * mcg. Given the unreliable out-of-order mode of trap
         * delivery, we can never be sure whether it is a problem
         * if the join fails. Thus, we warn the admin of a failure
         * if this was a creation trap. Note that the trap might
         * actually be reporting a long past event, and the mcg
         * might already have been deleted, thus we might be warning
         * in vain.
         */
        if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
            NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
            ibd_print_warn(state, "IBA promiscuous mode missed "
                "new multicast gid %016llx:%016llx",
                (u_longlong_t)mgid.gid_prefix,
                (u_longlong_t)mgid.gid_guid);
    }

    /*
     * Free the request slot allocated by the subnet event thread.
     */
    ibd_async_done(state);
}

/*
 * GLDv3 entry point to get capabilities.
 */
static boolean_t
ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
{
    ibd_state_t *state = arg;

    switch (cap) {
    case MAC_CAPAB_HCKSUM: {
        uint32_t *txflags = cap_data;

        /*
         * We either do full checksum or not do it at all
         */
        if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
            *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
        else
            return (B_FALSE);
        break;
    }

    case MAC_CAPAB_LSO: {
        mac_capab_lso_t *cap_lso = cap_data;

        /*
         * In addition to the capability and policy, since LSO
         * relies on hw checksum, we'll not enable LSO if we
         * don't have hw checksum.  Of course, if the HCA doesn't
         * provide the reserved lkey capability, enabling LSO will
         * actually affect performance adversely, so we'll disable
         * LSO even for that case.
         */
        if (!state->id_lso_policy || !state->id_lso_capable)
            return (B_FALSE);

        if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
            return (B_FALSE);

        if (state->id_hca_res_lkey_capab == 0) {
            ibd_print_warn(state, "no reserved-lkey capability, "
                "disabling LSO");
            return (B_FALSE);
        }

        cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
        cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
        break;
    }

    default:
        return (B_FALSE);
    }

    return (B_TRUE);
}

static int
ibd_get_port_details(ibd_state_t *state)
{
    ibt_hca_portinfo_t *port_infop;
    ibt_status_t ret;
    uint_t psize, port_infosz;

    mutex_enter(&state->id_link_mutex);

    /*
     * Query for port information
     */
    ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
        &port_infop, &psize, &port_infosz);
    if ((ret != IBT_SUCCESS) || (psize != 1)) {
        mutex_exit(&state->id_link_mutex);
        DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
            "failed, ret=%d", ret);
        return (ENETDOWN);
    }

    /*
     * If the link already went down by the time we get here,
     * give up
     */
    if (port_infop->p_linkstate != IBT_PORT_ACTIVE) {
        mutex_exit(&state->id_link_mutex);
        ibt_free_portinfo(port_infop, port_infosz);
        DPRINT(10, "ibd_get_port_details: port is not active");
        return (ENETDOWN);
    }

    /*
     * If the link is active, verify the pkey
     */
    if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
        state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
        mutex_exit(&state->id_link_mutex);
        ibt_free_portinfo(port_infop, port_infosz);
        DPRINT(10, "ibd_get_port_details: ibt_pkey2index "
            "failed, ret=%d", ret);
        return (ENONET);
    }

    state->id_mtu = (128 << port_infop->p_mtu);
    state->id_sgid = *port_infop->p_sgid_tbl;
    state->id_link_state = LINK_STATE_UP;

    mutex_exit(&state->id_link_mutex);
    ibt_free_portinfo(port_infop, port_infosz);

    /*
     * Now that the port is active, record the port speed
     */
    state->id_link_speed = ibd_get_portspeed(state);

    return (0);
}

static int
ibd_alloc_cqs(ibd_state_t *state)
{
    ibt_hca_attr_t hca_attrs;
    ibt_cq_attr_t cq_attr;
    ibt_status_t ret;
    uint32_t real_size;

    ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
    ASSERT(ret == IBT_SUCCESS);

    /*
     * Allocate Rx/combined CQ:
     * Theoretically, there is no point in having more than #rwqe
     * plus #swqe cqe's, except that the CQ will be signalled for
     * overflow when the last wqe completes, if none of the previous
     * cqe's have been polled. Thus, we allocate just a few less wqe's
     * to make sure such overflow does not occur.
     */
    cq_attr.cq_sched = NULL;
    cq_attr.cq_flags = IBT_CQ_NO_FLAGS;

    if (ibd_separate_cqs == 1) {
        /*
         * Allocate Receive CQ.
         */
        if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
            cq_attr.cq_size = state->id_num_rwqe + 1;
        } else {
            cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
            state->id_num_rwqe = cq_attr.cq_size - 1;
        }

        if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
            &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
            DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
                "failed, ret=%d\n", ret);
            return (DDI_FAILURE);
        }

        if ((ret = ibt_modify_cq(state->id_rcq_hdl,
            ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) {
            DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
                "moderation failed, ret=%d\n", ret);
        }

        state->id_rxwcs_size = state->id_num_rwqe + 1;
        state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
            state->id_rxwcs_size, KM_SLEEP);

        /*
         * Allocate Send CQ.
         */
        if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
            cq_attr.cq_size = state->id_num_swqe + 1;
        } else {
            cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
            state->id_num_swqe = cq_attr.cq_size - 1;
        }

        if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
            &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
            DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
                "failed, ret=%d\n", ret);
            kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
                state->id_rxwcs_size);
            (void) ibt_free_cq(state->id_rcq_hdl);
            return (DDI_FAILURE);
        }
        if ((ret = ibt_modify_cq(state->id_scq_hdl,
            IBD_TXCOMP_COUNT, IBD_TXCOMP_USEC, 0)) != IBT_SUCCESS) {
            DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
                "moderation failed, ret=%d\n", ret);
        }

        state->id_txwcs_size = state->id_num_swqe + 1;
        state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
            state->id_txwcs_size, KM_SLEEP);
    } else {
        /*
         * Allocate combined Send/Receive CQ.
         */
        if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe +
            state->id_num_swqe + 1)) {
            cq_attr.cq_size = state->id_num_rwqe +
                state->id_num_swqe + 1;
        } else {
            cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
            state->id_num_rwqe = ((cq_attr.cq_size - 1) *
                state->id_num_rwqe) / (state->id_num_rwqe +
                state->id_num_swqe);
            state->id_num_swqe = cq_attr.cq_size - 1 -
                state->id_num_rwqe;
        }

        state->id_rxwcs_size = cq_attr.cq_size;
        state->id_txwcs_size = state->id_rxwcs_size;

        if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
            &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
            DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rscq) "
                "failed, ret=%d\n", ret);
            return (DDI_FAILURE);
        }
        state->id_scq_hdl = state->id_rcq_hdl;
        state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
            state->id_rxwcs_size, KM_SLEEP);
        state->id_txwcs = state->id_rxwcs;
    }

    /*
     * Print message in case we could not allocate as many wqe's
     * as was requested.
     */
    if (state->id_num_rwqe != IBD_NUM_RWQE) {
        ibd_print_warn(state, "Setting #rwqe = %d instead of default "
            "%d", state->id_num_rwqe, IBD_NUM_RWQE);
    }
    if (state->id_num_swqe != IBD_NUM_SWQE) {
        ibd_print_warn(state, "Setting #swqe = %d instead of default "
            "%d", state->id_num_swqe, IBD_NUM_SWQE);
    }

    return (DDI_SUCCESS);
}

static int
ibd_setup_ud_channel(ibd_state_t *state)
{
    ibt_ud_chan_alloc_args_t ud_alloc_attr;
    ibt_ud_chan_query_attr_t ud_chan_attr;
    ibt_status_t ret;

    ud_alloc_attr.ud_flags  = IBT_WR_SIGNALED;
    if (state->id_hca_res_lkey_capab)
        ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
    if (state->id_lso_policy && state->id_lso_capable)
        ud_alloc_attr.ud_flags |= IBT_USES_LSO;

    ud_alloc_attr.ud_hca_port_num   = state->id_port;
    ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
    ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
    ud_alloc_attr.ud_sizes.cs_sq    = state->id_num_swqe;
    ud_alloc_attr.ud_sizes.cs_rq    = state->id_num_rwqe;
    ud_alloc_attr.ud_qkey       = state->id_mcinfo->mc_qkey;
    ud_alloc_attr.ud_scq        = state->id_scq_hdl;
    ud_alloc_attr.ud_rcq        = state->id_rcq_hdl;
    ud_alloc_attr.ud_pd     = state->id_pd_hdl;
    ud_alloc_attr.ud_pkey_ix    = state->id_pkix;
    ud_alloc_attr.ud_clone_chan = NULL;

    if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
        &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
        DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
            "failed, ret=%d\n", ret);
        return (DDI_FAILURE);
    }

    if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
        &ud_chan_attr)) != IBT_SUCCESS) {
        DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
            "failed, ret=%d\n", ret);
        (void) ibt_free_channel(state->id_chnl_hdl);
        return (DDI_FAILURE);
    }

    state->id_qpnum = ud_chan_attr.ud_qpn;

    return (DDI_SUCCESS);
}

static int
ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
{
    uint32_t progress = state->id_mac_state;
    uint_t attempts;
    ibt_status_t ret;
    ib_gid_t mgid;
    ibd_mce_t *mce;
    uint8_t jstate;

    /*
     * Before we try to stop/undo whatever we did in ibd_start(),
     * we need to mark the link state appropriately to prevent the
     * ip layer from using this instance for any new transfers. Note
     * that if the original state of the link was "up" when we're
     * here, we'll set the final link state to "unknown", to behave
     * in the same fashion as other ethernet drivers.
     */
    mutex_enter(&state->id_link_mutex);
    if (cur_link_state == LINK_STATE_DOWN) {
        state->id_link_state = cur_link_state;
    } else {
        state->id_link_state = LINK_STATE_UNKNOWN;
    }
    mutex_exit(&state->id_link_mutex);
    mac_link_update(state->id_mh, state->id_link_state);

    state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
    if (progress & IBD_DRV_STARTED) {
        state->id_mac_state &= (~IBD_DRV_STARTED);
    }

    /*
     * First, stop receive interrupts; this stops the driver from
     * handing up buffers to higher layers.  Wait for receive buffers
     * to be returned and give up after 5 seconds.
     */
    if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {

        ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);

        attempts = 50;
        while (state->id_rx_list.dl_bufs_outstanding > 0) {
            delay(drv_usectohz(100000));
            if (--attempts == 0) {
                /*
                 * There are pending bufs with the network
                 * layer and we have no choice but to wait
                 * for them to be done with. Reap all the
                 * Tx/Rx completions that were posted since
                 * we turned off the notification and
                 * return failure.
                 */
                DPRINT(2, "ibd_undo_start: "
                    "reclaiming failed");
                ibd_poll_compq(state, state->id_rcq_hdl);
                ibt_set_cq_handler(state->id_rcq_hdl,
                    ibd_rcq_handler, state);
                return (DDI_FAILURE);
            }
        }
        state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
    }

    if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
        ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);

        mutex_enter(&state->id_trap_lock);
        state->id_trap_stop = B_TRUE;
        while (state->id_trap_inprog > 0)
            cv_wait(&state->id_trap_cv, &state->id_trap_lock);
        mutex_exit(&state->id_trap_lock);

        state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
    }

    if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
        /*
         * Flushing the channel ensures that all pending WQE's
         * are marked with flush_error and handed to the CQ. It
         * does not guarantee the invocation of the CQ handler.
         * This call is guaranteed to return successfully for
         * UD QPNs.
         */
        if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
            IBT_SUCCESS) {
            DPRINT(10, "ibd_undo_start: flush_channel "
                "failed, ret=%d", ret);
        }

        /*
         * Turn off Tx interrupts and poll. By the time the polling
         * returns an empty indicator, we are sure we have seen all
         * pending Tx callbacks. Note that after the call to
         * ibt_set_cq_handler() returns, the old handler is
         * guaranteed not to be invoked anymore.
         */
        if (ibd_separate_cqs == 1) {
            ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
        }
        ibd_poll_compq(state, state->id_scq_hdl);

        state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
    }

    if (progress & IBD_DRV_ASYNC_THR_CREATED) {
        /*
         * No new async requests will be posted since the device
         * link state has been marked as unknown; completion handlers
         * have been turned off, so Tx handler will not cause any
         * more IBD_ASYNC_REAP requests.
         *
         * Queue a request for the async thread to exit, which will
         * be serviced after any pending ones. This can take a while,
         * specially if the SM is unreachable, since IBMF will slowly
         * timeout each SM request issued by the async thread.  Reap
         * the thread before continuing on, we do not want it to be
         * lingering in modunloaded code (or we could move the reap
         * to ibd_detach(), provided we keep track of the current
         * id_async_thrid somewhere safe).
         */
        ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
        thread_join(state->id_async_thrid);

        state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
    }

    if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
        /*
         * Drop all residual full/non membership. This includes full
         * membership to the broadcast group, and any nonmembership
         * acquired during transmits. We do this after the Tx completion
         * handlers are done, since those might result in some late
         * leaves; this also eliminates a potential race with that
         * path wrt the mc full list insert/delete. Trap handling
         * has also been suppressed at this point. Thus, no locks
         * are required while traversing the mc full list.
         */
        DPRINT(2, "ibd_undo_start: clear full cache entries");
        mce = list_head(&state->id_mc_full);
        while (mce != NULL) {
            mgid = mce->mc_info.mc_adds_vect.av_dgid;
            jstate = mce->mc_jstate;
            mce = list_next(&state->id_mc_full, mce);
            ibd_leave_group(state, mgid, jstate);
        }
        state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
    }

    if (progress & IBD_DRV_RXLIST_ALLOCD) {
        ibd_fini_rxlist(state);
        state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
    }

    if (progress & IBD_DRV_TXLIST_ALLOCD) {
        ibd_fini_txlist(state);
        state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
    }

    if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
        if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
            IBT_SUCCESS) {
            DPRINT(10, "ibd_undo_start: free_channel "
                "failed, ret=%d", ret);
        }

        state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
    }

    if (progress & IBD_DRV_CQS_ALLOCD) {
        if (ibd_separate_cqs == 1) {
            kmem_free(state->id_txwcs,
                sizeof (ibt_wc_t) * state->id_txwcs_size);
            if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
                IBT_SUCCESS) {
                DPRINT(10, "ibd_undo_start: free_cq(scq) "
                    "failed, ret=%d", ret);
            }
        }

        kmem_free(state->id_rxwcs,
            sizeof (ibt_wc_t) * state->id_rxwcs_size);
        if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
            DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
                "ret=%d", ret);
        }

        state->id_txwcs = NULL;
        state->id_rxwcs = NULL;
        state->id_scq_hdl = NULL;
        state->id_rcq_hdl = NULL;

        state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
    }

    if (progress & IBD_DRV_ACACHE_INITIALIZED) {
        mod_hash_destroy_hash(state->id_ah_active_hash);
        ibd_acache_fini(state);

        state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
    }

    if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
        /*
         * If we'd created the ipoib broadcast group and had
         * successfully joined it, leave it now
         */
        if (state->id_bgroup_created) {
            mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
            jstate = IB_MC_JSTATE_FULL;
            (void) ibt_leave_mcg(state->id_sgid, mgid,
                state->id_sgid, jstate);
        }
        ibt_free_mcg_info(state->id_mcinfo, 1);

        state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
    }

    return (DDI_SUCCESS);
}

/*
 * These pair of routines are used to set/clear the condition that
 * the caller is likely to do something to change the id_mac_state.
 * If there's already someone doing either a start or a stop (possibly
 * due to the async handler detecting a pkey relocation event, a plumb
 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
 * that's done.
 */
static void
ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
{
    mutex_enter(&state->id_macst_lock);
    while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
        cv_wait(&state->id_macst_cv, &state->id_macst_lock);

    state->id_mac_state |= flag;
    mutex_exit(&state->id_macst_lock);
}

static void
ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
{
    mutex_enter(&state->id_macst_lock);
    state->id_mac_state &= (~flag);
    cv_signal(&state->id_macst_cv);
    mutex_exit(&state->id_macst_lock);
}

/*
 * GLDv3 entry point to start hardware.
 */
/*ARGSUSED*/
static int
ibd_m_start(void *arg)
{
    ibd_state_t *state = arg;
    int ret;

    ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);

    ret = ibd_start(state);

    ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);

    return (ret);
}

static int
ibd_start(ibd_state_t *state)
{
    kthread_t *kht;
    int err;
    ibt_status_t ret;

    if (state->id_mac_state & IBD_DRV_STARTED)
        return (DDI_SUCCESS);

    /*
     * Get port details; if we fail here, very likely the port
     * state is inactive or the pkey can't be verified.
     */
    if ((err = ibd_get_port_details(state)) != 0) {
        DPRINT(10, "ibd_start: ibd_get_port_details() failed");
        goto start_fail;
    }
    state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;

    /*
     * Find the IPoIB broadcast group
     */
    if (ibd_find_bgroup(state) != IBT_SUCCESS) {
        DPRINT(10, "ibd_start: ibd_find_bgroup() failed");
        err = ENOTACTIVE;
        goto start_fail;
    }
    state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;

    /*
     * Initialize per-interface caches and lists; if we fail here,
     * it is most likely due to a lack of resources
     */
    if (ibd_acache_init(state) != DDI_SUCCESS) {
        DPRINT(10, "ibd_start: ibd_acache_init() failed");
        err = ENOMEM;
        goto start_fail;
    }
    state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;

    /*
     * Allocate send and receive completion queues
     */
    if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
        DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
        err = ENOMEM;
        goto start_fail;
    }
    state->id_mac_state |= IBD_DRV_CQS_ALLOCD;

    /*
     * Setup a UD channel
     */
    if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
        err = ENOMEM;
        DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
        goto start_fail;
    }
    state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;

    /*
     * Allocate and initialize the tx buffer list
     */
    if (ibd_init_txlist(state) != DDI_SUCCESS) {
        DPRINT(10, "ibd_start: ibd_init_txlist() failed");
        err = ENOMEM;
        goto start_fail;
    }
    state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;

    /*
     * If we have separate cqs, create the send cq handler here
     */
    if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) {
        ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
        if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
            IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
            DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
                "failed, ret=%d", ret);
            err = EINVAL;
            goto start_fail;
        }
        state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
    }

    /*
     * Allocate and initialize the rx buffer list
     */
    if (ibd_init_rxlist(state) != DDI_SUCCESS) {
        DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
        err = ENOMEM;
        goto start_fail;
    }
    state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;

    /*
     * Join IPoIB broadcast group
     */
    if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
        DPRINT(10, "ibd_start: ibd_join_group() failed");
        err = ENOTACTIVE;
        goto start_fail;
    }
    state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;

    /*
     * Create the async thread; thread_create never fails.
     */
    kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
        TS_RUN, minclsyspri);
    state->id_async_thrid = kht->t_did;
    state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;

    /*
     * When we did mac_register() in ibd_attach(), we didn't register
     * the real macaddr and we didn't have the true port mtu. Now that
     * we're almost ready, set the local mac address and broadcast
     * addresses and update gldv3 about the real values of these
     * parameters.
     */
    ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
        state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
    ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
        state->id_mgid.gid_prefix, state->id_mgid.gid_guid);

    mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE);
    mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);

    /*
     * Setup the receive cq handler
     */
    ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
    if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
        IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
        DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
            "failed, ret=%d", ret);
        err = EINVAL;
        goto start_fail;
    }
    state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;

    /*
     * Setup the subnet notices handler after we've initialized the acache/
     * mcache and started the async thread, both of which are required for
     * the trap handler to function properly.
     *
     * Now that the async thread has been started (and we've already done
     * a mac_register() during attach so mac_tx_update() can be called
     * if necessary without any problem), we can enable the trap handler
     * to queue requests to the async thread.
     */
    ibt_register_subnet_notices(state->id_ibt_hdl,
        ibd_snet_notices_handler, state);
    mutex_enter(&state->id_trap_lock);
    state->id_trap_stop = B_FALSE;
    mutex_exit(&state->id_trap_lock);
    state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;

    /*
     * Indicate link status to GLDv3 and higher layers. By default,
     * we assume we are in up state (which must have been true at
     * least at the time the broadcast mcg's were probed); if there
     * were any up/down transitions till the time we come here, the
     * async handler will have updated last known state, which we
     * use to tell GLDv3. The async handler will not send any
     * notifications to GLDv3 till we reach here in the initialization
     * sequence.
     */
    state->id_mac_state |= IBD_DRV_STARTED;
    mac_link_update(state->id_mh, state->id_link_state);

    return (DDI_SUCCESS);

start_fail:
    /*
     * If we ran into a problem during ibd_start() and ran into
     * some other problem during undoing our partial work, we can't
     * do anything about it.  Ignore any errors we might get from
     * ibd_undo_start() and just return the original error we got.
     */
    (void) ibd_undo_start(state, LINK_STATE_DOWN);
    return (err);
}

/*
 * GLDv3 entry point to stop hardware from receiving packets.
 */
/*ARGSUSED*/
static void
ibd_m_stop(void *arg)
{
    ibd_state_t *state = (ibd_state_t *)arg;

    ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);

    (void) ibd_undo_start(state, state->id_link_state);

    ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
}

/*
 * GLDv3 entry point to modify device's mac address. We do not
 * allow address modifications.
 */
static int
ibd_m_unicst(void *arg, const uint8_t *macaddr)
{
    ibd_state_t *state = arg;

    /*
     * Don't bother even comparing the macaddr if we haven't
     * completed ibd_m_start().
     */
    if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
        return (0);

    if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
        return (0);
    else
        return (EINVAL);
}

/*
 * The blocking part of the IBA join/leave operations are done out
 * of here on the async thread.
 */
static void
ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
{
    DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
        "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);

    if (op == IBD_ASYNC_JOIN) {
        if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
            ibd_print_warn(state, "Joint multicast group failed :"
            "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
        }
    } else {
        /*
         * Here, we must search for the proper mcg_info and
         * use that to leave the group.
         */
        ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
    }
}

/*
 * GLDv3 entry point for multicast enable/disable requests.
 * This function queues the operation to the async thread and
 * return success for a valid multicast address.
 */
static int
ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
{
    ibd_state_t *state = (ibd_state_t *)arg;
    ipoib_mac_t maddr, *mcast;
    ib_gid_t mgid;
    ibd_req_t *req;

    /*
     * If we haven't completed ibd_m_start(), async thread wouldn't
     * have been started and id_bcaddr wouldn't be set, so there's
     * no point in continuing.
     */
    if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
        return (0);

    /*
     * The incoming multicast address might not be aligned properly
     * on a 4 byte boundary to be considered an ipoib_mac_t. We force
     * it to look like one though, to get the offsets of the mc gid,
     * since we know we are not going to dereference any values with
     * the ipoib_mac_t pointer.
     */
    bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
    mcast = &maddr;

    /*
     * Check validity of MCG address. We could additionally check
     * that a enable/disable is not being issued on the "broadcast"
     * mcg, but since this operation is only invokable by priviledged
     * programs anyway, we allow the flexibility to those dlpi apps.
     * Note that we do not validate the "scope" of the IBA mcg.
     */
    if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
        return (EINVAL);

    /*
     * fill in multicast pkey and scope
     */
    IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);

    /*
     * If someone is trying to JOIN/LEAVE the broadcast group, we do
     * nothing (i.e. we stay JOINed to the broadcast group done in
     * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
     * requires to be joined to broadcast groups at all times.
     * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
     * depends on this.
     */
    if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
        return (0);

    ibd_n2h_gid(mcast, &mgid);
    req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
    if (req == NULL)
        return (ENOMEM);

    req->rq_gid = mgid;

    if (add) {
        DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
            mgid.gid_prefix, mgid.gid_guid);
        ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
    } else {
        DPRINT(1, "ibd_m_multicst : unset_multicast : "
            "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
        ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
    }
    return (0);
}

/*
 * The blocking part of the IBA promiscuous operations are done
 * out of here on the async thread. The dlpireq parameter indicates
 * whether this invocation is due to a dlpi request or due to
 * a port up/down event.
 */
static void
ibd_async_unsetprom(ibd_state_t *state)
{
    ibd_mce_t *mce = list_head(&state->id_mc_non);
    ib_gid_t mgid;

    DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");

    while (mce != NULL) {
        mgid = mce->mc_info.mc_adds_vect.av_dgid;
        mce = list_next(&state->id_mc_non, mce);
        ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
    }
    state->id_prom_op = IBD_OP_NOTSTARTED;
}

/*
 * The blocking part of the IBA promiscuous operations are done
 * out of here on the async thread. The dlpireq parameter indicates
 * whether this invocation is due to a dlpi request or due to
 * a port up/down event.
 */
static void
ibd_async_setprom(ibd_state_t *state)
{
    ibt_mcg_attr_t mcg_attr;
    ibt_mcg_info_t *mcg_info;
    ib_gid_t mgid;
    uint_t numg;
    int i;
    char ret = IBD_OP_COMPLETED;

    DPRINT(2, "ibd_async_setprom : async_set_promisc");

    /*
     * Obtain all active MC groups on the IB fabric with
     * specified criteria (scope + Pkey + Qkey + mtu).
     */
    bzero(&mcg_attr, sizeof (mcg_attr));
    mcg_attr.mc_pkey = state->id_pkey;
    mcg_attr.mc_scope = state->id_scope;
    mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
    mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
    mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
    if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
        IBT_SUCCESS) {
        ibd_print_warn(state, "Could not get list of IBA multicast "
            "groups");
        ret = IBD_OP_ERRORED;
        goto done;
    }

    /*
     * Iterate over the returned mcg's and join as NonMember
     * to the IP mcg's.
     */
    for (i = 0; i < numg; i++) {
        /*
         * Do a NonMember JOIN on the MC group.
         */
        mgid = mcg_info[i].mc_adds_vect.av_dgid;
        if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
            ibd_print_warn(state, "IBA promiscuous mode missed "
                "multicast gid %016llx:%016llx",
                (u_longlong_t)mgid.gid_prefix,
                (u_longlong_t)mgid.gid_guid);
    }

    ibt_free_mcg_info(mcg_info, numg);
    DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
done:
    state->id_prom_op = ret;
}

/*
 * GLDv3 entry point for multicast promiscuous enable/disable requests.
 * GLDv3 assumes phys state receives more packets than multi state,
 * which is not true for IPoIB. Thus, treat the multi and phys
 * promiscuous states the same way to work with GLDv3's assumption.
 */
static int
ibd_m_promisc(void *arg, boolean_t on)
{
    ibd_state_t *state = (ibd_state_t *)arg;
    ibd_req_t *req;

    /*
     * Async thread wouldn't have been started if we haven't
     * passed ibd_m_start()
     */
    if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
        return (0);

    req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
    if (req == NULL)
        return (ENOMEM);
    if (on) {
        DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
        ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
    } else {
        DPRINT(1, "ibd_m_promisc : unset_promisc");
        ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
    }

    return (0);
}

/*
 * GLDv3 entry point for gathering statistics.
 */
static int
ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
{
    ibd_state_t *state = (ibd_state_t *)arg;

    switch (stat) {
    case MAC_STAT_IFSPEED:
        *val = state->id_link_speed;
        break;
    case MAC_STAT_MULTIRCV:
        *val = state->id_multi_rcv;
        break;
    case MAC_STAT_BRDCSTRCV:
        *val = state->id_brd_rcv;
        break;
    case MAC_STAT_MULTIXMT:
        *val = state->id_multi_xmt;
        break;
    case MAC_STAT_BRDCSTXMT:
        *val = state->id_brd_xmt;
        break;
    case MAC_STAT_RBYTES:
        *val = state->id_rcv_bytes;
        break;
    case MAC_STAT_IPACKETS:
        *val = state->id_rcv_pkt;
        break;
    case MAC_STAT_OBYTES:
        *val = state->id_xmt_bytes;
        break;
    case MAC_STAT_OPACKETS:
        *val = state->id_xmt_pkt;
        break;
    case MAC_STAT_OERRORS:
        *val = state->id_ah_error;  /* failed AH translation */
        break;
    case MAC_STAT_IERRORS:
        *val = 0;
        break;
    case MAC_STAT_NOXMTBUF:
        *val = state->id_tx_short;
        break;
    case MAC_STAT_NORCVBUF:
    default:
        return (ENOTSUP);
    }

    return (0);
}

static void
ibd_async_txsched(ibd_state_t *state)
{
    ibd_req_t *req;
    int ret;

    if (ibd_txcomp_poll)
        ibd_poll_compq(state, state->id_scq_hdl);

    ret = ibd_resume_transmission(state);
    if (ret && ibd_txcomp_poll) {
        if (req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP))
            ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
        else {
            ibd_print_warn(state, "ibd_async_txsched: "
                "no memory, can't schedule work slot");
        }
    }
}

static int
ibd_resume_transmission(ibd_state_t *state)
{
    int flag;
    int met_thresh = 0;
    int ret = -1;

    mutex_enter(&state->id_sched_lock);
    if (state->id_sched_needed & IBD_RSRC_SWQE) {
        met_thresh = (state->id_tx_list.dl_cnt >
            IBD_FREE_SWQES_THRESH);
        flag = IBD_RSRC_SWQE;
    } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
        ASSERT(state->id_lso != NULL);
        met_thresh = (state->id_lso->bkt_nfree >
            IBD_FREE_LSOS_THRESH);
        flag = IBD_RSRC_LSOBUF;
    }
    if (met_thresh) {
        state->id_sched_needed &= ~flag;
        ret = 0;
    }
    mutex_exit(&state->id_sched_lock);

    if (ret == 0)
        mac_tx_update(state->id_mh);

    return (ret);
}

/*
 * Release the send wqe back into free list.
 */
static void
ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
{
    /*
     * Add back on Tx list for reuse.
     */
    swqe->swqe_next = NULL;
    mutex_enter(&state->id_tx_list.dl_mutex);
    if (state->id_tx_list.dl_pending_sends) {
        state->id_tx_list.dl_pending_sends = B_FALSE;
    }
    if (state->id_tx_list.dl_head == NULL) {
        state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
    } else {
        state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
    }
    state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
    state->id_tx_list.dl_cnt++;
    mutex_exit(&state->id_tx_list.dl_mutex);
}

/*
 * Acquire a send wqe from free list.
 * Returns error number and send wqe pointer.
 */
static int
ibd_acquire_swqe(ibd_state_t *state, ibd_swqe_t **swqe)
{
    int rc = 0;
    ibd_swqe_t *wqe;

    /*
     * Check and reclaim some of the completed Tx requests.
     * If someone else is already in this code and pulling Tx
     * completions, no need to poll, since the current lock holder
     * will do the work anyway. Normally, we poll for completions
     * every few Tx attempts, but if we are short on Tx descriptors,
     * we always try to poll.
     */
    if ((ibd_txcomp_poll == 1) &&
        (state->id_tx_list.dl_cnt < IBD_TX_POLL_THRESH)) {
        ibd_poll_compq(state, state->id_scq_hdl);
    }

    /*
     * Grab required transmit wqes.
     */
    mutex_enter(&state->id_tx_list.dl_mutex);
    wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
    if (wqe != NULL) {
        state->id_tx_list.dl_cnt -= 1;
        state->id_tx_list.dl_head = wqe->swqe_next;
        if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe))
            state->id_tx_list.dl_tail = NULL;
    } else {
        /*
         * If we did not find the number we were looking for, flag
         * no resource. Adjust list appropriately in either case.
         */
        rc = ENOENT;
        state->id_tx_list.dl_pending_sends = B_TRUE;
        DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
        atomic_add_64(&state->id_tx_short, 1);
    }
    mutex_exit(&state->id_tx_list.dl_mutex);
    *swqe = wqe;

    return (rc);
}

static int
ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
    ibt_ud_dest_hdl_t ud_dest)
{
    mblk_t  *nmp;
    int iph_len, tcph_len;
    ibt_wr_lso_t *lso;
    uintptr_t ip_start, tcp_start;
    uint8_t *dst;
    uint_t pending, mblen;

    /*
     * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
     * we need to adjust it here for lso.
     */
    lso = &(node->w_swr.wr.ud_lso);
    lso->lso_ud_dest = ud_dest;
    lso->lso_mss = mss;

    /*
     * Calculate the LSO header size and set it in the UD LSO structure.
     * Note that the only assumption we make is that each of the IPoIB,
     * IP and TCP headers will be contained in a single mblk fragment;
     * together, the headers may span multiple mblk fragments.
     */
    nmp = mp;
    ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
    if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
        ip_start = (uintptr_t)nmp->b_cont->b_rptr
            + (ip_start - (uintptr_t)(nmp->b_wptr));
        nmp = nmp->b_cont;

    }
    iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);

    tcp_start = ip_start + iph_len;
    if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
        tcp_start = (uintptr_t)nmp->b_cont->b_rptr
            + (tcp_start - (uintptr_t)(nmp->b_wptr));
        nmp = nmp->b_cont;
    }
    tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
    lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;

    /*
     * If the lso header fits entirely within a single mblk fragment,
     * we'll avoid an additional copy of the lso header here and just
     * pass the b_rptr of the mblk directly.
     *
     * If this isn't true, we'd have to allocate for it explicitly.
     */
    if (lso->lso_hdr_sz <= MBLKL(mp)) {
        lso->lso_hdr = mp->b_rptr;
    } else {
        /* On work completion, remember to free this allocated hdr */
        lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
        if (lso->lso_hdr == NULL) {
            DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
                "sz = %d", lso->lso_hdr_sz);
            lso->lso_hdr_sz = 0;
            lso->lso_mss = 0;
            return (-1);
        }
    }

    /*
     * Copy in the lso header only if we need to
     */
    if (lso->lso_hdr != mp->b_rptr) {
        dst = lso->lso_hdr;
        pending = lso->lso_hdr_sz;

        for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
            mblen = MBLKL(nmp);
            if (pending > mblen) {
                bcopy(nmp->b_rptr, dst, mblen);
                dst += mblen;
                pending -= mblen;
            } else {
                bcopy(nmp->b_rptr, dst, pending);
                break;
            }
        }
    }

    return (0);
}

static void
ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
{
    ibt_wr_lso_t *lso;

    if ((!node) || (!mp))
        return;

    /*
     * Free any header space that we might've allocated if we
     * did an LSO
     */
    if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
        lso = &(node->w_swr.wr.ud_lso);
        if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
            kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
            lso->lso_hdr = NULL;
            lso->lso_hdr_sz = 0;
        }
    }
}

static void
ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
{
    uint_t      i;
    uint_t      num_posted;
    uint_t      n_wrs;
    ibt_status_t    ibt_status;
    ibt_send_wr_t   wrs[IBD_MAX_POST_MULTIPLE];
    ibd_swqe_t  *elem;
    ibd_swqe_t  *nodes[IBD_MAX_POST_MULTIPLE];

    node->swqe_next = NULL;

    mutex_enter(&state->id_txpost_lock);

    /*
     * Enqueue the new node in chain of wqes to send
     */
    if (state->id_tx_head) {
        *(state->id_tx_tailp) = (ibd_wqe_t *)node;
    } else {
        state->id_tx_head = node;
    }
    state->id_tx_tailp = &(node->swqe_next);

    /*
     * If someone else is helping out with the sends,
     * just go back
     */
    if (state->id_tx_busy) {
        mutex_exit(&state->id_txpost_lock);
        return;
    }

    /*
     * Otherwise, mark the flag to indicate that we'll be
     * doing the dispatch of what's there in the wqe chain
     */
    state->id_tx_busy = 1;

    while (state->id_tx_head) {
        /*
         * Collect pending requests, IBD_MAX_POST_MULTIPLE wrs
         * at a time if possible, and keep posting them.
         */
        for (n_wrs = 0, elem = state->id_tx_head;
            (elem) && (n_wrs < IBD_MAX_POST_MULTIPLE);
            elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {

            nodes[n_wrs] = elem;
            wrs[n_wrs] = elem->w_swr;
        }
        state->id_tx_head = elem;

        /*
         * Release the txpost lock before posting the
         * send request to the hca; if the posting fails
         * for some reason, we'll never receive completion
         * intimation, so we'll need to cleanup.
         */
        mutex_exit(&state->id_txpost_lock);

        ASSERT(n_wrs != 0);

        /*
         * If posting fails for some reason, we'll never receive
         * completion intimation, so we'll need to cleanup. But
         * we need to make sure we don't clean up nodes whose
         * wrs have been successfully posted. We assume that the
         * hca driver returns on the first failure to post and
         * therefore the first 'num_posted' entries don't need
         * cleanup here.
         */
        num_posted = 0;
        ibt_status = ibt_post_send(state->id_chnl_hdl,
            wrs, n_wrs, &num_posted);
        if (ibt_status != IBT_SUCCESS) {

            ibd_print_warn(state, "ibd_post_send: "
                "posting multiple wrs failed: "
                "requested=%d, done=%d, ret=%d",
                n_wrs, num_posted, ibt_status);

            for (i = num_posted; i < n_wrs; i++)
                ibd_tx_cleanup(state, nodes[i]);
        }

        /*
         * Grab the mutex before we go and check the tx Q again
         */
        mutex_enter(&state->id_txpost_lock);
    }

    state->id_tx_busy = 0;
    mutex_exit(&state->id_txpost_lock);
}

static int
ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
    uint_t lsohdr_sz)
{
    ibt_wr_ds_t *sgl;
    ibt_status_t ibt_status;
    mblk_t *nmp;
    mblk_t *data_mp;
    uchar_t *bufp;
    size_t blksize;
    size_t skip;
    size_t avail;
    uint_t pktsize;
    uint_t frag_len;
    uint_t pending_hdr;
    uint_t hiwm;
    int nmblks;
    int i;

    /*
     * Let's skip ahead to the data if this is LSO
     */
    data_mp = mp;
    pending_hdr = 0;
    if (lsohdr_sz) {
        pending_hdr = lsohdr_sz;
        for (nmp = mp; nmp; nmp = nmp->b_cont) {
            frag_len = nmp->b_wptr - nmp->b_rptr;
            if (frag_len > pending_hdr)
                break;
            pending_hdr -= frag_len;
        }
        data_mp = nmp;  /* start of data past lso header */
        ASSERT(data_mp != NULL);
    }

    /*
     * Calculate the size of message data and number of msg blocks
     */
    pktsize = 0;
    for (nmblks = 0, nmp = data_mp; nmp != NULL;
        nmp = nmp->b_cont, nmblks++) {
        pktsize += MBLKL(nmp);
    }
    pktsize -= pending_hdr;

    /*
     * Translating the virtual address regions into physical regions
     * for using the Reserved LKey feature results in a wr sgl that
     * is a little longer. Since failing ibt_map_mem_iov() is costly,
     * we'll fix a high-water mark (65%) for when we should stop.
     */
    hiwm = (state->id_max_sqseg * 65) / 100;

    /*
     * We only do ibt_map_mem_iov() if the pktsize is above the
     * "copy-threshold", and if the number of mp fragments is less than
     * the maximum acceptable.
     */
    if ((state->id_hca_res_lkey_capab) &&
        (pktsize > IBD_TX_COPY_THRESH) &&
        (nmblks < hiwm)) {
        ibt_iov_t iov_arr[IBD_MAX_SQSEG];
        ibt_iov_attr_t iov_attr;

        iov_attr.iov_as = NULL;
        iov_attr.iov = iov_arr;
        iov_attr.iov_buf = NULL;
        iov_attr.iov_list_len = nmblks;
        iov_attr.iov_wr_nds = state->id_max_sqseg;
        iov_attr.iov_lso_hdr_sz = lsohdr_sz;
        iov_attr.iov_flags = IBT_IOV_SLEEP;

        for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
            iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
            iov_arr[i].iov_len = MBLKL(nmp);
            if (i == 0) {
                iov_arr[i].iov_addr += pending_hdr;
                iov_arr[i].iov_len -= pending_hdr;
            }
        }

        node->w_buftype = IBD_WQE_MAPPED;
        node->w_swr.wr_sgl = node->w_sgl;

        ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
            (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
        if (ibt_status != IBT_SUCCESS) {
            ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
                "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
            goto ibd_copy_path;
        }

        return (0);
    }

ibd_copy_path:
    if (pktsize <= state->id_tx_buf_sz) {
        node->swqe_copybuf.ic_sgl.ds_len = pktsize;
        node->w_swr.wr_nds = 1;
        node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
        node->w_buftype = IBD_WQE_TXBUF;

        /*
         * Even though this is the copy path for transfers less than
         * id_tx_buf_sz, it could still be an LSO packet.  If so, it
         * is possible the first data mblk fragment (data_mp) still
         * contains part of the LSO header that we need to skip.
         */
        bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
        for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
            blksize = MBLKL(nmp) - pending_hdr;
            bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
            bufp += blksize;
            pending_hdr = 0;
        }

        return (0);
    }

    /*
     * Copy path for transfers greater than id_tx_buf_sz
     */
    node->w_swr.wr_sgl = node->w_sgl;
    if (ibd_acquire_lsobufs(state, pktsize,
        node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
        DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
        return (-1);
    }
    node->w_buftype = IBD_WQE_LSOBUF;

    /*
     * Copy the larger-than-id_tx_buf_sz packet into a set of
     * fixed-sized, pre-mapped LSO buffers. Note that we might
     * need to skip part of the LSO header in the first fragment
     * as before.
     */
    nmp = data_mp;
    skip = pending_hdr;
    for (i = 0; i < node->w_swr.wr_nds; i++) {
        sgl = node->w_swr.wr_sgl + i;
        bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
        avail = IBD_LSO_BUFSZ;
        while (nmp && avail) {
            blksize = MBLKL(nmp) - skip;
            if (blksize > avail) {
                bcopy(nmp->b_rptr + skip, bufp, avail);
                skip += avail;
                avail = 0;
            } else {
                bcopy(nmp->b_rptr + skip, bufp, blksize);
                skip = 0;
                avail -= blksize;
                bufp += blksize;
                nmp = nmp->b_cont;
            }
        }
    }

    return (0);
}

/*
 * Schedule a completion queue polling to reap the resource we're
 * short on.  If we implement the change to reap tx completions
 * in a separate thread, we'll need to wake up that thread here.
 */
static int
ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
{
    ibd_req_t *req;

    mutex_enter(&state->id_sched_lock);
    state->id_sched_needed |= resource_type;
    mutex_exit(&state->id_sched_lock);

    /*
     * If we are asked to queue a work entry, we need to do it
     */
    if (q_flag) {
        req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
        if (req == NULL)
            return (-1);

        ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
    }

    return (0);
}

/*
 * The passed in packet has this format:
 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
 */
static boolean_t
ibd_send(ibd_state_t *state, mblk_t *mp)
{
    ibd_ace_t *ace;
    ibd_swqe_t *node;
    ipoib_mac_t *dest;
    ib_header_info_t *ipibp;
    ip6_t *ip6h;
    uint_t pktsize;
    uint32_t mss;
    uint32_t hckflags;
    uint32_t lsoflags = 0;
    uint_t lsohdr_sz = 0;
    int ret, len;
    boolean_t dofree = B_FALSE;
    boolean_t rc;

    /*
     * If we aren't done with the device initialization and start,
     * we shouldn't be here.
     */
    if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
        return (B_FALSE);

    node = NULL;
    if (ibd_acquire_swqe(state, &node) != 0) {
        /*
         * If we don't have an swqe available, schedule a transmit
         * completion queue cleanup and hold off on sending more
         * more packets until we have some free swqes
         */
        if (ibd_sched_poll(state, IBD_RSRC_SWQE, ibd_txcomp_poll) == 0)
            return (B_FALSE);

        /*
         * If a poll cannot be scheduled, we have no choice but
         * to drop this packet
         */
        ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
        return (B_TRUE);
    }

    /*
     * Initialize the commonly used fields in swqe to NULL to protect
     * against ibd_tx_cleanup accidentally misinterpreting these on a
     * failure.
     */
    node->swqe_im_mblk = NULL;
    node->w_swr.wr_nds = 0;
    node->w_swr.wr_sgl = NULL;
    node->w_swr.wr_opcode = IBT_WRC_SEND;

    /*
     * Obtain an address handle for the destination.
     */
    ipibp = (ib_header_info_t *)mp->b_rptr;
    dest = (ipoib_mac_t *)&ipibp->ib_dst;
    if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
        IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);

    pktsize = msgsize(mp);

    atomic_add_64(&state->id_xmt_bytes, pktsize);
    atomic_inc_64(&state->id_xmt_pkt);
    if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
        atomic_inc_64(&state->id_brd_xmt);
    else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
        atomic_inc_64(&state->id_multi_xmt);

    if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) {
        node->w_ahandle = ace;
        node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
    } else {
        DPRINT(5,
            "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
            ((ret == EFAULT) ? "failed" : "queued"),
            htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
            htonl(dest->ipoib_gidpref[1]),
            htonl(dest->ipoib_gidsuff[0]),
            htonl(dest->ipoib_gidsuff[1]));
        node->w_ahandle = NULL;

        /*
         * for the poll mode, it is probably some cqe pending in the
         * cq. So ibd has to poll cq here, otherwise acache probably
         * may not be recycled.
         */
        if (ibd_txcomp_poll == 1)
            ibd_poll_compq(state, state->id_scq_hdl);

        /*
         * Here if ibd_acache_lookup() returns EFAULT, it means ibd
         * can not find a path for the specific dest address. We
         * should get rid of this kind of packet.  We also should get
         * rid of the packet if we cannot schedule a poll via the
         * async thread.  For the normal case, ibd will return the
         * packet to upper layer and wait for AH creating.
         *
         * Note that we always queue a work slot entry for the async
         * thread when we fail AH lookup (even in intr mode); this is
         * due to the convoluted way the code currently looks for AH.
         */
        if (ret == EFAULT) {
            dofree = B_TRUE;
            rc = B_TRUE;
        } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
            dofree = B_TRUE;
            rc = B_TRUE;
        } else {
            dofree = B_FALSE;
            rc = B_FALSE;
        }
        goto ibd_send_fail;
    }

    /*
     * For ND6 packets, padding is at the front of the source lladdr.
     * Insert the padding at front.
     */
    if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) {
        if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
            if (!pullupmsg(mp, IPV6_HDR_LEN +
                sizeof (ib_header_info_t))) {
                DPRINT(10, "ibd_send: pullupmsg failure ");
                dofree = B_TRUE;
                rc = B_TRUE;
                goto ibd_send_fail;
            }
            ipibp = (ib_header_info_t *)mp->b_rptr;
        }
        ip6h = (ip6_t *)((uchar_t *)ipibp +
            sizeof (ib_header_info_t));
        len = ntohs(ip6h->ip6_plen);
        if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
            mblk_t  *pad;

            pad = allocb(4, 0);
            pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
            linkb(mp, pad);
            if (MBLKL(mp) < sizeof (ib_header_info_t) +
                IPV6_HDR_LEN + len + 4) {
                if (!pullupmsg(mp, sizeof (ib_header_info_t) +
                    IPV6_HDR_LEN + len + 4)) {
                    DPRINT(10, "ibd_send: pullupmsg "
                        "failure ");
                    dofree = B_TRUE;
                    rc = B_TRUE;
                    goto ibd_send_fail;
                }
                ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
                    sizeof (ib_header_info_t));
            }

            /* LINTED: E_CONSTANT_CONDITION */
            IBD_PAD_NSNA(ip6h, len, IBD_SEND);
        }
    }

    mp->b_rptr += sizeof (ib_addrs_t);

    /*
     * Do LSO and checksum related work here.  For LSO send, adjust the
     * ud destination, the opcode and the LSO header information to the
     * work request.
     */
    lso_info_get(mp, &mss, &lsoflags);
    if ((lsoflags & HW_LSO) != HW_LSO) {
        node->w_swr.wr_opcode = IBT_WRC_SEND;
        lsohdr_sz = 0;
    } else {
        if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
            /*
             * The routine can only fail if there's no memory; we
             * can only drop the packet if this happens
             */
            ibd_print_warn(state,
                "ibd_send: no memory, lso posting failed");
            dofree = B_TRUE;
            rc = B_TRUE;
            goto ibd_send_fail;
        }

        node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
        lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
    }

    hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags);
    if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
        node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
    else
        node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;

    /*
     * Prepare the sgl for posting; the routine can only fail if there's
     * no lso buf available for posting. If this is the case, we should
     * probably resched for lso bufs to become available and then try again.
     */
    if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
        if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
            dofree = B_TRUE;
            rc = B_TRUE;
        } else {
            dofree = B_FALSE;
            rc = B_FALSE;
        }
        goto ibd_send_fail;
    }
    node->swqe_im_mblk = mp;

    /*
     * Queue the wqe to hardware; since we can now simply queue a
     * post instead of doing it serially, we cannot assume anything
     * about the 'node' after ibd_post_send() returns.
     */
    ibd_post_send(state, node);

    return (B_TRUE);

ibd_send_fail:
    if (node && mp)
        ibd_free_lsohdr(node, mp);

    if (dofree)
        freemsg(mp);

    if (node != NULL)
        ibd_tx_cleanup(state, node);

    return (rc);
}

/*
 * GLDv3 entry point for transmitting datagram.
 */
static mblk_t *
ibd_m_tx(void *arg, mblk_t *mp)
{
    ibd_state_t *state = (ibd_state_t *)arg;
    mblk_t *next;

    if (state->id_link_state != LINK_STATE_UP) {
        freemsgchain(mp);
        mp = NULL;
    }

    while (mp != NULL) {
        next = mp->b_next;
        mp->b_next = NULL;
        if (ibd_send(state, mp) == B_FALSE) {
            /* Send fail */
            mp->b_next = next;
            break;
        }
        mp = next;
    }

    return (mp);
}

/*
 * this handles Tx and Rx completions. With separate CQs, this handles
 * only Rx completions.
 */
static uint_t
ibd_intr(char *arg)
{
    ibd_state_t *state = (ibd_state_t *)arg;

    ibd_poll_compq(state, state->id_rcq_hdl);

    return (DDI_INTR_CLAIMED);
}

/*
 * Poll and drain the cq
 */
static uint_t
ibd_drain_cq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl, ibt_wc_t *wcs,
    uint_t numwcs)
{
    ibd_wqe_t *wqe;
    ibt_wc_t *wc;
    uint_t total_polled = 0;
    uint_t num_polled;
    int i;

    while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
        total_polled += num_polled;
        for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
            wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
            ASSERT((wqe->w_type == IBD_WQE_SEND) ||
                (wqe->w_type == IBD_WQE_RECV));
            if (wc->wc_status != IBT_WC_SUCCESS) {
                /*
                 * Channel being torn down.
                 */
                if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
                    DPRINT(5, "ibd_drain_cq: flush error");
                    /*
                     * Only invoke the Tx handler to
                     * release possibly held resources
                     * like AH refcount etc. Can not
                     * invoke Rx handler because it might
                     * try adding buffers to the Rx pool
                     * when we are trying to deinitialize.
                     */
                    if (wqe->w_type == IBD_WQE_RECV) {
                        continue;
                    } else {
                        DPRINT(10, "ibd_drain_cq: Bad "
                            "status %d", wc->wc_status);
                    }
                }
            }
            if (wqe->w_type == IBD_WQE_SEND) {
                ibd_tx_cleanup(state, WQE_TO_SWQE(wqe));
            } else {
                ibd_process_rx(state, WQE_TO_RWQE(wqe), wc);
            }
        }
    }

    return (total_polled);
}

/*
 * Common code for interrupt handling as well as for polling
 * for all completed wqe's while detaching.
 */
static void
ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
{
    ibt_wc_t *wcs;
    uint_t numwcs;
    int flag, redo_flag;
    int redo = 1;
    uint_t num_polled = 0;

    if (ibd_separate_cqs == 1) {
        if (cq_hdl == state->id_rcq_hdl) {
            flag = IBD_RX_CQ_POLLING;
            redo_flag = IBD_REDO_RX_CQ_POLLING;
        } else {
            flag = IBD_TX_CQ_POLLING;
            redo_flag = IBD_REDO_TX_CQ_POLLING;
        }
    } else {
        flag = IBD_RX_CQ_POLLING | IBD_TX_CQ_POLLING;
        redo_flag = IBD_REDO_RX_CQ_POLLING | IBD_REDO_TX_CQ_POLLING;
    }

    mutex_enter(&state->id_cq_poll_lock);
    if (state->id_cq_poll_busy & flag) {
        state->id_cq_poll_busy |= redo_flag;
        mutex_exit(&state->id_cq_poll_lock);
        return;
    }
    state->id_cq_poll_busy |= flag;
    mutex_exit(&state->id_cq_poll_lock);

    /*
     * In some cases (eg detaching), this code can be invoked on
     * any cpu after disabling cq notification (thus no concurrency
     * exists). Apart from that, the following applies normally:
     * The receive completion handling is always on the Rx interrupt
     * cpu. Transmit completion handling could be from any cpu if
     * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
     * is interrupt driven. Combined completion handling is always
     * on the interrupt cpu. Thus, lock accordingly and use the
     * proper completion array.
     */
    if (ibd_separate_cqs == 1) {
        if (cq_hdl == state->id_rcq_hdl) {
            wcs = state->id_rxwcs;
            numwcs = state->id_rxwcs_size;
        } else {
            wcs = state->id_txwcs;
            numwcs = state->id_txwcs_size;
        }
    } else {
        wcs = state->id_rxwcs;
        numwcs = state->id_rxwcs_size;
    }

    /*
     * Poll and drain the CQ
     */
    num_polled = ibd_drain_cq(state, cq_hdl, wcs, numwcs);

    /*
     * Enable CQ notifications and redrain the cq to catch any
     * completions we might have missed after the ibd_drain_cq()
     * above and before the ibt_enable_cq_notify() that follows.
     * Finally, service any new requests to poll the cq that
     * could've come in after the ibt_enable_cq_notify().
     */
    do {
        if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
            IBT_SUCCESS) {
            DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
        }

        num_polled += ibd_drain_cq(state, cq_hdl, wcs, numwcs);

        mutex_enter(&state->id_cq_poll_lock);
        if (state->id_cq_poll_busy & redo_flag)
            state->id_cq_poll_busy &= ~redo_flag;
        else {
            state->id_cq_poll_busy &= ~flag;
            redo = 0;
        }
        mutex_exit(&state->id_cq_poll_lock);

    } while (redo);

    /*
     * If we polled the receive cq and found anything, we need to flush
     * it out to the nw layer here.
     */
    if ((flag & IBD_RX_CQ_POLLING) && (num_polled > 0)) {
        ibd_flush_rx(state, NULL);
    }
}

/*
 * Unmap the memory area associated with a given swqe.
 */
static void
ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
{
    ibt_status_t stat;

    DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);

    if (swqe->w_mi_hdl) {
        if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
            swqe->w_mi_hdl)) != IBT_SUCCESS) {
            DPRINT(10,
                "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
        }
        swqe->w_mi_hdl = NULL;
    }
    swqe->w_swr.wr_nds = 0;
}

/*
 * Common code that deals with clean ups after a successful or
 * erroneous transmission attempt.
 */
static void
ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
{
    ibd_ace_t *ace = swqe->w_ahandle;

    DPRINT(20, "ibd_tx_cleanup %p\n", swqe);

    /*
     * If this was a dynamic mapping in ibd_send(), we need to
     * unmap here. If this was an lso buffer we'd used for sending,
     * we need to release the lso buf to the pool, since the resource
     * is scarce. However, if this was simply a normal send using
     * the copybuf (present in each swqe), we don't need to release it.
     */
    if (swqe->swqe_im_mblk != NULL) {
        if (swqe->w_buftype == IBD_WQE_MAPPED) {
            ibd_unmap_mem(state, swqe);
        } else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
            ibd_release_lsobufs(state,
                swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
        }
        ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
        freemsg(swqe->swqe_im_mblk);
        swqe->swqe_im_mblk = NULL;
    }

    /*
     * Drop the reference count on the AH; it can be reused
     * now for a different destination if there are no more
     * posted sends that will use it. This can be eliminated
     * if we can always associate each Tx buffer with an AH.
     * The ace can be null if we are cleaning up from the
     * ibd_send() error path.
     */
    if (ace != NULL) {
        /*
         * The recycling logic can be eliminated from here
         * and put into the async thread if we create another
         * list to hold ACE's for unjoined mcg's.
         */
        if (DEC_REF_DO_CYCLE(ace)) {
            ibd_mce_t *mce;

            /*
             * Check with the lock taken: we decremented
             * reference count without the lock, and some
             * transmitter might alreay have bumped the
             * reference count (possible in case of multicast
             * disable when we leave the AH on the active
             * list). If not still 0, get out, leaving the
             * recycle bit intact.
             *
             * Atomically transition the AH from active
             * to free list, and queue a work request to
             * leave the group and destroy the mce. No
             * transmitter can be looking at the AH or
             * the MCE in between, since we have the
             * ac_mutex lock. In the SendOnly reap case,
             * it is not neccesary to hold the ac_mutex
             * and recheck the ref count (since the AH was
             * taken off the active list), we just do it
             * to have uniform processing with the Full
             * reap case.
             */
            mutex_enter(&state->id_ac_mutex);
            mce = ace->ac_mce;
            if (GET_REF_CYCLE(ace) == 0) {
                CLEAR_REFCYCLE(ace);
                /*
                 * Identify the case of fullmember reap as
                 * opposed to mcg trap reap. Also, port up
                 * might set ac_mce to NULL to indicate Tx
                 * cleanup should do no more than put the
                 * AH in the free list (see ibd_async_link).
                 */
                if (mce != NULL) {
                    ace->ac_mce = NULL;
                    IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
                    /*
                     * mc_req was initialized at mce
                     * creation time.
                     */
                    ibd_queue_work_slot(state,
                        &mce->mc_req, IBD_ASYNC_REAP);
                }
                IBD_ACACHE_INSERT_FREE(state, ace);
            }
            mutex_exit(&state->id_ac_mutex);
        }
    }

    /*
     * Release the send wqe for reuse.
     */
    ibd_release_swqe(state, swqe);
}

/*
 * Hand off the processed rx mp chain to mac_rx()
 */
static void
ibd_flush_rx(ibd_state_t *state, mblk_t *mpc)
{
    if (mpc == NULL) {
        mutex_enter(&state->id_rx_lock);

        mpc = state->id_rx_mp;

        state->id_rx_mp = NULL;
        state->id_rx_mp_tail = NULL;
        state->id_rx_mp_len = 0;

        mutex_exit(&state->id_rx_lock);
    }

    if (mpc) {
        mac_rx(state->id_mh, state->id_rh, mpc);
    }
}

/*
 * Processing to be done after receipt of a packet; hand off to GLD
 * in the format expected by GLD.  The received packet has this
 * format: 2b sap :: 00 :: data.
 */
static void
ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
{
    ib_header_info_t *phdr;
    mblk_t *mp;
    mblk_t *mpc = NULL;
    ipoib_hdr_t *ipibp;
    ipha_t *iphap;
    ip6_t *ip6h;
    int rxcnt, len;

    /*
     * Track number handed to upper layer, and number still
     * available to receive packets.
     */
    rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1);
    ASSERT(rxcnt >= 0);
    atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1);

    /*
     * Adjust write pointer depending on how much data came in.
     */
    mp = rwqe->rwqe_im_mblk;
    mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer;

    /*
     * Make sure this is NULL or we're in trouble.
     */
    if (mp->b_next != NULL) {
        ibd_print_warn(state,
            "ibd_process_rx: got duplicate mp from rcq?");
        mp->b_next = NULL;
    }

    /*
     * the IB link will deliver one of the IB link layer
     * headers called, the Global Routing Header (GRH).
     * ibd driver uses the information in GRH to build the
     * Header_info structure and pass it with the datagram up
     * to GLDv3.
     * If the GRH is not valid, indicate to GLDv3 by setting
     * the VerTcFlow field to 0.
     */
    phdr = (ib_header_info_t *)mp->b_rptr;
    if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
        phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);

        /* if it is loop back packet, just drop it. */
        if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
            IPOIB_ADDRL) == 0) {
            freemsg(mp);
            return;
        }

        ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
            sizeof (ipoib_mac_t));
        if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
            phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
            IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
        } else {
            phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
        }
    } else {
        /*
         * It can not be a IBA multicast packet. Must have been
         * unicast for us. Just copy the interface address to dst.
         */
        phdr->ib_grh.ipoib_vertcflow = 0;
        ovbcopy(&state->id_macaddr, &phdr->ib_dst,
            sizeof (ipoib_mac_t));
    }

    /*
     * For ND6 packets, padding is at the front of the source/target
     * lladdr. However the inet6 layer is not aware of it, hence remove
     * the padding from such packets.
     */
    ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
    if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
        if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) {
            if (!pullupmsg(mp, IPV6_HDR_LEN +
                sizeof (ipoib_hdr_t))) {
                DPRINT(10, "ibd_process_rx: pullupmsg failed");
                freemsg(mp);
                return;
            }
            ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr +
                sizeof (ipoib_pgrh_t));
        }
        ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
        len = ntohs(ip6h->ip6_plen);
        if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
            if (MBLKL(mp) < sizeof (ipoib_hdr_t) +
                IPV6_HDR_LEN + len) {
                if (!pullupmsg(mp, sizeof (ipoib_hdr_t) +
                    IPV6_HDR_LEN + len)) {
                    DPRINT(10, "ibd_process_rx: pullupmsg"
                        " failed");
                    freemsg(mp);
                    return;
                }
                ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
                    sizeof (ipoib_pgrh_t) +
                    sizeof (ipoib_hdr_t));
            }
            /* LINTED: E_CONSTANT_CONDITION */
            IBD_PAD_NSNA(ip6h, len, IBD_RECV);
        }
    }

    /*
     * Update statistics
     */
    atomic_add_64(&state->id_rcv_bytes, wc->wc_bytes_xfer);
    atomic_inc_64(&state->id_rcv_pkt);
    if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
        atomic_inc_64(&state->id_brd_rcv);
    else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
        atomic_inc_64(&state->id_multi_rcv);

    iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
    /*
     * Set receive checksum status in mp
     * Hardware checksumming can be considered valid only if:
     * 1. CQE.IP_OK bit is set
     * 2. CQE.CKSUM = 0xffff
     * 3. IPv6 routing header is not present in the packet
     * 4. If there are no IP_OPTIONS in the IP HEADER
     */

    if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
        (wc->wc_cksum == 0xFFFF) &&
        (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
        (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
            HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
    }

    /*
     * Add this mp to the list of processed mp's to send to
     * the nw layer
     */
    mutex_enter(&state->id_rx_lock);
    if (state->id_rx_mp) {
        ASSERT(state->id_rx_mp_tail != NULL);
        state->id_rx_mp_tail->b_next = mp;
    } else {
        ASSERT(state->id_rx_mp_tail == NULL);
        state->id_rx_mp = mp;
    }

    state->id_rx_mp_tail = mp;
    state->id_rx_mp_len++;

    if (state->id_rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
        mpc = state->id_rx_mp;

        state->id_rx_mp = NULL;
        state->id_rx_mp_tail = NULL;
        state->id_rx_mp_len = 0;
    }

    mutex_exit(&state->id_rx_lock);

    if (mpc) {
        ibd_flush_rx(state, mpc);
    }
}

/*
 * Callback code invoked from STREAMs when the receive data buffer is
 * free for recycling.
 */
static void
ibd_freemsg_cb(char *arg)
{
    ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
    ibd_state_t *state = rwqe->w_state;

    /*
     * If the wqe is being destructed, do not attempt recycling.
     */
    if (rwqe->w_freeing_wqe == B_TRUE) {
        DPRINT(6, "ibd_freemsg: wqe being freed");
        return;
    } else {
        /*
         * Upper layer has released held mblk, so we have
         * no more use for keeping the old pointer in
         * our rwqe.
         */
        rwqe->rwqe_im_mblk = NULL;
    }

    rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
        state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
    if (rwqe->rwqe_im_mblk == NULL) {
        ibd_delete_rwqe(state, rwqe);
        ibd_free_rwqe(state, rwqe);
        DPRINT(6, "ibd_freemsg: desballoc failed");
        return;
    }

    if (ibd_post_recv(state, rwqe, B_TRUE) == DDI_FAILURE) {
        ibd_delete_rwqe(state, rwqe);
        ibd_free_rwqe(state, rwqe);
        return;
    }

    atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1);
}

static uint_t
ibd_tx_recycle(char *arg)
{
    ibd_state_t *state = (ibd_state_t *)arg;

    /*
     * Poll for completed entries
     */
    ibd_poll_compq(state, state->id_scq_hdl);

    /*
     * Resume any blocked transmissions if possible
     */
    (void) ibd_resume_transmission(state);

    return (DDI_INTR_CLAIMED);
}

#ifdef IBD_LOGGING
static void
ibd_log_init(void)
{
    ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
    ibd_lbuf_ndx = 0;

    mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
}

static void
ibd_log_fini(void)
{
    if (ibd_lbuf)
        kmem_free(ibd_lbuf, IBD_LOG_SZ);
    ibd_lbuf_ndx = 0;
    ibd_lbuf = NULL;

    mutex_destroy(&ibd_lbuf_lock);
}

static void
ibd_log(const char *fmt, ...)
{
    va_list ap;
    uint32_t off;
    uint32_t msglen;
    char tmpbuf[IBD_DMAX_LINE];

    if (ibd_lbuf == NULL)
        return;

    va_start(ap, fmt);
    msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
    va_end(ap);

    if (msglen >= IBD_DMAX_LINE)
        msglen = IBD_DMAX_LINE - 1;

    mutex_enter(&ibd_lbuf_lock);

    off = ibd_lbuf_ndx;     /* current msg should go here */
    if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
        ibd_lbuf[ibd_lbuf_ndx-1] = '\n';

    ibd_lbuf_ndx += msglen;     /* place where next msg should start */
    ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */

    if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
        ibd_lbuf_ndx = 0;

    mutex_exit(&ibd_lbuf_lock);

    bcopy(tmpbuf, ibd_lbuf+off, msglen);    /* no lock needed for this */
}
#endif