clients/eoib/eib_svc.c

	eib_svc.c revision b494511a9cf72b1fc4eb13a0e593f55c624ab829
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include <sys/types.h>
#include <sys/kmem.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/ksynch.h>
#include <sys/callb.h>
#include <sys/mac_provider.h>

#include <sys/ib/clients/eoib/eib_impl.h>

/*
 * Thread to handle EoIB events asynchronously
 */
void
eib_events_handler(eib_t *ss)
{
    eib_event_t *evi;
    eib_event_t *nxt;
    kmutex_t ci_lock;
    callb_cpr_t ci;

    mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
    CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_EVENTS_HDLR);

wait_for_event:
    mutex_enter(&ss->ei_ev_lock);
    while ((evi = ss->ei_event) == NULL) {
        mutex_enter(&ci_lock);
        CALLB_CPR_SAFE_BEGIN(&ci);
        mutex_exit(&ci_lock);

        cv_wait(&ss->ei_ev_cv, &ss->ei_ev_lock);

        mutex_enter(&ci_lock);
        CALLB_CPR_SAFE_END(&ci, &ci_lock);
        mutex_exit(&ci_lock);
    }

    /*
     * Are we being asked to die ?
     */
    if (evi->ev_code == EIB_EV_SHUTDOWN) {
        while (evi) {
            nxt = evi->ev_next;
            kmem_free(evi, sizeof (eib_event_t));
            evi = nxt;
        }
        ss->ei_event = NULL;
        mutex_exit(&ss->ei_ev_lock);

        mutex_enter(&ci_lock);
        CALLB_CPR_EXIT(&ci);
        mutex_destroy(&ci_lock);

        return;
    }

    /*
     * Otherwise, pull out the first entry from our work queue
     */
    ss->ei_event = evi->ev_next;
    evi->ev_next = NULL;

    mutex_exit(&ss->ei_ev_lock);

    /*
     * Process this event
     *
     * Note that we don't want to race with plumb/unplumb in this
     * handler, since we may have to restart vnics or do stuff that
     * may get re-initialized or released if we allowed plumb/unplumb
     * to happen in parallel.
     */
    eib_mac_set_nic_state(ss, EIB_NIC_RESTARTING);

    switch (evi->ev_code) {
    case EIB_EV_PORT_DOWN:
        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_events_handler: Begin EIB_EV_PORT_DOWN");

        eib_mac_link_down(ss, B_FALSE);

        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_events_handler: End EIB_EV_PORT_DOWN");
        break;

    case EIB_EV_PORT_UP:
        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_events_handler: Begin EIB_EV_PORT_UP");

        eib_ibt_link_mod(ss);

        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_events_handler: End EIB_EV_PORT_UP");
        break;

    case EIB_EV_PKEY_CHANGE:
        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_events_handler: Begin EIB_EV_PKEY_CHANGE");

        eib_ibt_link_mod(ss);

        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_events_handler: End EIB_EV_PKEY_CHANGE");
        break;

    case EIB_EV_SGID_CHANGE:
        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_events_handler: Begin EIB_EV_SGID_CHANGE");

        eib_ibt_link_mod(ss);

        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_events_handler: End EIB_EV_SGID_CHANGE");
        break;

    case EIB_EV_CLNT_REREG:
        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_events_handler: Begin EIB_EV_CLNT_REREG");

        eib_ibt_link_mod(ss);

        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_events_handler: End EIB_EV_CLNT_REREG");
        break;

    case EIB_EV_GW_UP:
        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_events_handler: Begin EIB_EV_GW_UP");

        /*
         * EoIB nexus has notified us that our gateway is now
         * reachable. Unless we already think it is reachable,
         * mark it so in our records and try to resurrect dead
         * vnics.
         */
        mutex_enter(&ss->ei_vnic_lock);
        if (ss->ei_gw_unreachable == B_FALSE) {
            EIB_DPRINTF_DEBUG(ss->ei_instance,
                "eib_events_handler: gw reachable");
            mutex_exit(&ss->ei_vnic_lock);

            EIB_DPRINTF_DEBUG(ss->ei_instance,
                "eib_events_handler: End EIB_EV_GW_UP");
            break;
        }
        ss->ei_gw_unreachable = B_FALSE;
        mutex_exit(&ss->ei_vnic_lock);

        /*
         * If we've not even started yet, we have nothing to do.
         */
        if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) == 0) {
            EIB_DPRINTF_DEBUG(ss->ei_instance,
                "eib_events_handler: End EIB_EV_GW_UP");
            break;
        }

        if (eib_mac_hca_portstate(ss, NULL, NULL) != EIB_E_SUCCESS) {
            EIB_DPRINTF_DEBUG(ss->ei_instance,
                "eib_events_handler: "
                "HCA portstate failed, marking link down");

            eib_mac_link_down(ss, B_FALSE);
        } else {
            uint8_t vn0_mac[ETHERADDRL];

            EIB_DPRINTF_DEBUG(ss->ei_instance,
                "eib_events_handler: "
                "HCA portstate ok, resurrecting zombies");

            bcopy(eib_zero_mac, vn0_mac, ETHERADDRL);
            eib_vnic_resurrect_zombies(ss, vn0_mac);

            /*
             * If we've resurrected the zombies because the gateway
             * went down and came back, it is possible our unicast
             * mac address changed from what it was earlier. If
             * so, we need to update our unicast address with the
             * mac layer before marking the link up.
             */
            if (bcmp(vn0_mac, eib_zero_mac, ETHERADDRL) != 0) {
                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: updating unicast "
                    "addr to %x:%x:%x:%x:%x:%x", vn0_mac[0],
                    vn0_mac[1], vn0_mac[2], vn0_mac[3],
                    vn0_mac[4], vn0_mac[5]);

                mac_unicst_update(ss->ei_mac_hdl, vn0_mac);
            }

            EIB_DPRINTF_DEBUG(ss->ei_instance,
                "eib_events_handler: eib_mac_link_up(B_FALSE)");

            eib_mac_link_up(ss, B_FALSE);
        }

        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_events_handler: End EIB_EV_GW_UP");
        break;

    case EIB_EV_GW_INFO_UPDATE:
        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_events_handler: Begin EIB_EV_GW_INFO_UPDATE");

        if (evi->ev_arg) {
            eib_update_props(ss, (eib_gw_info_t *)(evi->ev_arg));
            kmem_free(evi->ev_arg, sizeof (eib_gw_info_t));
        }

        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_events_handler: End EIB_EV_GW_INFO_UPDATE");
        break;

    case EIB_EV_MCG_DELETED:
        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_events_handler: Begin-End EIB_EV_MCG_DELETED");
        break;

    case EIB_EV_MCG_CREATED:
        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_events_handler: Begin-End EIB_EV_MCG_CREATED");
        break;

    case EIB_EV_GW_EPORT_DOWN:
        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_events_handler: Begin-End EIB_EV_GW_EPORT_DOWN");
        break;

    case EIB_EV_GW_DOWN:
        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_events_handler: Begin-End EIB_EV_GW_DOWN");
        break;
    }

    eib_mac_clr_nic_state(ss, EIB_NIC_RESTARTING);

    kmem_free(evi, sizeof (eib_event_t));
    goto wait_for_event;

    /*NOTREACHED*/
}

void
eib_svc_enqueue_event(eib_t *ss, eib_event_t *evi)
{
    eib_event_t *elem = NULL;
    eib_event_t *tail = NULL;

    mutex_enter(&ss->ei_ev_lock);

    /*
     * Notice to shutdown has a higher priority than the
     * rest and goes to the head of the list. Everything
     * else goes at the end.
     */
    if (evi->ev_code == EIB_EV_SHUTDOWN) {
        evi->ev_next = ss->ei_event;
        ss->ei_event = evi;
    } else {
        for (elem = ss->ei_event; elem; elem = elem->ev_next)
            tail = elem;

        if (tail)
            tail->ev_next = evi;
        else
            ss->ei_event = evi;
    }

    cv_signal(&ss->ei_ev_cv);
    mutex_exit(&ss->ei_ev_lock);
}

/*
 * Thread to refill channels with rwqes whenever they get low.
 */
void
eib_refill_rwqes(eib_t *ss)
{
    eib_chan_t *chan;
    kmutex_t ci_lock;
    callb_cpr_t ci;

    mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
    CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_RWQES_REFILLER);

wait_for_refill_work:
    mutex_enter(&ss->ei_rxpost_lock);

    while ((ss->ei_rxpost == NULL) && (ss->ei_rxpost_die == 0)) {
        mutex_enter(&ci_lock);
        CALLB_CPR_SAFE_BEGIN(&ci);
        mutex_exit(&ci_lock);

        cv_wait(&ss->ei_rxpost_cv, &ss->ei_rxpost_lock);

        mutex_enter(&ci_lock);
        CALLB_CPR_SAFE_END(&ci, &ci_lock);
        mutex_exit(&ci_lock);
    }

    /*
     * Discard all requests for refill if we're being asked to die
     */
    if (ss->ei_rxpost_die) {
        ss->ei_rxpost = NULL;
        mutex_exit(&ss->ei_rxpost_lock);

        mutex_enter(&ci_lock);
        CALLB_CPR_EXIT(&ci);
        mutex_destroy(&ci_lock);

        return;
    }
    ASSERT(ss->ei_rxpost != NULL);

    /*
     * Take the first element out of the queue
     */
    chan = ss->ei_rxpost;
    ss->ei_rxpost = chan->ch_rxpost_next;
    chan->ch_rxpost_next = NULL;

    mutex_exit(&ss->ei_rxpost_lock);

    /*
     * Try to post a bunch of recv wqes into this channel. If we
     * fail, it means that we haven't even been able to post a
     * single recv wqe.  This is alarming, but there's nothing
     * we can do. We just move on to the next channel needing
     * our service.
     */
    if (eib_chan_post_rx(ss, chan, NULL) != EIB_E_SUCCESS) {
        EIB_DPRINTF_ERR(ss->ei_instance,
            "eib_refill_rwqes: eib_chan_post_rx() failed");
    }

    /*
     * Mark it to indicate that the refilling is done
     */
    mutex_enter(&chan->ch_rx_lock);
    chan->ch_rx_refilling = B_FALSE;
    mutex_exit(&chan->ch_rx_lock);

    goto wait_for_refill_work;

    /*NOTREACHED*/
}

/*
 * Thread to create or restart vnics when required
 */
void
eib_vnic_creator(eib_t *ss)
{
    eib_vnic_req_t *vrq;
    eib_vnic_req_t *elem;
    eib_vnic_req_t *nxt;
    kmutex_t ci_lock;
    callb_cpr_t ci;
    uint_t vr_req;
    uint8_t *vr_mac;
    int ret;
    int err;

    mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
    CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_VNIC_CREATOR);

wait_for_vnic_req:
    mutex_enter(&ss->ei_vnic_req_lock);

    while ((vrq = ss->ei_vnic_req) == NULL) {
        mutex_enter(&ci_lock);
        CALLB_CPR_SAFE_BEGIN(&ci);
        mutex_exit(&ci_lock);

        cv_wait(&ss->ei_vnic_req_cv, &ss->ei_vnic_req_lock);

        mutex_enter(&ci_lock);
        CALLB_CPR_SAFE_END(&ci, &ci_lock);
        mutex_exit(&ci_lock);
    }

    /*
     * Pull out the first request
     */
    ss->ei_vnic_req = vrq->vr_next;
    vrq->vr_next = NULL;

    vr_req = vrq->vr_req;
    vr_mac = vrq->vr_mac;

    switch (vr_req) {
    case EIB_CR_REQ_DIE:
    case EIB_CR_REQ_FLUSH:
        /*
         * Cleanup all pending reqs and failed reqs
         */
        for (elem = ss->ei_vnic_req; elem; elem = nxt) {
            nxt = elem->vr_next;
            kmem_free(elem, sizeof (eib_vnic_req_t));
        }
        for (elem = ss->ei_failed_vnic_req; elem; elem = nxt) {
            nxt = elem->vr_next;
            kmem_free(elem, sizeof (eib_vnic_req_t));
        }
        ss->ei_vnic_req = NULL;
        ss->ei_failed_vnic_req = NULL;
        ss->ei_pending_vnic_req = NULL;
        mutex_exit(&ss->ei_vnic_req_lock);

        break;

    case EIB_CR_REQ_NEW_VNIC:
        ss->ei_pending_vnic_req = vrq;
        mutex_exit(&ss->ei_vnic_req_lock);

        EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_vnic_creator: "
            "new vnic creation request for %x:%x:%x:%x:%x:%x, 0x%x",
            vr_mac[0], vr_mac[1], vr_mac[2], vr_mac[3], vr_mac[4],
            vr_mac[5], vrq->vr_vlan);

        /*
         * Make sure we don't race with the plumb/unplumb code.  If
         * the eoib instance has been unplumbed already, we ignore any
         * creation requests that may have been pending.
         */
        eib_mac_set_nic_state(ss, EIB_NIC_STARTING);

        if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) !=
            EIB_NIC_STARTED) {
            mutex_enter(&ss->ei_vnic_req_lock);
            ss->ei_pending_vnic_req = NULL;
            mutex_exit(&ss->ei_vnic_req_lock);
            eib_mac_clr_nic_state(ss, EIB_NIC_STARTING);
            break;
        }

        /*
         * Try to create a new vnic with the supplied parameters.
         */
        err = 0;
        if ((ret = eib_vnic_create(ss, vrq->vr_mac, vrq->vr_vlan,
            NULL, &err)) != EIB_E_SUCCESS) {
            EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_creator: "
                "eib_vnic_create(mac=%x:%x:%x:%x:%x:%x, vlan=0x%x) "
                "failed, ret=%d", vr_mac[0], vr_mac[1], vr_mac[2],
                vr_mac[3], vr_mac[4], vr_mac[5], vrq->vr_vlan, err);
        }

        /*
         * If we failed, add this vnic req to our failed list (unless
         * it already exists there), so we won't try to create this
         * vnic again.  Whether we fail or succeed, we're done with
         * processing this req, so clear the pending req.
         */
        mutex_enter(&ss->ei_vnic_req_lock);
        if ((ret != EIB_E_SUCCESS) && (err != EEXIST)) {
            vrq->vr_next = ss->ei_failed_vnic_req;
            ss->ei_failed_vnic_req = vrq;
            vrq = NULL;
        }
        ss->ei_pending_vnic_req = NULL;
        mutex_exit(&ss->ei_vnic_req_lock);

        /*
         * Notify the mac layer that it should retry its tx again. If we
         * had created the vnic successfully, we'll be able to send the
         * packets; if we had not been successful, we'll drop packets on
         * this vnic.
         */
        EIB_DPRINTF_DEBUG(ss->ei_instance,
            "eib_vnic_creator: calling mac_tx_update()");
        mac_tx_update(ss->ei_mac_hdl);

        eib_mac_clr_nic_state(ss, EIB_NIC_STARTING);
        break;

    default:
        EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_vnic_creator: "
            "unknown request 0x%lx, ignoring", vrq->vr_req);
        break;
    }

    /*
     * Free the current req and quit if we have to
     */
    if (vrq) {
        kmem_free(vrq, sizeof (eib_vnic_req_t));
    }

    if (vr_req == EIB_CR_REQ_DIE) {
        mutex_enter(&ci_lock);
        CALLB_CPR_EXIT(&ci);
        mutex_destroy(&ci_lock);

        return;
    }

    goto wait_for_vnic_req;
    /*NOTREACHED*/
}

/*
 * Thread to monitor tx wqes and update the mac layer when needed.
 * Note that this thread can only be started after the tx wqe pool
 * has been allocated and initialized.
 */
void
eib_monitor_tx_wqes(eib_t *ss)
{
    eib_wqe_pool_t *wp = ss->ei_tx;
    kmutex_t ci_lock;
    callb_cpr_t ci;

    mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
    CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_TXWQES_MONITOR);

    ASSERT(wp != NULL);

monitor_wqe_status:
    mutex_enter(&wp->wp_lock);

    /*
     * Wait till someone falls short of wqes
     */
    while (wp->wp_status == 0) {
        mutex_enter(&ci_lock);
        CALLB_CPR_SAFE_BEGIN(&ci);
        mutex_exit(&ci_lock);

        cv_wait(&wp->wp_cv, &wp->wp_lock);

        mutex_enter(&ci_lock);
        CALLB_CPR_SAFE_END(&ci, &ci_lock);
        mutex_exit(&ci_lock);
    }

    /*
     * Have we been asked to die ?
     */
    if (wp->wp_status & EIB_TXWQE_MONITOR_DIE) {
        mutex_exit(&wp->wp_lock);

        mutex_enter(&ci_lock);
        CALLB_CPR_EXIT(&ci);
        mutex_destroy(&ci_lock);

        return;
    }

    ASSERT((wp->wp_status & EIB_TXWQE_SHORT) != 0);

    /*
     * Start monitoring free wqes till they cross min threshold
     */
    while ((wp->wp_nfree < EIB_NFREE_SWQES_HWM) &&
        ((wp->wp_status & EIB_TXWQE_MONITOR_DIE) == 0)) {

        mutex_enter(&ci_lock);
        CALLB_CPR_SAFE_BEGIN(&ci);
        mutex_exit(&ci_lock);

        cv_wait(&wp->wp_cv, &wp->wp_lock);

        mutex_enter(&ci_lock);
        CALLB_CPR_SAFE_END(&ci, &ci_lock);
        mutex_exit(&ci_lock);
    }

    /*
     * Have we been asked to die ?
     */
    if (wp->wp_status & EIB_TXWQE_MONITOR_DIE) {
        mutex_exit(&wp->wp_lock);

        mutex_enter(&ci_lock);
        CALLB_CPR_EXIT(&ci);
        mutex_destroy(&ci_lock);

        return;
    }

    ASSERT(wp->wp_nfree >= EIB_NFREE_SWQES_HWM);
    wp->wp_status &= (~EIB_TXWQE_SHORT);

    mutex_exit(&wp->wp_lock);

    /*
     * Inform the mac layer that tx resources are now available
     * and go back to monitoring
     */
    if (ss->ei_mac_hdl) {
        mac_tx_update(ss->ei_mac_hdl);
    }
    goto monitor_wqe_status;

    /*NOTREACHED*/
}

/*
 * Thread to monitor lso bufs and update the mac layer as needed.
 * Note that this thread can only be started after the lso buckets
 * have been allocated and initialized.
 */
void
eib_monitor_lso_bufs(eib_t *ss)
{
    eib_lsobkt_t *bkt = ss->ei_lso;
    kmutex_t ci_lock;
    callb_cpr_t ci;

    mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
    CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_LSOBUFS_MONITOR);

    ASSERT(bkt != NULL);

monitor_lso_status:
    mutex_enter(&bkt->bk_lock);

    /*
     * Wait till someone falls short of LSO buffers or we're asked
     * to die
     */
    while (bkt->bk_status == 0) {
        mutex_enter(&ci_lock);
        CALLB_CPR_SAFE_BEGIN(&ci);
        mutex_exit(&ci_lock);

        cv_wait(&bkt->bk_cv, &bkt->bk_lock);

        mutex_enter(&ci_lock);
        CALLB_CPR_SAFE_END(&ci, &ci_lock);
        mutex_exit(&ci_lock);
    }

    if (bkt->bk_status & EIB_LBUF_MONITOR_DIE) {
        mutex_exit(&bkt->bk_lock);

        mutex_enter(&ci_lock);
        CALLB_CPR_EXIT(&ci);
        mutex_destroy(&ci_lock);

        return;
    }

    ASSERT((bkt->bk_status & EIB_LBUF_SHORT) != 0);

    /*
     * Start monitoring free LSO buffers till there are enough
     * free buffers available
     */
    while ((bkt->bk_nfree < EIB_LSO_FREE_BUFS_THRESH) &&
        ((bkt->bk_status & EIB_LBUF_MONITOR_DIE) == 0)) {

        mutex_enter(&ci_lock);
        CALLB_CPR_SAFE_BEGIN(&ci);
        mutex_exit(&ci_lock);

        cv_wait(&bkt->bk_cv, &bkt->bk_lock);

        mutex_enter(&ci_lock);
        CALLB_CPR_SAFE_END(&ci, &ci_lock);
        mutex_exit(&ci_lock);
    }

    if (bkt->bk_status & EIB_LBUF_MONITOR_DIE) {
        mutex_exit(&bkt->bk_lock);

        mutex_enter(&ci_lock);
        CALLB_CPR_EXIT(&ci);
        mutex_destroy(&ci_lock);

        return;
    }

    /*
     * We have enough lso buffers available now
     */
    ASSERT(bkt->bk_nfree >= EIB_LSO_FREE_BUFS_THRESH);
    bkt->bk_status &= (~EIB_LBUF_SHORT);

    mutex_exit(&bkt->bk_lock);

    /*
     * Inform the mac layer that tx lso resources are now available
     * and go back to monitoring
     */
    if (ss->ei_mac_hdl) {
        mac_tx_update(ss->ei_mac_hdl);
    }
    goto monitor_lso_status;

    /*NOTREACHED*/
}

/*
 * Thread to manage the keepalive requirements for vnics and the gateway.
 */
void
eib_manage_keepalives(eib_t *ss)
{
    eib_ka_vnics_t *elem;
    eib_ka_vnics_t *nxt;
    clock_t deadline;
    int64_t lbolt64;
    int err;
    kmutex_t ci_lock;
    callb_cpr_t ci;

    mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
    CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_EVENTS_HDLR);

    mutex_enter(&ss->ei_ka_vnics_lock);

periodic_keepalive:
    deadline = ddi_get_lbolt() + ss->ei_gw_props->pp_vnic_ka_ticks;

    while ((ss->ei_ka_vnics_event &
        (EIB_KA_VNICS_DIE | EIB_KA_VNICS_TIMED_OUT)) == 0) {
        mutex_enter(&ci_lock);
        CALLB_CPR_SAFE_BEGIN(&ci);
        mutex_exit(&ci_lock);

        if (cv_timedwait(&ss->ei_ka_vnics_cv, &ss->ei_ka_vnics_lock,
            deadline) == -1) {
            ss->ei_ka_vnics_event |= EIB_KA_VNICS_TIMED_OUT;
        }

        mutex_enter(&ci_lock);
        CALLB_CPR_SAFE_END(&ci, &ci_lock);
        mutex_exit(&ci_lock);
    }

    if (ss->ei_ka_vnics_event & EIB_KA_VNICS_DIE) {
        for (elem = ss->ei_ka_vnics; elem; elem = nxt) {
            nxt = elem->ka_next;
            kmem_free(elem, sizeof (eib_ka_vnics_t));
        }
        ss->ei_ka_vnics = NULL;
        mutex_exit(&ss->ei_ka_vnics_lock);

        mutex_enter(&ci_lock);
        CALLB_CPR_EXIT(&ci);
        mutex_destroy(&ci_lock);

        return;
    }

    /*
     * Are there any vnics that need keepalive management ?
     */
    ss->ei_ka_vnics_event &= ~EIB_KA_VNICS_TIMED_OUT;
    if (ss->ei_ka_vnics == NULL)
        goto periodic_keepalive;

    /*
     * Ok, we need to send vnic keepalives to our gateway. But first
     * check if the gateway heartbeat is good as of this moment.  Note
     * that we need do get the lbolt value after acquiring ei_vnic_lock
     * to ensure that ei_gw_last_heartbeat does not change before the
     * comparison (to avoid a negative value in the comparison result
     * causing us to incorrectly assume that the gateway heartbeat has
     * stopped).
     */
    mutex_enter(&ss->ei_vnic_lock);

    lbolt64 = ddi_get_lbolt64();

    if (ss->ei_gw_last_heartbeat != 0) {
        if ((lbolt64 - ss->ei_gw_last_heartbeat) >
            ss->ei_gw_props->pp_gw_ka_ticks) {

            EIB_DPRINTF_WARN(ss->ei_instance,
                "eib_manage_keepalives: no keepalives from gateway "
                "0x%x for hca_guid=0x%llx, port=0x%x, "
                "last_gw_ka=0x%llx", ss->ei_gw_props->pp_gw_portid,
                ss->ei_props->ep_hca_guid,
                ss->ei_props->ep_port_num,
                ss->ei_gw_last_heartbeat);

            for (elem = ss->ei_ka_vnics; elem; elem = nxt) {
                nxt = elem->ka_next;
                ss->ei_zombie_vnics |=
                    ((uint64_t)1 << elem->ka_vnic->vn_instance);
                kmem_free(elem, sizeof (eib_ka_vnics_t));
            }
            ss->ei_ka_vnics = NULL;
            ss->ei_gw_unreachable = B_TRUE;
            mutex_exit(&ss->ei_vnic_lock);

            eib_mac_link_down(ss, B_FALSE);

            goto periodic_keepalive;
        }
    }
    mutex_exit(&ss->ei_vnic_lock);

    for (elem = ss->ei_ka_vnics; elem; elem = elem->ka_next)
        (void) eib_fip_heartbeat(ss, elem->ka_vnic, &err);

    goto periodic_keepalive;
    /*NOTREACHED*/
}

void
eib_stop_events_handler(eib_t *ss)
{
    eib_event_t *evi;

    evi = kmem_zalloc(sizeof (eib_event_t), KM_SLEEP);
    evi->ev_code = EIB_EV_SHUTDOWN;
    evi->ev_arg = NULL;

    eib_svc_enqueue_event(ss, evi);

    thread_join(ss->ei_events_handler);
}

void
eib_stop_refill_rwqes(eib_t *ss)
{
    mutex_enter(&ss->ei_rxpost_lock);

    ss->ei_rxpost_die = 1;

    cv_signal(&ss->ei_rxpost_cv);
    mutex_exit(&ss->ei_rxpost_lock);

    thread_join(ss->ei_rwqes_refiller);
}

void
eib_stop_vnic_creator(eib_t *ss)
{
    eib_vnic_req_t *vrq;

    vrq = kmem_zalloc(sizeof (eib_vnic_req_t), KM_SLEEP);
    vrq->vr_req = EIB_CR_REQ_DIE;
    vrq->vr_next = NULL;

    eib_vnic_enqueue_req(ss, vrq);

    thread_join(ss->ei_vnic_creator);
}

void
eib_stop_monitor_tx_wqes(eib_t *ss)
{
    eib_wqe_pool_t *wp = ss->ei_tx;

    mutex_enter(&wp->wp_lock);

    wp->wp_status |= EIB_TXWQE_MONITOR_DIE;

    cv_signal(&wp->wp_cv);
    mutex_exit(&wp->wp_lock);

    thread_join(ss->ei_txwqe_monitor);
}

int
eib_stop_monitor_lso_bufs(eib_t *ss, boolean_t force)
{
    eib_lsobkt_t *bkt = ss->ei_lso;

    mutex_enter(&bkt->bk_lock);

    /*
     * If there are some buffers still not reaped and the force
     * flag is not set, return without doing anything. Otherwise,
     * stop the lso bufs monitor and wait for it to die.
     */
    if ((bkt->bk_nelem != bkt->bk_nfree) && (force == B_FALSE)) {
        mutex_exit(&bkt->bk_lock);
        return (EIB_E_FAILURE);
    }

    bkt->bk_status |= EIB_LBUF_MONITOR_DIE;

    cv_signal(&bkt->bk_cv);
    mutex_exit(&bkt->bk_lock);

    thread_join(ss->ei_lsobufs_monitor);
    return (EIB_E_SUCCESS);
}

void
eib_stop_manage_keepalives(eib_t *ss)
{
    mutex_enter(&ss->ei_ka_vnics_lock);

    ss->ei_ka_vnics_event |= EIB_KA_VNICS_DIE;

    cv_signal(&ss->ei_ka_vnics_cv);
    mutex_exit(&ss->ei_ka_vnics_lock);

    thread_join(ss->ei_keepalives_manager);
}

void
eib_flush_vnic_reqs(eib_t *ss)
{
    eib_vnic_req_t *vrq;

    vrq = kmem_zalloc(sizeof (eib_vnic_req_t), KM_SLEEP);
    vrq->vr_req = EIB_CR_REQ_FLUSH;
    vrq->vr_next = NULL;

    eib_vnic_enqueue_req(ss, vrq);
}

/*ARGSUSED*/
void
eib_gw_alive_cb(dev_info_t *dip, ddi_eventcookie_t cookie, void *arg,
    void *impl_data)
{
    eib_t *ss = (eib_t *)arg;
    eib_event_t *evi;

    evi = kmem_zalloc(sizeof (eib_event_t), KM_NOSLEEP);
    if (evi == NULL) {
        EIB_DPRINTF_WARN(ss->ei_instance, "eib_gw_alive_cb: "
            "no memory, ignoring this gateway alive event");
    } else {
        evi->ev_code = EIB_EV_GW_UP;
        evi->ev_arg = NULL;
        eib_svc_enqueue_event(ss, evi);
    }
}

/*ARGSUSED*/
void
eib_login_ack_cb(dev_info_t *dip, ddi_eventcookie_t cookie, void *arg,
    void *impl_data)
{
    eib_t *ss = (eib_t *)arg;
    uint8_t *pkt = (uint8_t *)impl_data;
    eib_login_data_t ld;

    /*
     * We have received a login ack message from the gateway via the EoIB
     * nexus (solicitation qpn).  The packet is passed to us raw (unparsed)
     * and we have to figure out if this is a vnic login ack.
     */
    if (eib_fip_parse_login_ack(ss, pkt + EIB_GRH_SZ, &ld) == EIB_E_SUCCESS)
        eib_vnic_login_ack(ss, &ld);
}

/*ARGSUSED*/
void
eib_gw_info_cb(dev_info_t *dip, ddi_eventcookie_t cookie, void *arg,
    void *impl_data)
{
    eib_t *ss = (eib_t *)arg;
    eib_event_t *evi;

    evi = kmem_zalloc(sizeof (eib_event_t), KM_NOSLEEP);
    if (evi == NULL) {
        EIB_DPRINTF_WARN(ss->ei_instance, "eib_gw_info_cb: "
            "no memory, ignoring this gateway props update event");
        return;
    }
    evi->ev_arg = kmem_zalloc(sizeof (eib_gw_info_t), KM_NOSLEEP);
    if (evi->ev_arg == NULL) {
        EIB_DPRINTF_WARN(ss->ei_instance, "eib_gw_info_cb: "
            "no memory, ignoring this gateway props update event");
        kmem_free(evi, sizeof (eib_event_t));
        return;
    }
    bcopy(impl_data, evi->ev_arg, sizeof (eib_gw_info_t));
    evi->ev_code = EIB_EV_GW_INFO_UPDATE;

    eib_svc_enqueue_event(ss, evi);
}