sun4v/io/vsw_ldc.c

	vsw_ldc.c revision d3d50737e566cade9a08d73d2af95105ac7cd960
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/types.h>
#include <sys/errno.h>
#include <sys/debug.h>
#include <sys/time.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/user.h>
#include <sys/stropts.h>
#include <sys/stream.h>
#include <sys/strlog.h>
#include <sys/strsubr.h>
#include <sys/cmn_err.h>
#include <sys/cpu.h>
#include <sys/kmem.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/ksynch.h>
#include <sys/stat.h>
#include <sys/kstat.h>
#include <sys/vtrace.h>
#include <sys/strsun.h>
#include <sys/dlpi.h>
#include <sys/ethernet.h>
#include <net/if.h>
#include <sys/varargs.h>
#include <sys/machsystm.h>
#include <sys/modctl.h>
#include <sys/modhash.h>
#include <sys/mac.h>
#include <sys/mac_ether.h>
#include <sys/taskq.h>
#include <sys/note.h>
#include <sys/mach_descrip.h>
#include <sys/mdeg.h>
#include <sys/ldc.h>
#include <sys/vsw_fdb.h>
#include <sys/vsw.h>
#include <sys/vio_mailbox.h>
#include <sys/vnet_mailbox.h>
#include <sys/vnet_common.h>
#include <sys/vio_util.h>
#include <sys/sdt.h>
#include <sys/atomic.h>
#include <sys/callb.h>
#include <sys/vlan.h>

/* Port add/deletion/etc routines */
static  void vsw_port_delete(vsw_port_t *port);
static  int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
static  void vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
static  int vsw_init_ldcs(vsw_port_t *port);
static  void vsw_uninit_ldcs(vsw_port_t *port);
static  int vsw_ldc_init(vsw_ldc_t *ldcp);
static  void vsw_ldc_uninit(vsw_ldc_t *ldcp);
static  void vsw_drain_ldcs(vsw_port_t *port);
static  void vsw_drain_port_taskq(vsw_port_t *port);
static  void vsw_marker_task(void *);
static  int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
void vsw_detach_ports(vsw_t *vswp);
int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
int vsw_port_detach(vsw_t *vswp, int p_instance);
int vsw_portsend(vsw_port_t *port, mblk_t *mp);
int vsw_port_attach(vsw_port_t *portp);
vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
void vsw_reset_ports(vsw_t *vswp);
void vsw_port_reset(vsw_port_t *portp);
void vsw_physlink_update_ports(vsw_t *vswp);
static  void vsw_port_physlink_update(vsw_port_t *portp);

/* Interrupt routines */
static  uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);

/* Handshake routines */
static  void vsw_ldc_reinit(vsw_ldc_t *);
static  void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
static  void vsw_conn_task(void *);
static  int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
static  void vsw_next_milestone(vsw_ldc_t *);
static  int vsw_supported_version(vio_ver_msg_t *);
static  void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
static  void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);

/* Data processing routines */
static void vsw_process_pkt(void *);
static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *);
static void vsw_process_ctrl_pkt(void *);
static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
static void vsw_process_physlink_msg(vsw_ldc_t *, void *);
static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
    uint32_t);
static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
static void vsw_process_pkt_data(void *, void *, uint32_t);
static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);

/* Switching/data transmit routines */
static  int vsw_dringsend(vsw_ldc_t *, mblk_t *);
static  int vsw_descrsend(vsw_ldc_t *, mblk_t *);
static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);

/* Packet creation routines */
static void vsw_send_ver(void *);
static void vsw_send_attr(vsw_ldc_t *);
static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
static void vsw_send_dring_info(vsw_ldc_t *);
static void vsw_send_rdx(vsw_ldc_t *);
static void vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state);

/* Dring routines */
static dring_info_t *vsw_create_dring(vsw_ldc_t *);
static void vsw_create_privring(vsw_ldc_t *);
static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
    int *);
static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
static int vsw_reclaim_dring(dring_info_t *dp, int start);

static void vsw_set_lane_attr(vsw_t *, lane_t *);
static int vsw_check_attr(vnet_attr_msg_t *, vsw_ldc_t *);
static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
static int vsw_check_dring_info(vio_dring_reg_msg_t *);

/* Rcv/Tx thread routines */
static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
static void vsw_ldc_tx_worker(void *arg);
static void vsw_stop_rx_thread(vsw_ldc_t *ldcp);
static void vsw_ldc_rx_worker(void *arg);

/* Misc support routines */
static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
static void vsw_free_ring(dring_info_t *);
static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
static int vsw_get_same_dest_list(struct ether_header *ehp,
    mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
static mblk_t *vsw_dupmsgchain(mblk_t *mp);

/* Debugging routines */
static void dump_flags(uint64_t);
static void display_state(void);
static void display_lane(lane_t *);
static void display_ring(dring_info_t *);

/*
 * Functions imported from other files.
 */
extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
extern void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
extern void vsw_del_mcst_port(vsw_port_t *port);
extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
extern void vsw_fdbe_add(vsw_t *vswp, void *port);
extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
extern void vsw_create_vlans(void *arg, int type);
extern void vsw_destroy_vlans(void *arg, int type);
extern void vsw_vlan_add_ids(void *arg, int type);
extern void vsw_vlan_remove_ids(void *arg, int type);
extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
    struct ether_header *ehp, uint16_t *vidp);
extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
    mblk_t **npt);
extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
extern void vsw_hio_stop_port(vsw_port_t *portp);
extern void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
extern void vsw_destroy_rxpools(void *arg);

#define VSW_NUM_VMPOOLS     3   /* number of vio mblk pools */

/*
 * Tunables used in this file.
 */
extern int vsw_num_handshakes;
extern int vsw_wretries;
extern int vsw_desc_delay;
extern int vsw_read_attempts;
extern int vsw_ldc_tx_delay;
extern int vsw_ldc_tx_retries;
extern int vsw_ldc_retries;
extern int vsw_ldc_delay;
extern boolean_t vsw_ldc_rxthr_enabled;
extern boolean_t vsw_ldc_txthr_enabled;
extern uint32_t vsw_ntxds;
extern uint32_t vsw_max_tx_qcount;
extern uint32_t vsw_chain_len;
extern uint32_t vsw_mblk_size1;
extern uint32_t vsw_mblk_size2;
extern uint32_t vsw_mblk_size3;
extern uint32_t vsw_mblk_size4;
extern uint32_t vsw_num_mblks1;
extern uint32_t vsw_num_mblks2;
extern uint32_t vsw_num_mblks3;
extern uint32_t vsw_num_mblks4;
extern boolean_t vsw_obp_ver_proto_workaround;
extern uint32_t vsw_publish_macaddr_count;
extern boolean_t vsw_jumbo_rxpools;

#define LDC_ENTER_LOCK(ldcp)    \
                mutex_enter(&((ldcp)->ldc_cblock));\
                mutex_enter(&((ldcp)->ldc_rxlock));\
                mutex_enter(&((ldcp)->ldc_txlock));
#define LDC_EXIT_LOCK(ldcp) \
                mutex_exit(&((ldcp)->ldc_txlock));\
                mutex_exit(&((ldcp)->ldc_rxlock));\
                mutex_exit(&((ldcp)->ldc_cblock));

#define VSW_VER_EQ(ldcp, major, minor)  \
    ((ldcp)->lane_out.ver_major == (major) &&   \
        (ldcp)->lane_out.ver_minor == (minor))

#define VSW_VER_LT(ldcp, major, minor)  \
    (((ldcp)->lane_out.ver_major < (major)) ||  \
        ((ldcp)->lane_out.ver_major == (major) &&   \
        (ldcp)->lane_out.ver_minor < (minor)))

#define VSW_VER_GTEQ(ldcp, major, minor)    \
    (((ldcp)->lane_out.ver_major > (major)) ||  \
        ((ldcp)->lane_out.ver_major == (major) &&   \
        (ldcp)->lane_out.ver_minor >= (minor)))

/*
 * VIO Protocol Version Info:
 *
 * The version specified below represents the version of protocol currently
 * supported in the driver. It means the driver can negotiate with peers with
 * versions <= this version. Here is a summary of the feature(s) that are
 * supported at each version of the protocol:
 *
 * 1.0          Basic VIO protocol.
 * 1.1          vDisk protocol update (no virtual network update).
 * 1.2          Support for priority frames (priority-ether-types).
 * 1.3          VLAN and HybridIO support.
 * 1.4          Jumbo Frame support.
 * 1.5          Link State Notification support with optional support
 *          for Physical Link information.
 */
static  ver_sup_t   vsw_versions[] = { {1, 5} };

/*
 * For the moment the state dump routines have their own
 * private flag.
 */
#define DUMP_STATE  0

#if DUMP_STATE

#define DUMP_TAG(tag) \
{           \
    D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
    D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);  \
    D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);   \
}

#define DUMP_TAG_PTR(tag) \
{           \
    D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
    D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype); \
    D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);  \
}

#define DUMP_FLAGS(flags) dump_flags(flags);
#define DISPLAY_STATE() display_state()

#else

#define DUMP_TAG(tag)
#define DUMP_TAG_PTR(tag)
#define DUMP_FLAGS(state)
#define DISPLAY_STATE()

#endif  /* DUMP_STATE */

/*
 * Attach the specified port.
 *
 * Returns 0 on success, 1 on failure.
 */
int
vsw_port_attach(vsw_port_t *port)
{
    vsw_t           *vswp = port->p_vswp;
    vsw_port_list_t     *plist = &vswp->plist;
    vsw_port_t      *p, **pp;
    int         i;
    int         nids = port->num_ldcs;
    uint64_t        *ldcids;
    int         rv;

    D1(vswp, "%s: enter : port %d", __func__, port->p_instance);

    /* port already exists? */
    READ_ENTER(&plist->lockrw);
    for (p = plist->head; p != NULL; p = p->p_next) {
        if (p->p_instance == port->p_instance) {
            DWARN(vswp, "%s: port instance %d already attached",
                __func__, p->p_instance);
            RW_EXIT(&plist->lockrw);
            return (1);
        }
    }
    RW_EXIT(&plist->lockrw);

    rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);

    mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
    mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
    rw_init(&port->maccl_rwlock, NULL, RW_DRIVER, NULL);

    mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
    cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
    port->state = VSW_PORT_INIT;

    D2(vswp, "%s: %d nids", __func__, nids);
    ldcids = port->ldc_ids;
    for (i = 0; i < nids; i++) {
        D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
        if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
            DERR(vswp, "%s: ldc_attach failed", __func__);
            goto exit_error;
        }
    }

    if (vswp->switching_setup_done == B_TRUE) {
        /*
         * If the underlying network device has been setup,
         * then open a mac client and porgram the mac address
         * for this port.
         */
        rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
        if (rv != 0) {
            goto exit_error;
        }
    }

    /* create the fdb entry for this port/mac address */
    vsw_fdbe_add(vswp, port);

    vsw_create_vlans(port, VSW_VNETPORT);

    WRITE_ENTER(&plist->lockrw);

    /* link it into the list of ports for this vsw instance */
    pp = (vsw_port_t **)(&plist->head);
    port->p_next = *pp;
    *pp = port;
    plist->num_ports++;

    RW_EXIT(&plist->lockrw);

    /*
     * Initialise the port and any ldc's under it.
     */
    (void) vsw_init_ldcs(port);

    /* announce macaddr of vnet to the physical switch */
    if (vsw_publish_macaddr_count != 0) {   /* enabled */
        vsw_publish_macaddr(vswp, port);
    }

    D1(vswp, "%s: exit", __func__);
    return (0);

exit_error:
    rw_destroy(&port->p_ldclist.lockrw);

    cv_destroy(&port->state_cv);
    mutex_destroy(&port->state_lock);

    rw_destroy(&port->maccl_rwlock);
    mutex_destroy(&port->tx_lock);
    mutex_destroy(&port->mca_lock);
    kmem_free(port, sizeof (vsw_port_t));
    return (1);
}

/*
 * Detach the specified port.
 *
 * Returns 0 on success, 1 on failure.
 */
int
vsw_port_detach(vsw_t *vswp, int p_instance)
{
    vsw_port_t  *port = NULL;
    vsw_port_list_t *plist = &vswp->plist;

    D1(vswp, "%s: enter: port id %d", __func__, p_instance);

    WRITE_ENTER(&plist->lockrw);

    if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
        RW_EXIT(&plist->lockrw);
        return (1);
    }

    if (vsw_plist_del_node(vswp, port)) {
        RW_EXIT(&plist->lockrw);
        return (1);
    }

    /* cleanup any HybridIO for this port */
    vsw_hio_stop_port(port);

    /*
     * No longer need to hold writer lock on port list now
     * that we have unlinked the target port from the list.
     */
    RW_EXIT(&plist->lockrw);

    /* Cleanup and close the mac client */
    vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);

    /* Remove the fdb entry for this port/mac address */
    vsw_fdbe_del(vswp, &(port->p_macaddr));
    vsw_destroy_vlans(port, VSW_VNETPORT);

    /* Remove any multicast addresses.. */
    vsw_del_mcst_port(port);

    vsw_port_delete(port);

    D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
    return (0);
}

/*
 * Detach all active ports.
 */
void
vsw_detach_ports(vsw_t *vswp)
{
    vsw_port_list_t     *plist = &vswp->plist;
    vsw_port_t      *port = NULL;

    D1(vswp, "%s: enter", __func__);

    WRITE_ENTER(&plist->lockrw);

    while ((port = plist->head) != NULL) {
        (void) vsw_plist_del_node(vswp, port);

        /* cleanup any HybridIO for this port */
        vsw_hio_stop_port(port);

        /* Cleanup and close the mac client */
        vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);

        /* Remove the fdb entry for this port/mac address */
        vsw_fdbe_del(vswp, &(port->p_macaddr));
        vsw_destroy_vlans(port, VSW_VNETPORT);

        /* Remove any multicast addresses.. */
        vsw_del_mcst_port(port);

        /*
         * No longer need to hold the lock on the port list
         * now that we have unlinked the target port from the
         * list.
         */
        RW_EXIT(&plist->lockrw);
        vsw_port_delete(port);
        WRITE_ENTER(&plist->lockrw);
    }
    RW_EXIT(&plist->lockrw);

    D1(vswp, "%s: exit", __func__);
}

/*
 * Delete the specified port.
 */
static void
vsw_port_delete(vsw_port_t *port)
{
    vsw_ldc_list_t      *ldcl;
    vsw_t           *vswp = port->p_vswp;
    int         num_ldcs;

    D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);

    vsw_uninit_ldcs(port);

    /*
     * Wait for any pending ctrl msg tasks which reference this
     * port to finish.
     */
    vsw_drain_port_taskq(port);

    /*
     * Wait for any active callbacks to finish
     */
    vsw_drain_ldcs(port);

    ldcl = &port->p_ldclist;
    num_ldcs = port->num_ldcs;
    WRITE_ENTER(&ldcl->lockrw);
    while (num_ldcs > 0) {
        vsw_ldc_detach(port, ldcl->head->ldc_id);
        num_ldcs--;
    }
    RW_EXIT(&ldcl->lockrw);

    rw_destroy(&port->p_ldclist.lockrw);

    rw_destroy(&port->maccl_rwlock);
    mutex_destroy(&port->mca_lock);
    mutex_destroy(&port->tx_lock);

    cv_destroy(&port->state_cv);
    mutex_destroy(&port->state_lock);

    if (port->num_ldcs != 0) {
        kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
        port->num_ldcs = 0;
    }

    if (port->nvids != 0) {
        kmem_free(port->vids, sizeof (vsw_vlanid_t) * port->nvids);
    }

    kmem_free(port, sizeof (vsw_port_t));

    D1(vswp, "%s: exit", __func__);
}

static int
vsw_init_multipools(vsw_ldc_t *ldcp, vsw_t *vswp)
{
    size_t      data_sz;
    int     rv;
    uint32_t    sz1 = 0;
    uint32_t    sz2 = 0;
    uint32_t    sz3 = 0;
    uint32_t    sz4 = 0;

    /*
     * We round up the mtu specified to be a multiple of 2K to limit the
     * number of rx buffer pools created for a given mtu.
     */
    data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
    data_sz = VNET_ROUNDUP_2K(data_sz);

    /*
     * If pool sizes are specified, use them. Note that the presence of
     * the first tunable will be used as a hint.
     */
    if (vsw_mblk_size1 != 0) {
        sz1 = vsw_mblk_size1;
        sz2 = vsw_mblk_size2;
        sz3 = vsw_mblk_size3;
        sz4 = vsw_mblk_size4;

        if (sz4 == 0) { /* need 3 pools */

            ldcp->max_rxpool_size = sz3;
            rv = vio_init_multipools(&ldcp->vmp,
                VSW_NUM_VMPOOLS, sz1, sz2, sz3,
                vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);

        } else {

            ldcp->max_rxpool_size = sz4;
            rv = vio_init_multipools(&ldcp->vmp,
                VSW_NUM_VMPOOLS + 1, sz1, sz2, sz3, sz4,
                vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
                vsw_num_mblks4);

        }

        return (rv);
    }

    /*
     * Pool sizes are not specified. We select the pool sizes based on the
     * mtu if vnet_jumbo_rxpools is enabled.
     */
    if (vsw_jumbo_rxpools == B_FALSE || data_sz == VNET_2K) {
        /*
         * Receive buffer pool allocation based on mtu is disabled.
         * Use the default mechanism of standard size pool allocation.
         */
        sz1 = VSW_MBLK_SZ_128;
        sz2 = VSW_MBLK_SZ_256;
        sz3 = VSW_MBLK_SZ_2048;
        ldcp->max_rxpool_size = sz3;

        rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS,
            sz1, sz2, sz3,
            vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);

        return (rv);
    }

    switch (data_sz) {

    case VNET_4K:

        sz1 = VSW_MBLK_SZ_128;
        sz2 = VSW_MBLK_SZ_256;
        sz3 = VSW_MBLK_SZ_2048;
        sz4 = sz3 << 1;         /* 4K */
        ldcp->max_rxpool_size = sz4;

        rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS + 1,
            sz1, sz2, sz3, sz4,
            vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
            vsw_num_mblks4);
        break;

    default:    /* data_sz:  4K+ to 16K */

        sz1 = VSW_MBLK_SZ_256;
        sz2 = VSW_MBLK_SZ_2048;
        sz3 = data_sz >> 1; /* Jumbo-size/2 */
        sz4 = data_sz;  /* Jumbo-size */
        ldcp->max_rxpool_size = sz4;

        rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS + 1,
            sz1, sz2, sz3, sz4,
            vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
            vsw_num_mblks4);
        break;
    }

    return (rv);

}

/*
 * Attach a logical domain channel (ldc) under a specified port.
 *
 * Returns 0 on success, 1 on failure.
 */
static int
vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
{
    vsw_t       *vswp = port->p_vswp;
    vsw_ldc_list_t *ldcl = &port->p_ldclist;
    vsw_ldc_t   *ldcp = NULL;
    ldc_attr_t  attr;
    ldc_status_t    istatus;
    int         status = DDI_FAILURE;
    char        kname[MAXNAMELEN];
    enum        { PROG_init = 0x0,
                PROG_callback = 0x1, PROG_rx_thread = 0x2,
                PROG_tx_thread = 0x4}
            progress;

    progress = PROG_init;

    D1(vswp, "%s: enter", __func__);

    ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
    if (ldcp == NULL) {
        DERR(vswp, "%s: kmem_zalloc failed", __func__);
        return (1);
    }
    ldcp->ldc_id = ldc_id;

    mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
    mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
    mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
    mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
    cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
    rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
    rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);

    /* required for handshake with peer */
    ldcp->local_session = (uint64_t)ddi_get_lbolt();
    ldcp->peer_session = 0;
    ldcp->session_status = 0;
    ldcp->hss_id = 1;   /* Initial handshake session id */

    (void) atomic_swap_32(&port->p_hio_capable, B_FALSE);

    /* only set for outbound lane, inbound set by peer */
    vsw_set_lane_attr(vswp, &ldcp->lane_out);

    attr.devclass = LDC_DEV_NT_SVC;
    attr.instance = ddi_get_instance(vswp->dip);
    attr.mode = LDC_MODE_UNRELIABLE;
    attr.mtu = VSW_LDC_MTU;
    status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
    if (status != 0) {
        DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
            __func__, ldc_id, status);
        goto ldc_attach_fail;
    }

    if (vsw_ldc_rxthr_enabled) {
        ldcp->rx_thr_flags = 0;

        mutex_init(&ldcp->rx_thr_lock, NULL, MUTEX_DRIVER, NULL);
        cv_init(&ldcp->rx_thr_cv, NULL, CV_DRIVER, NULL);
        ldcp->rx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
            vsw_ldc_rx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);

        progress |= PROG_rx_thread;
        if (ldcp->rx_thread == NULL) {
            DWARN(vswp, "%s(%lld): Failed to create worker thread",
                __func__, ldc_id);
            goto ldc_attach_fail;
        }
    }

    if (vsw_ldc_txthr_enabled) {
        ldcp->tx_thr_flags = 0;
        ldcp->tx_mhead = ldcp->tx_mtail = NULL;

        mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
        cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
        ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
            vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);

        progress |= PROG_tx_thread;
        if (ldcp->tx_thread == NULL) {
            DWARN(vswp, "%s(%lld): Failed to create worker thread",
                __func__, ldc_id);
            goto ldc_attach_fail;
        }
    }

    status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
    if (status != 0) {
        DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
            __func__, ldc_id, status);
        (void) ldc_fini(ldcp->ldc_handle);
        goto ldc_attach_fail;
    }
    /*
     * allocate a message for ldc_read()s, big enough to hold ctrl and
     * data msgs, including raw data msgs used to recv priority frames.
     */
    ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
    ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);

    progress |= PROG_callback;

    mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);

    if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
        DERR(vswp, "%s: ldc_status failed", __func__);
        mutex_destroy(&ldcp->status_lock);
        goto ldc_attach_fail;
    }

    ldcp->ldc_status = istatus;
    ldcp->ldc_port = port;
    ldcp->ldc_vswp = vswp;

    vsw_reset_vnet_proto_ops(ldcp);

    (void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
    ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
        kname, &ldcp->ldc_stats);
    if (ldcp->ksp == NULL) {
        DERR(vswp, "%s: kstats setup failed", __func__);
        goto ldc_attach_fail;
    }

    /* link it into the list of channels for this port */
    WRITE_ENTER(&ldcl->lockrw);
    ldcp->ldc_next = ldcl->head;
    ldcl->head = ldcp;
    RW_EXIT(&ldcl->lockrw);

    D1(vswp, "%s: exit", __func__);
    return (0);

ldc_attach_fail:

    if (progress & PROG_callback) {
        (void) ldc_unreg_callback(ldcp->ldc_handle);
        kmem_free(ldcp->ldcmsg, ldcp->msglen);
    }

    if (progress & PROG_rx_thread) {
        if (ldcp->rx_thread != NULL) {
            vsw_stop_rx_thread(ldcp);
        }
        mutex_destroy(&ldcp->rx_thr_lock);
        cv_destroy(&ldcp->rx_thr_cv);
    }

    if (progress & PROG_tx_thread) {
        if (ldcp->tx_thread != NULL) {
            vsw_stop_tx_thread(ldcp);
        }
        mutex_destroy(&ldcp->tx_thr_lock);
        cv_destroy(&ldcp->tx_thr_cv);
    }
    if (ldcp->ksp != NULL) {
        vgen_destroy_kstats(ldcp->ksp);
    }
    mutex_destroy(&ldcp->ldc_txlock);
    mutex_destroy(&ldcp->ldc_rxlock);
    mutex_destroy(&ldcp->ldc_cblock);
    mutex_destroy(&ldcp->drain_cv_lock);

    cv_destroy(&ldcp->drain_cv);

    rw_destroy(&ldcp->lane_in.dlistrw);
    rw_destroy(&ldcp->lane_out.dlistrw);

    kmem_free(ldcp, sizeof (vsw_ldc_t));

    return (1);
}

/*
 * Detach a logical domain channel (ldc) belonging to a
 * particular port.
 */
static void
vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
{
    vsw_t       *vswp = port->p_vswp;
    vsw_ldc_t   *ldcp, *prev_ldcp;
    vsw_ldc_list_t  *ldcl = &port->p_ldclist;
    int         rv;
    int     retries = 0;
    vio_mblk_pool_t *fvmp = NULL;

    prev_ldcp = ldcl->head;
    for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
        if (ldcp->ldc_id == ldc_id) {
            break;
        }
    }

    /* specified ldc id not found */
    ASSERT(ldcp != NULL);

    D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);

    /* Stop the receive thread */
    if (ldcp->rx_thread != NULL) {
        vsw_stop_rx_thread(ldcp);
        mutex_destroy(&ldcp->rx_thr_lock);
        cv_destroy(&ldcp->rx_thr_cv);
    }
    kmem_free(ldcp->ldcmsg, ldcp->msglen);

    /* Stop the tx thread */
    if (ldcp->tx_thread != NULL) {
        vsw_stop_tx_thread(ldcp);
        mutex_destroy(&ldcp->tx_thr_lock);
        cv_destroy(&ldcp->tx_thr_cv);
        if (ldcp->tx_mhead != NULL) {
            freemsgchain(ldcp->tx_mhead);
            ldcp->tx_mhead = ldcp->tx_mtail = NULL;
            ldcp->tx_cnt = 0;
        }
    }

    /* Destory kstats */
    vgen_destroy_kstats(ldcp->ksp);

    /*
     * Before we can close the channel we must release any mapped
     * resources (e.g. drings).
     */
    vsw_free_lane_resources(ldcp, INBOUND);
    vsw_free_lane_resources(ldcp, OUTBOUND);

    /*
     * Close the channel, retry on EAAGIN.
     */
    while ((rv = ldc_close(ldcp->ldc_handle)) == EAGAIN) {
        if (++retries > vsw_ldc_retries) {
            break;
        }
        drv_usecwait(vsw_ldc_delay);
    }
    if (rv != 0) {
        cmn_err(CE_NOTE,
            "!vsw%d: Error(%d) closing the channel(0x%lx)\n",
            vswp->instance, rv, ldcp->ldc_id);
    }

    (void) ldc_fini(ldcp->ldc_handle);

    ldcp->ldc_status = LDC_INIT;
    ldcp->ldc_handle = NULL;
    ldcp->ldc_vswp = NULL;


    /*
     * If we can't destroy all the rx pools for this channel, dispatch
     * a task to retry and clean up those rx pools. Note that we don't
     * need to wait for the task to complete. If the vsw device itself
     * gets detached (vsw_detach()), it will wait for the task to complete
     * implicitly in ddi_taskq_destroy().
     */
    vio_destroy_multipools(&ldcp->vmp, &fvmp);
    if (fvmp != NULL) {
        (void) ddi_taskq_dispatch(vswp->rxp_taskq,
            vsw_destroy_rxpools, fvmp, DDI_SLEEP);
    }

    /* unlink it from the list */
    prev_ldcp = ldcp->ldc_next;

    mutex_destroy(&ldcp->ldc_txlock);
    mutex_destroy(&ldcp->ldc_rxlock);
    mutex_destroy(&ldcp->ldc_cblock);
    cv_destroy(&ldcp->drain_cv);
    mutex_destroy(&ldcp->drain_cv_lock);
    mutex_destroy(&ldcp->status_lock);
    rw_destroy(&ldcp->lane_in.dlistrw);
    rw_destroy(&ldcp->lane_out.dlistrw);

    kmem_free(ldcp, sizeof (vsw_ldc_t));
}

/*
 * Open and attempt to bring up the channel. Note that channel
 * can only be brought up if peer has also opened channel.
 *
 * Returns 0 if can open and bring up channel, otherwise
 * returns 1.
 */
static int
vsw_ldc_init(vsw_ldc_t *ldcp)
{
    vsw_t       *vswp = ldcp->ldc_vswp;
    ldc_status_t    istatus = 0;
    int     rv;

    D1(vswp, "%s: enter", __func__);

    LDC_ENTER_LOCK(ldcp);

    /* don't start at 0 in case clients don't like that */
    ldcp->next_ident = 1;

    rv = ldc_open(ldcp->ldc_handle);
    if (rv != 0) {
        DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
            __func__, ldcp->ldc_id, rv);
        LDC_EXIT_LOCK(ldcp);
        return (1);
    }

    if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
        DERR(vswp, "%s: unable to get status", __func__);
        LDC_EXIT_LOCK(ldcp);
        return (1);

    } else if (istatus != LDC_OPEN && istatus != LDC_READY) {
        DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
            __func__, ldcp->ldc_id, istatus);
        LDC_EXIT_LOCK(ldcp);
        return (1);
    }

    mutex_enter(&ldcp->status_lock);
    ldcp->ldc_status = istatus;
    mutex_exit(&ldcp->status_lock);

    rv = ldc_up(ldcp->ldc_handle);
    if (rv != 0) {
        /*
         * Not a fatal error for ldc_up() to fail, as peer
         * end point may simply not be ready yet.
         */
        D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
            ldcp->ldc_id, rv);
        LDC_EXIT_LOCK(ldcp);
        return (1);
    }

    /*
     * ldc_up() call is non-blocking so need to explicitly
     * check channel status to see if in fact the channel
     * is UP.
     */
    mutex_enter(&ldcp->status_lock);
    if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
        DERR(vswp, "%s: unable to get status", __func__);
        mutex_exit(&ldcp->status_lock);
        LDC_EXIT_LOCK(ldcp);
        return (1);

    }

    if (ldcp->ldc_status == LDC_UP) {
        D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
            ldcp->ldc_id, istatus);
        mutex_exit(&ldcp->status_lock);
        LDC_EXIT_LOCK(ldcp);

        vsw_process_conn_evt(ldcp, VSW_CONN_UP);
        return (0);
    }

    mutex_exit(&ldcp->status_lock);
    LDC_EXIT_LOCK(ldcp);

    D1(vswp, "%s: exit", __func__);
    return (0);
}

/* disable callbacks on the channel */
static void
vsw_ldc_uninit(vsw_ldc_t *ldcp)
{
    vsw_t   *vswp = ldcp->ldc_vswp;
    int rv;

    D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);

    LDC_ENTER_LOCK(ldcp);

    rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
    if (rv != 0) {
        cmn_err(CE_NOTE, "!vsw_ldc_uninit(%ld): error disabling "
            "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
    }

    mutex_enter(&ldcp->status_lock);
    ldcp->ldc_status = LDC_INIT;
    mutex_exit(&ldcp->status_lock);

    LDC_EXIT_LOCK(ldcp);

    D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
}

static int
vsw_init_ldcs(vsw_port_t *port)
{
    vsw_ldc_list_t  *ldcl = &port->p_ldclist;
    vsw_ldc_t   *ldcp;

    READ_ENTER(&ldcl->lockrw);
    ldcp =  ldcl->head;
    for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
        (void) vsw_ldc_init(ldcp);
    }
    RW_EXIT(&ldcl->lockrw);

    return (0);
}

static void
vsw_uninit_ldcs(vsw_port_t *port)
{
    vsw_ldc_list_t  *ldcl = &port->p_ldclist;
    vsw_ldc_t   *ldcp;

    D1(NULL, "vsw_uninit_ldcs: enter\n");

    READ_ENTER(&ldcl->lockrw);
    ldcp =  ldcl->head;
    for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
        vsw_ldc_uninit(ldcp);
    }
    RW_EXIT(&ldcl->lockrw);

    D1(NULL, "vsw_uninit_ldcs: exit\n");
}

/*
 * Wait until the callback(s) associated with the ldcs under the specified
 * port have completed.
 *
 * Prior to this function being invoked each channel under this port
 * should have been quiesced via ldc_set_cb_mode(DISABLE).
 *
 * A short explaination of what we are doing below..
 *
 * The simplest approach would be to have a reference counter in
 * the ldc structure which is increment/decremented by the callbacks as
 * they use the channel. The drain function could then simply disable any
 * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
 * there is a tiny window here - before the callback is able to get the lock
 * on the channel it is interrupted and this function gets to execute. It
 * sees that the ref count is zero and believes its free to delete the
 * associated data structures.
 *
 * We get around this by taking advantage of the fact that before the ldc
 * framework invokes a callback it sets a flag to indicate that there is a
 * callback active (or about to become active). If when we attempt to
 * unregister a callback when this active flag is set then the unregister
 * will fail with EWOULDBLOCK.
 *
 * If the unregister fails we do a cv_timedwait. We will either be signaled
 * by the callback as it is exiting (note we have to wait a short period to
 * allow the callback to return fully to the ldc framework and it to clear
 * the active flag), or by the timer expiring. In either case we again attempt
 * the unregister. We repeat this until we can succesfully unregister the
 * callback.
 *
 * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
 * the case where the callback has finished but the ldc framework has not yet
 * cleared the active flag. In this case we would never get a cv_signal.
 */
static void
vsw_drain_ldcs(vsw_port_t *port)
{
    vsw_ldc_list_t  *ldcl = &port->p_ldclist;
    vsw_ldc_t   *ldcp;
    vsw_t       *vswp = port->p_vswp;

    D1(vswp, "%s: enter", __func__);

    READ_ENTER(&ldcl->lockrw);

    ldcp = ldcl->head;

    for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
        /*
         * If we can unregister the channel callback then we
         * know that there is no callback either running or
         * scheduled to run for this channel so move on to next
         * channel in the list.
         */
        mutex_enter(&ldcp->drain_cv_lock);

        /* prompt active callbacks to quit */
        ldcp->drain_state = VSW_LDC_DRAINING;

        if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
            D2(vswp, "%s: unreg callback for chan %ld", __func__,
                ldcp->ldc_id);
            mutex_exit(&ldcp->drain_cv_lock);
            continue;
        } else {
            /*
             * If we end up here we know that either 1) a callback
             * is currently executing, 2) is about to start (i.e.
             * the ldc framework has set the active flag but
             * has not actually invoked the callback yet, or 3)
             * has finished and has returned to the ldc framework
             * but the ldc framework has not yet cleared the
             * active bit.
             *
             * Wait for it to finish.
             */
            while (ldc_unreg_callback(ldcp->ldc_handle)
                == EWOULDBLOCK)
                (void) cv_reltimedwait(&ldcp->drain_cv,
                    &ldcp->drain_cv_lock, hz, TR_CLOCK_TICK);

            mutex_exit(&ldcp->drain_cv_lock);
            D2(vswp, "%s: unreg callback for chan %ld after "
                "timeout", __func__, ldcp->ldc_id);
        }
    }
    RW_EXIT(&ldcl->lockrw);

    D1(vswp, "%s: exit", __func__);
}

/*
 * Wait until all tasks which reference this port have completed.
 *
 * Prior to this function being invoked each channel under this port
 * should have been quiesced via ldc_set_cb_mode(DISABLE).
 */
static void
vsw_drain_port_taskq(vsw_port_t *port)
{
    vsw_t       *vswp = port->p_vswp;

    D1(vswp, "%s: enter", __func__);

    /*
     * Mark the port as in the process of being detached, and
     * dispatch a marker task to the queue so we know when all
     * relevant tasks have completed.
     */
    mutex_enter(&port->state_lock);
    port->state = VSW_PORT_DETACHING;

    if ((vswp->taskq_p == NULL) ||
        (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
        port, DDI_NOSLEEP) != DDI_SUCCESS)) {
        cmn_err(CE_NOTE, "!vsw%d: unable to dispatch marker task",
            vswp->instance);
        mutex_exit(&port->state_lock);
        return;
    }

    /*
     * Wait for the marker task to finish.
     */
    while (port->state != VSW_PORT_DETACHABLE)
        cv_wait(&port->state_cv, &port->state_lock);

    mutex_exit(&port->state_lock);

    D1(vswp, "%s: exit", __func__);
}

static void
vsw_marker_task(void *arg)
{
    vsw_port_t  *port = arg;
    vsw_t       *vswp = port->p_vswp;

    D1(vswp, "%s: enter", __func__);

    mutex_enter(&port->state_lock);

    /*
     * No further tasks should be dispatched which reference
     * this port so ok to mark it as safe to detach.
     */
    port->state = VSW_PORT_DETACHABLE;

    cv_signal(&port->state_cv);

    mutex_exit(&port->state_lock);

    D1(vswp, "%s: exit", __func__);
}

vsw_port_t *
vsw_lookup_port(vsw_t *vswp, int p_instance)
{
    vsw_port_list_t *plist = &vswp->plist;
    vsw_port_t  *port;

    for (port = plist->head; port != NULL; port = port->p_next) {
        if (port->p_instance == p_instance) {
            D2(vswp, "vsw_lookup_port: found p_instance\n");
            return (port);
        }
    }

    return (NULL);
}

void
vsw_vlan_unaware_port_reset(vsw_port_t *portp)
{
    vsw_ldc_list_t  *ldclp;
    vsw_ldc_t   *ldcp;

    ldclp = &portp->p_ldclist;

    READ_ENTER(&ldclp->lockrw);

    /*
     * NOTE: for now, we will assume we have a single channel.
     */
    if (ldclp->head == NULL) {
        RW_EXIT(&ldclp->lockrw);
        return;
    }
    ldcp = ldclp->head;

    mutex_enter(&ldcp->ldc_cblock);

    /*
     * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
     * the connection. See comments in vsw_set_vnet_proto_ops().
     */
    if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
        portp->nvids != 0) {
        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
    }

    mutex_exit(&ldcp->ldc_cblock);

    RW_EXIT(&ldclp->lockrw);
}

void
vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate)
{
    vsw_ldc_list_t  *ldclp;
    vsw_ldc_t   *ldcp;

    ldclp = &portp->p_ldclist;

    READ_ENTER(&ldclp->lockrw);

    /*
     * NOTE: for now, we will assume we have a single channel.
     */
    if (ldclp->head == NULL) {
        RW_EXIT(&ldclp->lockrw);
        return;
    }
    ldcp = ldclp->head;

    mutex_enter(&ldcp->ldc_cblock);

    /*
     * If the peer is HybridIO capable (ver >= 1.3), reset channel
     * to trigger re-negotiation, which inturn trigger HybridIO
     * setup/cleanup.
     */
    if ((ldcp->hphase == VSW_MILESTONE4) &&
        (portp->p_hio_capable == B_TRUE)) {
        if (immediate == B_TRUE) {
            (void) ldc_down(ldcp->ldc_handle);
        } else {
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
        }
    }

    mutex_exit(&ldcp->ldc_cblock);

    RW_EXIT(&ldclp->lockrw);
}

void
vsw_port_reset(vsw_port_t *portp)
{
    vsw_ldc_list_t  *ldclp;
    vsw_ldc_t   *ldcp;

    ldclp = &portp->p_ldclist;

    READ_ENTER(&ldclp->lockrw);

    /*
     * NOTE: for now, we will assume we have a single channel.
     */
    if (ldclp->head == NULL) {
        RW_EXIT(&ldclp->lockrw);
        return;
    }
    ldcp = ldclp->head;

    mutex_enter(&ldcp->ldc_cblock);

    /*
     * reset channel and terminate the connection.
     */
    vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);

    mutex_exit(&ldcp->ldc_cblock);

    RW_EXIT(&ldclp->lockrw);
}

void
vsw_reset_ports(vsw_t *vswp)
{
    vsw_port_list_t *plist = &vswp->plist;
    vsw_port_t  *portp;

    READ_ENTER(&plist->lockrw);
    for (portp = plist->head; portp != NULL; portp = portp->p_next) {
        if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
            vsw_hio_stop_port(portp);
        }
        vsw_port_reset(portp);
    }
    RW_EXIT(&plist->lockrw);
}

static void
vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state)
{
    vnet_physlink_msg_t msg;
    vnet_physlink_msg_t *msgp = &msg;
    uint32_t        physlink_info = 0;

    if (plink_state == LINK_STATE_UP) {
        physlink_info |= VNET_PHYSLINK_STATE_UP;
    } else {
        physlink_info |= VNET_PHYSLINK_STATE_DOWN;
    }

    msgp->tag.vio_msgtype = VIO_TYPE_CTRL;
    msgp->tag.vio_subtype = VIO_SUBTYPE_INFO;
    msgp->tag.vio_subtype_env = VNET_PHYSLINK_INFO;
    msgp->tag.vio_sid = ldcp->local_session;
    msgp->physlink_info = physlink_info;

    (void) vsw_send_msg(ldcp, msgp, sizeof (msg), B_TRUE);
}

static void
vsw_port_physlink_update(vsw_port_t *portp)
{
    vsw_ldc_list_t  *ldclp;
    vsw_ldc_t   *ldcp;
    vsw_t       *vswp;

    vswp = portp->p_vswp;
    ldclp = &portp->p_ldclist;

    READ_ENTER(&ldclp->lockrw);

    /*
     * NOTE: for now, we will assume we have a single channel.
     */
    if (ldclp->head == NULL) {
        RW_EXIT(&ldclp->lockrw);
        return;
    }
    ldcp = ldclp->head;

    mutex_enter(&ldcp->ldc_cblock);

    /*
     * If handshake has completed successfully and if the vnet device
     * has negotiated to get physical link state updates, send a message
     * with the current state.
     */
    if (ldcp->hphase == VSW_MILESTONE4 && ldcp->pls_negotiated == B_TRUE) {
        vsw_send_physlink_msg(ldcp, vswp->phys_link_state);
    }

    mutex_exit(&ldcp->ldc_cblock);

    RW_EXIT(&ldclp->lockrw);
}

void
vsw_physlink_update_ports(vsw_t *vswp)
{
    vsw_port_list_t *plist = &vswp->plist;
    vsw_port_t  *portp;

    READ_ENTER(&plist->lockrw);
    for (portp = plist->head; portp != NULL; portp = portp->p_next) {
        vsw_port_physlink_update(portp);
    }
    RW_EXIT(&plist->lockrw);
}

/*
 * Search for and remove the specified port from the port
 * list. Returns 0 if able to locate and remove port, otherwise
 * returns 1.
 */
static int
vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
{
    vsw_port_list_t *plist = &vswp->plist;
    vsw_port_t  *curr_p, *prev_p;

    if (plist->head == NULL)
        return (1);

    curr_p = prev_p = plist->head;

    while (curr_p != NULL) {
        if (curr_p == port) {
            if (prev_p == curr_p) {
                plist->head = curr_p->p_next;
            } else {
                prev_p->p_next = curr_p->p_next;
            }
            plist->num_ports--;
            break;
        } else {
            prev_p = curr_p;
            curr_p = curr_p->p_next;
        }
    }
    return (0);
}

/*
 * Interrupt handler for ldc messages.
 */
static uint_t
vsw_ldc_cb(uint64_t event, caddr_t arg)
{
    vsw_ldc_t   *ldcp = (vsw_ldc_t  *)arg;
    vsw_t       *vswp = ldcp->ldc_vswp;

    D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);

    mutex_enter(&ldcp->ldc_cblock);
    ldcp->ldc_stats.callbacks++;

    mutex_enter(&ldcp->status_lock);
    if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
        mutex_exit(&ldcp->status_lock);
        mutex_exit(&ldcp->ldc_cblock);
        return (LDC_SUCCESS);
    }
    mutex_exit(&ldcp->status_lock);

    if (event & LDC_EVT_UP) {
        /*
         * Channel has come up.
         */
        D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
            __func__, ldcp->ldc_id, event, ldcp->ldc_status);

        vsw_process_conn_evt(ldcp, VSW_CONN_UP);

        ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
    }

    if (event & LDC_EVT_READ) {
        /*
         * Data available for reading.
         */
        D2(vswp, "%s: id(ld) event(%llx) data READ",
            __func__, ldcp->ldc_id, event);

        if (ldcp->rx_thread != NULL) {
            /*
             * If the receive thread is enabled, then
             * wakeup the receive thread to process the
             * LDC messages.
             */
            mutex_exit(&ldcp->ldc_cblock);
            mutex_enter(&ldcp->rx_thr_lock);
            if (!(ldcp->rx_thr_flags & VSW_WTHR_DATARCVD)) {
                ldcp->rx_thr_flags |= VSW_WTHR_DATARCVD;
                cv_signal(&ldcp->rx_thr_cv);
            }
            mutex_exit(&ldcp->rx_thr_lock);
            mutex_enter(&ldcp->ldc_cblock);
        } else {
            vsw_process_pkt(ldcp);
        }

        ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);

        goto vsw_cb_exit;
    }

    if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
        D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
            __func__, ldcp->ldc_id, event, ldcp->ldc_status);

        vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
    }

    /*
     * Catch either LDC_EVT_WRITE which we don't support or any
     * unknown event.
     */
    if (event &
        ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
        DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
            __func__, ldcp->ldc_id, event, ldcp->ldc_status);
    }

vsw_cb_exit:
    mutex_exit(&ldcp->ldc_cblock);

    /*
     * Let the drain function know we are finishing if it
     * is waiting.
     */
    mutex_enter(&ldcp->drain_cv_lock);
    if (ldcp->drain_state == VSW_LDC_DRAINING)
        cv_signal(&ldcp->drain_cv);
    mutex_exit(&ldcp->drain_cv_lock);

    return (LDC_SUCCESS);
}

/*
 * Reinitialise data structures associated with the channel.
 */
static void
vsw_ldc_reinit(vsw_ldc_t *ldcp)
{
    vsw_t       *vswp = ldcp->ldc_vswp;
    vsw_port_t  *port;
    vsw_ldc_list_t  *ldcl;
    vio_mblk_pool_t *fvmp = NULL;

    D1(vswp, "%s: enter", __func__);

    /*
     * If we can't destroy all the rx pools for this channel, dispatch
     * a task to retry and clean up those rx pools. Note that we don't
     * need to wait for the task to complete. If the vsw device itself
     * gets detached (vsw_detach()), it will wait for the task to complete
     * implicitly in ddi_taskq_destroy().
     */
    vio_destroy_multipools(&ldcp->vmp, &fvmp);
    if (fvmp != NULL) {
        (void) ddi_taskq_dispatch(vswp->rxp_taskq,
            vsw_destroy_rxpools, fvmp, DDI_SLEEP);
    }

    port = ldcp->ldc_port;
    ldcl = &port->p_ldclist;

    READ_ENTER(&ldcl->lockrw);

    D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
        ldcp->lane_in.lstate, ldcp->lane_out.lstate);

    vsw_free_lane_resources(ldcp, INBOUND);
    vsw_free_lane_resources(ldcp, OUTBOUND);
    RW_EXIT(&ldcl->lockrw);

    ldcp->lane_in.lstate = 0;
    ldcp->lane_out.lstate = 0;

    /* Remove the fdb entry for this port/mac address */
    vsw_fdbe_del(vswp, &(port->p_macaddr));

    /* remove the port from vlans it has been assigned to */
    vsw_vlan_remove_ids(port, VSW_VNETPORT);

    /*
     * Remove parent port from any multicast groups
     * it may have registered with. Client must resend
     * multicast add command after handshake completes.
     */
    vsw_del_mcst_port(port);

    ldcp->peer_session = 0;
    ldcp->session_status = 0;
    ldcp->hcnt = 0;
    ldcp->hphase = VSW_MILESTONE0;

    vsw_reset_vnet_proto_ops(ldcp);

    D1(vswp, "%s: exit", __func__);
}

/*
 * Process a connection event.
 *
 * Note - care must be taken to ensure that this function is
 * not called with the dlistrw lock held.
 */
static void
vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
{
    vsw_t       *vswp = ldcp->ldc_vswp;
    vsw_conn_evt_t  *conn = NULL;

    D1(vswp, "%s: enter", __func__);

    /*
     * Check if either a reset or restart event is pending
     * or in progress. If so just return.
     *
     * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
     * being received by the callback handler, or a ECONNRESET error
     * code being returned from a ldc_read() or ldc_write() call.
     *
     * A VSW_CONN_RESTART event occurs when some error checking code
     * decides that there is a problem with data from the channel,
     * and that the handshake should be restarted.
     */
    if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
        (ldstub((uint8_t *)&ldcp->reset_active)))
        return;

    /*
     * If it is an LDC_UP event we first check the recorded
     * state of the channel. If this is UP then we know that
     * the channel moving to the UP state has already been dealt
     * with and don't need to dispatch a  new task.
     *
     * The reason for this check is that when we do a ldc_up(),
     * depending on the state of the peer, we may or may not get
     * a LDC_UP event. As we can't depend on getting a LDC_UP evt
     * every time we do ldc_up() we explicitly check the channel
     * status to see has it come up (ldc_up() is asynch and will
     * complete at some undefined time), and take the appropriate
     * action.
     *
     * The flip side of this is that we may get a LDC_UP event
     * when we have already seen that the channel is up and have
     * dealt with that.
     */
    mutex_enter(&ldcp->status_lock);
    if (evt == VSW_CONN_UP) {
        if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
            mutex_exit(&ldcp->status_lock);
            return;
        }
    }
    mutex_exit(&ldcp->status_lock);

    /*
     * The transaction group id allows us to identify and discard
     * any tasks which are still pending on the taskq and refer
     * to the handshake session we are about to restart or reset.
     * These stale messages no longer have any real meaning.
     */
    (void) atomic_inc_32(&ldcp->hss_id);

    ASSERT(vswp->taskq_p != NULL);

    if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
        cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
            " connection event", vswp->instance);
        goto err_exit;
    }

    conn->evt = evt;
    conn->ldcp = ldcp;

    if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
        DDI_NOSLEEP) != DDI_SUCCESS) {
        cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
            vswp->instance);

        kmem_free(conn, sizeof (vsw_conn_evt_t));
        goto err_exit;
    }

    D1(vswp, "%s: exit", __func__);
    return;

err_exit:
    /*
     * Have mostly likely failed due to memory shortage. Clear the flag so
     * that future requests will at least be attempted and will hopefully
     * succeed.
     */
    if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
        ldcp->reset_active = 0;
}

/*
 * Deal with events relating to a connection. Invoked from a taskq.
 */
static void
vsw_conn_task(void *arg)
{
    vsw_conn_evt_t  *conn = (vsw_conn_evt_t *)arg;
    vsw_ldc_t   *ldcp = NULL;
    vsw_port_t  *portp;
    vsw_t       *vswp = NULL;
    uint16_t    evt;
    ldc_status_t    curr_status;

    ldcp = conn->ldcp;
    evt = conn->evt;
    vswp = ldcp->ldc_vswp;
    portp = ldcp->ldc_port;

    D1(vswp, "%s: enter", __func__);

    /* can safely free now have copied out data */
    kmem_free(conn, sizeof (vsw_conn_evt_t));

    mutex_enter(&ldcp->status_lock);
    if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
        cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
            "channel %ld", vswp->instance, ldcp->ldc_id);
        mutex_exit(&ldcp->status_lock);
        return;
    }

    /*
     * If we wish to restart the handshake on this channel, then if
     * the channel is UP we bring it DOWN to flush the underlying
     * ldc queue.
     */
    if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
        (void) ldc_down(ldcp->ldc_handle);

    if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
        vsw_hio_stop(vswp, ldcp);
    }

    /*
     * re-init all the associated data structures.
     */
    vsw_ldc_reinit(ldcp);

    /*
     * Bring the channel back up (note it does no harm to
     * do this even if the channel is already UP, Just
     * becomes effectively a no-op).
     */
    (void) ldc_up(ldcp->ldc_handle);

    /*
     * Check if channel is now UP. This will only happen if
     * peer has also done a ldc_up().
     */
    if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
        cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
            "channel %ld", vswp->instance, ldcp->ldc_id);
        mutex_exit(&ldcp->status_lock);
        return;
    }

    ldcp->ldc_status = curr_status;

    /* channel UP so restart handshake by sending version info */
    if (curr_status == LDC_UP) {
        if (ldcp->hcnt++ > vsw_num_handshakes) {
            cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
                " handshake attempts (%d) on channel %ld",
                vswp->instance, ldcp->hcnt, ldcp->ldc_id);
            mutex_exit(&ldcp->status_lock);
            return;
        }

        if (vsw_obp_ver_proto_workaround == B_FALSE &&
            (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
            DDI_NOSLEEP) != DDI_SUCCESS)) {
            cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
                vswp->instance);

            /*
             * Don't count as valid restart attempt if couldn't
             * send version msg.
             */
            if (ldcp->hcnt > 0)
                ldcp->hcnt--;
        }
    }

    /*
     * Mark that the process is complete by clearing the flag.
     *
     * Note is it possible that the taskq dispatch above may have failed,
     * most likely due to memory shortage. We still clear the flag so
     * future attempts will at least be attempted and will hopefully
     * succeed.
     */
    if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
        ldcp->reset_active = 0;

    mutex_exit(&ldcp->status_lock);

    D1(vswp, "%s: exit", __func__);
}

/*
 * returns 0 if legal for event signified by flag to have
 * occured at the time it did. Otherwise returns 1.
 */
int
vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
{
    vsw_t       *vswp = ldcp->ldc_vswp;
    uint64_t    state;
    uint64_t    phase;

    if (dir == INBOUND)
        state = ldcp->lane_in.lstate;
    else
        state = ldcp->lane_out.lstate;

    phase = ldcp->hphase;

    switch (flag) {
    case VSW_VER_INFO_RECV:
        if (phase > VSW_MILESTONE0) {
            DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
                " when in state %d\n", ldcp->ldc_id, phase);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return (1);
        }
        break;

    case VSW_VER_ACK_RECV:
    case VSW_VER_NACK_RECV:
        if (!(state & VSW_VER_INFO_SENT)) {
            DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
                "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return (1);
        } else
            state &= ~VSW_VER_INFO_SENT;
        break;

    case VSW_ATTR_INFO_RECV:
        if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
            DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
                " when in state %d\n", ldcp->ldc_id, phase);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return (1);
        }
        break;

    case VSW_ATTR_ACK_RECV:
    case VSW_ATTR_NACK_RECV:
        if (!(state & VSW_ATTR_INFO_SENT)) {
            DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
                " or ATTR_NACK when in state %d\n",
                ldcp->ldc_id, phase);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return (1);
        } else
            state &= ~VSW_ATTR_INFO_SENT;
        break;

    case VSW_DRING_INFO_RECV:
        if (phase < VSW_MILESTONE1) {
            DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
                " when in state %d\n", ldcp->ldc_id, phase);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return (1);
        }
        break;

    case VSW_DRING_ACK_RECV:
    case VSW_DRING_NACK_RECV:
        if (!(state & VSW_DRING_INFO_SENT)) {
            DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
                " or DRING_NACK when in state %d\n",
                ldcp->ldc_id, phase);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return (1);
        } else
            state &= ~VSW_DRING_INFO_SENT;
        break;

    case VSW_RDX_INFO_RECV:
        if (phase < VSW_MILESTONE3) {
            DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
                " when in state %d\n", ldcp->ldc_id, phase);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return (1);
        }
        break;

    case VSW_RDX_ACK_RECV:
    case VSW_RDX_NACK_RECV:
        if (!(state & VSW_RDX_INFO_SENT)) {
            DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
                "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return (1);
        } else
            state &= ~VSW_RDX_INFO_SENT;
        break;

    case VSW_MCST_INFO_RECV:
        if (phase < VSW_MILESTONE3) {
            DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
                " when in state %d\n", ldcp->ldc_id, phase);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return (1);
        }
        break;

    default:
        DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
            ldcp->ldc_id, flag);
        return (1);
    }

    if (dir == INBOUND)
        ldcp->lane_in.lstate = state;
    else
        ldcp->lane_out.lstate = state;

    D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);

    return (0);
}

void
vsw_next_milestone(vsw_ldc_t *ldcp)
{
    vsw_t       *vswp = ldcp->ldc_vswp;
    vsw_port_t  *portp = ldcp->ldc_port;

    D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
        ldcp->ldc_id, ldcp->hphase);

    DUMP_FLAGS(ldcp->lane_in.lstate);
    DUMP_FLAGS(ldcp->lane_out.lstate);

    switch (ldcp->hphase) {

    case VSW_MILESTONE0:
        /*
         * If we haven't started to handshake with our peer,
         * start to do so now.
         */
        if (ldcp->lane_out.lstate == 0) {
            D2(vswp, "%s: (chan %lld) starting handshake "
                "with peer", __func__, ldcp->ldc_id);
            vsw_process_conn_evt(ldcp, VSW_CONN_UP);
        }

        /*
         * Only way to pass this milestone is to have successfully
         * negotiated version info.
         */
        if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
            (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {

            D2(vswp, "%s: (chan %lld) leaving milestone 0",
                __func__, ldcp->ldc_id);

            vsw_set_vnet_proto_ops(ldcp);

            /*
             * Next milestone is passed when attribute
             * information has been successfully exchanged.
             */
            ldcp->hphase = VSW_MILESTONE1;
            vsw_send_attr(ldcp);

        }
        break;

    case VSW_MILESTONE1:
        /*
         * Only way to pass this milestone is to have successfully
         * negotiated attribute information.
         */
        if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {

            ldcp->hphase = VSW_MILESTONE2;

            /*
             * If the peer device has said it wishes to
             * use descriptor rings then we send it our ring
             * info, otherwise we just set up a private ring
             * which we use an internal buffer
             */
            if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
                (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
                (VSW_VER_LT(ldcp, 1, 2) &&
                (ldcp->lane_in.xfer_mode ==
                VIO_DRING_MODE_V1_0))) {
                vsw_send_dring_info(ldcp);
            }
        }
        break;

    case VSW_MILESTONE2:
        /*
         * If peer has indicated in its attribute message that
         * it wishes to use descriptor rings then the only way
         * to pass this milestone is for us to have received
         * valid dring info.
         *
         * If peer is not using descriptor rings then just fall
         * through.
         */
        if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
            (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
            (VSW_VER_LT(ldcp, 1, 2) &&
            (ldcp->lane_in.xfer_mode ==
            VIO_DRING_MODE_V1_0))) {
            if (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))
                break;
        }

        D2(vswp, "%s: (chan %lld) leaving milestone 2",
            __func__, ldcp->ldc_id);

        ldcp->hphase = VSW_MILESTONE3;
        vsw_send_rdx(ldcp);
        break;

    case VSW_MILESTONE3:
        /*
         * Pass this milestone when all paramaters have been
         * successfully exchanged and RDX sent in both directions.
         *
         * Mark outbound lane as available to transmit data.
         */
        if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
            (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {

            D2(vswp, "%s: (chan %lld) leaving milestone 3",
                __func__, ldcp->ldc_id);
            D2(vswp, "%s: ** handshake complete (0x%llx : "
                "0x%llx) **", __func__, ldcp->lane_in.lstate,
                ldcp->lane_out.lstate);
            ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
            ldcp->hphase = VSW_MILESTONE4;
            ldcp->hcnt = 0;
            DISPLAY_STATE();
            /* Start HIO if enabled and capable */
            if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
                D2(vswp, "%s: start HybridIO setup", __func__);
                vsw_hio_start(vswp, ldcp);
            }

            if (ldcp->pls_negotiated == B_TRUE) {
                /*
                 * The vnet device has negotiated to get phys
                 * link updates. Now that the handshake with
                 * the vnet device is complete, send an initial
                 * update with the current physical link state.
                 */
                vsw_send_physlink_msg(ldcp,
                    vswp->phys_link_state);
            }

        } else {
            D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
                __func__, ldcp->lane_in.lstate,
                ldcp->lane_out.lstate);
        }
        break;

    case VSW_MILESTONE4:
        D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
            ldcp->ldc_id);
        break;

    default:
        DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
            ldcp->ldc_id, ldcp->hphase);
    }

    D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
        ldcp->hphase);
}

/*
 * Check if major version is supported.
 *
 * Returns 0 if finds supported major number, and if necessary
 * adjusts the minor field.
 *
 * Returns 1 if can't match major number exactly. Sets mjor/minor
 * to next lowest support values, or to zero if no other values possible.
 */
static int
vsw_supported_version(vio_ver_msg_t *vp)
{
    int i;

    D1(NULL, "vsw_supported_version: enter");

    for (i = 0; i < VSW_NUM_VER; i++) {
        if (vsw_versions[i].ver_major == vp->ver_major) {
            /*
             * Matching or lower major version found. Update
             * minor number if necessary.
             */
            if (vp->ver_minor > vsw_versions[i].ver_minor) {
                D2(NULL, "%s: adjusting minor value from %d "
                    "to %d", __func__, vp->ver_minor,
                    vsw_versions[i].ver_minor);
                vp->ver_minor = vsw_versions[i].ver_minor;
            }

            return (0);
        }

        /*
         * If the message contains a higher major version number, set
         * the message's major/minor versions to the current values
         * and return false, so this message will get resent with
         * these values.
         */
        if (vsw_versions[i].ver_major < vp->ver_major) {
            D2(NULL, "%s: adjusting major and minor "
                "values to %d, %d\n",
                __func__, vsw_versions[i].ver_major,
                vsw_versions[i].ver_minor);
            vp->ver_major = vsw_versions[i].ver_major;
            vp->ver_minor = vsw_versions[i].ver_minor;
            return (1);
        }
    }

    /* No match was possible, zero out fields */
    vp->ver_major = 0;
    vp->ver_minor = 0;

    D1(NULL, "vsw_supported_version: exit");

    return (1);
}

/*
 * Set vnet-protocol-version dependent functions based on version.
 */
static void
vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
{
    vsw_t   *vswp = ldcp->ldc_vswp;
    lane_t  *lp = &ldcp->lane_out;

    if (VSW_VER_GTEQ(ldcp, 1, 4)) {
        /*
         * If the version negotiated with peer is >= 1.4(Jumbo Frame
         * Support), set the mtu in our attributes to max_frame_size.
         */
        lp->mtu = vswp->max_frame_size;
    } else if (VSW_VER_EQ(ldcp, 1, 3)) {
        /*
         * If the version negotiated with peer is == 1.3 (Vlan Tag
         * Support) set the attr.mtu to ETHERMAX + VLAN_TAGSZ.
         */
        lp->mtu = ETHERMAX + VLAN_TAGSZ;
    } else {
        vsw_port_t  *portp = ldcp->ldc_port;
        /*
         * Pre-1.3 peers expect max frame size of ETHERMAX.
         * We can negotiate that size with those peers provided only
         * pvid is defined for our peer and there are no vids. Then we
         * can send/recv only untagged frames of max size ETHERMAX.
         * Note that pvid of the peer can be different, as vsw has to
         * serve the vnet in that vlan even if itself is not assigned
         * to that vlan.
         */
        if (portp->nvids == 0) {
            lp->mtu = ETHERMAX;
        }
    }

    if (VSW_VER_GTEQ(ldcp, 1, 2)) {
        /* Versions >= 1.2 */

        if (VSW_PRI_ETH_DEFINED(vswp)) {
            /*
             * enable priority routines and pkt mode only if
             * at least one pri-eth-type is specified in MD.
             */
            ldcp->tx = vsw_ldctx_pri;
            ldcp->rx_pktdata = vsw_process_pkt_data;

            /* set xfer mode for vsw_send_attr() */
            lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
        } else {
            /* no priority eth types defined in MD */

            ldcp->tx = vsw_ldctx;
            ldcp->rx_pktdata = vsw_process_pkt_data_nop;

            /* set xfer mode for vsw_send_attr() */
            lp->xfer_mode = VIO_DRING_MODE_V1_2;
        }

    } else {
        /* Versions prior to 1.2  */

        vsw_reset_vnet_proto_ops(ldcp);
    }
}

/*
 * Reset vnet-protocol-version dependent functions to v1.0.
 */
static void
vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
{
    lane_t  *lp = &ldcp->lane_out;

    ldcp->tx = vsw_ldctx;
    ldcp->rx_pktdata = vsw_process_pkt_data_nop;

    /* set xfer mode for vsw_send_attr() */
    lp->xfer_mode = VIO_DRING_MODE_V1_0;
}

/*
 * Main routine for processing messages received over LDC.
 */
static void
vsw_process_pkt(void *arg)
{
    vsw_ldc_t   *ldcp = (vsw_ldc_t  *)arg;
    vsw_t       *vswp = ldcp->ldc_vswp;
    size_t      msglen;
    vio_msg_tag_t   *tagp;
    uint64_t    *ldcmsg;
    int         rv = 0;


    D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);

    ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));

    ldcmsg = ldcp->ldcmsg;
    /*
     * If channel is up read messages until channel is empty.
     */
    do {
        msglen = ldcp->msglen;
        rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);

        if (rv != 0) {
            DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
                __func__, ldcp->ldc_id, rv, msglen);
        }

        /* channel has been reset */
        if (rv == ECONNRESET) {
            vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
            break;
        }

        if (msglen == 0) {
            D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
                ldcp->ldc_id);
            break;
        }

        D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
            ldcp->ldc_id, msglen);

        /*
         * Figure out what sort of packet we have gotten by
         * examining the msg tag, and then switch it appropriately.
         */
        tagp = (vio_msg_tag_t *)ldcmsg;

        switch (tagp->vio_msgtype) {
        case VIO_TYPE_CTRL:
            vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp);
            break;
        case VIO_TYPE_DATA:
            vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
            break;
        case VIO_TYPE_ERR:
            vsw_process_err_pkt(ldcp, ldcmsg, tagp);
            break;
        default:
            DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
                "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
            break;
        }
    } while (msglen);

    D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
}

/*
 * Dispatch a task to process a VIO control message.
 */
static void
vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp)
{
    vsw_ctrl_task_t     *ctaskp = NULL;
    vsw_port_t      *port = ldcp->ldc_port;
    vsw_t           *vswp = port->p_vswp;

    D1(vswp, "%s: enter", __func__);

    /*
     * We need to handle RDX ACK messages in-band as once they
     * are exchanged it is possible that we will get an
     * immediate (legitimate) data packet.
     */
    if ((tagp->vio_subtype_env == VIO_RDX) &&
        (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {

        if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
            return;

        ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
        D2(vswp, "%s (%ld) handling RDX_ACK in place "
            "(ostate 0x%llx : hphase %d)", __func__,
            ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
        vsw_next_milestone(ldcp);
        return;
    }

    ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);

    if (ctaskp == NULL) {
        DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
        return;
    }

    ctaskp->ldcp = ldcp;
    bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
    ctaskp->hss_id = ldcp->hss_id;

    /*
     * Dispatch task to processing taskq if port is not in
     * the process of being detached.
     */
    mutex_enter(&port->state_lock);
    if (port->state == VSW_PORT_INIT) {
        if ((vswp->taskq_p == NULL) ||
            (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
            ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
            mutex_exit(&port->state_lock);
            DERR(vswp, "%s: unable to dispatch task to taskq",
                __func__);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
            return;
        }
    } else {
        kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
        DWARN(vswp, "%s: port %d detaching, not dispatching "
            "task", __func__, port->p_instance);
    }

    mutex_exit(&port->state_lock);

    D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
        ldcp->ldc_id);
    D1(vswp, "%s: exit", __func__);
}

/*
 * Process a VIO ctrl message. Invoked from taskq.
 */
static void
vsw_process_ctrl_pkt(void *arg)
{
    vsw_ctrl_task_t *ctaskp = (vsw_ctrl_task_t *)arg;
    vsw_ldc_t   *ldcp = ctaskp->ldcp;
    vsw_t       *vswp = ldcp->ldc_vswp;
    vio_msg_tag_t   tag;
    uint16_t    env;

    D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

    bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
    env = tag.vio_subtype_env;

    /* stale pkt check */
    if (ctaskp->hss_id < ldcp->hss_id) {
        DWARN(vswp, "%s: discarding stale packet belonging to earlier"
            " (%ld) handshake session", __func__, ctaskp->hss_id);
        kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
        return;
    }

    /* session id check */
    if (ldcp->session_status & VSW_PEER_SESSION) {
        if (ldcp->peer_session != tag.vio_sid) {
            DERR(vswp, "%s (chan %d): invalid session id (%llx)",
                __func__, ldcp->ldc_id, tag.vio_sid);
            kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return;
        }
    }

    /*
     * Switch on vio_subtype envelope, then let lower routines
     * decide if its an INFO, ACK or NACK packet.
     */
    switch (env) {
    case VIO_VER_INFO:
        vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
        break;
    case VIO_DRING_REG:
        vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
        break;
    case VIO_DRING_UNREG:
        vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
        break;
    case VIO_ATTR_INFO:
        vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
        break;
    case VNET_MCAST_INFO:
        vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
        break;
    case VIO_RDX:
        vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
        break;
    case VIO_DDS_INFO:
        vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
        break;

    case VNET_PHYSLINK_INFO:
        vsw_process_physlink_msg(ldcp, &ctaskp->pktp);
        break;
    default:
        DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
    }

    kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
    D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

/*
 * Version negotiation. We can end up here either because our peer
 * has responded to a handshake message we have sent it, or our peer
 * has initiated a handshake with us. If its the former then can only
 * be ACK or NACK, if its the later can only be INFO.
 *
 * If its an ACK we move to the next stage of the handshake, namely
 * attribute exchange. If its a NACK we see if we can specify another
 * version, if we can't we stop.
 *
 * If it is an INFO we reset all params associated with communication
 * in that direction over this channel (remember connection is
 * essentially 2 independent simplex channels).
 */
void
vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
{
    vio_ver_msg_t   *ver_pkt;
    vsw_t       *vswp = ldcp->ldc_vswp;

    D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

    /*
     * We know this is a ctrl/version packet so
     * cast it into the correct structure.
     */
    ver_pkt = (vio_ver_msg_t *)pkt;

    switch (ver_pkt->tag.vio_subtype) {
    case VIO_SUBTYPE_INFO:
        D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");

        /*
         * Record the session id, which we will use from now
         * until we see another VER_INFO msg. Even then the
         * session id in most cases will be unchanged, execpt
         * if channel was reset.
         */
        if ((ldcp->session_status & VSW_PEER_SESSION) &&
            (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
            DERR(vswp, "%s: updating session id for chan %lld "
                "from %llx to %llx", __func__, ldcp->ldc_id,
                ldcp->peer_session, ver_pkt->tag.vio_sid);
        }

        ldcp->peer_session = ver_pkt->tag.vio_sid;
        ldcp->session_status |= VSW_PEER_SESSION;

        /* Legal message at this time ? */
        if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
            return;

        /*
         * First check the device class. Currently only expect
         * to be talking to a network device. In the future may
         * also talk to another switch.
         */
        if (ver_pkt->dev_class != VDEV_NETWORK) {
            DERR(vswp, "%s: illegal device class %d", __func__,
                ver_pkt->dev_class);

            ver_pkt->tag.vio_sid = ldcp->local_session;
            ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;

            DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);

            (void) vsw_send_msg(ldcp, (void *)ver_pkt,
                sizeof (vio_ver_msg_t), B_TRUE);

            ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
            vsw_next_milestone(ldcp);
            return;
        } else {
            ldcp->dev_class = ver_pkt->dev_class;
        }

        /*
         * Now check the version.
         */
        if (vsw_supported_version(ver_pkt) == 0) {
            /*
             * Support this major version and possibly
             * adjusted minor version.
             */

            D2(vswp, "%s: accepted ver %d:%d", __func__,
                ver_pkt->ver_major, ver_pkt->ver_minor);

            /* Store accepted values */
            ldcp->lane_in.ver_major = ver_pkt->ver_major;
            ldcp->lane_in.ver_minor = ver_pkt->ver_minor;

            ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;

            ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;

            if (vsw_obp_ver_proto_workaround == B_TRUE) {
                /*
                 * Send a version info message
                 * using the accepted version that
                 * we are about to ack. Also note that
                 * we send our ver info before we ack.
                 * Otherwise, as soon as receiving the
                 * ack, obp sends attr info msg, which
                 * breaks vsw_check_flag() invoked
                 * from vsw_process_ctrl_attr_pkt();
                 * as we also need VSW_VER_ACK_RECV to
                 * be set in lane_out.lstate, before
                 * we can receive attr info.
                 */
                vsw_send_ver(ldcp);
            }
        } else {
            /*
             * NACK back with the next lower major/minor
             * pairing we support (if don't suuport any more
             * versions then they will be set to zero.
             */

            D2(vswp, "%s: replying with ver %d:%d", __func__,
                ver_pkt->ver_major, ver_pkt->ver_minor);

            /* Store updated values */
            ldcp->lane_in.ver_major = ver_pkt->ver_major;
            ldcp->lane_in.ver_minor = ver_pkt->ver_minor;

            ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;

            ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
        }

        DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
        ver_pkt->tag.vio_sid = ldcp->local_session;
        (void) vsw_send_msg(ldcp, (void *)ver_pkt,
            sizeof (vio_ver_msg_t), B_TRUE);

        vsw_next_milestone(ldcp);
        break;

    case VIO_SUBTYPE_ACK:
        D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);

        if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
            return;

        /* Store updated values */
        ldcp->lane_out.ver_major = ver_pkt->ver_major;
        ldcp->lane_out.ver_minor = ver_pkt->ver_minor;

        ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
        vsw_next_milestone(ldcp);

        break;

    case VIO_SUBTYPE_NACK:
        D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);

        if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
            return;

        /*
         * If our peer sent us a NACK with the ver fields set to
         * zero then there is nothing more we can do. Otherwise see
         * if we support either the version suggested, or a lesser
         * one.
         */
        if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
            DERR(vswp, "%s: peer unable to negotiate any "
                "further.", __func__);
            ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
            vsw_next_milestone(ldcp);
            return;
        }

        /*
         * Check to see if we support this major version or
         * a lower one. If we don't then maj/min will be set
         * to zero.
         */
        (void) vsw_supported_version(ver_pkt);
        if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
            /* Nothing more we can do */
            DERR(vswp, "%s: version negotiation failed.\n",
                __func__);
            ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
            vsw_next_milestone(ldcp);
        } else {
            /* found a supported major version */
            ldcp->lane_out.ver_major = ver_pkt->ver_major;
            ldcp->lane_out.ver_minor = ver_pkt->ver_minor;

            D2(vswp, "%s: resending with updated values (%x, %x)",
                __func__, ver_pkt->ver_major, ver_pkt->ver_minor);

            ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
            ver_pkt->tag.vio_sid = ldcp->local_session;
            ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;

            DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);

            (void) vsw_send_msg(ldcp, (void *)ver_pkt,
                sizeof (vio_ver_msg_t), B_TRUE);

            vsw_next_milestone(ldcp);

        }
        break;

    default:
        DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
            ver_pkt->tag.vio_subtype);
    }

    D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
}

/*
 * Process an attribute packet. We can end up here either because our peer
 * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
 * peer has sent us an attribute INFO message
 *
 * If its an ACK we then move to the next stage of the handshake which
 * is to send our descriptor ring info to our peer. If its a NACK then
 * there is nothing more we can (currently) do.
 *
 * If we get a valid/acceptable INFO packet (and we have already negotiated
 * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
 * NACK back and reset channel state to INACTIV.
 *
 * FUTURE: in time we will probably negotiate over attributes, but for
 * the moment unacceptable attributes are regarded as a fatal error.
 *
 */
void
vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
{
    vnet_attr_msg_t     *attr_pkt;
    vsw_t           *vswp = ldcp->ldc_vswp;
    vsw_port_t      *port = ldcp->ldc_port;
    uint64_t        macaddr = 0;
    lane_t          *lane_out = &ldcp->lane_out;
    lane_t          *lane_in = &ldcp->lane_in;
    uint32_t        mtu;
    boolean_t       ack = B_TRUE;
    int         i;

    D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);

    /*
     * We know this is a ctrl/attr packet so
     * cast it into the correct structure.
     */
    attr_pkt = (vnet_attr_msg_t *)pkt;

    switch (attr_pkt->tag.vio_subtype) {
    case VIO_SUBTYPE_INFO:
        D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

        if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
            return;

        /*
         * If the attributes are unacceptable then we NACK back.
         */
        if (vsw_check_attr(attr_pkt, ldcp)) {
            ack = B_FALSE;

            DERR(vswp, "%s (chan %d): invalid attributes",
                __func__, ldcp->ldc_id);

        } else {

            if (VSW_VER_GTEQ(ldcp, 1, 4)) {
                /*
                 * Versions >= 1.4:
                 * The mtu is negotiated down to the
                 * minimum of our mtu and peer's mtu.
                 */
                mtu = MIN(attr_pkt->mtu, vswp->max_frame_size);

                /*
                 * If we have received an ack for the attr info
                 * that we sent, then check if the mtu computed
                 * above matches the mtu that the peer had ack'd
                 * (saved in local hparams). If they don't
                 * match, we fail the handshake.
                 */
                if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
                    if (mtu != lane_out->mtu) {
                        /* send NACK */
                        ack = B_FALSE;
                    }
                } else {
                    /*
                     * Save the mtu computed above in our
                     * attr parameters, so it gets sent in
                     * the attr info from us to the peer.
                     */
                    lane_out->mtu = mtu;
                }
            }

        }

        if (ack == B_FALSE) {

            vsw_free_lane_resources(ldcp, INBOUND);

            attr_pkt->tag.vio_sid = ldcp->local_session;
            attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;

            DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
            ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
            (void) vsw_send_msg(ldcp, (void *)attr_pkt,
                sizeof (vnet_attr_msg_t), B_TRUE);

            vsw_next_milestone(ldcp);
            return;
        }

        /*
         * Otherwise store attributes for this lane and update
         * lane state.
         */
        lane_in->mtu = attr_pkt->mtu;
        lane_in->addr = attr_pkt->addr;
        lane_in->addr_type = attr_pkt->addr_type;
        lane_in->xfer_mode = attr_pkt->xfer_mode;
        lane_in->ack_freq = attr_pkt->ack_freq;
        lane_in->physlink_update = attr_pkt->physlink_update;

        /*
         * Check if the client has requested physlink state updates.
         * If there is a physical device bound to this vswitch (L2
         * mode), set the ack bits to indicate it is supported.
         * Otherwise, set the nack bits.
         */
        if (VSW_VER_GTEQ(ldcp, 1, 5)) { /* Protocol ver >= 1.5 */

            /* Does the vnet need phys link state updates ? */
            if ((lane_in->physlink_update &
                PHYSLINK_UPDATE_STATE_MASK) ==
                PHYSLINK_UPDATE_STATE) {

                if (vswp->smode & VSW_LAYER2) {
                    /* is a net-dev assigned to us ? */
                    attr_pkt->physlink_update =
                        PHYSLINK_UPDATE_STATE_ACK;
                    ldcp->pls_negotiated = B_TRUE;
                } else {
                    /* not in L2 mode */
                    attr_pkt->physlink_update =
                        PHYSLINK_UPDATE_STATE_NACK;
                    ldcp->pls_negotiated = B_FALSE;
                }

            } else {
                attr_pkt->physlink_update =
                    PHYSLINK_UPDATE_NONE;
                ldcp->pls_negotiated = B_FALSE;
            }

        } else {
            /*
             * physlink_update bits are ignored
             * if set by clients < v1.5 protocol.
             */
            attr_pkt->physlink_update = PHYSLINK_UPDATE_NONE;
            ldcp->pls_negotiated = B_FALSE;
        }

        if (VSW_VER_GTEQ(ldcp, 1, 4)) {
            /* save the MIN mtu in the msg to be replied */
            attr_pkt->mtu = mtu;
        }

        macaddr = lane_in->addr;
        for (i = ETHERADDRL - 1; i >= 0; i--) {
            port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
            macaddr >>= 8;
        }

        /* create the fdb entry for this port/mac address */
        vsw_fdbe_add(vswp, port);

        /* add the port to the specified vlans */
        vsw_vlan_add_ids(port, VSW_VNETPORT);

        /* setup device specifc xmit routines */
        mutex_enter(&port->tx_lock);
        if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
            (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
            (VSW_VER_LT(ldcp, 1, 2) &&
            (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
            D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
            port->transmit = vsw_dringsend;
        } else if (lane_in->xfer_mode == VIO_DESC_MODE) {
            D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
            vsw_create_privring(ldcp);
            port->transmit = vsw_descrsend;
            lane_out->xfer_mode = VIO_DESC_MODE;
        }

        /*
         * HybridIO is supported only vnet, not by OBP.
         * So, set hio_capable to true only when in DRING mode.
         */
        if (VSW_VER_GTEQ(ldcp, 1, 3) &&
            (lane_in->xfer_mode != VIO_DESC_MODE)) {
            (void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
        } else {
            (void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
        }

        mutex_exit(&port->tx_lock);

        attr_pkt->tag.vio_sid = ldcp->local_session;
        attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;

        DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);

        lane_in->lstate |= VSW_ATTR_ACK_SENT;

        (void) vsw_send_msg(ldcp, (void *)attr_pkt,
            sizeof (vnet_attr_msg_t), B_TRUE);

        vsw_next_milestone(ldcp);
        break;

    case VIO_SUBTYPE_ACK:
        D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

        if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
            return;

        if (VSW_VER_GTEQ(ldcp, 1, 4)) {
            /*
             * Versions >= 1.4:
             * The ack msg sent by the peer contains the minimum of
             * our mtu (that we had sent in our attr info) and the
             * peer's mtu.
             *
             * If we have sent an ack for the attr info msg from
             * the peer, check if the mtu that was computed then
             * (saved in lane_out params) matches the mtu that the
             * peer has ack'd. If they don't match, we fail the
             * handshake.
             */
            if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
                if (lane_out->mtu != attr_pkt->mtu) {
                    return;
                }
            } else {
                /*
                 * If the mtu ack'd by the peer is > our mtu
                 * fail handshake. Otherwise, save the mtu, so
                 * we can validate it when we receive attr info
                 * from our peer.
                 */
                if (attr_pkt->mtu > lane_out->mtu) {
                    return;
                }
                if (attr_pkt->mtu <= lane_out->mtu) {
                    lane_out->mtu = attr_pkt->mtu;
                }
            }
        }

        lane_out->lstate |= VSW_ATTR_ACK_RECV;
        vsw_next_milestone(ldcp);
        break;

    case VIO_SUBTYPE_NACK:
        D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

        if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
            return;

        lane_out->lstate |= VSW_ATTR_NACK_RECV;
        vsw_next_milestone(ldcp);
        break;

    default:
        DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
            attr_pkt->tag.vio_subtype);
    }

    D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
}

/*
 * Process a dring info packet. We can end up here either because our peer
 * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
 * peer has sent us a dring INFO message.
 *
 * If we get a valid/acceptable INFO packet (and we have already negotiated
 * a version) we ACK back and update the lane state, otherwise we NACK back.
 *
 * FUTURE: nothing to stop client from sending us info on multiple dring's
 * but for the moment we will just use the first one we are given.
 *
 */
void
vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
{
    vio_dring_reg_msg_t *dring_pkt;
    vsw_t           *vswp = ldcp->ldc_vswp;
    ldc_mem_info_t      minfo;
    dring_info_t        *dp, *dbp;
    int         dring_found = 0;

    /*
     * We know this is a ctrl/dring packet so
     * cast it into the correct structure.
     */
    dring_pkt = (vio_dring_reg_msg_t *)pkt;

    D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);

    switch (dring_pkt->tag.vio_subtype) {
    case VIO_SUBTYPE_INFO:
        D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

        if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
            return;

        /*
         * If the dring params are unacceptable then we NACK back.
         */
        if (vsw_check_dring_info(dring_pkt)) {

            DERR(vswp, "%s (%lld): invalid dring info",
                __func__, ldcp->ldc_id);

            vsw_free_lane_resources(ldcp, INBOUND);

            dring_pkt->tag.vio_sid = ldcp->local_session;
            dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;

            DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);

            ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;

            (void) vsw_send_msg(ldcp, (void *)dring_pkt,
                sizeof (vio_dring_reg_msg_t), B_TRUE);

            vsw_next_milestone(ldcp);
            return;
        }

        /*
         * Otherwise, attempt to map in the dring using the
         * cookie. If that succeeds we send back a unique dring
         * identifier that the sending side will use in future
         * to refer to this descriptor ring.
         */
        dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);

        dp->num_descriptors = dring_pkt->num_descriptors;
        dp->descriptor_size = dring_pkt->descriptor_size;
        dp->options = dring_pkt->options;
        dp->ncookies = dring_pkt->ncookies;

        /*
         * Note: should only get one cookie. Enforced in
         * the ldc layer.
         */
        bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
            sizeof (ldc_mem_cookie_t));

        D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
            dp->num_descriptors, dp->descriptor_size);
        D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
            dp->options, dp->ncookies);

        if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
            dp->ncookies, dp->num_descriptors, dp->descriptor_size,
            LDC_DIRECT_MAP, &(dp->handle))) != 0) {

            DERR(vswp, "%s: dring_map failed\n", __func__);

            kmem_free(dp, sizeof (dring_info_t));
            vsw_free_lane_resources(ldcp, INBOUND);

            dring_pkt->tag.vio_sid = ldcp->local_session;
            dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;

            DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);

            ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
            (void) vsw_send_msg(ldcp, (void *)dring_pkt,
                sizeof (vio_dring_reg_msg_t), B_TRUE);

            vsw_next_milestone(ldcp);
            return;
        }

        if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {

            DERR(vswp, "%s: dring_addr failed\n", __func__);

            kmem_free(dp, sizeof (dring_info_t));
            vsw_free_lane_resources(ldcp, INBOUND);

            dring_pkt->tag.vio_sid = ldcp->local_session;
            dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;

            DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);

            ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
            (void) vsw_send_msg(ldcp, (void *)dring_pkt,
                sizeof (vio_dring_reg_msg_t), B_TRUE);

            vsw_next_milestone(ldcp);
            return;
        } else {
            /* store the address of the pub part of ring */
            dp->pub_addr = minfo.vaddr;

            /* cache the dring mtype */
            dp->dring_mtype = minfo.mtype;
        }

        /* no private section as we are importing */
        dp->priv_addr = NULL;

        /*
         * Using simple mono increasing int for ident at
         * the moment.
         */
        dp->ident = ldcp->next_ident;
        ldcp->next_ident++;

        dp->end_idx = 0;
        dp->next = NULL;

        /*
         * Link it onto the end of the list of drings
         * for this lane.
         */
        if (ldcp->lane_in.dringp == NULL) {
            D2(vswp, "%s: adding first INBOUND dring", __func__);
            ldcp->lane_in.dringp = dp;
        } else {
            dbp = ldcp->lane_in.dringp;

            while (dbp->next != NULL)
                dbp = dbp->next;

            dbp->next = dp;
        }

        /* acknowledge it */
        dring_pkt->tag.vio_sid = ldcp->local_session;
        dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
        dring_pkt->dring_ident = dp->ident;

        (void) vsw_send_msg(ldcp, (void *)dring_pkt,
            sizeof (vio_dring_reg_msg_t), B_TRUE);

        ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
        vsw_next_milestone(ldcp);
        break;

    case VIO_SUBTYPE_ACK:
        D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

        if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
            return;

        /*
         * Peer is acknowledging our dring info and will have
         * sent us a dring identifier which we will use to
         * refer to this ring w.r.t. our peer.
         */
        dp = ldcp->lane_out.dringp;
        if (dp != NULL) {
            /*
             * Find the ring this ident should be associated
             * with.
             */
            if (vsw_dring_match(dp, dring_pkt)) {
                dring_found = 1;

            } else while (dp != NULL) {
                if (vsw_dring_match(dp, dring_pkt)) {
                    dring_found = 1;
                    break;
                }
                dp = dp->next;
            }

            if (dring_found == 0) {
                DERR(NULL, "%s: unrecognised ring cookie",
                    __func__);
                vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
                return;
            }

        } else {
            DERR(vswp, "%s: DRING ACK received but no drings "
                "allocated", __func__);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return;
        }

        /* store ident */
        dp->ident = dring_pkt->dring_ident;
        ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
        vsw_next_milestone(ldcp);
        break;

    case VIO_SUBTYPE_NACK:
        D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

        if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
            return;

        ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
        vsw_next_milestone(ldcp);
        break;

    default:
        DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
            dring_pkt->tag.vio_subtype);
    }

    D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
}

/*
 * Process a request from peer to unregister a dring.
 *
 * For the moment we just restart the handshake if our
 * peer endpoint attempts to unregister a dring.
 */
void
vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
{
    vsw_t           *vswp = ldcp->ldc_vswp;
    vio_dring_unreg_msg_t   *dring_pkt;

    /*
     * We know this is a ctrl/dring packet so
     * cast it into the correct structure.
     */
    dring_pkt = (vio_dring_unreg_msg_t *)pkt;

    D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

    switch (dring_pkt->tag.vio_subtype) {
    case VIO_SUBTYPE_INFO:
        D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

        DWARN(vswp, "%s: restarting handshake..", __func__);
        break;

    case VIO_SUBTYPE_ACK:
        D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

        DWARN(vswp, "%s: restarting handshake..", __func__);
        break;

    case VIO_SUBTYPE_NACK:
        D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

        DWARN(vswp, "%s: restarting handshake..", __func__);
        break;

    default:
        DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
            dring_pkt->tag.vio_subtype);
    }

    vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);

    D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

#define SND_MCST_NACK(ldcp, pkt) \
    pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
    pkt->tag.vio_sid = ldcp->local_session; \
    (void) vsw_send_msg(ldcp, (void *)pkt, \
            sizeof (vnet_mcast_msg_t), B_TRUE);

/*
 * Process a multicast request from a vnet.
 *
 * Vnet's specify a multicast address that they are interested in. This
 * address is used as a key into the hash table which forms the multicast
 * forwarding database (mFDB).
 *
 * The table keys are the multicast addresses, while the table entries
 * are pointers to lists of ports which wish to receive packets for the
 * specified multicast address.
 *
 * When a multicast packet is being switched we use the address as a key
 * into the hash table, and then walk the appropriate port list forwarding
 * the pkt to each port in turn.
 *
 * If a vnet is no longer interested in a particular multicast grouping
 * we simply find the correct location in the hash table and then delete
 * the relevant port from the port list.
 *
 * To deal with the case whereby a port is being deleted without first
 * removing itself from the lists in the hash table, we maintain a list
 * of multicast addresses the port has registered an interest in, within
 * the port structure itself. We then simply walk that list of addresses
 * using them as keys into the hash table and remove the port from the
 * appropriate lists.
 */
static void
vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
{
    vnet_mcast_msg_t    *mcst_pkt;
    vsw_port_t      *port = ldcp->ldc_port;
    vsw_t           *vswp = ldcp->ldc_vswp;
    int         i;

    D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

    /*
     * We know this is a ctrl/mcast packet so
     * cast it into the correct structure.
     */
    mcst_pkt = (vnet_mcast_msg_t *)pkt;

    switch (mcst_pkt->tag.vio_subtype) {
    case VIO_SUBTYPE_INFO:
        D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

        /*
         * Check if in correct state to receive a multicast
         * message (i.e. handshake complete). If not reset
         * the handshake.
         */
        if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
            return;

        /*
         * Before attempting to add or remove address check
         * that they are valid multicast addresses.
         * If not, then NACK back.
         */
        for (i = 0; i < mcst_pkt->count; i++) {
            if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
                DERR(vswp, "%s: invalid multicast address",
                    __func__);
                SND_MCST_NACK(ldcp, mcst_pkt);
                return;
            }
        }

        /*
         * Now add/remove the addresses. If this fails we
         * NACK back.
         */
        if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
            SND_MCST_NACK(ldcp, mcst_pkt);
            return;
        }

        mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
        mcst_pkt->tag.vio_sid = ldcp->local_session;

        DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);

        (void) vsw_send_msg(ldcp, (void *)mcst_pkt,
            sizeof (vnet_mcast_msg_t), B_TRUE);
        break;

    case VIO_SUBTYPE_ACK:
        DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

        /*
         * We shouldn't ever get a multicast ACK message as
         * at the moment we never request multicast addresses
         * to be set on some other device. This may change in
         * the future if we have cascading switches.
         */
        if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
            return;

                /* Do nothing */
        break;

    case VIO_SUBTYPE_NACK:
        DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

        /*
         * We shouldn't get a multicast NACK packet for the
         * same reasons as we shouldn't get a ACK packet.
         */
        if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
            return;

                /* Do nothing */
        break;

    default:
        DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
            mcst_pkt->tag.vio_subtype);
    }

    D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

static void
vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
{
    vio_rdx_msg_t   *rdx_pkt;
    vsw_t       *vswp = ldcp->ldc_vswp;

    /*
     * We know this is a ctrl/rdx packet so
     * cast it into the correct structure.
     */
    rdx_pkt = (vio_rdx_msg_t *)pkt;

    D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);

    switch (rdx_pkt->tag.vio_subtype) {
    case VIO_SUBTYPE_INFO:
        D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

        if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
            return;

        rdx_pkt->tag.vio_sid = ldcp->local_session;
        rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;

        DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);

        ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;

        (void) vsw_send_msg(ldcp, (void *)rdx_pkt,
            sizeof (vio_rdx_msg_t), B_TRUE);

        vsw_next_milestone(ldcp);
        break;

    case VIO_SUBTYPE_ACK:
        /*
         * Should be handled in-band by callback handler.
         */
        DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
        break;

    case VIO_SUBTYPE_NACK:
        D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

        if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
            return;

        ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
        vsw_next_milestone(ldcp);
        break;

    default:
        DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
            rdx_pkt->tag.vio_subtype);
    }

    D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

static void
vsw_process_physlink_msg(vsw_ldc_t *ldcp, void *pkt)
{
    vnet_physlink_msg_t *msgp;
    vsw_t           *vswp = ldcp->ldc_vswp;

    msgp = (vnet_physlink_msg_t *)pkt;

    D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);

    switch (msgp->tag.vio_subtype) {
    case VIO_SUBTYPE_INFO:

        /* vsw shouldn't recv physlink info */
        DWARN(vswp, "%s: Unexpected VIO_SUBTYPE_INFO", __func__);
        break;

    case VIO_SUBTYPE_ACK:

        D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
        break;

    case VIO_SUBTYPE_NACK:

        D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
        break;

    default:
        DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
            msgp->tag.vio_subtype);
    }

    D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

static void
vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
    uint32_t msglen)
{
    uint16_t    env = tagp->vio_subtype_env;
    vsw_t       *vswp = ldcp->ldc_vswp;

    D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

    /* session id check */
    if (ldcp->session_status & VSW_PEER_SESSION) {
        if (ldcp->peer_session != tagp->vio_sid) {
            DERR(vswp, "%s (chan %d): invalid session id (%llx)",
                __func__, ldcp->ldc_id, tagp->vio_sid);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return;
        }
    }

    /*
     * It is an error for us to be getting data packets
     * before the handshake has completed.
     */
    if (ldcp->hphase != VSW_MILESTONE4) {
        DERR(vswp, "%s: got data packet before handshake complete "
            "hphase %d (%x: %x)", __func__, ldcp->hphase,
            ldcp->lane_in.lstate, ldcp->lane_out.lstate);
        DUMP_FLAGS(ldcp->lane_in.lstate);
        DUMP_FLAGS(ldcp->lane_out.lstate);
        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
        return;
    }

    /*
     * To reduce the locking contention, release the
     * ldc_cblock here and re-acquire it once we are done
     * receiving packets.
     */
    mutex_exit(&ldcp->ldc_cblock);
    mutex_enter(&ldcp->ldc_rxlock);

    /*
     * Switch on vio_subtype envelope, then let lower routines
     * decide if its an INFO, ACK or NACK packet.
     */
    if (env == VIO_DRING_DATA) {
        vsw_process_data_dring_pkt(ldcp, dpkt);
    } else if (env == VIO_PKT_DATA) {
        ldcp->rx_pktdata(ldcp, dpkt, msglen);
    } else if (env == VIO_DESC_DATA) {
        vsw_process_data_ibnd_pkt(ldcp, dpkt);
    } else {
        DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
    }

    mutex_exit(&ldcp->ldc_rxlock);
    mutex_enter(&ldcp->ldc_cblock);

    D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

#define SND_DRING_NACK(ldcp, pkt) \
    pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
    pkt->tag.vio_sid = ldcp->local_session; \
    (void) vsw_send_msg(ldcp, (void *)pkt, \
            sizeof (vio_dring_msg_t), B_TRUE);

static void
vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
{
    vio_dring_msg_t     *dring_pkt;
    vnet_public_desc_t  desc, *pub_addr = NULL;
    vsw_private_desc_t  *priv_addr = NULL;
    dring_info_t        *dp = NULL;
    vsw_t           *vswp = ldcp->ldc_vswp;
    mblk_t          *mp = NULL;
    mblk_t          *bp = NULL;
    mblk_t          *bpt = NULL;
    size_t          nbytes = 0;
    uint64_t        chain = 0;
    uint64_t        len;
    uint32_t        pos, start;
    uint32_t        range_start, range_end;
    int32_t         end, num, cnt = 0;
    int         i, rv, rng_rv = 0, msg_rv = 0;
    boolean_t       prev_desc_ack = B_FALSE;
    int         read_attempts = 0;
    struct ether_header *ehp;
    lane_t          *lp = &ldcp->lane_out;

    D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

    /*
     * We know this is a data/dring packet so
     * cast it into the correct structure.
     */
    dring_pkt = (vio_dring_msg_t *)dpkt;

    /*
     * Switch on the vio_subtype. If its INFO then we need to
     * process the data. If its an ACK we need to make sure
     * it makes sense (i.e did we send an earlier data/info),
     * and if its a NACK then we maybe attempt a retry.
     */
    switch (dring_pkt->tag.vio_subtype) {
    case VIO_SUBTYPE_INFO:
        D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);

        READ_ENTER(&ldcp->lane_in.dlistrw);
        if ((dp = vsw_ident2dring(&ldcp->lane_in,
            dring_pkt->dring_ident)) == NULL) {
            RW_EXIT(&ldcp->lane_in.dlistrw);

            DERR(vswp, "%s(%lld): unable to find dring from "
                "ident 0x%llx", __func__, ldcp->ldc_id,
                dring_pkt->dring_ident);

            SND_DRING_NACK(ldcp, dring_pkt);
            return;
        }

        start = pos = dring_pkt->start_idx;
        end = dring_pkt->end_idx;
        len = dp->num_descriptors;

        range_start = range_end = pos;

        D2(vswp, "%s(%lld): start index %ld : end %ld\n",
            __func__, ldcp->ldc_id, start, end);

        if (end == -1) {
            num = -1;
        } else if (end >= 0) {
            num = end >= pos ? end - pos + 1: (len - pos + 1) + end;

            /* basic sanity check */
            if (end > len) {
                RW_EXIT(&ldcp->lane_in.dlistrw);
                DERR(vswp, "%s(%lld): endpoint %lld outside "
                    "ring length %lld", __func__,
                    ldcp->ldc_id, end, len);

                SND_DRING_NACK(ldcp, dring_pkt);
                return;
            }
        } else {
            RW_EXIT(&ldcp->lane_in.dlistrw);
            DERR(vswp, "%s(%lld): invalid endpoint %lld",
                __func__, ldcp->ldc_id, end);
            SND_DRING_NACK(ldcp, dring_pkt);
            return;
        }

        while (cnt != num) {
vsw_recheck_desc:
            pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;

            if ((rng_rv = vnet_dring_entry_copy(pub_addr,
                &desc, dp->dring_mtype, dp->handle,
                pos, pos)) != 0) {
                DERR(vswp, "%s(%lld): unable to copy "
                    "descriptor at pos %d: err %d",
                    __func__, pos, ldcp->ldc_id, rng_rv);
                ldcp->ldc_stats.ierrors++;
                break;
            }

            /*
             * When given a bounded range of descriptors
             * to process, its an error to hit a descriptor
             * which is not ready. In the non-bounded case
             * (end_idx == -1) this simply indicates we have
             * reached the end of the current active range.
             */
            if (desc.hdr.dstate != VIO_DESC_READY) {
                /* unbound - no error */
                if (end == -1) {
                    if (read_attempts == vsw_read_attempts)
                        break;

                    delay(drv_usectohz(vsw_desc_delay));
                    read_attempts++;
                    goto vsw_recheck_desc;
                }

                /* bounded - error - so NACK back */
                RW_EXIT(&ldcp->lane_in.dlistrw);
                DERR(vswp, "%s(%lld): descriptor not READY "
                    "(%d)", __func__, ldcp->ldc_id,
                    desc.hdr.dstate);
                SND_DRING_NACK(ldcp, dring_pkt);
                return;
            }

            DTRACE_PROBE1(read_attempts, int, read_attempts);

            range_end = pos;

            /*
             * If we ACK'd the previous descriptor then now
             * record the new range start position for later
             * ACK's.
             */
            if (prev_desc_ack) {
                range_start = pos;

                D2(vswp, "%s(%lld): updating range start to be "
                    "%d", __func__, ldcp->ldc_id, range_start);

                prev_desc_ack = B_FALSE;
            }

            D2(vswp, "%s(%lld): processing desc %lld at pos"
                " 0x%llx : dstate 0x%lx : datalen 0x%lx",
                __func__, ldcp->ldc_id, pos, &desc,
                desc.hdr.dstate, desc.nbytes);

            if ((desc.nbytes < ETHERMIN) ||
                (desc.nbytes > lp->mtu)) {
                /* invalid size; drop the packet */
                ldcp->ldc_stats.ierrors++;
                goto vsw_process_desc_done;
            }

            /*
             * Ensure that we ask ldc for an aligned
             * number of bytes. Data is padded to align on 8
             * byte boundary, desc.nbytes is actual data length,
             * i.e. minus that padding.
             */
            nbytes = (desc.nbytes + VNET_IPALIGN + 7) & ~7;
            if (nbytes > ldcp->max_rxpool_size) {
                mp = allocb(desc.nbytes + VNET_IPALIGN + 8,
                    BPRI_MED);
            } else {
                mp = vio_multipool_allocb(&ldcp->vmp, nbytes);
                if (mp == NULL) {
                    ldcp->ldc_stats.rx_vio_allocb_fail++;
                    /*
                     * No free receive buffers available,
                     * so fallback onto allocb(9F). Make
                     * sure that we get a data buffer which
                     * is a multiple of 8 as this is
                     * required by ldc_mem_copy.
                     */
                    DTRACE_PROBE(allocb);
                    mp = allocb(desc.nbytes +
                        VNET_IPALIGN + 8, BPRI_MED);
                }
            }
            if (mp == NULL) {
                DERR(vswp, "%s(%ld): allocb failed",
                    __func__, ldcp->ldc_id);
                rng_rv = vnet_dring_entry_set_dstate(pub_addr,
                    dp->dring_mtype, dp->handle, pos, pos,
                    VIO_DESC_DONE);
                ldcp->ldc_stats.ierrors++;
                ldcp->ldc_stats.rx_allocb_fail++;
                break;
            }

            rv = ldc_mem_copy(ldcp->ldc_handle,
                (caddr_t)mp->b_rptr, 0, &nbytes,
                desc.memcookie, desc.ncookies, LDC_COPY_IN);
            if (rv != 0) {
                DERR(vswp, "%s(%d): unable to copy in data "
                    "from %d cookies in desc %d (rv %d)",
                    __func__, ldcp->ldc_id, desc.ncookies,
                    pos, rv);
                freemsg(mp);

                rng_rv = vnet_dring_entry_set_dstate(pub_addr,
                    dp->dring_mtype, dp->handle, pos, pos,
                    VIO_DESC_DONE);
                ldcp->ldc_stats.ierrors++;
                break;
            } else {
                D2(vswp, "%s(%d): copied in %ld bytes"
                    " using %d cookies", __func__,
                    ldcp->ldc_id, nbytes, desc.ncookies);
            }

            /* adjust the read pointer to skip over the padding */
            mp->b_rptr += VNET_IPALIGN;

            /* point to the actual end of data */
            mp->b_wptr = mp->b_rptr + desc.nbytes;

            /* update statistics */
            ehp = (struct ether_header *)mp->b_rptr;
            if (IS_BROADCAST(ehp))
                ldcp->ldc_stats.brdcstrcv++;
            else if (IS_MULTICAST(ehp))
                ldcp->ldc_stats.multircv++;

            ldcp->ldc_stats.ipackets++;
            ldcp->ldc_stats.rbytes += desc.nbytes;

            /*
             * IPALIGN space can be used for VLAN_TAG
             */
            (void) vsw_vlan_frame_pretag(ldcp->ldc_port,
                VSW_VNETPORT, mp);

            /* build a chain of received packets */
            if (bp == NULL) {
                /* first pkt */
                bp = mp;
                bp->b_next = bp->b_prev = NULL;
                bpt = bp;
                chain = 1;
            } else {
                mp->b_next = mp->b_prev = NULL;
                bpt->b_next = mp;
                bpt = mp;
                chain++;
            }

vsw_process_desc_done:
            /* mark we are finished with this descriptor */
            if ((rng_rv = vnet_dring_entry_set_dstate(pub_addr,
                dp->dring_mtype, dp->handle, pos, pos,
                VIO_DESC_DONE)) != 0) {
                DERR(vswp, "%s(%lld): unable to update "
                    "dstate at pos %d: err %d",
                    __func__, pos, ldcp->ldc_id, rng_rv);
                ldcp->ldc_stats.ierrors++;
                break;
            }

            /*
             * Send an ACK back to peer if requested.
             */
            if (desc.hdr.ack) {
                dring_pkt->start_idx = range_start;
                dring_pkt->end_idx = range_end;

                DERR(vswp, "%s(%lld): processed %d %d, ACK"
                    " requested", __func__, ldcp->ldc_id,
                    dring_pkt->start_idx, dring_pkt->end_idx);

                dring_pkt->dring_process_state = VIO_DP_ACTIVE;
                dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
                dring_pkt->tag.vio_sid = ldcp->local_session;

                msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
                    sizeof (vio_dring_msg_t), B_FALSE);

                /*
                 * Check if ACK was successfully sent. If not
                 * we break and deal with that below.
                 */
                if (msg_rv != 0)
                    break;

                prev_desc_ack = B_TRUE;
                range_start = pos;
            }

            /* next descriptor */
            pos = (pos + 1) % len;
            cnt++;

            /*
             * Break out of loop here and stop processing to
             * allow some other network device (or disk) to
             * get access to the cpu.
             */
            if (chain > vsw_chain_len) {
                D3(vswp, "%s(%lld): switching chain of %d "
                    "msgs", __func__, ldcp->ldc_id, chain);
                break;
            }
        }
        RW_EXIT(&ldcp->lane_in.dlistrw);

        /* send the chain of packets to be switched */
        if (bp != NULL) {
            DTRACE_PROBE1(vsw_rcv_msgs, int, chain);
            D3(vswp, "%s(%lld): switching chain of %d msgs",
                __func__, ldcp->ldc_id, chain);
            vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
                ldcp->ldc_port, NULL);
        }

        /*
         * If when we encountered an error when attempting to
         * access an imported dring, initiate a connection reset.
         */
        if (rng_rv != 0) {
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            break;
        }

        /*
         * If when we attempted to send the ACK we found that the
         * channel had been reset then now handle this. We deal with
         * it here as we cannot reset the channel while holding the
         * dlistrw lock, and we don't want to acquire/release it
         * continuously in the above loop, as a channel reset should
         * be a rare event.
         */
        if (msg_rv == ECONNRESET) {
            vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
            break;
        }

        DTRACE_PROBE1(msg_cnt, int, cnt);

        /*
         * We are now finished so ACK back with the state
         * set to STOPPING so our peer knows we are finished
         */
        dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
        dring_pkt->tag.vio_sid = ldcp->local_session;

        dring_pkt->dring_process_state = VIO_DP_STOPPED;

        DTRACE_PROBE(stop_process_sent);

        /*
         * We have not processed any more descriptors beyond
         * the last one we ACK'd.
         */
        if (prev_desc_ack)
            range_start = range_end;

        dring_pkt->start_idx = range_start;
        dring_pkt->end_idx = range_end;

        D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
            __func__, ldcp->ldc_id, dring_pkt->start_idx,
            dring_pkt->end_idx);

        (void) vsw_send_msg(ldcp, (void *)dring_pkt,
            sizeof (vio_dring_msg_t), B_TRUE);
        break;

    case VIO_SUBTYPE_ACK:
        D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
        /*
         * Verify that the relevant descriptors are all
         * marked as DONE
         */
        READ_ENTER(&ldcp->lane_out.dlistrw);
        if ((dp = vsw_ident2dring(&ldcp->lane_out,
            dring_pkt->dring_ident)) == NULL) {
            RW_EXIT(&ldcp->lane_out.dlistrw);
            DERR(vswp, "%s: unknown ident in ACK", __func__);
            return;
        }

        start = end = 0;
        start = dring_pkt->start_idx;
        end = dring_pkt->end_idx;
        len = dp->num_descriptors;


        mutex_enter(&dp->dlock);
        dp->last_ack_recv = end;
        ldcp->ldc_stats.dring_data_acks++;
        mutex_exit(&dp->dlock);

        (void) vsw_reclaim_dring(dp, start);

        /*
         * If our peer is stopping processing descriptors then
         * we check to make sure it has processed all the descriptors
         * we have updated. If not then we send it a new message
         * to prompt it to restart.
         */
        if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
            DTRACE_PROBE(stop_process_recv);
            D2(vswp, "%s(%lld): got stopping msg : %d : %d",
                __func__, ldcp->ldc_id, dring_pkt->start_idx,
                dring_pkt->end_idx);

            /*
             * Check next descriptor in public section of ring.
             * If its marked as READY then we need to prompt our
             * peer to start processing the ring again.
             */
            i = (end + 1) % len;
            pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
            priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;

            /*
             * Hold the restart lock across all of this to
             * make sure that its not possible for us to
             * decide that a msg needs to be sent in the future
             * but the sending code having already checked is
             * about to exit.
             */
            mutex_enter(&dp->restart_lock);
            ldcp->ldc_stats.dring_stopped_acks++;
            mutex_enter(&priv_addr->dstate_lock);
            if (pub_addr->hdr.dstate == VIO_DESC_READY) {

                mutex_exit(&priv_addr->dstate_lock);

                dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
                dring_pkt->tag.vio_sid = ldcp->local_session;

                dring_pkt->start_idx = (end + 1) % len;
                dring_pkt->end_idx = -1;

                D2(vswp, "%s(%lld) : sending restart msg:"
                    " %d : %d", __func__, ldcp->ldc_id,
                    dring_pkt->start_idx, dring_pkt->end_idx);

                msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
                    sizeof (vio_dring_msg_t), B_FALSE);
                ldcp->ldc_stats.dring_data_msgs++;

            } else {
                mutex_exit(&priv_addr->dstate_lock);
                dp->restart_reqd = B_TRUE;
            }
            mutex_exit(&dp->restart_lock);
        }
        RW_EXIT(&ldcp->lane_out.dlistrw);

        /* only do channel reset after dropping dlistrw lock */
        if (msg_rv == ECONNRESET)
            vsw_process_conn_evt(ldcp, VSW_CONN_RESET);

        break;

    case VIO_SUBTYPE_NACK:
        DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
            __func__, ldcp->ldc_id);
        /*
         * Something is badly wrong if we are getting NACK's
         * for our data pkts. So reset the channel.
         */
        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);

        break;

    default:
        DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
            ldcp->ldc_id, dring_pkt->tag.vio_subtype);
    }

    D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
}

/*
 * dummy pkt data handler function for vnet protocol version 1.0
 */
static void
vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
{
    _NOTE(ARGUNUSED(arg1, arg2, msglen))
}

/*
 * This function handles raw pkt data messages received over the channel.
 * Currently, only priority-eth-type frames are received through this mechanism.
 * In this case, the frame(data) is present within the message itself which
 * is copied into an mblk before switching it.
 */
static void
vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
{
    vsw_ldc_t       *ldcp = (vsw_ldc_t *)arg1;
    vio_raw_data_msg_t  *dpkt = (vio_raw_data_msg_t *)arg2;
    uint32_t        size;
    mblk_t          *mp;
    vsw_t           *vswp = ldcp->ldc_vswp;
    vgen_stats_t        *statsp = &ldcp->ldc_stats;
    lane_t          *lp = &ldcp->lane_out;

    size = msglen - VIO_PKT_DATA_HDRSIZE;
    if (size < ETHERMIN || size > lp->mtu) {
        (void) atomic_inc_32(&statsp->rx_pri_fail);
        DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
            ldcp->ldc_id, size);
        return;
    }

    mp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
    if (mp == NULL) {
        mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
        if (mp == NULL) {
            (void) atomic_inc_32(&statsp->rx_pri_fail);
            DWARN(vswp, "%s(%lld) allocb failure, "
                "unable to process priority frame\n", __func__,
                ldcp->ldc_id);
            return;
        }
    }

    /* skip over the extra space for vlan tag */
    mp->b_rptr += VLAN_TAGSZ;

    /* copy the frame from the payload of raw data msg into the mblk */
    bcopy(dpkt->data, mp->b_rptr, size);
    mp->b_wptr = mp->b_rptr + size;

    /* update stats */
    (void) atomic_inc_64(&statsp->rx_pri_packets);
    (void) atomic_add_64(&statsp->rx_pri_bytes, size);

    /*
     * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
     */
    (void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);

    /* switch the frame to destination */
    vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
}

/*
 * Process an in-band descriptor message (most likely from
 * OBP).
 */
static void
vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
{
    vnet_ibnd_desc_t    *ibnd_desc;
    dring_info_t        *dp = NULL;
    vsw_private_desc_t  *priv_addr = NULL;
    vsw_t           *vswp = ldcp->ldc_vswp;
    mblk_t          *mp = NULL;
    size_t          nbytes = 0;
    size_t          off = 0;
    uint64_t        idx = 0;
    uint32_t        num = 1, len, datalen = 0;
    uint64_t        ncookies = 0;
    int         i, rv;
    int         j = 0;

    D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

    ibnd_desc = (vnet_ibnd_desc_t *)pkt;

    switch (ibnd_desc->hdr.tag.vio_subtype) {
    case VIO_SUBTYPE_INFO:
        D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

        if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
            return;

        /*
         * Data is padded to align on a 8 byte boundary,
         * nbytes is actual data length, i.e. minus that
         * padding.
         */
        datalen = ibnd_desc->nbytes;

        D2(vswp, "%s(%lld): processing inband desc : "
            ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);

        ncookies = ibnd_desc->ncookies;

        /*
         * allocb(9F) returns an aligned data block. We
         * need to ensure that we ask ldc for an aligned
         * number of bytes also.
         */
        nbytes = datalen;
        if (nbytes & 0x7) {
            off = 8 - (nbytes & 0x7);
            nbytes += off;
        }

        /* alloc extra space for VLAN_TAG */
        mp = allocb(datalen + 8, BPRI_MED);
        if (mp == NULL) {
            DERR(vswp, "%s(%lld): allocb failed",
                __func__, ldcp->ldc_id);
            ldcp->ldc_stats.rx_allocb_fail++;
            return;
        }

        /* skip over the extra space for VLAN_TAG */
        mp->b_rptr += 8;

        rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
            0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
            LDC_COPY_IN);

        if (rv != 0) {
            DERR(vswp, "%s(%d): unable to copy in data from "
                "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
            freemsg(mp);
            ldcp->ldc_stats.ierrors++;
            return;
        }

        D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
            __func__, ldcp->ldc_id, nbytes, ncookies);

        /* point to the actual end of data */
        mp->b_wptr = mp->b_rptr + datalen;
        ldcp->ldc_stats.ipackets++;
        ldcp->ldc_stats.rbytes += datalen;

        /*
         * We ACK back every in-band descriptor message we process
         */
        ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
        ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
        (void) vsw_send_msg(ldcp, (void *)ibnd_desc,
            sizeof (vnet_ibnd_desc_t), B_TRUE);

        /*
         * there is extra space alloc'd for VLAN_TAG
         */
        (void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);

        /* send the packet to be switched */
        vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
            ldcp->ldc_port, NULL);

        break;

    case VIO_SUBTYPE_ACK:
        D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

        /* Verify the ACK is valid */
        idx = ibnd_desc->hdr.desc_handle;

        if (idx >= vsw_ntxds) {
            cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
                "(idx %ld)", vswp->instance, idx);
            return;
        }

        if ((dp = ldcp->lane_out.dringp) == NULL) {
            DERR(vswp, "%s: no dring found", __func__);
            return;
        }

        len = dp->num_descriptors;
        /*
         * If the descriptor we are being ACK'ed for is not the
         * one we expected, then pkts were lost somwhere, either
         * when we tried to send a msg, or a previous ACK msg from
         * our peer. In either case we now reclaim the descriptors
         * in the range from the last ACK we received up to the
         * current ACK.
         */
        if (idx != dp->last_ack_recv) {
            DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
                __func__, dp->last_ack_recv, idx);
            num = idx >= dp->last_ack_recv ?
                idx - dp->last_ack_recv + 1:
                (len - dp->last_ack_recv + 1) + idx;
        }

        /*
         * When we sent the in-band message to our peer we
         * marked the copy in our private ring as READY. We now
         * check that the descriptor we are being ACK'ed for is in
         * fact READY, i.e. it is one we have shared with our peer.
         *
         * If its not we flag an error, but still reset the descr
         * back to FREE.
         */
        for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
            priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
            mutex_enter(&priv_addr->dstate_lock);
            if (priv_addr->dstate != VIO_DESC_READY) {
                DERR(vswp, "%s: (%ld) desc at index %ld not "
                    "READY (0x%lx)", __func__,
                    ldcp->ldc_id, idx, priv_addr->dstate);
                DERR(vswp, "%s: bound %d: ncookies %ld : "
                    "datalen %ld", __func__,
                    priv_addr->bound, priv_addr->ncookies,
                    priv_addr->datalen);
            }
            D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
                ldcp->ldc_id, idx);
            /* release resources associated with sent msg */
            priv_addr->datalen = 0;
            priv_addr->dstate = VIO_DESC_FREE;
            mutex_exit(&priv_addr->dstate_lock);
        }
        /* update to next expected value */
        dp->last_ack_recv = (idx + 1) % dp->num_descriptors;

        break;

    case VIO_SUBTYPE_NACK:
        DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

        /*
         * We should only get a NACK if our peer doesn't like
         * something about a message we have sent it. If this
         * happens we just release the resources associated with
         * the message. (We are relying on higher layers to decide
         * whether or not to resend.
         */

        /* limit check */
        idx = ibnd_desc->hdr.desc_handle;

        if (idx >= vsw_ntxds) {
            DERR(vswp, "%s: corrupted NACK received (idx %lld)",
                __func__, idx);
            return;
        }

        if ((dp = ldcp->lane_out.dringp) == NULL) {
            DERR(vswp, "%s: no dring found", __func__);
            return;
        }

        priv_addr = (vsw_private_desc_t *)dp->priv_addr;

        /* move to correct location in ring */
        priv_addr += idx;

        /* release resources associated with sent msg */
        mutex_enter(&priv_addr->dstate_lock);
        priv_addr->datalen = 0;
        priv_addr->dstate = VIO_DESC_FREE;
        mutex_exit(&priv_addr->dstate_lock);

        break;

    default:
        DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
            ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
    }

    D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
}

static void
vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
{
    _NOTE(ARGUNUSED(epkt))

    vsw_t       *vswp = ldcp->ldc_vswp;
    uint16_t    env = tagp->vio_subtype_env;

    D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);

    /*
     * Error vio_subtypes have yet to be defined. So for
     * the moment we can't do anything.
     */
    D2(vswp, "%s: (%x) vio_subtype env", __func__, env);

    D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
}

/* transmit the packet over the given port */
int
vsw_portsend(vsw_port_t *port, mblk_t *mp)
{
    vsw_ldc_list_t  *ldcl = &port->p_ldclist;
    vsw_ldc_t   *ldcp;
    mblk_t      *mpt;
    int     count;
    int     status = 0;

    READ_ENTER(&ldcl->lockrw);
    /*
     * Note for now, we have a single channel.
     */
    ldcp = ldcl->head;
    if (ldcp == NULL) {
        DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
        freemsgchain(mp);
        RW_EXIT(&ldcl->lockrw);
        return (1);
    }

    count = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);

    if (count != 0) {
        status = ldcp->tx(ldcp, mp, mpt, count);
    }

    RW_EXIT(&ldcl->lockrw);
    return (status);
}

/*
 * Break up frames into 2 seperate chains: normal and
 * priority, based on the frame type. The number of
 * priority frames is also counted and returned.
 *
 * Params:
 *  vswp:   pointer to the instance of vsw
 *  np: head of packet chain to be broken
 *  npt:    tail of packet chain to be broken
 *
 * Returns:
 *  np: head of normal data packets
 *  npt:    tail of normal data packets
 *  hp: head of high priority packets
 *  hpt:    tail of high priority packets
 */
static uint32_t
vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
    mblk_t **hp, mblk_t **hpt)
{
    mblk_t          *tmp = NULL;
    mblk_t          *smp = NULL;
    mblk_t          *hmp = NULL;    /* high prio pkts head */
    mblk_t          *hmpt = NULL;   /* high prio pkts tail */
    mblk_t          *nmp = NULL;    /* normal pkts head */
    mblk_t          *nmpt = NULL;   /* normal pkts tail */
    uint32_t        count = 0;
    int         i;
    struct ether_header *ehp;
    uint32_t        num_types;
    uint16_t        *types;

    tmp = *np;
    while (tmp != NULL) {

        smp = tmp;
        tmp = tmp->b_next;
        smp->b_next = NULL;
        smp->b_prev = NULL;

        ehp = (struct ether_header *)smp->b_rptr;
        num_types = vswp->pri_num_types;
        types = vswp->pri_types;
        for (i = 0; i < num_types; i++) {
            if (ehp->ether_type == types[i]) {
                /* high priority frame */

                if (hmp != NULL) {
                    hmpt->b_next = smp;
                    hmpt = smp;
                } else {
                    hmp = hmpt = smp;
                }
                count++;
                break;
            }
        }
        if (i == num_types) {
            /* normal data frame */

            if (nmp != NULL) {
                nmpt->b_next = smp;
                nmpt = smp;
            } else {
                nmp = nmpt = smp;
            }
        }
    }

    *hp = hmp;
    *hpt = hmpt;
    *np = nmp;
    *npt = nmpt;

    return (count);
}

/*
 * Wrapper function to transmit normal and/or priority frames over the channel.
 */
static int
vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
{
    vsw_ldc_t       *ldcp = (vsw_ldc_t *)arg;
    mblk_t          *tmp;
    mblk_t          *smp;
    mblk_t          *hmp;   /* high prio pkts head */
    mblk_t          *hmpt;  /* high prio pkts tail */
    mblk_t          *nmp;   /* normal pkts head */
    mblk_t          *nmpt;  /* normal pkts tail */
    uint32_t        n = 0;
    vsw_t           *vswp = ldcp->ldc_vswp;

    ASSERT(VSW_PRI_ETH_DEFINED(vswp));
    ASSERT(count != 0);

    nmp = mp;
    nmpt = mpt;

    /* gather any priority frames from the chain of packets */
    n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);

    /* transmit priority frames */
    tmp = hmp;
    while (tmp != NULL) {
        smp = tmp;
        tmp = tmp->b_next;
        smp->b_next = NULL;
        vsw_ldcsend_pkt(ldcp, smp);
    }

    count -= n;

    if (count == 0) {
        /* no normal data frames to process */
        return (0);
    }

    return (vsw_ldctx(ldcp, nmp, nmpt, count));
}

/*
 * Wrapper function to transmit normal frames over the channel.
 */
static int
vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
{
    vsw_ldc_t   *ldcp = (vsw_ldc_t *)arg;
    mblk_t      *tmp = NULL;

    ASSERT(count != 0);
    /*
     * If the TX thread is enabled, then queue the
     * ordinary frames and signal the tx thread.
     */
    if (ldcp->tx_thread != NULL) {

        mutex_enter(&ldcp->tx_thr_lock);

        if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
            /*
             * If we reached queue limit,
             * do not queue new packets,
             * drop them.
             */
            ldcp->ldc_stats.tx_qfull += count;
            mutex_exit(&ldcp->tx_thr_lock);
            freemsgchain(mp);
            goto exit;
        }
        if (ldcp->tx_mhead == NULL) {
            ldcp->tx_mhead = mp;
            ldcp->tx_mtail = mpt;
            cv_signal(&ldcp->tx_thr_cv);
        } else {
            ldcp->tx_mtail->b_next = mp;
            ldcp->tx_mtail = mpt;
        }
        ldcp->tx_cnt += count;
        mutex_exit(&ldcp->tx_thr_lock);
    } else {
        while (mp != NULL) {
            tmp = mp->b_next;
            mp->b_next = mp->b_prev = NULL;
            (void) vsw_ldcsend(ldcp, mp, 1);
            mp = tmp;
        }
    }

exit:
    return (0);
}

/*
 * This function transmits the frame in the payload of a raw data
 * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
 * send special frames with high priorities, without going through
 * the normal data path which uses descriptor ring mechanism.
 */
static void
vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
{
    vio_raw_data_msg_t  *pkt;
    mblk_t          *bp;
    mblk_t          *nmp = NULL;
    caddr_t         dst;
    uint32_t        mblksz;
    uint32_t        size;
    uint32_t        nbytes;
    int         rv;
    vsw_t           *vswp = ldcp->ldc_vswp;
    vgen_stats_t        *statsp = &ldcp->ldc_stats;

    if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
        (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
        (void) atomic_inc_32(&statsp->tx_pri_fail);
        DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
            "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
            ldcp->lane_out.lstate);
        goto send_pkt_exit;
    }

    size = msgsize(mp);

    /* frame size bigger than available payload len of raw data msg ? */
    if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
        (void) atomic_inc_32(&statsp->tx_pri_fail);
        DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
            ldcp->ldc_id, size);
        goto send_pkt_exit;
    }

    if (size < ETHERMIN)
        size = ETHERMIN;

    /* alloc space for a raw data message */
    nmp = vio_allocb(vswp->pri_tx_vmp);
    if (nmp == NULL) {
        (void) atomic_inc_32(&statsp->tx_pri_fail);
        DWARN(vswp, "vio_allocb failed\n");
        goto send_pkt_exit;
    }
    pkt = (vio_raw_data_msg_t *)nmp->b_rptr;

    /* copy frame into the payload of raw data message */
    dst = (caddr_t)pkt->data;
    for (bp = mp; bp != NULL; bp = bp->b_cont) {
        mblksz = MBLKL(bp);
        bcopy(bp->b_rptr, dst, mblksz);
        dst += mblksz;
    }

    /* setup the raw data msg */
    pkt->tag.vio_msgtype = VIO_TYPE_DATA;
    pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
    pkt->tag.vio_subtype_env = VIO_PKT_DATA;
    pkt->tag.vio_sid = ldcp->local_session;
    nbytes = VIO_PKT_DATA_HDRSIZE + size;

    /* send the msg over ldc */
    rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
    if (rv != 0) {
        (void) atomic_inc_32(&statsp->tx_pri_fail);
        DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
            ldcp->ldc_id);
        goto send_pkt_exit;
    }

    /* update stats */
    (void) atomic_inc_64(&statsp->tx_pri_packets);
    (void) atomic_add_64(&statsp->tx_pri_packets, size);

send_pkt_exit:
    if (nmp != NULL)
        freemsg(nmp);
    freemsg(mp);
}

/*
 * Transmit the packet over the given LDC channel.
 *
 * The 'retries' argument indicates how many times a packet
 * is retried before it is dropped. Note, the retry is done
 * only for a resource related failure, for all other failures
 * the packet is dropped immediately.
 */
static int
vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
{
    int i;
    int rc;
    int status = 0;
    vsw_port_t *port = ldcp->ldc_port;
    dring_info_t *dp = NULL;


    for (i = 0; i < retries; ) {
        /*
         * Send the message out using the appropriate
         * transmit function which will free mblock when it
         * is finished with it.
         */
        mutex_enter(&port->tx_lock);
        if (port->transmit != NULL) {
            status = (*port->transmit)(ldcp, mp);
        }
        if (status == LDC_TX_SUCCESS) {
            mutex_exit(&port->tx_lock);
            break;
        }
        i++;    /* increment the counter here */

        /* If its the last retry, then update the oerror */
        if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
            ldcp->ldc_stats.oerrors++;
        }
        mutex_exit(&port->tx_lock);

        if (status != LDC_TX_NORESOURCES) {
            /*
             * No retrying required for errors un-related
             * to resources.
             */
            break;
        }
        READ_ENTER(&ldcp->lane_out.dlistrw);
        if (((dp = ldcp->lane_out.dringp) != NULL) &&
            ((VSW_VER_GTEQ(ldcp, 1, 2) &&
            (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
            ((VSW_VER_LT(ldcp, 1, 2) &&
            (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
            rc = vsw_reclaim_dring(dp, dp->end_idx);
        } else {
            /*
             * If there is no dring or the xfer_mode is
             * set to DESC_MODE(ie., OBP), then simply break here.
             */
            RW_EXIT(&ldcp->lane_out.dlistrw);
            break;
        }
        RW_EXIT(&ldcp->lane_out.dlistrw);

        /*
         * Delay only if none were reclaimed
         * and its not the last retry.
         */
        if ((rc == 0) && (i < retries)) {
            delay(drv_usectohz(vsw_ldc_tx_delay));
        }
    }
    freemsg(mp);
    return (status);
}

/*
 * Send packet out via descriptor ring to a logical device.
 */
static int
vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
{
    vio_dring_msg_t     dring_pkt;
    dring_info_t        *dp = NULL;
    vsw_private_desc_t  *priv_desc = NULL;
    vnet_public_desc_t  *pub = NULL;
    vsw_t           *vswp = ldcp->ldc_vswp;
    mblk_t          *bp;
    size_t          n, size;
    caddr_t         bufp;
    int         idx;
    int         status = LDC_TX_SUCCESS;
    struct ether_header *ehp = (struct ether_header *)mp->b_rptr;
    lane_t          *lp = &ldcp->lane_out;

    D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);

    /* TODO: make test a macro */
    if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
        (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
        DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
            "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
            ldcp->lane_out.lstate);
        ldcp->ldc_stats.oerrors++;
        return (LDC_TX_FAILURE);
    }

    /*
     * Note - using first ring only, this may change
     * in the future.
     */
    READ_ENTER(&ldcp->lane_out.dlistrw);
    if ((dp = ldcp->lane_out.dringp) == NULL) {
        RW_EXIT(&ldcp->lane_out.dlistrw);
        DERR(vswp, "%s(%lld): no dring for outbound lane on"
            " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
        ldcp->ldc_stats.oerrors++;
        return (LDC_TX_FAILURE);
    }

    size = msgsize(mp);
    if (size > (size_t)lp->mtu) {
        RW_EXIT(&ldcp->lane_out.dlistrw);
        DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
            ldcp->ldc_id, size);
        ldcp->ldc_stats.oerrors++;
        return (LDC_TX_FAILURE);
    }

    /*
     * Find a free descriptor
     *
     * Note: for the moment we are assuming that we will only
     * have one dring going from the switch to each of its
     * peers. This may change in the future.
     */
    if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
        D2(vswp, "%s(%lld): no descriptor available for ring "
            "at 0x%llx", __func__, ldcp->ldc_id, dp);

        /* nothing more we can do */
        status = LDC_TX_NORESOURCES;
        ldcp->ldc_stats.tx_no_desc++;
        goto vsw_dringsend_free_exit;
    } else {
        D2(vswp, "%s(%lld): free private descriptor found at pos %ld "
            "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc);
    }

    /* copy data into the descriptor */
    bufp = priv_desc->datap;
    bufp += VNET_IPALIGN;
    for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
        n = MBLKL(bp);
        bcopy(bp->b_rptr, bufp, n);
        bufp += n;
    }

    priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;

    pub = priv_desc->descp;
    pub->nbytes = priv_desc->datalen;

    /* update statistics */
    if (IS_BROADCAST(ehp))
        ldcp->ldc_stats.brdcstxmt++;
    else if (IS_MULTICAST(ehp))
        ldcp->ldc_stats.multixmt++;
    ldcp->ldc_stats.opackets++;
    ldcp->ldc_stats.obytes += priv_desc->datalen;

    mutex_enter(&priv_desc->dstate_lock);
    pub->hdr.dstate = VIO_DESC_READY;
    mutex_exit(&priv_desc->dstate_lock);

    /*
     * Determine whether or not we need to send a message to our
     * peer prompting them to read our newly updated descriptor(s).
     */
    mutex_enter(&dp->restart_lock);
    if (dp->restart_reqd) {
        dp->restart_reqd = B_FALSE;
        ldcp->ldc_stats.dring_data_msgs++;
        mutex_exit(&dp->restart_lock);

        /*
         * Send a vio_dring_msg to peer to prompt them to read
         * the updated descriptor ring.
         */
        dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
        dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
        dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
        dring_pkt.tag.vio_sid = ldcp->local_session;

        /* Note - for now using first ring */
        dring_pkt.dring_ident = dp->ident;

        /*
         * If last_ack_recv is -1 then we know we've not
         * received any ack's yet, so this must be the first
         * msg sent, so set the start to the begining of the ring.
         */
        mutex_enter(&dp->dlock);
        if (dp->last_ack_recv == -1) {
            dring_pkt.start_idx = 0;
        } else {
            dring_pkt.start_idx =
                (dp->last_ack_recv + 1) % dp->num_descriptors;
        }
        dring_pkt.end_idx = -1;
        mutex_exit(&dp->dlock);

        D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
            ldcp->ldc_id, dp, dring_pkt.dring_ident);
        D3(vswp, "%s(%lld): start %lld : end %lld :\n",
            __func__, ldcp->ldc_id, dring_pkt.start_idx,
            dring_pkt.end_idx);

        RW_EXIT(&ldcp->lane_out.dlistrw);

        (void) vsw_send_msg(ldcp, (void *)&dring_pkt,
            sizeof (vio_dring_msg_t), B_TRUE);

        return (status);

    } else {
        mutex_exit(&dp->restart_lock);
        D2(vswp, "%s(%lld): updating descp %d", __func__,
            ldcp->ldc_id, idx);
    }

vsw_dringsend_free_exit:

    RW_EXIT(&ldcp->lane_out.dlistrw);

    D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
    return (status);
}

/*
 * Send an in-band descriptor message over ldc.
 */
static int
vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
{
    vsw_t           *vswp = ldcp->ldc_vswp;
    vnet_ibnd_desc_t    ibnd_msg;
    vsw_private_desc_t  *priv_desc = NULL;
    dring_info_t        *dp = NULL;
    size_t          n, size = 0;
    caddr_t         bufp;
    mblk_t          *bp;
    int         idx, i;
    int         status = LDC_TX_SUCCESS;
    static int      warn_msg = 1;
    lane_t          *lp = &ldcp->lane_out;

    D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

    ASSERT(mp != NULL);

    if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
        (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
        DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
            __func__, ldcp->ldc_id, ldcp->ldc_status,
            ldcp->lane_out.lstate);
        ldcp->ldc_stats.oerrors++;
        return (LDC_TX_FAILURE);
    }

    /*
     * only expect single dring to exist, which we use
     * as an internal buffer, rather than a transfer channel.
     */
    READ_ENTER(&ldcp->lane_out.dlistrw);
    if ((dp = ldcp->lane_out.dringp) == NULL) {
        DERR(vswp, "%s(%lld): no dring for outbound lane",
            __func__, ldcp->ldc_id);
        DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
            ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
        RW_EXIT(&ldcp->lane_out.dlistrw);
        ldcp->ldc_stats.oerrors++;
        return (LDC_TX_FAILURE);
    }

    size = msgsize(mp);
    if (size > (size_t)lp->mtu) {
        RW_EXIT(&ldcp->lane_out.dlistrw);
        DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
            ldcp->ldc_id, size);
        ldcp->ldc_stats.oerrors++;
        return (LDC_TX_FAILURE);
    }

    /*
     * Find a free descriptor in our buffer ring
     */
    if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
        RW_EXIT(&ldcp->lane_out.dlistrw);
        if (warn_msg) {
            DERR(vswp, "%s(%lld): no descriptor available for ring "
                "at 0x%llx", __func__, ldcp->ldc_id, dp);
            warn_msg = 0;
        }

        /* nothing more we can do */
        status = LDC_TX_NORESOURCES;
        goto vsw_descrsend_free_exit;
    } else {
        D2(vswp, "%s(%lld): free private descriptor found at pos "
            "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
        warn_msg = 1;
    }

    /* copy data into the descriptor */
    bufp = priv_desc->datap;
    for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
        n = MBLKL(bp);
        bcopy(bp->b_rptr, bufp, n);
        bufp += n;
    }

    priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;

    /* create and send the in-band descp msg */
    ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
    ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
    ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
    ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;

    /*
     * Copy the mem cookies describing the data from the
     * private region of the descriptor ring into the inband
     * descriptor.
     */
    for (i = 0; i < priv_desc->ncookies; i++) {
        bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
            sizeof (ldc_mem_cookie_t));
    }

    ibnd_msg.hdr.desc_handle = idx;
    ibnd_msg.ncookies = priv_desc->ncookies;
    ibnd_msg.nbytes = size;

    ldcp->ldc_stats.opackets++;
    ldcp->ldc_stats.obytes += size;

    RW_EXIT(&ldcp->lane_out.dlistrw);

    (void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
        sizeof (vnet_ibnd_desc_t), B_TRUE);

vsw_descrsend_free_exit:

    D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
    return (status);
}

static void
vsw_send_ver(void *arg)
{
    vsw_ldc_t   *ldcp = (vsw_ldc_t *)arg;
    vsw_t       *vswp = ldcp->ldc_vswp;
    lane_t      *lp = &ldcp->lane_out;
    vio_ver_msg_t   ver_msg;

    D1(vswp, "%s enter", __func__);

    ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
    ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
    ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
    ver_msg.tag.vio_sid = ldcp->local_session;

    if (vsw_obp_ver_proto_workaround == B_FALSE) {
        ver_msg.ver_major = vsw_versions[0].ver_major;
        ver_msg.ver_minor = vsw_versions[0].ver_minor;
    } else {
        /* use the major,minor that we've ack'd */
        lane_t  *lpi = &ldcp->lane_in;
        ver_msg.ver_major = lpi->ver_major;
        ver_msg.ver_minor = lpi->ver_minor;
    }
    ver_msg.dev_class = VDEV_NETWORK_SWITCH;

    lp->lstate |= VSW_VER_INFO_SENT;
    lp->ver_major = ver_msg.ver_major;
    lp->ver_minor = ver_msg.ver_minor;

    DUMP_TAG(ver_msg.tag);

    (void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);

    D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
}

static void
vsw_send_attr(vsw_ldc_t *ldcp)
{
    vsw_t           *vswp = ldcp->ldc_vswp;
    lane_t          *lp = &ldcp->lane_out;
    vnet_attr_msg_t     attr_msg;

    D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);

    /*
     * Subtype is set to INFO by default
     */
    attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
    attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
    attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
    attr_msg.tag.vio_sid = ldcp->local_session;

    /* payload copied from default settings for lane */
    attr_msg.mtu = lp->mtu;
    attr_msg.addr_type = lp->addr_type;
    attr_msg.xfer_mode = lp->xfer_mode;
    attr_msg.ack_freq = lp->xfer_mode;

    READ_ENTER(&vswp->if_lockrw);
    attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
    RW_EXIT(&vswp->if_lockrw);

    ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;

    DUMP_TAG(attr_msg.tag);

    (void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);

    D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
}

/*
 * Create dring info msg (which also results in the creation of
 * a dring).
 */
static vio_dring_reg_msg_t *
vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
{
    vio_dring_reg_msg_t *mp;
    dring_info_t        *dp;
    vsw_t           *vswp = ldcp->ldc_vswp;
    int         rv;

    D1(vswp, "vsw_create_dring_info_pkt enter\n");

    /*
     * If we can't create a dring, obviously no point sending
     * a message.
     */
    if ((dp = vsw_create_dring(ldcp)) == NULL)
        return (NULL);

    /* Allocate pools of receive mblks */
    rv = vsw_init_multipools(ldcp, vswp);
    if (rv) {
        /*
         * We do not return failure if receive mblk pools can't be
         * allocated, instead allocb(9F) will be used to dynamically
         * allocate buffers during receive.
         */
        DWARN(vswp, "%s: unable to create free mblk pools for"
            " channel %ld (rv %d)", __func__, ldcp->ldc_id, rv);
    }

    mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);

    mp->tag.vio_msgtype = VIO_TYPE_CTRL;
    mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
    mp->tag.vio_subtype_env = VIO_DRING_REG;
    mp->tag.vio_sid = ldcp->local_session;

    /* payload */
    mp->num_descriptors = dp->num_descriptors;
    mp->descriptor_size = dp->descriptor_size;
    mp->options = dp->options;
    mp->ncookies = dp->ncookies;
    bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));

    mp->dring_ident = 0;

    D1(vswp, "vsw_create_dring_info_pkt exit\n");

    return (mp);
}

static void
vsw_send_dring_info(vsw_ldc_t *ldcp)
{
    vio_dring_reg_msg_t *dring_msg;
    vsw_t           *vswp = ldcp->ldc_vswp;

    D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);

    dring_msg = vsw_create_dring_info_pkt(ldcp);
    if (dring_msg == NULL) {
        cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
            vswp->instance, __func__);
        return;
    }

    ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;

    DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);

    (void) vsw_send_msg(ldcp, dring_msg,
        sizeof (vio_dring_reg_msg_t), B_TRUE);

    kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));

    D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
}

static void
vsw_send_rdx(vsw_ldc_t *ldcp)
{
    vsw_t       *vswp = ldcp->ldc_vswp;
    vio_rdx_msg_t   rdx_msg;

    D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);

    rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
    rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
    rdx_msg.tag.vio_subtype_env = VIO_RDX;
    rdx_msg.tag.vio_sid = ldcp->local_session;

    ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;

    DUMP_TAG(rdx_msg.tag);

    (void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);

    D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
}

/*
 * Generic routine to send message out over ldc channel.
 *
 * It is possible that when we attempt to write over the ldc channel
 * that we get notified that it has been reset. Depending on the value
 * of the handle_reset flag we either handle that event here or simply
 * notify the caller that the channel was reset.
 */
int
vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
{
    int         rv;
    size_t          msglen = size;
    vio_msg_tag_t       *tag = (vio_msg_tag_t *)msgp;
    vsw_t           *vswp = ldcp->ldc_vswp;
    vio_dring_msg_t     *dmsg;
    vio_raw_data_msg_t  *rmsg;
    vnet_ibnd_desc_t    *imsg;
    boolean_t       data_msg = B_FALSE;

    D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
        ldcp->ldc_id, size);

    D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
    D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
    D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);

    mutex_enter(&ldcp->ldc_txlock);

    if (tag->vio_subtype == VIO_SUBTYPE_INFO) {
        if (tag->vio_subtype_env == VIO_DRING_DATA) {
            dmsg = (vio_dring_msg_t *)tag;
            dmsg->seq_num = ldcp->lane_out.seq_num;
            data_msg = B_TRUE;
        } else if (tag->vio_subtype_env == VIO_PKT_DATA) {
            rmsg = (vio_raw_data_msg_t *)tag;
            rmsg->seq_num = ldcp->lane_out.seq_num;
            data_msg = B_TRUE;
        } else if (tag->vio_subtype_env == VIO_DESC_DATA) {
            imsg = (vnet_ibnd_desc_t *)tag;
            imsg->hdr.seq_num = ldcp->lane_out.seq_num;
            data_msg = B_TRUE;
        }
    }

    do {
        msglen = size;
        rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
    } while (rv == EWOULDBLOCK && --vsw_wretries > 0);

    if (rv == 0 && data_msg == B_TRUE) {
        ldcp->lane_out.seq_num++;
    }

    if ((rv != 0) || (msglen != size)) {
        DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) "
            "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen);
        ldcp->ldc_stats.oerrors++;
    }

    mutex_exit(&ldcp->ldc_txlock);

    /*
     * If channel has been reset we either handle it here or
     * simply report back that it has been reset and let caller
     * decide what to do.
     */
    if (rv == ECONNRESET) {
        DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id);

        /*
         * N.B - must never be holding the dlistrw lock when
         * we do a reset of the channel.
         */
        if (handle_reset) {
            vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
        }
    }

    return (rv);
}

/*
 * Remove the specified address from the list of address maintained
 * in this port node.
 */
mcst_addr_t *
vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
{
    vsw_t       *vswp = NULL;
    vsw_port_t  *port = NULL;
    mcst_addr_t *prev_p = NULL;
    mcst_addr_t *curr_p = NULL;

    D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
        __func__, devtype, addr);

    if (devtype == VSW_VNETPORT) {
        port = (vsw_port_t *)arg;
        mutex_enter(&port->mca_lock);
        prev_p = curr_p = port->mcap;
    } else {
        vswp = (vsw_t *)arg;
        mutex_enter(&vswp->mca_lock);
        prev_p = curr_p = vswp->mcap;
    }

    while (curr_p != NULL) {
        if (curr_p->addr == addr) {
            D2(NULL, "%s: address found", __func__);
            /* match found */
            if (prev_p == curr_p) {
                /* list head */
                if (devtype == VSW_VNETPORT)
                    port->mcap = curr_p->nextp;
                else
                    vswp->mcap = curr_p->nextp;
            } else {
                prev_p->nextp = curr_p->nextp;
            }
            break;
        } else {
            prev_p = curr_p;
            curr_p = curr_p->nextp;
        }
    }

    if (devtype == VSW_VNETPORT)
        mutex_exit(&port->mca_lock);
    else
        mutex_exit(&vswp->mca_lock);

    D1(NULL, "%s: exit", __func__);

    return (curr_p);
}

/*
 * Creates a descriptor ring (dring) and links it into the
 * link of outbound drings for this channel.
 *
 * Returns NULL if creation failed.
 */
static dring_info_t *
vsw_create_dring(vsw_ldc_t *ldcp)
{
    vsw_private_desc_t  *priv_addr = NULL;
    vsw_t           *vswp = ldcp->ldc_vswp;
    ldc_mem_info_t      minfo;
    dring_info_t        *dp, *tp;
    int         i;

    dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);

    mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);

    /* create public section of ring */
    if ((ldc_mem_dring_create(vsw_ntxds,
        VSW_PUB_SIZE, &dp->handle)) != 0) {

        DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
            "failed", ldcp->ldc_id);
        goto create_fail_exit;
    }

    ASSERT(dp->handle != NULL);

    /*
     * Get the base address of the public section of the ring.
     */
    if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
        DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
            ldcp->ldc_id);
        goto dring_fail_exit;
    } else {
        ASSERT(minfo.vaddr != 0);
        dp->pub_addr = minfo.vaddr;
    }

    dp->num_descriptors = vsw_ntxds;
    dp->descriptor_size = VSW_PUB_SIZE;
    dp->options = VIO_TX_DRING;
    dp->ncookies = 1;   /* guaranteed by ldc */

    /*
     * create private portion of ring
     */
    dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
        (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);

    if (vsw_setup_ring(ldcp, dp)) {
        DERR(vswp, "%s: unable to setup ring", __func__);
        goto dring_fail_exit;
    }

    /* haven't used any descriptors yet */
    dp->end_idx = 0;
    dp->last_ack_recv = -1;

    /* bind dring to the channel */
    if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
        LDC_DIRECT_MAP | LDC_SHADOW_MAP, LDC_MEM_RW,
        &dp->cookie[0], &dp->ncookies)) != 0) {
        DERR(vswp, "vsw_create_dring: unable to bind to channel "
            "%lld", ldcp->ldc_id);
        goto dring_fail_exit;
    }

    mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
    dp->restart_reqd = B_TRUE;

    /*
     * Only ever create rings for outgoing lane. Link it onto
     * end of list.
     */
    WRITE_ENTER(&ldcp->lane_out.dlistrw);
    if (ldcp->lane_out.dringp == NULL) {
        D2(vswp, "vsw_create_dring: adding first outbound ring");
        ldcp->lane_out.dringp = dp;
    } else {
        tp = ldcp->lane_out.dringp;
        while (tp->next != NULL)
            tp = tp->next;

        tp->next = dp;
    }
    RW_EXIT(&ldcp->lane_out.dlistrw);

    return (dp);

dring_fail_exit:
    (void) ldc_mem_dring_destroy(dp->handle);

create_fail_exit:
    if (dp->priv_addr != NULL) {
        priv_addr = dp->priv_addr;
        for (i = 0; i < vsw_ntxds; i++) {
            if (priv_addr->memhandle != NULL)
                (void) ldc_mem_free_handle(
                    priv_addr->memhandle);
            priv_addr++;
        }
        kmem_free(dp->priv_addr,
            (sizeof (vsw_private_desc_t) * vsw_ntxds));
    }
    mutex_destroy(&dp->dlock);

    kmem_free(dp, sizeof (dring_info_t));
    return (NULL);
}

/*
 * Create a ring consisting of just a private portion and link
 * it into the list of rings for the outbound lane.
 *
 * These type of rings are used primarily for temporary data
 * storage (i.e. as data buffers).
 */
void
vsw_create_privring(vsw_ldc_t *ldcp)
{
    dring_info_t        *dp, *tp;
    vsw_t           *vswp = ldcp->ldc_vswp;

    D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

    dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);

    mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);

    /* no public section */
    dp->pub_addr = NULL;

    dp->priv_addr = kmem_zalloc(
        (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);

    dp->num_descriptors = vsw_ntxds;

    if (vsw_setup_ring(ldcp, dp)) {
        DERR(vswp, "%s: setup of ring failed", __func__);
        kmem_free(dp->priv_addr,
            (sizeof (vsw_private_desc_t) * vsw_ntxds));
        mutex_destroy(&dp->dlock);
        kmem_free(dp, sizeof (dring_info_t));
        return;
    }

    /* haven't used any descriptors yet */
    dp->end_idx = 0;

    mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
    dp->restart_reqd = B_TRUE;

    /*
     * Only ever create rings for outgoing lane. Link it onto
     * end of list.
     */
    WRITE_ENTER(&ldcp->lane_out.dlistrw);
    if (ldcp->lane_out.dringp == NULL) {
        D2(vswp, "%s: adding first outbound privring", __func__);
        ldcp->lane_out.dringp = dp;
    } else {
        tp = ldcp->lane_out.dringp;
        while (tp->next != NULL)
            tp = tp->next;

        tp->next = dp;
    }
    RW_EXIT(&ldcp->lane_out.dlistrw);

    D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

/*
 * Setup the descriptors in the dring. Returns 0 on success, 1 on
 * failure.
 */
int
vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
{
    vnet_public_desc_t  *pub_addr = NULL;
    vsw_private_desc_t  *priv_addr = NULL;
    vsw_t           *vswp = ldcp->ldc_vswp;
    uint64_t        *tmpp;
    uint64_t        offset = 0;
    uint32_t        ncookies = 0;
    static char     *name = "vsw_setup_ring";
    int         i, j, nc, rv;
    size_t          data_sz;
    void            *data_addr;

    priv_addr = dp->priv_addr;
    pub_addr = dp->pub_addr;

    /* public section may be null but private should never be */
    ASSERT(priv_addr != NULL);

    /*
     * Allocate the region of memory which will be used to hold
     * the data the descriptors will refer to.
     */
    data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;

    /*
     * In order to ensure that the number of ldc cookies per descriptor is
     * limited to be within the default MAX_COOKIES (2), we take the steps
     * outlined below:
     *
     * Align the entire data buffer area to 8K and carve out per descriptor
     * data buffers starting from this 8K aligned base address.
     *
     * We round up the mtu specified to be a multiple of 2K or 4K.
     * For sizes up to 12K we round up the size to the next 2K.
     * For sizes > 12K we round up to the next 4K (otherwise sizes such as
     * 14K could end up needing 3 cookies, with the buffer spread across
     * 3 8K pages:  8K+6K, 2K+8K+2K, 6K+8K, ...).
     */
    if (data_sz <= VNET_12K) {
        data_sz = VNET_ROUNDUP_2K(data_sz);
    } else {
        data_sz = VNET_ROUNDUP_4K(data_sz);
    }

    dp->desc_data_sz = data_sz;

    /* allocate extra 8K bytes for alignment */
    dp->data_sz = (vsw_ntxds * data_sz) + VNET_8K;
    data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
    dp->data_addr = data_addr;

    D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
        dp->data_sz, dp->data_addr);

    /* align the starting address of the data area to 8K */
    data_addr = (void *)VNET_ROUNDUP_8K((uintptr_t)data_addr);

    tmpp = (uint64_t *)data_addr;
    offset = dp->desc_data_sz/sizeof (tmpp);

    /*
     * Initialise some of the private and public (if they exist)
     * descriptor fields.
     */
    for (i = 0; i < vsw_ntxds; i++) {
        mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);

        if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
            &priv_addr->memhandle)) != 0) {
            DERR(vswp, "%s: alloc mem handle failed", name);
            goto setup_ring_cleanup;
        }

        priv_addr->datap = (void *)tmpp;

        rv = ldc_mem_bind_handle(priv_addr->memhandle,
            (caddr_t)priv_addr->datap, dp->desc_data_sz,
            LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
            &(priv_addr->memcookie[0]), &ncookies);
        if (rv != 0) {
            DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
                "(rv %d)", name, ldcp->ldc_id, rv);
            goto setup_ring_cleanup;
        }
        priv_addr->bound = 1;

        D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
            name, i, priv_addr->memcookie[0].addr,
            priv_addr->memcookie[0].size);

        if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
            DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
                "invalid num of cookies (%d) for size 0x%llx",
                name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ);

            goto setup_ring_cleanup;
        } else {
            for (j = 1; j < ncookies; j++) {
                rv = ldc_mem_nextcookie(priv_addr->memhandle,
                    &(priv_addr->memcookie[j]));
                if (rv != 0) {
                    DERR(vswp, "%s: ldc_mem_nextcookie "
                        "failed rv (%d)", name, rv);
                    goto setup_ring_cleanup;
                }
                D3(vswp, "%s: memcookie %d : addr 0x%llx : "
                    "size 0x%llx", name, j,
                    priv_addr->memcookie[j].addr,
                    priv_addr->memcookie[j].size);
            }

        }
        priv_addr->ncookies = ncookies;
        priv_addr->dstate = VIO_DESC_FREE;

        if (pub_addr != NULL) {

            /* link pub and private sides */
            priv_addr->descp = pub_addr;

            pub_addr->ncookies = priv_addr->ncookies;

            for (nc = 0; nc < pub_addr->ncookies; nc++) {
                bcopy(&priv_addr->memcookie[nc],
                    &pub_addr->memcookie[nc],
                    sizeof (ldc_mem_cookie_t));
            }

            pub_addr->hdr.dstate = VIO_DESC_FREE;
            pub_addr++;
        }

        /*
         * move to next element in the dring and the next
         * position in the data buffer.
         */
        priv_addr++;
        tmpp += offset;
    }

    return (0);

setup_ring_cleanup:
    priv_addr = dp->priv_addr;

    for (j = 0; j < i; j++) {
        (void) ldc_mem_unbind_handle(priv_addr->memhandle);
        (void) ldc_mem_free_handle(priv_addr->memhandle);

        mutex_destroy(&priv_addr->dstate_lock);

        priv_addr++;
    }
    kmem_free(dp->data_addr, dp->data_sz);

    return (1);
}

/*
 * Searches the private section of a ring for a free descriptor,
 * starting at the location of the last free descriptor found
 * previously.
 *
 * Returns 0 if free descriptor is available, and updates state
 * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
 *
 * FUTURE: might need to return contiguous range of descriptors
 * as dring info msg assumes all will be contiguous.
 */
static int
vsw_dring_find_free_desc(dring_info_t *dringp,
        vsw_private_desc_t **priv_p, int *idx)
{
    vsw_private_desc_t  *addr = NULL;
    int         num = vsw_ntxds;
    int         ret = 1;

    D1(NULL, "%s enter\n", __func__);

    ASSERT(dringp->priv_addr != NULL);

    D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
        __func__, dringp, dringp->end_idx);

    addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;

    mutex_enter(&addr->dstate_lock);
    if (addr->dstate == VIO_DESC_FREE) {
        addr->dstate = VIO_DESC_READY;
        *priv_p = addr;
        *idx = dringp->end_idx;
        dringp->end_idx = (dringp->end_idx + 1) % num;
        ret = 0;

    }
    mutex_exit(&addr->dstate_lock);

    /* ring full */
    if (ret == 1) {
        D2(NULL, "%s: no desp free: started at %d", __func__,
            dringp->end_idx);
    }

    D1(NULL, "%s: exit\n", __func__);

    return (ret);
}

/*
 * Map from a dring identifier to the ring itself. Returns
 * pointer to ring or NULL if no match found.
 *
 * Should be called with dlistrw rwlock held as reader.
 */
static dring_info_t *
vsw_ident2dring(lane_t *lane, uint64_t ident)
{
    dring_info_t    *dp = NULL;

    if ((dp = lane->dringp) == NULL) {
        return (NULL);
    } else {
        if (dp->ident == ident)
            return (dp);

        while (dp != NULL) {
            if (dp->ident == ident)
                break;
            dp = dp->next;
        }
    }

    return (dp);
}

/*
 * Set the default lane attributes. These are copied into
 * the attr msg we send to our peer. If they are not acceptable
 * then (currently) the handshake ends.
 */
static void
vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
{
    bzero(lp, sizeof (lane_t));

    READ_ENTER(&vswp->if_lockrw);
    ether_copy(&(vswp->if_addr), &(lp->addr));
    RW_EXIT(&vswp->if_lockrw);

    lp->mtu = vswp->max_frame_size;
    lp->addr_type = ADDR_TYPE_MAC;
    lp->xfer_mode = VIO_DRING_MODE_V1_0;
    lp->ack_freq = 0;   /* for shared mode */
    lp->seq_num = VNET_ISS;
}

/*
 * Verify that the attributes are acceptable.
 *
 * FUTURE: If some attributes are not acceptable, change them
 * our desired values.
 */
static int
vsw_check_attr(vnet_attr_msg_t *pkt, vsw_ldc_t *ldcp)
{
    int         ret = 0;
    struct ether_addr   ea;
    vsw_port_t      *port = ldcp->ldc_port;
    lane_t          *lp = &ldcp->lane_out;

    D1(NULL, "vsw_check_attr enter\n");

    if ((pkt->xfer_mode != VIO_DESC_MODE) &&
        (pkt->xfer_mode != lp->xfer_mode)) {
        D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode);
        ret = 1;
    }

    /* Only support MAC addresses at moment. */
    if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
        D2(NULL, "vsw_check_attr: invalid addr_type %x, "
            "or address 0x%llx\n", pkt->addr_type, pkt->addr);
        ret = 1;
    }

    /*
     * MAC address supplied by device should match that stored
     * in the vsw-port OBP node. Need to decide what to do if they
     * don't match, for the moment just warn but don't fail.
     */
    vnet_macaddr_ultostr(pkt->addr, ea.ether_addr_octet);
    if (ether_cmp(&ea, &port->p_macaddr) != 0) {
        DERR(NULL, "vsw_check_attr: device supplied address "
            "0x%llx doesn't match node address 0x%llx\n",
            pkt->addr, port->p_macaddr);
    }

    /*
     * Ack freq only makes sense in pkt mode, in shared
     * mode the ring descriptors say whether or not to
     * send back an ACK.
     */
    if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
        (pkt->xfer_mode & VIO_DRING_MODE_V1_2)) ||
        (VSW_VER_LT(ldcp, 1, 2) &&
        (pkt->xfer_mode == VIO_DRING_MODE_V1_0))) {
        if (pkt->ack_freq > 0) {
            D2(NULL, "vsw_check_attr: non zero ack freq "
                " in SHM mode\n");
            ret = 1;
        }
    }

    if (VSW_VER_LT(ldcp, 1, 4)) {
        /* versions < 1.4, mtu must match */
        if (pkt->mtu != lp->mtu) {
            D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
                pkt->mtu);
            ret = 1;
        }
    } else {
        /* Ver >= 1.4, validate mtu of the peer is at least ETHERMAX */
        if (pkt->mtu < ETHERMAX) {
            ret = 1;
        }
    }

    D1(NULL, "vsw_check_attr exit\n");

    return (ret);
}

/*
 * Returns 1 if there is a problem, 0 otherwise.
 */
static int
vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
{
    _NOTE(ARGUNUSED(pkt))

    int ret = 0;

    D1(NULL, "vsw_check_dring_info enter\n");

    if ((pkt->num_descriptors == 0) ||
        (pkt->descriptor_size == 0) ||
        (pkt->ncookies != 1)) {
        DERR(NULL, "vsw_check_dring_info: invalid dring msg");
        ret = 1;
    }

    D1(NULL, "vsw_check_dring_info exit\n");

    return (ret);
}

/*
 * Returns 1 if two memory cookies match. Otherwise returns 0.
 */
static int
vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
{
    if ((m1->addr != m2->addr) ||
        (m2->size != m2->size)) {
        return (0);
    } else {
        return (1);
    }
}

/*
 * Returns 1 if ring described in reg message matches that
 * described by dring_info structure. Otherwise returns 0.
 */
static int
vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
{
    if ((msg->descriptor_size != dp->descriptor_size) ||
        (msg->num_descriptors != dp->num_descriptors) ||
        (msg->ncookies != dp->ncookies) ||
        !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
        return (0);
    } else {
        return (1);
    }

}

/*
 * Reset and free all the resources associated with
 * the channel.
 */
static void
vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
{
    dring_info_t        *dp, *dpp;
    lane_t          *lp = NULL;

    ASSERT(ldcp != NULL);

    D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);

    if (dir == INBOUND) {
        D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
            " of channel %lld", __func__, ldcp->ldc_id);
        lp = &ldcp->lane_in;
    } else {
        D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
            " of channel %lld", __func__, ldcp->ldc_id);
        lp = &ldcp->lane_out;
    }

    lp->lstate = VSW_LANE_INACTIV;
    lp->seq_num = VNET_ISS;

    if (lp->dringp) {
        if (dir == INBOUND) {
            WRITE_ENTER(&lp->dlistrw);
            dp = lp->dringp;
            while (dp != NULL) {
                dpp = dp->next;
                if (dp->handle != NULL)
                    (void) ldc_mem_dring_unmap(dp->handle);
                kmem_free(dp, sizeof (dring_info_t));
                dp = dpp;
            }
            RW_EXIT(&lp->dlistrw);
        } else {
            /*
             * unbind, destroy exported dring, free dring struct
             */
            WRITE_ENTER(&lp->dlistrw);
            dp = lp->dringp;
            vsw_free_ring(dp);
            RW_EXIT(&lp->dlistrw);
        }
        lp->dringp = NULL;
    }

    D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
}

/*
 * Free ring and all associated resources.
 *
 * Should be called with dlistrw rwlock held as writer.
 */
static void
vsw_free_ring(dring_info_t *dp)
{
    vsw_private_desc_t  *paddr = NULL;
    dring_info_t        *dpp;
    int         i;

    while (dp != NULL) {
        mutex_enter(&dp->dlock);
        dpp = dp->next;
        if (dp->priv_addr != NULL) {
            /*
             * First unbind and free the memory handles
             * stored in each descriptor within the ring.
             */
            for (i = 0; i < vsw_ntxds; i++) {
                paddr = (vsw_private_desc_t *)
                    dp->priv_addr + i;
                if (paddr->memhandle != NULL) {
                    if (paddr->bound == 1) {
                        if (ldc_mem_unbind_handle(
                            paddr->memhandle) != 0) {
                            DERR(NULL, "error "
                            "unbinding handle for "
                            "ring 0x%llx at pos %d",
                                dp, i);
                            continue;
                        }
                        paddr->bound = 0;
                    }

                    if (ldc_mem_free_handle(
                        paddr->memhandle) != 0) {
                        DERR(NULL, "error freeing "
                            "handle for ring 0x%llx "
                            "at pos %d", dp, i);
                        continue;
                    }
                    paddr->memhandle = NULL;
                }
                mutex_destroy(&paddr->dstate_lock);
            }
            kmem_free(dp->priv_addr,
                (sizeof (vsw_private_desc_t) * vsw_ntxds));
        }

        /*
         * Now unbind and destroy the ring itself.
         */
        if (dp->handle != NULL) {
            (void) ldc_mem_dring_unbind(dp->handle);
            (void) ldc_mem_dring_destroy(dp->handle);
        }

        if (dp->data_addr != NULL) {
            kmem_free(dp->data_addr, dp->data_sz);
        }

        mutex_exit(&dp->dlock);
        mutex_destroy(&dp->dlock);
        mutex_destroy(&dp->restart_lock);
        kmem_free(dp, sizeof (dring_info_t));

        dp = dpp;
    }
}

/*
 * vsw_ldc_rx_worker -- A per LDC worker thread to receive data.
 * This thread is woken up by the LDC interrupt handler to process
 * LDC packets and receive data.
 */
static void
vsw_ldc_rx_worker(void *arg)
{
    callb_cpr_t cprinfo;
    vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
    vsw_t *vswp = ldcp->ldc_vswp;

    D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
    CALLB_CPR_INIT(&cprinfo, &ldcp->rx_thr_lock, callb_generic_cpr,
        "vsw_rx_thread");
    mutex_enter(&ldcp->rx_thr_lock);
    while (!(ldcp->rx_thr_flags & VSW_WTHR_STOP)) {

        CALLB_CPR_SAFE_BEGIN(&cprinfo);
        /*
         * Wait until the data is received or a stop
         * request is received.
         */
        while (!(ldcp->rx_thr_flags &
            (VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) {
            cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
        }
        CALLB_CPR_SAFE_END(&cprinfo, &ldcp->rx_thr_lock)

        /*
         * First process the stop request.
         */
        if (ldcp->rx_thr_flags & VSW_WTHR_STOP) {
            D2(vswp, "%s(%lld):Rx thread stopped\n",
                __func__, ldcp->ldc_id);
            break;
        }
        ldcp->rx_thr_flags &= ~VSW_WTHR_DATARCVD;
        mutex_exit(&ldcp->rx_thr_lock);
        D1(vswp, "%s(%lld):calling vsw_process_pkt\n",
            __func__, ldcp->ldc_id);
        mutex_enter(&ldcp->ldc_cblock);
        vsw_process_pkt(ldcp);
        mutex_exit(&ldcp->ldc_cblock);
        mutex_enter(&ldcp->rx_thr_lock);
    }

    /*
     * Update the run status and wakeup the thread that
     * has sent the stop request.
     */
    ldcp->rx_thr_flags &= ~VSW_WTHR_STOP;
    ldcp->rx_thread = NULL;
    CALLB_CPR_EXIT(&cprinfo);
    D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
    thread_exit();
}

/* vsw_stop_rx_thread -- Co-ordinate with receive thread to stop it */
static void
vsw_stop_rx_thread(vsw_ldc_t *ldcp)
{
    kt_did_t    tid = 0;
    vsw_t       *vswp = ldcp->ldc_vswp;

    D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
    /*
     * Send a stop request by setting the stop flag and
     * wait until the receive thread stops.
     */
    mutex_enter(&ldcp->rx_thr_lock);
    if (ldcp->rx_thread != NULL) {
        tid = ldcp->rx_thread->t_did;
        ldcp->rx_thr_flags |= VSW_WTHR_STOP;
        cv_signal(&ldcp->rx_thr_cv);
    }
    mutex_exit(&ldcp->rx_thr_lock);

    if (tid != 0) {
        thread_join(tid);
    }
    D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
}

/*
 * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
 * This thread is woken up by the vsw_portsend to transmit
 * packets.
 */
static void
vsw_ldc_tx_worker(void *arg)
{
    callb_cpr_t cprinfo;
    vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
    vsw_t *vswp = ldcp->ldc_vswp;
    mblk_t *mp;
    mblk_t *tmp;

    D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
    CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
        "vnet_tx_thread");
    mutex_enter(&ldcp->tx_thr_lock);
    while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {

        CALLB_CPR_SAFE_BEGIN(&cprinfo);
        /*
         * Wait until the data is received or a stop
         * request is received.
         */
        while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
            (ldcp->tx_mhead == NULL)) {
            cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
        }
        CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)

        /*
         * First process the stop request.
         */
        if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
            D2(vswp, "%s(%lld):tx thread stopped\n",
                __func__, ldcp->ldc_id);
            break;
        }
        mp = ldcp->tx_mhead;
        ldcp->tx_mhead = ldcp->tx_mtail = NULL;
        ldcp->tx_cnt = 0;
        mutex_exit(&ldcp->tx_thr_lock);
        D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
            __func__, ldcp->ldc_id);
        while (mp != NULL) {
            tmp = mp->b_next;
            mp->b_next = mp->b_prev = NULL;
            (void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
            mp = tmp;
        }
        mutex_enter(&ldcp->tx_thr_lock);
    }

    /*
     * Update the run status and wakeup the thread that
     * has sent the stop request.
     */
    ldcp->tx_thr_flags &= ~VSW_WTHR_STOP;
    ldcp->tx_thread = NULL;
    CALLB_CPR_EXIT(&cprinfo);
    D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
    thread_exit();
}

/* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
static void
vsw_stop_tx_thread(vsw_ldc_t *ldcp)
{
    kt_did_t    tid = 0;
    vsw_t       *vswp = ldcp->ldc_vswp;

    D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
    /*
     * Send a stop request by setting the stop flag and
     * wait until the receive thread stops.
     */
    mutex_enter(&ldcp->tx_thr_lock);
    if (ldcp->tx_thread != NULL) {
        tid = ldcp->tx_thread->t_did;
        ldcp->tx_thr_flags |= VSW_WTHR_STOP;
        cv_signal(&ldcp->tx_thr_cv);
    }
    mutex_exit(&ldcp->tx_thr_lock);

    if (tid != 0) {
        thread_join(tid);
    }

    D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
}

/* vsw_reclaim_dring -- reclaim descriptors */
static int
vsw_reclaim_dring(dring_info_t *dp, int start)
{
    int i, j, len;
    vsw_private_desc_t *priv_addr;
    vnet_public_desc_t *pub_addr;

    pub_addr = (vnet_public_desc_t *)dp->pub_addr;
    priv_addr = (vsw_private_desc_t *)dp->priv_addr;
    len = dp->num_descriptors;

    D2(NULL, "%s: start index %ld\n", __func__, start);

    j = 0;
    for (i = start; j < len; i = (i + 1) % len, j++) {
        pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
        priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;

        mutex_enter(&priv_addr->dstate_lock);
        if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
            mutex_exit(&priv_addr->dstate_lock);
            break;
        }
        pub_addr->hdr.dstate = VIO_DESC_FREE;
        priv_addr->dstate = VIO_DESC_FREE;
        /* clear all the fields */
        priv_addr->datalen = 0;
        pub_addr->hdr.ack = 0;
        mutex_exit(&priv_addr->dstate_lock);

        D3(NULL, "claiming descp:%d pub state:0x%llx priv state 0x%llx",
            i, pub_addr->hdr.dstate, priv_addr->dstate);
    }
    return (j);
}

/*
 * Debugging routines
 */
static void
display_state(void)
{
    vsw_t       *vswp;
    vsw_port_list_t *plist;
    vsw_port_t  *port;
    vsw_ldc_list_t  *ldcl;
    vsw_ldc_t   *ldcp;
    extern vsw_t    *vsw_head;

    cmn_err(CE_NOTE, "***** system state *****");

    for (vswp = vsw_head; vswp; vswp = vswp->next) {
        plist = &vswp->plist;
        READ_ENTER(&plist->lockrw);
        cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
            vswp->instance, plist->num_ports);

        for (port = plist->head; port != NULL; port = port->p_next) {
            ldcl = &port->p_ldclist;
            cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
                port->p_instance, port->num_ldcs);
            READ_ENTER(&ldcl->lockrw);
            ldcp = ldcl->head;
            for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
                cmn_err(CE_CONT, "chan %lu : dev %d : "
                    "status %d : phase %u\n",
                    ldcp->ldc_id, ldcp->dev_class,
                    ldcp->ldc_status, ldcp->hphase);
                cmn_err(CE_CONT, "chan %lu : lsession %lu : "
                    "psession %lu\n", ldcp->ldc_id,
                    ldcp->local_session, ldcp->peer_session);

                cmn_err(CE_CONT, "Inbound lane:\n");
                display_lane(&ldcp->lane_in);
                cmn_err(CE_CONT, "Outbound lane:\n");
                display_lane(&ldcp->lane_out);
            }
            RW_EXIT(&ldcl->lockrw);
        }
        RW_EXIT(&plist->lockrw);
    }
    cmn_err(CE_NOTE, "***** system state *****");
}

static void
display_lane(lane_t *lp)
{
    dring_info_t    *drp;

    cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
        lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
    cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
        lp->addr_type, lp->addr, lp->xfer_mode);
    cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);

    cmn_err(CE_CONT, "Dring info:\n");
    for (drp = lp->dringp; drp != NULL; drp = drp->next) {
        cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
            drp->num_descriptors, drp->descriptor_size);
        cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
        cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
            (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
        cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
            drp->ident, drp->end_idx);
        display_ring(drp);
    }
}

static void
display_ring(dring_info_t *dringp)
{
    uint64_t        i;
    uint64_t        priv_count = 0;
    uint64_t        pub_count = 0;
    vnet_public_desc_t  *pub_addr = NULL;
    vsw_private_desc_t  *priv_addr = NULL;

    for (i = 0; i < vsw_ntxds; i++) {
        if (dringp->pub_addr != NULL) {
            pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;

            if (pub_addr->hdr.dstate == VIO_DESC_FREE)
                pub_count++;
        }

        if (dringp->priv_addr != NULL) {
            priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;

            if (priv_addr->dstate == VIO_DESC_FREE)
                priv_count++;
        }
    }
    cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
        i, priv_count, pub_count);
}

static void
dump_flags(uint64_t state)
{
    int i;

    typedef struct flag_name {
        int flag_val;
        char    *flag_name;
    } flag_name_t;

    flag_name_t flags[] = {
        VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
        VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
        VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
        VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
        VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
        VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
        VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
        VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
        VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
        VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
        VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
        VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
        VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
        VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
        VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
        VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
        VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
        VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
        VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
        VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
        VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
        VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
        VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
        VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
        VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
        VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
        VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
        VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
        VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
        VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
        VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};

    DERR(NULL, "DUMP_FLAGS: %llx\n", state);
    for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
        if (state & flags[i].flag_val)
            DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
    }
}