sun4v/io/vsw.c

	vsw.c revision 023505bcce788e9ac958a334707e49cddbf18d1d
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

#include <sys/types.h>
#include <sys/errno.h>
#include <sys/debug.h>
#include <sys/time.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/user.h>
#include <sys/stropts.h>
#include <sys/stream.h>
#include <sys/strlog.h>
#include <sys/strsubr.h>
#include <sys/cmn_err.h>
#include <sys/cpu.h>
#include <sys/kmem.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/ksynch.h>
#include <sys/stat.h>
#include <sys/kstat.h>
#include <sys/vtrace.h>
#include <sys/strsun.h>
#include <sys/dlpi.h>
#include <sys/ethernet.h>
#include <net/if.h>
#include <sys/varargs.h>
#include <sys/machsystm.h>
#include <sys/modctl.h>
#include <sys/modhash.h>
#include <sys/mac.h>
#include <sys/mac_ether.h>
#include <sys/taskq.h>
#include <sys/note.h>
#include <sys/mach_descrip.h>
#include <sys/mac.h>
#include <sys/mdeg.h>
#include <sys/ldc.h>
#include <sys/vsw_fdb.h>
#include <sys/vsw.h>
#include <sys/vio_mailbox.h>
#include <sys/vnet_mailbox.h>
#include <sys/vnet_common.h>
#include <sys/vio_util.h>
#include <sys/sdt.h>

/*
 * Function prototypes.
 */
static  int vsw_attach(dev_info_t *, ddi_attach_cmd_t);
static  int vsw_detach(dev_info_t *, ddi_detach_cmd_t);
static  int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
static  int vsw_get_md_physname(vsw_t *, md_t *, mde_cookie_t, char *);
static  int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *, int *);
static  int vsw_get_physaddr(vsw_t *);
static  int vsw_setup_switching(vsw_t *);
static  int vsw_setup_layer2(vsw_t *);
static  int vsw_setup_layer3(vsw_t *);

/* MAC Ring table functions. */
static void vsw_mac_ring_tbl_init(vsw_t *vswp);
static void vsw_mac_ring_tbl_destroy(vsw_t *vswp);
static void vsw_queue_worker(vsw_mac_ring_t *rrp);
static void vsw_queue_stop(vsw_queue_t *vqp);
static vsw_queue_t *vsw_queue_create();
static void vsw_queue_destroy(vsw_queue_t *vqp);

/* MAC layer routines */
static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg,
        mac_resource_t *mrp);
static  int vsw_get_hw_maddr(vsw_t *);
static  int vsw_set_hw(vsw_t *, vsw_port_t *, int);
static  int vsw_set_hw_addr(vsw_t *, mac_multi_addr_t *);
static  int vsw_set_hw_promisc(vsw_t *, vsw_port_t *, int);
static  int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
static  int vsw_unset_hw_addr(vsw_t *, int);
static  int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *, int);
static void vsw_reconfig_hw(vsw_t *);
static int vsw_prog_if(vsw_t *);
static int vsw_prog_ports(vsw_t *);
static int vsw_mac_attach(vsw_t *vswp);
static void vsw_mac_detach(vsw_t *vswp);

static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *);
static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
static int vsw_mac_register(vsw_t *);
static int vsw_mac_unregister(vsw_t *);
static int vsw_m_stat(void *, uint_t, uint64_t *);
static void vsw_m_stop(void *arg);
static int vsw_m_start(void *arg);
static int vsw_m_unicst(void *arg, const uint8_t *);
static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *);
static int vsw_m_promisc(void *arg, boolean_t);
static mblk_t *vsw_m_tx(void *arg, mblk_t *);

/* MDEG routines */
static  int vsw_mdeg_register(vsw_t *vswp);
static  void vsw_mdeg_unregister(vsw_t *vswp);
static  int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *);
static  int vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *);
static  void vsw_get_initial_md_properties(vsw_t *vswp, md_t *, mde_cookie_t);
static  void vsw_update_md_prop(vsw_t *, md_t *, mde_cookie_t);

/* Port add/deletion routines */
static  int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
static  int vsw_port_attach(vsw_t *vswp, int p_instance,
    uint64_t *ldcids, int nids, struct ether_addr *macaddr);
static  int vsw_detach_ports(vsw_t *vswp);
static  int vsw_port_detach(vsw_t *vswp, int p_instance);
static  int vsw_port_delete(vsw_port_t *port);
static  int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
static  int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
static  int vsw_init_ldcs(vsw_port_t *port);
static  int vsw_uninit_ldcs(vsw_port_t *port);
static  int vsw_ldc_init(vsw_ldc_t *ldcp);
static  int vsw_ldc_uninit(vsw_ldc_t *ldcp);
static  int vsw_drain_ldcs(vsw_port_t *port);
static  int vsw_drain_port_taskq(vsw_port_t *port);
static  void vsw_marker_task(void *);
static  vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
static  int vsw_plist_del_node(vsw_t *, vsw_port_t *port);

/* Interrupt routines */
static  uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);

/* Handshake routines */
static  void vsw_ldc_reinit(vsw_ldc_t *);
static  void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
static  void vsw_conn_task(void *);
static  int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
static  void vsw_next_milestone(vsw_ldc_t *);
static  int vsw_supported_version(vio_ver_msg_t *);

/* Data processing routines */
static void vsw_process_pkt(void *);
static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t);
static void vsw_process_ctrl_pkt(void *);
static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *);
static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);

/* Switching/data transmit routines */
static  void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
        vsw_port_t *port, mac_resource_handle_t);
static  void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
        vsw_port_t *port, mac_resource_handle_t);
static  int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller,
        vsw_port_t *port);
static  int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller,
        vsw_port_t *port);
static  int vsw_portsend(vsw_port_t *, mblk_t *);
static  int vsw_dringsend(vsw_ldc_t *, mblk_t *);
static  int vsw_descrsend(vsw_ldc_t *, mblk_t *);

/* Packet creation routines */
static void vsw_send_ver(void *);
static void vsw_send_attr(vsw_ldc_t *);
static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
static void vsw_send_dring_info(vsw_ldc_t *);
static void vsw_send_rdx(vsw_ldc_t *);

static int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);

/* Forwarding database (FDB) routines */
static  int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port);
static  int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port);
static  vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *);
static  int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
static  int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
static  int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
static  void vsw_del_addr(uint8_t, void *, uint64_t);
static  void vsw_del_mcst_port(vsw_port_t *);
static  void vsw_del_mcst_vsw(vsw_t *);

/* Dring routines */
static dring_info_t *vsw_create_dring(vsw_ldc_t *);
static void vsw_create_privring(vsw_ldc_t *);
static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
    int *);
static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);

static void vsw_set_lane_attr(vsw_t *, lane_t *);
static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *);
static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
static int vsw_check_dring_info(vio_dring_reg_msg_t *);

/* Misc support routines */
static  caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
static int vsw_free_ring(dring_info_t *);

/* Debugging routines */
static void dump_flags(uint64_t);
static void display_state(void);
static void display_lane(lane_t *);
static void display_ring(dring_info_t *);

int vsw_num_handshakes = VNET_NUM_HANDSHAKES; /* # of handshake attempts */
int vsw_wretries = 100;     /* # of write attempts */
int vsw_chain_len = 150;        /* max # of mblks in msg chain */
int vsw_desc_delay = 0;     /* delay in us */
int vsw_read_attempts = 5;      /* # of reads of descriptor */

uint32_t    vsw_mblk_size = VSW_MBLK_SIZE;
uint32_t    vsw_num_mblks = VSW_NUM_MBLKS;

static  mac_callbacks_t vsw_m_callbacks = {
    0,
    vsw_m_stat,
    vsw_m_start,
    vsw_m_stop,
    vsw_m_promisc,
    vsw_m_multicst,
    vsw_m_unicst,
    vsw_m_tx,
    NULL,
    NULL,
    NULL
};

static  struct  cb_ops  vsw_cb_ops = {
    nulldev,            /* cb_open */
    nulldev,            /* cb_close */
    nodev,              /* cb_strategy */
    nodev,              /* cb_print */
    nodev,              /* cb_dump */
    nodev,              /* cb_read */
    nodev,              /* cb_write */
    nodev,              /* cb_ioctl */
    nodev,              /* cb_devmap */
    nodev,              /* cb_mmap */
    nodev,              /* cb_segmap */
    nochpoll,           /* cb_chpoll */
    ddi_prop_op,            /* cb_prop_op */
    NULL,               /* cb_stream */
    D_MP,               /* cb_flag */
    CB_REV,             /* rev */
    nodev,              /* int (*cb_aread)() */
    nodev               /* int (*cb_awrite)() */
};

static  struct  dev_ops vsw_ops = {
    DEVO_REV,       /* devo_rev */
    0,          /* devo_refcnt */
    vsw_getinfo,        /* devo_getinfo */
    nulldev,        /* devo_identify */
    nulldev,        /* devo_probe */
    vsw_attach,     /* devo_attach */
    vsw_detach,     /* devo_detach */
    nodev,          /* devo_reset */
    &vsw_cb_ops,        /* devo_cb_ops */
    (struct bus_ops *)NULL, /* devo_bus_ops */
    ddi_power       /* devo_power */
};

extern  struct  mod_ops mod_driverops;
static struct modldrv vswmodldrv = {
    &mod_driverops,
    "sun4v Virtual Switch %I%",
    &vsw_ops,
};

#define LDC_ENTER_LOCK(ldcp)    \
                mutex_enter(&((ldcp)->ldc_cblock));\
                mutex_enter(&((ldcp)->ldc_txlock));
#define LDC_EXIT_LOCK(ldcp) \
                mutex_exit(&((ldcp)->ldc_txlock));\
                mutex_exit(&((ldcp)->ldc_cblock));

/* Driver soft state ptr  */
static void *vsw_state;

/*
 * Linked list of "vsw_t" structures - one per instance.
 */
vsw_t       *vsw_head = NULL;
krwlock_t   vsw_rw;

/*
 * Property names
 */
static char vdev_propname[] = "virtual-device";
static char vsw_propname[] = "virtual-network-switch";
static char physdev_propname[] = "vsw-phys-dev";
static char smode_propname[] = "vsw-switch-mode";
static char macaddr_propname[] = "local-mac-address";
static char remaddr_propname[] = "remote-mac-address";
static char ldcids_propname[] = "ldc-ids";
static char chan_propname[] = "channel-endpoint";
static char id_propname[] = "id";
static char reg_propname[] = "reg";

/* supported versions */
static  ver_sup_t   vsw_versions[] = { {1, 0} };

/*
 * Matching criteria passed to the MDEG to register interest
 * in changes to 'virtual-device-port' nodes identified by their
 * 'id' property.
 */
static md_prop_match_t vport_prop_match[] = {
    { MDET_PROP_VAL,    "id"   },
    { MDET_LIST_END,    NULL    }
};

static mdeg_node_match_t vport_match = { "virtual-device-port",
                        vport_prop_match };

/*
 * Matching criteria passed to the MDEG to register interest
 * in changes to 'virtual-device' nodes (i.e. vsw nodes) identified
 * by their 'name' and 'cfg-handle' properties.
 */
static md_prop_match_t vdev_prop_match[] = {
    { MDET_PROP_STR,    "name"   },
    { MDET_PROP_VAL,    "cfg-handle" },
    { MDET_LIST_END,    NULL    }
};

static mdeg_node_match_t vdev_match = { "virtual-device",
                        vdev_prop_match };


/*
 * Specification of an MD node passed to the MDEG to filter any
 * 'vport' nodes that do not belong to the specified node. This
 * template is copied for each vsw instance and filled in with
 * the appropriate 'cfg-handle' value before being passed to the MDEG.
 */
static mdeg_prop_spec_t vsw_prop_template[] = {
    { MDET_PROP_STR,    "name",     vsw_propname },
    { MDET_PROP_VAL,    "cfg-handle",   NULL    },
    { MDET_LIST_END,    NULL,       NULL    }
};

#define VSW_SET_MDEG_PROP_INST(specp, val)  (specp)[1].ps_val = (val);

/*
 * From /etc/system enable/disable thread per ring. This is a mode
 * selection that is done a vsw driver attach time.
 */
boolean_t vsw_multi_ring_enable = B_FALSE;
int vsw_mac_rx_rings = VSW_MAC_RX_RINGS;

/*
 * Print debug messages - set to 0x1f to enable all msgs
 * or 0x0 to turn all off.
 */
int vswdbg = 0x0;

/*
 * debug levels:
 * 0x01:    Function entry/exit tracing
 * 0x02:    Internal function messages
 * 0x04:    Verbose internal messages
 * 0x08:    Warning messages
 * 0x10:    Error messages
 */

static void
vswdebug(vsw_t *vswp, const char *fmt, ...)
{
    char buf[512];
    va_list ap;

    va_start(ap, fmt);
    (void) vsprintf(buf, fmt, ap);
    va_end(ap);

    if (vswp == NULL)
        cmn_err(CE_CONT, "%s\n", buf);
    else
        cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf);
}

/*
 * For the moment the state dump routines have their own
 * private flag.
 */
#define DUMP_STATE  0

#if DUMP_STATE

#define DUMP_TAG(tag) \
{           \
    D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
    D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);  \
    D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);   \
}

#define DUMP_TAG_PTR(tag) \
{           \
    D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
    D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype); \
    D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);  \
}

#define DUMP_FLAGS(flags) dump_flags(flags);
#define DISPLAY_STATE() display_state()

#else

#define DUMP_TAG(tag)
#define DUMP_TAG_PTR(tag)
#define DUMP_FLAGS(state)
#define DISPLAY_STATE()

#endif  /* DUMP_STATE */

#ifdef DEBUG

#define D1      \
if (vswdbg & 0x01)  \
    vswdebug

#define D2      \
if (vswdbg & 0x02)  \
    vswdebug

#define D3      \
if (vswdbg & 0x04)  \
    vswdebug

#define DWARN       \
if (vswdbg & 0x08)  \
    vswdebug

#define DERR        \
if (vswdbg & 0x10)  \
    vswdebug

#else

#define DERR        if (0)  vswdebug
#define DWARN       if (0)  vswdebug
#define D1      if (0)  vswdebug
#define D2      if (0)  vswdebug
#define D3      if (0)  vswdebug

#endif  /* DEBUG */

static struct modlinkage modlinkage = {
    MODREV_1,
    &vswmodldrv,
    NULL
};

int
_init(void)
{
    int status;

    rw_init(&vsw_rw, NULL, RW_DRIVER, NULL);

    status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1);
    if (status != 0) {
        return (status);
    }

    mac_init_ops(&vsw_ops, "vsw");
    status = mod_install(&modlinkage);
    if (status != 0) {
        ddi_soft_state_fini(&vsw_state);
    }
    return (status);
}

int
_fini(void)
{
    int status;

    status = mod_remove(&modlinkage);
    if (status != 0)
        return (status);
    mac_fini_ops(&vsw_ops);
    ddi_soft_state_fini(&vsw_state);

    rw_destroy(&vsw_rw);

    return (status);
}

int
_info(struct modinfo *modinfop)
{
    return (mod_info(&modlinkage, modinfop));
}

static int
vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
    vsw_t       *vswp;
    int     instance;
    char        hashname[MAXNAMELEN];
    char        qname[TASKQ_NAMELEN];
    enum        { PROG_init = 0x00,
                PROG_if_lock = 0x01,
                PROG_fdb = 0x02,
                PROG_mfdb = 0x04,
                PROG_report_dev = 0x08,
                PROG_plist = 0x10,
                PROG_taskq = 0x20}
            progress;

    progress = PROG_init;

    switch (cmd) {
    case DDI_ATTACH:
        break;
    case DDI_RESUME:
        /* nothing to do for this non-device */
        return (DDI_SUCCESS);
    case DDI_PM_RESUME:
    default:
        return (DDI_FAILURE);
    }

    instance = ddi_get_instance(dip);
    if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) {
        DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance);
        return (DDI_FAILURE);
    }
    vswp = ddi_get_soft_state(vsw_state, instance);

    if (vswp == NULL) {
        DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance);
        goto vsw_attach_fail;
    }

    vswp->dip = dip;
    vswp->instance = instance;
    ddi_set_driver_private(dip, (caddr_t)vswp);

    mutex_init(&vswp->hw_lock, NULL, MUTEX_DRIVER, NULL);
    mutex_init(&vswp->mac_lock, NULL, MUTEX_DRIVER, NULL);
    rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL);
    progress |= PROG_if_lock;

    /* setup the unicast forwarding database  */
    (void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d",
                            vswp->instance);
    D2(vswp, "creating unicast hash table (%s)...", hashname);
    vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
        mod_hash_null_valdtor, sizeof (void *));

    progress |= PROG_fdb;

    /* setup the multicast fowarding database */
    (void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d",
                            vswp->instance);
    D2(vswp, "creating multicast hash table %s)...", hashname);
    rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL);
    vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
            mod_hash_null_valdtor, sizeof (void *));

    progress |= PROG_mfdb;

    /*
     * create lock protecting list of multicast addresses
     * which could come via m_multicst() entry point when plumbed.
     */
    mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL);
    vswp->mcap = NULL;

    ddi_report_dev(vswp->dip);

    progress |= PROG_report_dev;

    WRITE_ENTER(&vsw_rw);
    vswp->next = vsw_head;
    vsw_head = vswp;
    RW_EXIT(&vsw_rw);

    /* setup the port list */
    rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL);
    vswp->plist.head = NULL;

    progress |= PROG_plist;

    /*
     * Create the taskq which will process all the VIO
     * control messages.
     */
    (void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance);
    if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1,
                    TASKQ_DEFAULTPRI, 0)) == NULL) {
        cmn_err(CE_WARN, "!vsw%d: Unable to create task queue",
            vswp->instance);
        goto vsw_attach_fail;
    }

    progress |= PROG_taskq;

    /* prevent auto-detaching */
    if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip,
                DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) {
        cmn_err(CE_NOTE, "!Unable to set \"%s\" property for "
            "instance %u", DDI_NO_AUTODETACH, instance);
    }

    /*
     * Now we have everything setup, register an interest in
     * specific MD nodes.
     *
     * The callback is invoked in 2 cases, firstly if upon mdeg
     * registration there are existing nodes which match our specified
     * criteria, and secondly if the MD is changed (and again, there
     * are nodes which we are interested in present within it. Note
     * that our callback will be invoked even if our specified nodes
     * have not actually changed).
     *
     * Until the callback is invoked we cannot switch any pkts as
     * we don't know basic information such as what mode we are
     * operating in. However we expect the callback to be invoked
     * immediately upon registration as this driver should only
     * be attaching if there are vsw nodes in the MD.
     */
    if (vsw_mdeg_register(vswp))
        goto vsw_attach_fail;

    return (DDI_SUCCESS);

vsw_attach_fail:
    DERR(NULL, "vsw_attach: failed");

    if (progress & PROG_taskq)
        ddi_taskq_destroy(vswp->taskq_p);

    if (progress & PROG_plist)
        rw_destroy(&vswp->plist.lockrw);

    if (progress & PROG_report_dev) {
        ddi_remove_minor_node(dip, NULL);
        mutex_destroy(&vswp->mca_lock);
    }

    if (progress & PROG_mfdb) {
        mod_hash_destroy_hash(vswp->mfdb);
        vswp->mfdb = NULL;
        rw_destroy(&vswp->mfdbrw);
    }

    if (progress & PROG_fdb) {
        mod_hash_destroy_hash(vswp->fdb);
        vswp->fdb = NULL;
    }

    if (progress & PROG_if_lock) {
        rw_destroy(&vswp->if_lockrw);
        mutex_destroy(&vswp->mac_lock);
        mutex_destroy(&vswp->hw_lock);
    }

    ddi_soft_state_free(vsw_state, instance);
    return (DDI_FAILURE);
}

static int
vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
    vio_mblk_pool_t     *poolp, *npoolp;
    vsw_t           **vswpp, *vswp;
    int             instance;

    instance = ddi_get_instance(dip);
    vswp = ddi_get_soft_state(vsw_state, instance);

    if (vswp == NULL) {
        return (DDI_FAILURE);
    }

    switch (cmd) {
    case DDI_DETACH:
        break;
    case DDI_SUSPEND:
    case DDI_PM_SUSPEND:
    default:
        return (DDI_FAILURE);
    }

    D2(vswp, "detaching instance %d", instance);

    if (vswp->if_state & VSW_IF_REG) {
        if (vsw_mac_unregister(vswp) != 0) {
            cmn_err(CE_WARN, "!vsw%d: Unable to detach from "
                "MAC layer", vswp->instance);
            return (DDI_FAILURE);
        }
    }

    vsw_mdeg_unregister(vswp);

    /* remove mac layer callback */
    mutex_enter(&vswp->mac_lock);
    if ((vswp->mh != NULL) && (vswp->mrh != NULL)) {
        mac_rx_remove(vswp->mh, vswp->mrh);
        vswp->mrh = NULL;
    }
    mutex_exit(&vswp->mac_lock);

    if (vsw_detach_ports(vswp) != 0) {
        cmn_err(CE_WARN, "!vsw%d: Unable to detach ports",
                            vswp->instance);
        return (DDI_FAILURE);
    }

    rw_destroy(&vswp->if_lockrw);

    mutex_destroy(&vswp->hw_lock);

    /*
     * Now that the ports have been deleted, stop and close
     * the physical device.
     */
    mutex_enter(&vswp->mac_lock);
    if (vswp->mh != NULL) {
        if (vswp->mstarted)
            mac_stop(vswp->mh);
        if (vswp->mresources)
            mac_resource_set(vswp->mh, NULL, NULL);
        mac_close(vswp->mh);

        vswp->mh = NULL;
        vswp->txinfo = NULL;
    }
    mutex_exit(&vswp->mac_lock);
    mutex_destroy(&vswp->mac_lock);

    /*
     * Destroy any free pools that may still exist.
     */
    poolp = vswp->rxh;
    while (poolp != NULL) {
        npoolp = vswp->rxh = poolp->nextp;
        if (vio_destroy_mblks(poolp) != 0) {
            vswp->rxh = poolp;
            return (DDI_FAILURE);
        }
        poolp = npoolp;
    }

    /*
     * Remove this instance from any entries it may be on in
     * the hash table by using the list of addresses maintained
     * in the vsw_t structure.
     */
    vsw_del_mcst_vsw(vswp);

    vswp->mcap = NULL;
    mutex_destroy(&vswp->mca_lock);

    /*
     * By now any pending tasks have finished and the underlying
     * ldc's have been destroyed, so its safe to delete the control
     * message taskq.
     */
    if (vswp->taskq_p != NULL)
        ddi_taskq_destroy(vswp->taskq_p);

    /*
     * At this stage all the data pointers in the hash table
     * should be NULL, as all the ports have been removed and will
     * have deleted themselves from the port lists which the data
     * pointers point to. Hence we can destroy the table using the
     * default destructors.
     */
    D2(vswp, "vsw_detach: destroying hash tables..");
    mod_hash_destroy_hash(vswp->fdb);
    vswp->fdb = NULL;

    WRITE_ENTER(&vswp->mfdbrw);
    mod_hash_destroy_hash(vswp->mfdb);
    vswp->mfdb = NULL;
    RW_EXIT(&vswp->mfdbrw);
    rw_destroy(&vswp->mfdbrw);

    ddi_remove_minor_node(dip, NULL);

    rw_destroy(&vswp->plist.lockrw);
    WRITE_ENTER(&vsw_rw);
    for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) {
        if (*vswpp == vswp) {
            *vswpp = vswp->next;
            break;
        }
    }
    RW_EXIT(&vsw_rw);
    ddi_soft_state_free(vsw_state, instance);

    return (DDI_SUCCESS);
}

static int
vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
{
    _NOTE(ARGUNUSED(dip))

    vsw_t   *vswp = NULL;
    dev_t   dev = (dev_t)arg;
    int instance;

    instance = getminor(dev);

    switch (infocmd) {
    case DDI_INFO_DEVT2DEVINFO:
        if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) {
            *result = NULL;
            return (DDI_FAILURE);
        }
        *result = vswp->dip;
        return (DDI_SUCCESS);

    case DDI_INFO_DEVT2INSTANCE:
        *result = (void *)(uintptr_t)instance;
        return (DDI_SUCCESS);

    default:
        *result = NULL;
        return (DDI_FAILURE);
    }
}

/*
 * Get the value of the "vsw-phys-dev" property in the specified
 * node. This property is the name of the physical device that
 * the virtual switch will use to talk to the outside world.
 *
 * Note it is valid for this property to be NULL (but the property
 * itself must exist). Callers of this routine should verify that
 * the value returned is what they expected (i.e. either NULL or non NULL).
 *
 * On success returns value of the property in region pointed to by
 * the 'name' argument, and with return value of 0. Otherwise returns 1.
 */
static int
vsw_get_md_physname(vsw_t *vswp, md_t *mdp, mde_cookie_t node, char *name)
{
    int len = 0;
    char    *physname = NULL;
    char    *dev;

    if (md_get_prop_data(mdp, node, physdev_propname,
                (uint8_t **)(&physname), &len) != 0) {
        cmn_err(CE_WARN, "!vsw%d: Unable to get name(s) of physical "
                "device(s) from MD", vswp->instance);
        return (1);
    } else if ((strlen(physname) + 1) > LIFNAMSIZ) {
        cmn_err(CE_WARN, "!vsw%d: %s is too long a device name",
            vswp->instance, physname);
        return (1);
    } else {
        (void) strncpy(name, physname, strlen(physname) + 1);
        D2(vswp, "%s: using first device specified (%s)",
            __func__, physname);
    }

#ifdef DEBUG
    /*
     * As a temporary measure to aid testing we check to see if there
     * is a vsw.conf file present. If there is we use the value of the
     * vsw_physname property in the file as the name of the physical
     * device, overriding the value from the MD.
     *
     * There may be multiple devices listed, but for the moment
     * we just use the first one.
     */
    if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0,
        "vsw_physname", &dev) == DDI_PROP_SUCCESS) {
        if ((strlen(dev) + 1) > LIFNAMSIZ) {
            cmn_err(CE_WARN, "vsw%d: %s is too long a device name",
                vswp->instance, dev);
            ddi_prop_free(dev);
            return (1);
        } else {
            cmn_err(CE_NOTE, "vsw%d: Using device name (%s) from "
                "config file", vswp->instance, dev);

            (void) strncpy(name, dev, strlen(dev) + 1);
        }

        ddi_prop_free(dev);
    }
#endif

    return (0);
}

/*
 * Read the 'vsw-switch-mode' property from the specified MD node.
 *
 * Returns 0 on success and the number of modes found in 'found',
 * otherwise returns 1.
 */
static int
vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
                        uint8_t *modes, int *found)
{
    int     len = 0;
    int     smode_num = 0;
    char        *smode = NULL;
    char        *curr_mode = NULL;

    D1(vswp, "%s: enter", __func__);

    /*
     * Get the switch-mode property. The modes are listed in
     * decreasing order of preference, i.e. prefered mode is
     * first item in list.
     */
    len = 0;
    smode_num = 0;
    if (md_get_prop_data(mdp, node, smode_propname,
                (uint8_t **)(&smode), &len) != 0) {
        /*
         * Unable to get switch-mode property from MD, nothing
         * more we can do.
         */
        cmn_err(CE_WARN, "!vsw%d: Unable to get switch mode property"
            " from the MD", vswp->instance);
        *found = 0;
        return (1);
    }

    curr_mode = smode;
    /*
     * Modes of operation:
     * 'switched'    - layer 2 switching, underlying HW in
     *          programmed mode.
     * 'promiscuous' - layer 2 switching, underlying HW in
     *          promiscuous mode.
     * 'routed'  - layer 3 (i.e. IP) routing, underlying HW
     *          in non-promiscuous mode.
     */
    while ((curr_mode < (smode + len)) && (smode_num < NUM_SMODES)) {
        D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode);
        if (strcmp(curr_mode, "switched") == 0) {
            modes[smode_num++] = VSW_LAYER2;
        } else if (strcmp(curr_mode, "promiscuous") == 0) {
            modes[smode_num++] = VSW_LAYER2_PROMISC;
        } else if (strcmp(curr_mode, "routed") == 0) {
            modes[smode_num++] = VSW_LAYER3;
        } else {
            cmn_err(CE_WARN, "!vsw%d: Unknown switch mode %s, "
                "setting to default switched mode",
                vswp->instance, curr_mode);
            modes[smode_num++] = VSW_LAYER2;
        }
        curr_mode += strlen(curr_mode) + 1;
    }
    *found = smode_num;

    D2(vswp, "%s: %d modes found", __func__, smode_num);

    D1(vswp, "%s: exit", __func__);

    return (0);
}

/*
 * Get the mac address of the physical device.
 *
 * Returns 0 on success, 1 on failure.
 */
static int
vsw_get_physaddr(vsw_t *vswp)
{
    mac_handle_t    mh;
    char        drv[LIFNAMSIZ];
    uint_t      ddi_instance;

    D1(vswp, "%s: enter", __func__);

    if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS)
        return (1);

    if (mac_open(vswp->physname, ddi_instance, &mh) != 0) {
        cmn_err(CE_WARN, "!vsw%d: mac_open %s failed",
                vswp->instance, vswp->physname);
        return (1);
    }

    READ_ENTER(&vswp->if_lockrw);
    mac_unicst_get(mh, vswp->if_addr.ether_addr_octet);
    RW_EXIT(&vswp->if_lockrw);

    mac_close(mh);

    vswp->mdprops |= VSW_DEV_MACADDR;

    D1(vswp, "%s: exit", __func__);

    return (0);
}

/*
 * Check to see if the card supports the setting of multiple unicst
 * addresses.
 *
 * Returns 0 if card supports the programming of multiple unicast addresses,
 * otherwise returns 1.
 */
static int
vsw_get_hw_maddr(vsw_t *vswp)
{
    D1(vswp, "%s: enter", __func__);

    mutex_enter(&vswp->mac_lock);
    if (vswp->mh == NULL) {
        mutex_exit(&vswp->mac_lock);
        return (1);
    }

    if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) {
        cmn_err(CE_WARN, "!vsw%d: device (%s) does not support "
            "setting multiple unicast addresses", vswp->instance,
            vswp->physname);
        mutex_exit(&vswp->mac_lock);
        return (1);
    }
    mutex_exit(&vswp->mac_lock);

    D2(vswp, "%s: %d addrs : %d free", __func__,
        vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree);

    D1(vswp, "%s: exit", __func__);

    return (0);
}

/*
 * Setup the required switching mode.
 *
 * Returns 0 on success, 1 on failure.
 */
static int
vsw_setup_switching(vsw_t *vswp)
{
    int i, rv = 1;

    D1(vswp, "%s: enter", __func__);

    /* select best switching mode */
    for (i = 0; i < vswp->smode_num; i++) {
        vswp->smode_idx = i;
        switch (vswp->smode[i]) {
        case VSW_LAYER2:
        case VSW_LAYER2_PROMISC:
            rv = vsw_setup_layer2(vswp);
            break;

        case VSW_LAYER3:
            rv = vsw_setup_layer3(vswp);
            break;

        default:
            DERR(vswp, "unknown switch mode");
            rv = 1;
            break;
        }

        if (rv == 0)
            break;
    }

    if (rv == 1) {
        cmn_err(CE_WARN, "!vsw%d: Unable to setup specified "
            "switching mode", vswp->instance);
        return (rv);
    }

    D2(vswp, "%s: Operating in mode %d", __func__,
                    vswp->smode[vswp->smode_idx]);

    D1(vswp, "%s: exit", __func__);

    return (0);
}

/*
 * Setup for layer 2 switching.
 *
 * Returns 0 on success, 1 on failure.
 */
static int
vsw_setup_layer2(vsw_t *vswp)
{
    D1(vswp, "%s: enter", __func__);

    vswp->vsw_switch_frame = vsw_switch_l2_frame;

    /*
     * Attempt to link into the MAC layer so we can get
     * and send packets out over the physical adapter.
     */
    if (vswp->mdprops & VSW_MD_PHYSNAME) {
        if (vsw_mac_attach(vswp) != 0) {
            /*
             * Registration with the MAC layer has failed,
             * so return 1 so that can fall back to next
             * prefered switching method.
             */
            cmn_err(CE_WARN, "!vsw%d: Unable to join as MAC layer "
                "client", vswp->instance);
            return (1);
        }

        if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) {
            /*
             * Verify that underlying device can support multiple
             * unicast mac addresses.
             */
            if (vsw_get_hw_maddr(vswp) != 0) {
                cmn_err(CE_WARN, "!vsw%d: Unable to setup "
                    "layer2 switching", vswp->instance);
                vsw_mac_detach(vswp);
                return (1);
            }
        }

    } else {
        /*
         * No physical device name found in MD which is
         * required for layer 2.
         */
        cmn_err(CE_WARN, "!vsw%d: no physical device name specified",
            vswp->instance);
        return (1);
    }

    D1(vswp, "%s: exit", __func__);

    return (0);
}

static int
vsw_setup_layer3(vsw_t *vswp)
{
    D1(vswp, "%s: enter", __func__);

    D2(vswp, "%s: operating in layer 3 mode", __func__);
    vswp->vsw_switch_frame = vsw_switch_l3_frame;

    D1(vswp, "%s: exit", __func__);

    return (0);
}

/*
 * Link into the MAC layer to gain access to the services provided by
 * the underlying physical device driver (which should also have
 * registered with the MAC layer).
 *
 * Only when in layer 2 mode.
 */
static int
vsw_mac_attach(vsw_t *vswp)
{
    char    drv[LIFNAMSIZ];
    uint_t  ddi_instance;

    D1(vswp, "%s: enter", __func__);

    ASSERT(vswp->mh == NULL);
    ASSERT(vswp->mrh == NULL);
    ASSERT(vswp->mstarted == B_FALSE);
    ASSERT(vswp->mresources == B_FALSE);

    ASSERT(vswp->mdprops & VSW_MD_PHYSNAME);

    mutex_enter(&vswp->mac_lock);
    if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) {
        cmn_err(CE_WARN, "!vsw%d: invalid device name: %s",
            vswp->instance, vswp->physname);
        goto mac_fail_exit;
    }

    if ((mac_open(vswp->physname, ddi_instance, &vswp->mh)) != 0) {
        cmn_err(CE_WARN, "!vsw%d: mac_open %s failed",
            vswp->instance, vswp->physname);
        goto mac_fail_exit;
    }

    ASSERT(vswp->mh != NULL);

    D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);

    if (vsw_multi_ring_enable) {
        /*
         * Initialize the ring table.
         */
        vsw_mac_ring_tbl_init(vswp);

        /*
         * Register our rx callback function.
         */
        vswp->mrh = mac_rx_add(vswp->mh,
            vsw_rx_queue_cb, (void *)vswp);
        ASSERT(vswp->mrh != NULL);

        /*
         * Register our mac resource callback.
         */
        mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp);
        vswp->mresources = B_TRUE;

        /*
         * Get the ring resources available to us from
         * the mac below us.
         */
        mac_resources(vswp->mh);
    } else {
        /*
         * Just register our rx callback function
         */
        vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
        ASSERT(vswp->mrh != NULL);
    }

    /* Get the MAC tx fn */
    vswp->txinfo = mac_tx_get(vswp->mh);

    /* start the interface */
    if (mac_start(vswp->mh) != 0) {
        cmn_err(CE_WARN, "!vsw%d: Could not start mac interface",
            vswp->instance);
        goto mac_fail_exit;
    }

    mutex_exit(&vswp->mac_lock);

    vswp->mstarted = B_TRUE;

    D1(vswp, "%s: exit", __func__);
    return (0);

mac_fail_exit:
    mutex_exit(&vswp->mac_lock);
    vsw_mac_detach(vswp);

    D1(vswp, "%s: exit", __func__);
    return (1);
}

static void
vsw_mac_detach(vsw_t *vswp)
{
    D1(vswp, "vsw_mac_detach: enter");

    ASSERT(vswp != NULL);

    if (vsw_multi_ring_enable) {
        vsw_mac_ring_tbl_destroy(vswp);
    }

    mutex_enter(&vswp->mac_lock);

    if (vswp->mh != NULL) {
        if (vswp->mstarted)
            mac_stop(vswp->mh);
        if (vswp->mrh != NULL)
            mac_rx_remove(vswp->mh, vswp->mrh);
        if (vswp->mresources)
            mac_resource_set(vswp->mh, NULL, NULL);
        mac_close(vswp->mh);
    }

    vswp->mrh = NULL;
    vswp->mh = NULL;
    vswp->txinfo = NULL;
    vswp->mstarted = B_FALSE;

    mutex_exit(&vswp->mac_lock);

    D1(vswp, "vsw_mac_detach: exit");
}

/*
 * Depending on the mode specified, the capabilites and capacity
 * of the underlying device setup the physical device.
 *
 * If in layer 3 mode, then do nothing.
 *
 * If in layer 2 programmed mode attempt to program the unicast address
 * associated with the port into the physical device. If this is not
 * possible due to resource exhaustion or simply because the device does
 * not support multiple unicast addresses then if required fallback onto
 * putting the card into promisc mode.
 *
 * If in promisc mode then simply set the card into promisc mode.
 *
 * Returns 0 success, 1 on failure.
 */
static int
vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type)
{
    mac_multi_addr_t    mac_addr;
    int         err;

    D1(vswp, "%s: enter", __func__);

    ASSERT(MUTEX_HELD(&vswp->hw_lock));
    ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));

    if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
        return (0);

    if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) {
        return (vsw_set_hw_promisc(vswp, port, type));
    }

    /*
     * Attempt to program the unicast address into the HW.
     */
    mac_addr.mma_addrlen = ETHERADDRL;
    if (type == VSW_VNETPORT) {
        ASSERT(port != NULL);
        ether_copy(&port->p_macaddr, &mac_addr.mma_addr);
    } else {
        READ_ENTER(&vswp->if_lockrw);
        /*
         * Don't program if the interface is not UP. This
         * is possible if the address has just been changed
         * in the MD node, but the interface has not yet been
         * plumbed.
         */
        if (!(vswp->if_state & VSW_IF_UP)) {
            RW_EXIT(&vswp->if_lockrw);
            return (0);
        }
        ether_copy(&vswp->if_addr, &mac_addr.mma_addr);
        RW_EXIT(&vswp->if_lockrw);
    }

    err = vsw_set_hw_addr(vswp, &mac_addr);
    if (err != 0) {
        /*
         * Mark that attempt should be made to re-config sometime
         * in future if a port is deleted.
         */
        vswp->recfg_reqd = B_TRUE;

        /*
         * Only 1 mode specified, nothing more to do.
         */
        if (vswp->smode_num == 1)
            return (err);

        /*
         * If promiscuous was next mode specified try to
         * set the card into that mode.
         */
        if ((vswp->smode_idx <= (vswp->smode_num - 2)) &&
            (vswp->smode[vswp->smode_idx + 1]
                    == VSW_LAYER2_PROMISC)) {
            vswp->smode_idx += 1;
            return (vsw_set_hw_promisc(vswp, port, type));
        }
        return (err);
    }

    if (type == VSW_VNETPORT) {
        port->addr_slot = mac_addr.mma_slot;
        port->addr_set = VSW_ADDR_HW;
    } else {
        vswp->addr_slot = mac_addr.mma_slot;
        vswp->addr_set = VSW_ADDR_HW;
    }

    D2(vswp, "programmed addr %x:%x:%x:%x:%x:%x into slot %d "
        "of device %s",
        mac_addr.mma_addr[0], mac_addr.mma_addr[1],
        mac_addr.mma_addr[2], mac_addr.mma_addr[3],
        mac_addr.mma_addr[4], mac_addr.mma_addr[5],
        mac_addr.mma_slot, vswp->physname);

    D1(vswp, "%s: exit", __func__);

    return (0);
}

/*
 * If in layer 3 mode do nothing.
 *
 * If in layer 2 switched mode remove the address from the physical
 * device.
 *
 * If in layer 2 promiscuous mode disable promisc mode.
 *
 * Returns 0 on success.
 */
static int
vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type)
{
    mac_addr_slot_t slot;
    int     rv;

    D1(vswp, "%s: enter", __func__);

    ASSERT(MUTEX_HELD(&vswp->hw_lock));

    if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
        return (0);

    switch (type) {
    case VSW_VNETPORT:
        ASSERT(port != NULL);

        if (port->addr_set == VSW_ADDR_PROMISC) {
            return (vsw_unset_hw_promisc(vswp, port, type));

        } else if (port->addr_set == VSW_ADDR_HW) {
            slot = port->addr_slot;
            if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
                port->addr_set = VSW_ADDR_UNSET;
        }

        break;

    case VSW_LOCALDEV:
        if (vswp->addr_set == VSW_ADDR_PROMISC) {
            return (vsw_unset_hw_promisc(vswp, NULL, type));

        } else if (vswp->addr_set == VSW_ADDR_HW) {
            slot = vswp->addr_slot;
            if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
                vswp->addr_set = VSW_ADDR_UNSET;
        }

        break;

    default:
        /* should never happen */
        DERR(vswp, "%s: unknown type %d", __func__, type);
        ASSERT(0);
        return (1);
    }

    D1(vswp, "%s: exit", __func__);
    return (rv);
}

/*
 * Attempt to program a unicast address into HW.
 *
 * Returns 0 on sucess, 1 on failure.
 */
static int
vsw_set_hw_addr(vsw_t *vswp, mac_multi_addr_t *mac)
{
    void    *mah;
    int rv;

    D1(vswp, "%s: enter", __func__);

    ASSERT(MUTEX_HELD(&vswp->hw_lock));

    if (vswp->maddr.maddr_handle == NULL)
        return (1);

    mah = vswp->maddr.maddr_handle;

    rv = vswp->maddr.maddr_add(mah, mac);

    if (rv == 0)
        return (0);

    /*
     * Its okay for the add to fail because we have exhausted
     * all the resouces in the hardware device. Any other error
     * we want to flag.
     */
    if (rv != ENOSPC) {
        cmn_err(CE_WARN, "!vsw%d: error programming "
            "address %x:%x:%x:%x:%x:%x into HW "
            "err (%d)", vswp->instance,
            mac->mma_addr[0], mac->mma_addr[1],
            mac->mma_addr[2], mac->mma_addr[3],
            mac->mma_addr[4], mac->mma_addr[5], rv);
    }
    D1(vswp, "%s: exit", __func__);
    return (1);
}

/*
 * Remove a unicast mac address which has previously been programmed
 * into HW.
 *
 * Returns 0 on sucess, 1 on failure.
 */
static int
vsw_unset_hw_addr(vsw_t *vswp, int slot)
{
    void    *mah;
    int rv;

    D1(vswp, "%s: enter", __func__);

    ASSERT(MUTEX_HELD(&vswp->hw_lock));
    ASSERT(slot >= 0);

    if (vswp->maddr.maddr_handle == NULL)
        return (1);

    mah = vswp->maddr.maddr_handle;

    rv = vswp->maddr.maddr_remove(mah, slot);
    if (rv != 0) {
        cmn_err(CE_WARN, "!vsw%d: unable to remove address "
            "from slot %d in device %s (err %d)",
            vswp->instance, slot, vswp->physname, rv);
        return (1);
    }

    D2(vswp, "removed addr from slot %d in device %s",
        slot, vswp->physname);

    D1(vswp, "%s: exit", __func__);
    return (0);
}

/*
 * Set network card into promisc mode.
 *
 * Returns 0 on success, 1 on failure.
 */
static int
vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
{
    D1(vswp, "%s: enter", __func__);

    ASSERT(MUTEX_HELD(&vswp->hw_lock));
    ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));

    mutex_enter(&vswp->mac_lock);
    if (vswp->mh == NULL) {
        mutex_exit(&vswp->mac_lock);
        return (1);
    }

    if (vswp->promisc_cnt++ == 0) {
        if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
            vswp->promisc_cnt--;
            mutex_exit(&vswp->mac_lock);
            return (1);
        }
        cmn_err(CE_NOTE, "!vsw%d: switching device %s into "
            "promiscuous mode", vswp->instance, vswp->physname);
    }
    mutex_exit(&vswp->mac_lock);

    if (type == VSW_VNETPORT) {
        ASSERT(port != NULL);
        port->addr_set = VSW_ADDR_PROMISC;
    } else {
        vswp->addr_set = VSW_ADDR_PROMISC;
    }

    D1(vswp, "%s: exit", __func__);

    return (0);
}

/*
 * Turn off promiscuous mode on network card.
 *
 * Returns 0 on success, 1 on failure.
 */
static int
vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
{
    vsw_port_list_t     *plist = &vswp->plist;

    D2(vswp, "%s: enter", __func__);

    ASSERT(MUTEX_HELD(&vswp->hw_lock));
    ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));

    mutex_enter(&vswp->mac_lock);
    if (vswp->mh == NULL) {
        mutex_exit(&vswp->mac_lock);
        return (1);
    }

    if (--vswp->promisc_cnt == 0) {
        if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) {
            vswp->promisc_cnt++;
            mutex_exit(&vswp->mac_lock);
            return (1);
        }

        /*
         * We are exiting promisc mode either because we were
         * only in promisc mode because we had failed over from
         * switched mode due to HW resource issues, or the user
         * wanted the card in promisc mode for all the ports and
         * the last port is now being deleted. Tweak the message
         * accordingly.
         */
        if (plist->num_ports != 0) {
            cmn_err(CE_NOTE, "!vsw%d: switching device %s back to "
                "programmed mode", vswp->instance,
                vswp->physname);
        } else {
            cmn_err(CE_NOTE, "!vsw%d: switching device %s out of "
                "promiscuous mode", vswp->instance,
                vswp->physname);
        }
    }
    mutex_exit(&vswp->mac_lock);

    if (type == VSW_VNETPORT) {
        ASSERT(port != NULL);
        ASSERT(port->addr_set == VSW_ADDR_PROMISC);
        port->addr_set = VSW_ADDR_UNSET;
    } else {
        ASSERT(vswp->addr_set == VSW_ADDR_PROMISC);
        vswp->addr_set = VSW_ADDR_UNSET;
    }

    D1(vswp, "%s: exit", __func__);
    return (0);
}

/*
 * Determine whether or not we are operating in our prefered
 * mode and if not whether the physical resources now allow us
 * to operate in it.
 *
 * If a port is being removed should only be invoked after port has been
 * removed from the port list.
 */
static void
vsw_reconfig_hw(vsw_t *vswp)
{
    int         s_idx;

    D1(vswp, "%s: enter", __func__);

    ASSERT(MUTEX_HELD(&vswp->hw_lock));

    if (vswp->maddr.maddr_handle == NULL) {
        return;
    }

    /*
     * If we are in layer 2 (i.e. switched) or would like to be
     * in layer 2 then check if any ports or the vswitch itself
     * need to be programmed into the HW.
     *
     * This can happen in two cases - switched was specified as
     * the prefered mode of operation but we exhausted the HW
     * resources and so failed over to the next specifed mode,
     * or switched was the only mode specified so after HW
     * resources were exhausted there was nothing more we
     * could do.
     */
    if (vswp->smode_idx > 0)
        s_idx = vswp->smode_idx - 1;
    else
        s_idx = vswp->smode_idx;

    if (vswp->smode[s_idx] != VSW_LAYER2) {
        return;
    }

    D2(vswp, "%s: attempting reconfig..", __func__);

    /*
     * First, attempt to set the vswitch mac address into HW,
     * if required.
     */
    if (vsw_prog_if(vswp)) {
        return;
    }

    /*
     * Next, attempt to set any ports which have not yet been
     * programmed into HW.
     */
    if (vsw_prog_ports(vswp)) {
        return;
    }

    /*
     * By now we know that have programmed all desired ports etc
     * into HW, so safe to mark reconfiguration as complete.
     */
    vswp->recfg_reqd = B_FALSE;

    vswp->smode_idx = s_idx;

    D1(vswp, "%s: exit", __func__);
}

/*
 * Check to see if vsw itself is plumbed, and if so whether or not
 * its mac address should be written into HW.
 *
 * Returns 0 if could set address, or didn't have to set it.
 * Returns 1 if failed to set address.
 */
static int
vsw_prog_if(vsw_t *vswp)
{
    mac_multi_addr_t    addr;

    D1(vswp, "%s: enter", __func__);

    ASSERT(MUTEX_HELD(&vswp->hw_lock));

    READ_ENTER(&vswp->if_lockrw);
    if ((vswp->if_state & VSW_IF_UP) &&
        (vswp->addr_set != VSW_ADDR_HW)) {

        addr.mma_addrlen = ETHERADDRL;
        ether_copy(&vswp->if_addr, &addr.mma_addr);

        if (vsw_set_hw_addr(vswp, &addr) != 0) {
            RW_EXIT(&vswp->if_lockrw);
            return (1);
        }

        vswp->addr_slot = addr.mma_slot;

        /*
         * If previously when plumbed had had to place
         * interface into promisc mode, now reverse that.
         *
         * Note that interface will only actually be set into
         * non-promisc mode when last port/interface has been
         * programmed into HW.
         */
        if (vswp->addr_set == VSW_ADDR_PROMISC)
            (void) vsw_unset_hw_promisc(vswp, NULL, VSW_LOCALDEV);

        vswp->addr_set = VSW_ADDR_HW;
    }
    RW_EXIT(&vswp->if_lockrw);

    D1(vswp, "%s: exit", __func__);
    return (0);
}

/*
 * Scan the port list for any ports which have not yet been set
 * into HW. For those found attempt to program their mac addresses
 * into the physical device.
 *
 * Returns 0 if able to program all required ports (can be 0) into HW.
 * Returns 1 if failed to set at least one mac address.
 */
static int
vsw_prog_ports(vsw_t *vswp)
{
    mac_multi_addr_t    addr;
    vsw_port_list_t     *plist = &vswp->plist;
    vsw_port_t      *tp;
    int         rv = 0;

    D1(vswp, "%s: enter", __func__);

    ASSERT(MUTEX_HELD(&vswp->hw_lock));

    READ_ENTER(&plist->lockrw);
    for (tp = plist->head; tp != NULL; tp = tp->p_next) {
        if (tp->addr_set != VSW_ADDR_HW) {
            addr.mma_addrlen = ETHERADDRL;
            ether_copy(&tp->p_macaddr, &addr.mma_addr);

            if (vsw_set_hw_addr(vswp, &addr) != 0) {
                rv = 1;
                break;
            }

            tp->addr_slot = addr.mma_slot;

            /*
             * If when this port had first attached we had
             * had to place the interface into promisc mode,
             * then now reverse that.
             *
             * Note that the interface will not actually
             * change to non-promisc mode until all ports
             * have been programmed.
             */
            if (tp->addr_set == VSW_ADDR_PROMISC)
                (void) vsw_unset_hw_promisc(vswp,
                        tp, VSW_VNETPORT);

            tp->addr_set = VSW_ADDR_HW;
        }
    }
    RW_EXIT(&plist->lockrw);

    D1(vswp, "%s: exit", __func__);
    return (rv);
}

static void
vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp)
{
    ringp->ring_state = VSW_MAC_RING_FREE;
    ringp->ring_arg = NULL;
    ringp->ring_blank = NULL;
    ringp->ring_vqp = NULL;
    ringp->ring_vswp = vswp;
}

static void
vsw_mac_ring_tbl_init(vsw_t *vswp)
{
    int     i;

    mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL);

    vswp->mac_ring_tbl_sz = vsw_mac_rx_rings;
    vswp->mac_ring_tbl  =
        kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t),
        KM_SLEEP);

    for (i = 0; i < vswp->mac_ring_tbl_sz; i++)
        vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]);
}

static void
vsw_mac_ring_tbl_destroy(vsw_t *vswp)
{
    int     i;
    vsw_mac_ring_t  *ringp;

    mutex_enter(&vswp->mac_ring_lock);
    for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
        ringp = &vswp->mac_ring_tbl[i];

        if (ringp->ring_state != VSW_MAC_RING_FREE) {
            /*
             * Destroy the queue.
             */
            vsw_queue_stop(ringp->ring_vqp);
            vsw_queue_destroy(ringp->ring_vqp);

            /*
             * Re-initialize the structure.
             */
            vsw_mac_ring_tbl_entry_init(vswp, ringp);
        }
    }
    mutex_exit(&vswp->mac_ring_lock);

    mutex_destroy(&vswp->mac_ring_lock);
    kmem_free(vswp->mac_ring_tbl,
        vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t));
    vswp->mac_ring_tbl_sz = 0;
}

/*
 * Handle resource add callbacks from the driver below.
 */
static mac_resource_handle_t
vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp)
{
    vsw_t       *vswp = (vsw_t *)arg;
    mac_rx_fifo_t   *mrfp = (mac_rx_fifo_t *)mrp;
    vsw_mac_ring_t  *ringp;
    vsw_queue_t *vqp;
    int     i;

    ASSERT(vswp != NULL);
    ASSERT(mrp != NULL);
    ASSERT(vswp->mac_ring_tbl != NULL);

    D1(vswp, "%s: enter", __func__);

    /*
     * Check to make sure we have the correct resource type.
     */
    if (mrp->mr_type != MAC_RX_FIFO)
        return (NULL);

    /*
     * Find a open entry in the ring table.
     */
    mutex_enter(&vswp->mac_ring_lock);
    for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
        ringp = &vswp->mac_ring_tbl[i];

        /*
         * Check for an empty slot, if found, then setup queue
         * and thread.
         */
        if (ringp->ring_state == VSW_MAC_RING_FREE) {
            /*
             * Create the queue for this ring.
             */
            vqp = vsw_queue_create();

            /*
             * Initialize the ring data structure.
             */
            ringp->ring_vqp = vqp;
            ringp->ring_arg = mrfp->mrf_arg;
            ringp->ring_blank = mrfp->mrf_blank;
            ringp->ring_state = VSW_MAC_RING_INUSE;

            /*
             * Create the worker thread.
             */
            vqp->vq_worker = thread_create(NULL, 0,
                vsw_queue_worker, ringp, 0, &p0,
                TS_RUN, minclsyspri);
            if (vqp->vq_worker == NULL) {
                vsw_queue_destroy(vqp);
                vsw_mac_ring_tbl_entry_init(vswp, ringp);
                ringp = NULL;
            }

            if (ringp != NULL) {
                /*
                 * Make sure thread get's running state for
                 * this ring.
                 */
                mutex_enter(&vqp->vq_lock);
                while ((vqp->vq_state != VSW_QUEUE_RUNNING) &&
                    (vqp->vq_state != VSW_QUEUE_DRAINED)) {
                    cv_wait(&vqp->vq_cv, &vqp->vq_lock);
                }

                /*
                 * If the thread is not running, cleanup.
                 */
                if (vqp->vq_state == VSW_QUEUE_DRAINED) {
                    vsw_queue_destroy(vqp);
                    vsw_mac_ring_tbl_entry_init(vswp,
                        ringp);
                    ringp = NULL;
                }
                mutex_exit(&vqp->vq_lock);
            }

            mutex_exit(&vswp->mac_ring_lock);
            D1(vswp, "%s: exit", __func__);
            return ((mac_resource_handle_t)ringp);
        }
    }
    mutex_exit(&vswp->mac_ring_lock);

    /*
     * No slots in the ring table available.
     */
    D1(vswp, "%s: exit", __func__);
    return (NULL);
}

static void
vsw_queue_stop(vsw_queue_t *vqp)
{
    mutex_enter(&vqp->vq_lock);

    if (vqp->vq_state == VSW_QUEUE_RUNNING) {
        vqp->vq_state = VSW_QUEUE_STOP;
        cv_signal(&vqp->vq_cv);

        while (vqp->vq_state != VSW_QUEUE_DRAINED)
            cv_wait(&vqp->vq_cv, &vqp->vq_lock);
    }

    vqp->vq_state = VSW_QUEUE_STOPPED;

    mutex_exit(&vqp->vq_lock);
}

static vsw_queue_t *
vsw_queue_create()
{
    vsw_queue_t *vqp;

    vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP);

    mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL);
    cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL);
    vqp->vq_first = NULL;
    vqp->vq_last = NULL;
    vqp->vq_state = VSW_QUEUE_STOPPED;

    return (vqp);
}

static void
vsw_queue_destroy(vsw_queue_t *vqp)
{
    cv_destroy(&vqp->vq_cv);
    mutex_destroy(&vqp->vq_lock);
    kmem_free(vqp, sizeof (vsw_queue_t));
}

static void
vsw_queue_worker(vsw_mac_ring_t *rrp)
{
    mblk_t      *mp;
    vsw_queue_t *vqp = rrp->ring_vqp;
    vsw_t       *vswp = rrp->ring_vswp;

    mutex_enter(&vqp->vq_lock);

    ASSERT(vqp->vq_state == VSW_QUEUE_STOPPED);

    /*
     * Set the state to running, since the thread is now active.
     */
    vqp->vq_state = VSW_QUEUE_RUNNING;
    cv_signal(&vqp->vq_cv);

    while (vqp->vq_state == VSW_QUEUE_RUNNING) {
        /*
         * Wait for work to do or the state has changed
         * to not running.
         */
        while ((vqp->vq_state == VSW_QUEUE_RUNNING) &&
                (vqp->vq_first == NULL)) {
            cv_wait(&vqp->vq_cv, &vqp->vq_lock);
        }

        /*
         * Process packets that we received from the interface.
         */
        if (vqp->vq_first != NULL) {
            mp = vqp->vq_first;

            vqp->vq_first = NULL;
            vqp->vq_last = NULL;

            mutex_exit(&vqp->vq_lock);

            /* switch the chain of packets received */
            vswp->vsw_switch_frame(vswp, mp,
                        VSW_PHYSDEV, NULL, NULL);

            mutex_enter(&vqp->vq_lock);
        }
    }

    /*
     * We are drained and signal we are done.
     */
    vqp->vq_state = VSW_QUEUE_DRAINED;
    cv_signal(&vqp->vq_cv);

    /*
     * Exit lock and drain the remaining packets.
     */
    mutex_exit(&vqp->vq_lock);

    /*
     * Exit the thread
     */
    thread_exit();
}

/*
 * static void
 * vsw_rx_queue_cb() - Receive callback routine when
 *  vsw_multi_ring_enable is non-zero.  Queue the packets
 *  to a packet queue for a worker thread to process.
 */
static void
vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
{
    vsw_mac_ring_t  *ringp = (vsw_mac_ring_t *)mrh;
    vsw_t       *vswp = (vsw_t *)arg;
    vsw_queue_t *vqp;
    mblk_t      *bp, *last;

    ASSERT(mrh != NULL);
    ASSERT(vswp != NULL);
    ASSERT(mp != NULL);

    D1(vswp, "%s: enter", __func__);

    /*
     * Find the last element in the mblk chain.
     */
    bp = mp;
    do {
        last = bp;
        bp = bp->b_next;
    } while (bp != NULL);

    /* Get the queue for the packets */
    vqp = ringp->ring_vqp;

    /*
     * Grab the lock such we can queue the packets.
     */
    mutex_enter(&vqp->vq_lock);

    if (vqp->vq_state != VSW_QUEUE_RUNNING) {
        freemsg(mp);
        mutex_exit(&vqp->vq_lock);
        goto vsw_rx_queue_cb_exit;
    }

    /*
     * Add the mblk chain to the queue.  If there
     * is some mblks in the queue, then add the new
     * chain to the end.
     */
    if (vqp->vq_first == NULL)
        vqp->vq_first = mp;
    else
        vqp->vq_last->b_next = mp;

    vqp->vq_last = last;

    /*
     * Signal the worker thread that there is work to
     * do.
     */
    cv_signal(&vqp->vq_cv);

    /*
     * Let go of the lock and exit.
     */
    mutex_exit(&vqp->vq_lock);

vsw_rx_queue_cb_exit:
    D1(vswp, "%s: exit", __func__);
}

/*
 * receive callback routine. Invoked by MAC layer when there
 * are pkts being passed up from physical device.
 *
 * PERF: It may be more efficient when the card is in promisc
 * mode to check the dest address of the pkts here (against
 * the FDB) rather than checking later. Needs to be investigated.
 */
static void
vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
{
    _NOTE(ARGUNUSED(mrh))

    vsw_t       *vswp = (vsw_t *)arg;

    ASSERT(vswp != NULL);

    D1(vswp, "vsw_rx_cb: enter");

    /* switch the chain of packets received */
    vswp->vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);

    D1(vswp, "vsw_rx_cb: exit");
}

/*
 * Send a message out over the physical device via the MAC layer.
 *
 * Returns any mblks that it was unable to transmit.
 */
static mblk_t *
vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
{
    const mac_txinfo_t  *mtp;
    mblk_t          *nextp;

    mutex_enter(&vswp->mac_lock);
    if (vswp->mh == NULL) {
        DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
        mutex_exit(&vswp->mac_lock);
        return (mp);
    } else {
        for (;;) {
            nextp = mp->b_next;
            mp->b_next = NULL;

            mtp = vswp->txinfo;

            if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
                mp->b_next = nextp;
                break;
            }

            if ((mp = nextp) == NULL)
                break;
        }
    }
    mutex_exit(&vswp->mac_lock);

    return (mp);
}

/*
 * Register with the MAC layer as a network device, so we
 * can be plumbed if necessary.
 */
static int
vsw_mac_register(vsw_t *vswp)
{
    mac_register_t  *macp;
    int     rv;

    D1(vswp, "%s: enter", __func__);

    if ((macp = mac_alloc(MAC_VERSION)) == NULL)
        return (EINVAL);
    macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
    macp->m_driver = vswp;
    macp->m_dip = vswp->dip;
    macp->m_src_addr = (uint8_t *)&vswp->if_addr;
    macp->m_callbacks = &vsw_m_callbacks;
    macp->m_min_sdu = 0;
    macp->m_max_sdu = ETHERMTU;
    rv = mac_register(macp, &vswp->if_mh);
    mac_free(macp);
    if (rv == 0)
        vswp->if_state |= VSW_IF_REG;

    D1(vswp, "%s: exit", __func__);

    return (rv);
}

static int
vsw_mac_unregister(vsw_t *vswp)
{
    int     rv = 0;

    D1(vswp, "%s: enter", __func__);

    WRITE_ENTER(&vswp->if_lockrw);

    if (vswp->if_state & VSW_IF_REG) {
        rv = mac_unregister(vswp->if_mh);
        if (rv != 0) {
            DWARN(vswp, "%s: unable to unregister from MAC "
                "framework", __func__);

            RW_EXIT(&vswp->if_lockrw);
            D1(vswp, "%s: fail exit", __func__);
            return (rv);
        }

        /* mark i/f as down and unregistered */
        vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG);
    }
    RW_EXIT(&vswp->if_lockrw);

    D1(vswp, "%s: exit", __func__);

    return (rv);
}

static int
vsw_m_stat(void *arg, uint_t stat, uint64_t *val)
{
    vsw_t           *vswp = (vsw_t *)arg;

    D1(vswp, "%s: enter", __func__);

    mutex_enter(&vswp->mac_lock);
    if (vswp->mh == NULL) {
        mutex_exit(&vswp->mac_lock);
        return (EINVAL);
    }

    /* return stats from underlying device */
    *val = mac_stat_get(vswp->mh, stat);

    mutex_exit(&vswp->mac_lock);

    return (0);
}

static void
vsw_m_stop(void *arg)
{
    vsw_t       *vswp = (vsw_t *)arg;

    D1(vswp, "%s: enter", __func__);

    WRITE_ENTER(&vswp->if_lockrw);
    vswp->if_state &= ~VSW_IF_UP;
    RW_EXIT(&vswp->if_lockrw);

    mutex_enter(&vswp->hw_lock);

    (void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);

    if (vswp->recfg_reqd)
        vsw_reconfig_hw(vswp);

    mutex_exit(&vswp->hw_lock);

    D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
}

static int
vsw_m_start(void *arg)
{
    vsw_t       *vswp = (vsw_t *)arg;

    D1(vswp, "%s: enter", __func__);

    WRITE_ENTER(&vswp->if_lockrw);
    vswp->if_state |= VSW_IF_UP;
    RW_EXIT(&vswp->if_lockrw);

    mutex_enter(&vswp->hw_lock);
    (void) vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
    mutex_exit(&vswp->hw_lock);

    D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
    return (0);
}

/*
 * Change the local interface address.
 *
 * Note: we don't support this entry point. The local
 * mac address of the switch can only be changed via its
 * MD node properties.
 */
static int
vsw_m_unicst(void *arg, const uint8_t *macaddr)
{
    _NOTE(ARGUNUSED(arg, macaddr))

    return (DDI_FAILURE);
}

static int
vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
{
    vsw_t       *vswp = (vsw_t *)arg;
    mcst_addr_t *mcst_p = NULL;
    uint64_t    addr = 0x0;
    int     i, ret = 0;

    D1(vswp, "%s: enter", __func__);

    /*
     * Convert address into form that can be used
     * as hash table key.
     */
    for (i = 0; i < ETHERADDRL; i++) {
        addr = (addr << 8) | mca[i];
    }

    D2(vswp, "%s: addr = 0x%llx", __func__, addr);

    if (add) {
        D2(vswp, "%s: adding multicast", __func__);
        if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
            /*
             * Update the list of multicast addresses
             * contained within the vsw_t structure to
             * include this new one.
             */
            mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP);
            if (mcst_p == NULL) {
                DERR(vswp, "%s unable to alloc mem", __func__);
                return (1);
            }
            mcst_p->addr = addr;

            mutex_enter(&vswp->mca_lock);
            mcst_p->nextp = vswp->mcap;
            vswp->mcap = mcst_p;
            mutex_exit(&vswp->mca_lock);

            /*
             * Call into the underlying driver to program the
             * address into HW.
             */
            mutex_enter(&vswp->mac_lock);
            if (vswp->mh != NULL) {
                ret = mac_multicst_add(vswp->mh, mca);
                if (ret != 0) {
                    cmn_err(CE_WARN, "!vsw%d: unable to "
                        "add multicast address",
                        vswp->instance);
                    mutex_exit(&vswp->mac_lock);
                    goto vsw_remove_addr;
                }
            }
            mutex_exit(&vswp->mac_lock);
        } else {
            cmn_err(CE_WARN, "!vsw%d: unable to add multicast "
                "address", vswp->instance);
        }
        return (ret);
    }

vsw_remove_addr:

    D2(vswp, "%s: removing multicast", __func__);
    /*
     * Remove the address from the hash table..
     */
    if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {

        /*
         * ..and then from the list maintained in the
         * vsw_t structure.
         */
        vsw_del_addr(VSW_LOCALDEV, vswp, addr);

        mutex_enter(&vswp->mac_lock);
        if (vswp->mh != NULL)
            (void) mac_multicst_remove(vswp->mh, mca);
        mutex_exit(&vswp->mac_lock);
    }

    D1(vswp, "%s: exit", __func__);

    return (0);
}

static int
vsw_m_promisc(void *arg, boolean_t on)
{
    vsw_t       *vswp = (vsw_t *)arg;

    D1(vswp, "%s: enter", __func__);

    WRITE_ENTER(&vswp->if_lockrw);
    if (on)
        vswp->if_state |= VSW_IF_PROMISC;
    else
        vswp->if_state &= ~VSW_IF_PROMISC;
    RW_EXIT(&vswp->if_lockrw);

    D1(vswp, "%s: exit", __func__);

    return (0);
}

static mblk_t *
vsw_m_tx(void *arg, mblk_t *mp)
{
    vsw_t       *vswp = (vsw_t *)arg;

    D1(vswp, "%s: enter", __func__);

    vswp->vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL);

    D1(vswp, "%s: exit", __func__);

    return (NULL);
}

/*
 * Register for machine description (MD) updates.
 *
 * Returns 0 on success, 1 on failure.
 */
static int
vsw_mdeg_register(vsw_t *vswp)
{
    mdeg_prop_spec_t    *pspecp;
    mdeg_node_spec_t    *inst_specp;
    mdeg_handle_t       mdeg_hdl, mdeg_port_hdl;
    size_t          templatesz;
    int         inst, rv;

    D1(vswp, "%s: enter", __func__);

    /*
     * In each 'virtual-device' node in the MD there is a
     * 'cfg-handle' property which is the MD's concept of
     * an instance number (this may be completely different from
     * the device drivers instance #). OBP reads that value and
     * stores it in the 'reg' property of the appropriate node in
     * the device tree. So we use the 'reg' value when registering
     * with the mdeg framework, to ensure we get events for the
     * correct nodes.
     */
    inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
        DDI_PROP_DONTPASS, reg_propname, -1);
    if (inst == -1) {
        cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from "
            "OBP device tree", vswp->instance, reg_propname);
        return (1);
    }

    D2(vswp, "%s: instance %d registering with mdeg", __func__, inst);

    /*
     * Allocate and initialize a per-instance copy
     * of the global property spec array that will
     * uniquely identify this vsw instance.
     */
    templatesz = sizeof (vsw_prop_template);
    pspecp = kmem_zalloc(templatesz, KM_SLEEP);

    bcopy(vsw_prop_template, pspecp, templatesz);

    VSW_SET_MDEG_PROP_INST(pspecp, inst);

    /* initialize the complete prop spec structure */
    inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
    inst_specp->namep = "virtual-device";
    inst_specp->specp = pspecp;

    /*
     * Register an interest in 'virtual-device' nodes with a
     * 'name' property of 'virtual-network-switch'
     */
    rv = mdeg_register(inst_specp, &vdev_match, vsw_mdeg_cb,
        (void *)vswp, &mdeg_hdl);
    if (rv != MDEG_SUCCESS) {
        DERR(vswp, "%s: mdeg_register failed (%d) for vsw node",
            __func__, rv);
        goto mdeg_reg_fail;
    }

    /*
     * Register an interest in 'vsw-port' nodes.
     */
    rv = mdeg_register(inst_specp, &vport_match, vsw_port_mdeg_cb,
        (void *)vswp, &mdeg_port_hdl);
    if (rv != MDEG_SUCCESS) {
        DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv);
        (void) mdeg_unregister(mdeg_hdl);
        goto mdeg_reg_fail;
    }

    /* save off data that will be needed later */
    vswp->inst_spec = inst_specp;
    vswp->mdeg_hdl = mdeg_hdl;
    vswp->mdeg_port_hdl = mdeg_port_hdl;

    D1(vswp, "%s: exit", __func__);
    return (0);

mdeg_reg_fail:
    cmn_err(CE_WARN, "!vsw%d: Unable to register MDEG callbacks",
                vswp->instance);
    kmem_free(pspecp, templatesz);
    kmem_free(inst_specp, sizeof (mdeg_node_spec_t));

    vswp->mdeg_hdl = NULL;
    vswp->mdeg_port_hdl = NULL;

    return (1);
}

static void
vsw_mdeg_unregister(vsw_t *vswp)
{
    D1(vswp, "vsw_mdeg_unregister: enter");

    if (vswp->mdeg_hdl != NULL)
        (void) mdeg_unregister(vswp->mdeg_hdl);

    if (vswp->mdeg_port_hdl != NULL)
        (void) mdeg_unregister(vswp->mdeg_port_hdl);

    if (vswp->inst_spec != NULL) {
        if (vswp->inst_spec->specp != NULL) {
            (void) kmem_free(vswp->inst_spec->specp,
                sizeof (vsw_prop_template));
            vswp->inst_spec->specp = NULL;
        }

        (void) kmem_free(vswp->inst_spec,
            sizeof (mdeg_node_spec_t));
        vswp->inst_spec = NULL;
    }

    D1(vswp, "vsw_mdeg_unregister: exit");
}

/*
 * Mdeg callback invoked for the vsw node itself.
 */
static int
vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
{
    vsw_t       *vswp;
    int     idx;
    md_t        *mdp;
    mde_cookie_t    node;
    uint64_t    inst;
    char        *node_name = NULL;

    if (resp == NULL)
        return (MDEG_FAILURE);

    vswp = (vsw_t *)cb_argp;

    D1(vswp, "%s: added %d : removed %d : curr matched %d"
        " : prev matched %d", __func__, resp->added.nelem,
        resp->removed.nelem, resp->match_curr.nelem,
        resp->match_prev.nelem);

    /*
     * Expect 'added' to be non-zero if virtual-network-switch
     * nodes exist in the MD when the driver attaches.
     */
    for (idx = 0; idx < resp->added.nelem; idx++) {
        mdp = resp->added.mdp;
        node = resp->added.mdep[idx];

        if (md_get_prop_str(mdp, node, "name", &node_name) != 0) {
            DERR(vswp, "%s: unable to get node name for "
                "node(%d) 0x%lx", __func__, idx, node);
            continue;
        }

        if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) {
            DERR(vswp, "%s: prop(cfg-handle) not found port(%d)",
                __func__, idx);
            continue;
        }

        D2(vswp, "%s: added node(%d) 0x%lx with name %s "
            "and inst %d", __func__, idx, node, node_name, inst);

        vsw_get_initial_md_properties(vswp, mdp, node);
    }

    /*
     * A non-zero 'match' value indicates that the MD has been
     * updated and that a virtual-network-switch node is present
     * which may or may not have been updated. It is up to the clients
     * to examine their own nodes and determine if they have changed.
     */
    for (idx = 0; idx < resp->match_curr.nelem; idx++) {
        mdp = resp->match_curr.mdp;
        node = resp->match_curr.mdep[idx];

        if (md_get_prop_str(mdp, node, "name", &node_name) != 0) {
            DERR(vswp, "%s: unable to get node name for "
                "node(%d) 0x%lx", __func__, idx, node);
            continue;
        }

        if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) {
            DERR(vswp, "%s: prop(cfg-handle) not found port(%d)",
                __func__, idx);
            continue;
        }

        D2(vswp, "%s: changed node(%d) 0x%lx with name %s "
            "and inst %d", __func__, idx, node, node_name, inst);

        vsw_update_md_prop(vswp, mdp, node);
    }

    return (MDEG_SUCCESS);
}

/*
 * Mdeg callback invoked for changes to the vsw-port nodes
 * under the vsw node.
 */
static int
vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
{
    vsw_t       *vswp;
    int     idx;
    md_t        *mdp;
    mde_cookie_t    node;
    uint64_t    inst;

    if ((resp == NULL) || (cb_argp == NULL))
        return (MDEG_FAILURE);

    vswp = (vsw_t *)cb_argp;

    D2(vswp, "%s: added %d : removed %d : curr matched %d"
        " : prev matched %d", __func__, resp->added.nelem,
        resp->removed.nelem, resp->match_curr.nelem,
        resp->match_prev.nelem);

    /* process added ports */
    for (idx = 0; idx < resp->added.nelem; idx++) {
        mdp = resp->added.mdp;
        node = resp->added.mdep[idx];

        D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node);

        if (vsw_port_add(vswp, mdp, &node) != 0) {
            cmn_err(CE_WARN, "!vsw%d: Unable to add new port "
                "(0x%lx)", vswp->instance, node);
        }
    }

    /* process removed ports */
    for (idx = 0; idx < resp->removed.nelem; idx++) {
        mdp = resp->removed.mdp;
        node = resp->removed.mdep[idx];

        if (md_get_prop_val(mdp, node, id_propname, &inst)) {
            DERR(vswp, "%s: prop(%s) not found in port(%d)",
                __func__, id_propname, idx);
            continue;
        }

        D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node);

        if (vsw_port_detach(vswp, inst) != 0) {
            cmn_err(CE_WARN, "!vsw%d: Unable to remove port %ld",
                vswp->instance, inst);
        }
    }

    /*
     * Currently no support for updating already active ports.
     * So, ignore the match_curr and match_priv arrays for now.
     */

    D1(vswp, "%s: exit", __func__);

    return (MDEG_SUCCESS);
}

/*
 * Read the initial start-of-day values from the specified MD node.
 */
static void
vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
{
    int     i;
    uint64_t    macaddr = 0;

    D1(vswp, "%s: enter", __func__);

    if (vsw_get_md_physname(vswp, mdp, node, vswp->physname) == 0) {
        /*
         * Note it is valid for the physname property to
         * be NULL so check actual name length to determine
         * if we have a actual device name.
         */
        if (strlen(vswp->physname) > 0)
            vswp->mdprops |= VSW_MD_PHYSNAME;
    } else {
        cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical "
            "device from MD", vswp->instance);
        return;
    }

    /* mac address for vswitch device itself */
    if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) {
        cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD",
            vswp->instance);

        /*
         * Fallback to using the mac address of the physical
         * device.
         */
        if (vsw_get_physaddr(vswp) == 0) {
            cmn_err(CE_NOTE, "!vsw%d: Using MAC address from "
                "physical device (%s)", vswp->instance,
                vswp->physname);
        } else {
            cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address"
                "from device %s", vswp->instance,
                vswp->physname);
        }
    } else {
        WRITE_ENTER(&vswp->if_lockrw);
        for (i = ETHERADDRL - 1; i >= 0; i--) {
            vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
            macaddr >>= 8;
        }
        RW_EXIT(&vswp->if_lockrw);
        vswp->mdprops |= VSW_MD_MACADDR;
    }

    if (vsw_get_md_smodes(vswp, mdp, node,
                vswp->smode, &vswp->smode_num)) {
        cmn_err(CE_WARN, "vsw%d: Unable to read %s property from "
            "MD, defaulting to programmed mode", vswp->instance,
            smode_propname);

        for (i = 0; i < NUM_SMODES; i++)
            vswp->smode[i] = VSW_LAYER2;

        vswp->smode_num = NUM_SMODES;
    } else {
        ASSERT(vswp->smode_num != 0);
        vswp->mdprops |= VSW_MD_SMODE;
    }

    /*
     * Unable to setup any switching mode, nothing more
     * we can do.
     */
    if (vsw_setup_switching(vswp))
        return;

    WRITE_ENTER(&vswp->if_lockrw);
    vswp->if_state &= ~VSW_IF_UP;
    RW_EXIT(&vswp->if_lockrw);
    if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) {
        if (vsw_mac_register(vswp) != 0) {
            /*
             * Treat this as a non-fatal error as we may be
             * able to operate in some other mode.
             */
            cmn_err(CE_WARN, "vsw%d: Unable to register as "
                "provider with MAC layer", vswp->instance);
        }
    }

    D1(vswp, "%s: exit", __func__);
}

/*
 * Check to see if the relevant properties in the specified node have
 * changed, and if so take the appropriate action.
 *
 * If any of the properties are missing or invalid we don't take
 * any action, as this function should only be invoked when modifications
 * have been made to what we assume is a working configuration, which
 * we leave active.
 *
 * Note it is legal for this routine to be invoked even if none of the
 * properties in the port node within the MD have actually changed.
 */
static void
vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
{
    char        physname[LIFNAMSIZ];
    char        drv[LIFNAMSIZ];
    uint_t      ddi_instance;
    uint8_t     new_smode[NUM_SMODES];
    int     i, smode_num = 0;
    uint64_t    macaddr = 0;
    vsw_port_list_t *plist = &vswp->plist;
    vsw_port_t  *port = NULL;
    enum        {MD_init = 0x1,
                MD_physname = 0x2,
                MD_macaddr = 0x4,
                MD_smode = 0x8} updated;

    updated = MD_init;

    D1(vswp, "%s: enter", __func__);

    /*
     * Check if name of physical device in MD has changed.
     */
    if (vsw_get_md_physname(vswp, mdp, node, (char *)&physname) == 0) {
        /*
         * Do basic sanity check on new device name/instance,
         * if its non NULL. It is valid for the device name to
         * have changed from a non NULL to a NULL value, i.e.
         * the vsw is being changed to 'routed' mode.
         */
        if ((strlen(physname) != 0) &&
            (ddi_parse(physname, drv,
                &ddi_instance) != DDI_SUCCESS)) {
            cmn_err(CE_WARN, "!vsw%d: new device name %s is not"
                " a valid device name/instance",
                vswp->instance, physname);
            goto fail_reconf;
        }

        if (strcmp(physname, vswp->physname)) {
            D2(vswp, "%s: device name changed from %s to %s",
                    __func__, vswp->physname, physname);

            updated |= MD_physname;
        } else {
            D2(vswp, "%s: device name unchanged at %s",
                    __func__, vswp->physname);
        }
    } else {
        cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical "
            "device from updated MD.", vswp->instance);
        goto fail_reconf;
    }

    /*
     * Check if MAC address has changed.
     */
    if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) {
        cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD",
            vswp->instance);
        goto fail_reconf;
    } else {
        READ_ENTER(&vswp->if_lockrw);
        for (i = ETHERADDRL - 1; i >= 0; i--) {
            if (vswp->if_addr.ether_addr_octet[i]
                            != (macaddr & 0xFF)) {
                D2(vswp, "%s: octet[%d] 0x%x != 0x%x",
                    __func__, i,
                    vswp->if_addr.ether_addr_octet[i],
                    (macaddr & 0xFF));
                updated |= MD_macaddr;
                break;
            }
            macaddr >>= 8;
        }
        RW_EXIT(&vswp->if_lockrw);
    }

    /*
     * Check if switching modes have changed.
     */
    if (vsw_get_md_smodes(vswp, mdp, node,
                new_smode, &smode_num)) {
        cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from MD",
                    vswp->instance, smode_propname);
        goto fail_reconf;
    } else {
        ASSERT(smode_num != 0);
        if (smode_num != vswp->smode_num) {
            D2(vswp, "%s: number of modes changed from %d to %d",
                __func__, vswp->smode_num, smode_num);
        }

        for (i = 0; i < smode_num; i++) {
            if (new_smode[i] != vswp->smode[i]) {
                D2(vswp, "%s: mode changed from %d to %d",
                    __func__, vswp->smode[i], new_smode[i]);
                updated |= MD_smode;
                break;
            }
        }
    }

    /*
     * Now make any changes which are needed...
     */

    if (updated & (MD_physname | MD_smode)) {
        /*
         * Disconnect all ports from the current card
         */
        WRITE_ENTER(&plist->lockrw);
        for (port = plist->head; port != NULL; port = port->p_next) {
            /* Remove address if was programmed into HW. */
            mutex_enter(&vswp->hw_lock);
            if (vsw_unset_hw(vswp, port, VSW_VNETPORT)) {
                mutex_exit(&vswp->hw_lock);
                RW_EXIT(&plist->lockrw);
                goto fail_update;
            }
            mutex_exit(&vswp->hw_lock);
        }
        RW_EXIT(&plist->lockrw);

        /*
         * Stop, detach the old device..
         */
        vsw_mac_detach(vswp);

        /*
         * Update phys name.
         */
        if (updated & MD_physname) {
            cmn_err(CE_NOTE, "!vsw%d: changing from %s to %s",
                vswp->instance, vswp->physname, physname);
            (void) strncpy(vswp->physname,
                    physname, strlen(physname) + 1);

            if (strlen(vswp->physname) > 0)
                vswp->mdprops |= VSW_MD_PHYSNAME;
        }

        /*
         * Update array with the new switch mode values.
         */
        if (updated & MD_smode) {
            for (i = 0; i < smode_num; i++)
                vswp->smode[i] = new_smode[i];

            vswp->smode_num = smode_num;
            vswp->smode_idx = 0;
        }

        /*
         * ..and attach, start the new device.
         */
        if (vsw_setup_switching(vswp))
            goto fail_update;

        /*
         * Connect ports to new card.
         */
        WRITE_ENTER(&plist->lockrw);
        for (port = plist->head; port != NULL; port = port->p_next) {
            mutex_enter(&vswp->hw_lock);
            if (vsw_set_hw(vswp, port, VSW_VNETPORT)) {
                mutex_exit(&vswp->hw_lock);
                RW_EXIT(&plist->lockrw);
                goto fail_update;
            }
            mutex_exit(&vswp->hw_lock);
        }
        RW_EXIT(&plist->lockrw);
    }

    if (updated & MD_macaddr) {
        cmn_err(CE_NOTE, "!vsw%d: changing mac address to 0x%lx",
                vswp->instance, macaddr);

        WRITE_ENTER(&vswp->if_lockrw);
        for (i = ETHERADDRL - 1; i >= 0; i--) {
            vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
            macaddr >>= 8;
        }
        RW_EXIT(&vswp->if_lockrw);

        /*
         * Remove old address from HW (if programmed) and set
         * new address.
         */
        mutex_enter(&vswp->hw_lock);
        (void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
        (void) vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
        mutex_exit(&vswp->hw_lock);

        /*
         * Notify the MAC layer of the changed address.
         */
        mac_unicst_update(vswp->if_mh, (uint8_t *)&vswp->if_addr);
    }

    return;

fail_reconf:
    cmn_err(CE_WARN, "!vsw%d: configuration unchanged", vswp->instance);
    return;

fail_update:
    cmn_err(CE_WARN, "!vsw%d: update of configuration failed",
            vswp->instance);
}

/*
 * Add a new port to the system.
 *
 * Returns 0 on success, 1 on failure.
 */
int
vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node)
{
    uint64_t        ldc_id;
    uint8_t         *addrp;
    int         i, addrsz;
    int         num_nodes = 0, nchan = 0;
    int         listsz = 0;
    mde_cookie_t        *listp = NULL;
    struct ether_addr   ea;
    uint64_t        macaddr;
    uint64_t        inst = 0;
    vsw_port_t      *port;

    if (md_get_prop_val(mdp, *node, id_propname, &inst)) {
        DWARN(vswp, "%s: prop(%s) not found", __func__,
            id_propname);
        return (1);
    }

    /*
     * Find the channel endpoint node(s) (which should be under this
     * port node) which contain the channel id(s).
     */
    if ((num_nodes = md_node_count(mdp)) <= 0) {
        DERR(vswp, "%s: invalid number of nodes found (%d)",
            __func__, num_nodes);
        return (1);
    }

    D2(vswp, "%s: %d nodes found", __func__, num_nodes);

    /* allocate enough space for node list */
    listsz = num_nodes * sizeof (mde_cookie_t);
    listp = kmem_zalloc(listsz, KM_SLEEP);

    nchan = md_scan_dag(mdp, *node,
        md_find_name(mdp, chan_propname),
        md_find_name(mdp, "fwd"), listp);

    if (nchan <= 0) {
        DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname);
        kmem_free(listp, listsz);
        return (1);
    }

    D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname);

    /* use property from first node found */
    if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) {
        DWARN(vswp, "%s: prop(%s) not found\n", __func__,
            id_propname);
        kmem_free(listp, listsz);
        return (1);
    }

    /* don't need list any more */
    kmem_free(listp, listsz);

    D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id);

    /* read mac-address property */
    if (md_get_prop_data(mdp, *node, remaddr_propname,
                    &addrp, &addrsz)) {
        DWARN(vswp, "%s: prop(%s) not found",
                __func__, remaddr_propname);
        return (1);
    }

    if (addrsz < ETHERADDRL) {
        DWARN(vswp, "%s: invalid address size", __func__);
        return (1);
    }

    macaddr = *((uint64_t *)addrp);
    D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr);

    for (i = ETHERADDRL - 1; i >= 0; i--) {
        ea.ether_addr_octet[i] = macaddr & 0xFF;
        macaddr >>= 8;
    }

    if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) {
        DERR(vswp, "%s: failed to attach port", __func__);
        return (1);
    }

    port = vsw_lookup_port(vswp, (int)inst);

    /* just successfuly created the port, so it should exist */
    ASSERT(port != NULL);

    return (0);
}

/*
 * Attach the specified port.
 *
 * Returns 0 on success, 1 on failure.
 */
static int
vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids,
struct ether_addr *macaddr)
{
    vsw_port_list_t     *plist = &vswp->plist;
    vsw_port_t      *port, **prev_port;
    int         i;

    D1(vswp, "%s: enter : port %d", __func__, p_instance);

    /* port already exists? */
    READ_ENTER(&plist->lockrw);
    for (port = plist->head; port != NULL; port = port->p_next) {
        if (port->p_instance == p_instance) {
            DWARN(vswp, "%s: port instance %d already attached",
                __func__, p_instance);
            RW_EXIT(&plist->lockrw);
            return (1);
        }
    }
    RW_EXIT(&plist->lockrw);

    port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP);
    port->p_vswp = vswp;
    port->p_instance = p_instance;
    port->p_ldclist.num_ldcs = 0;
    port->p_ldclist.head = NULL;
    port->addr_set = VSW_ADDR_UNSET;

    rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);

    mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
    mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);

    mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL);
    cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL);

    mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
    cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
    port->state = VSW_PORT_INIT;

    if (nids > VSW_PORT_MAX_LDCS) {
        D2(vswp, "%s: using first of %d ldc ids",
            __func__, nids);
        nids = VSW_PORT_MAX_LDCS;
    }

    D2(vswp, "%s: %d nids", __func__, nids);
    for (i = 0; i < nids; i++) {
        D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
        if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
            DERR(vswp, "%s: ldc_attach failed", __func__);

            rw_destroy(&port->p_ldclist.lockrw);

            cv_destroy(&port->ref_cv);
            mutex_destroy(&port->ref_lock);

            cv_destroy(&port->state_cv);
            mutex_destroy(&port->state_lock);

            mutex_destroy(&port->tx_lock);
            mutex_destroy(&port->mca_lock);
            kmem_free(port, sizeof (vsw_port_t));
            return (1);
        }
    }

    ether_copy(macaddr, &port->p_macaddr);

    WRITE_ENTER(&plist->lockrw);

    /* create the fdb entry for this port/mac address */
    (void) vsw_add_fdb(vswp, port);

    mutex_enter(&vswp->hw_lock);
    (void) vsw_set_hw(vswp, port, VSW_VNETPORT);
    mutex_exit(&vswp->hw_lock);

    /* link it into the list of ports for this vsw instance */
    prev_port = (vsw_port_t **)(&plist->head);
    port->p_next = *prev_port;
    *prev_port = port;
    plist->num_ports++;
    RW_EXIT(&plist->lockrw);

    /*
     * Initialise the port and any ldc's under it.
     */
    (void) vsw_init_ldcs(port);

    D1(vswp, "%s: exit", __func__);
    return (0);
}

/*
 * Detach the specified port.
 *
 * Returns 0 on success, 1 on failure.
 */
static int
vsw_port_detach(vsw_t *vswp, int p_instance)
{
    vsw_port_t  *port = NULL;
    vsw_port_list_t *plist = &vswp->plist;

    D1(vswp, "%s: enter: port id %d", __func__, p_instance);

    WRITE_ENTER(&plist->lockrw);

    if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
        RW_EXIT(&plist->lockrw);
        return (1);
    }

    if (vsw_plist_del_node(vswp, port)) {
        RW_EXIT(&plist->lockrw);
        return (1);
    }

    /* Remove the fdb entry for this port/mac address */
    (void) vsw_del_fdb(vswp, port);

    /* Remove any multicast addresses.. */
    vsw_del_mcst_port(port);

    /*
     * No longer need to hold writer lock on port list now
     * that we have unlinked the target port from the list.
     */
    RW_EXIT(&plist->lockrw);

    /* Remove address if was programmed into HW. */
    mutex_enter(&vswp->hw_lock);
    (void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
    if (vswp->recfg_reqd)
        vsw_reconfig_hw(vswp);
    mutex_exit(&vswp->hw_lock);

    if (vsw_port_delete(port)) {
        return (1);
    }

    D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
    return (0);
}

/*
 * Detach all active ports.
 *
 * Returns 0 on success, 1 on failure.
 */
static int
vsw_detach_ports(vsw_t *vswp)
{
    vsw_port_list_t     *plist = &vswp->plist;
    vsw_port_t      *port = NULL;

    D1(vswp, "%s: enter", __func__);

    WRITE_ENTER(&plist->lockrw);

    while ((port = plist->head) != NULL) {
        if (vsw_plist_del_node(vswp, port)) {
            DERR(vswp, "%s: Error deleting port %d"
                " from port list", __func__,
                port->p_instance);
            RW_EXIT(&plist->lockrw);
            return (1);
        }

        /* Remove address if was programmed into HW. */
        mutex_enter(&vswp->hw_lock);
        (void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
        mutex_exit(&vswp->hw_lock);

        /* Remove the fdb entry for this port/mac address */
        (void) vsw_del_fdb(vswp, port);

        /* Remove any multicast addresses.. */
        vsw_del_mcst_port(port);

        /*
         * No longer need to hold the lock on the port list
         * now that we have unlinked the target port from the
         * list.
         */
        RW_EXIT(&plist->lockrw);
        if (vsw_port_delete(port)) {
            DERR(vswp, "%s: Error deleting port %d",
                __func__, port->p_instance);
            return (1);
        }
        WRITE_ENTER(&plist->lockrw);
    }
    RW_EXIT(&plist->lockrw);

    D1(vswp, "%s: exit", __func__);

    return (0);
}

/*
 * Delete the specified port.
 *
 * Returns 0 on success, 1 on failure.
 */
static int
vsw_port_delete(vsw_port_t *port)
{
    vsw_ldc_list_t      *ldcl;
    vsw_t           *vswp = port->p_vswp;

    D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);

    (void) vsw_uninit_ldcs(port);

    /*
     * Wait for any pending ctrl msg tasks which reference this
     * port to finish.
     */
    if (vsw_drain_port_taskq(port))
        return (1);

    /*
     * Wait for port reference count to hit zero.
     */
    mutex_enter(&port->ref_lock);
    while (port->ref_cnt != 0)
        cv_wait(&port->ref_cv, &port->ref_lock);
    mutex_exit(&port->ref_lock);

    /*
     * Wait for any active callbacks to finish
     */
    if (vsw_drain_ldcs(port))
        return (1);

    ldcl = &port->p_ldclist;
    WRITE_ENTER(&ldcl->lockrw);
    while (ldcl->num_ldcs > 0) {
        if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {;
            cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld",
                    vswp->instance, ldcl->head->ldc_id);
            RW_EXIT(&ldcl->lockrw);
            return (1);
        }
    }
    RW_EXIT(&ldcl->lockrw);

    rw_destroy(&port->p_ldclist.lockrw);

    mutex_destroy(&port->mca_lock);
    mutex_destroy(&port->tx_lock);
    cv_destroy(&port->ref_cv);
    mutex_destroy(&port->ref_lock);

    cv_destroy(&port->state_cv);
    mutex_destroy(&port->state_lock);

    kmem_free(port, sizeof (vsw_port_t));

    D1(vswp, "%s: exit", __func__);

    return (0);
}

/*
 * Attach a logical domain channel (ldc) under a specified port.
 *
 * Returns 0 on success, 1 on failure.
 */
static int
vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
{
    vsw_t       *vswp = port->p_vswp;
    vsw_ldc_list_t *ldcl = &port->p_ldclist;
    vsw_ldc_t   *ldcp = NULL;
    ldc_attr_t  attr;
    ldc_status_t    istatus;
    int         status = DDI_FAILURE;
    int     rv;
    enum        { PROG_init = 0x0, PROG_mblks = 0x1,
                PROG_callback = 0x2}
            progress;

    progress = PROG_init;

    D1(vswp, "%s: enter", __func__);

    ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
    if (ldcp == NULL) {
        DERR(vswp, "%s: kmem_zalloc failed", __func__);
        return (1);
    }
    ldcp->ldc_id = ldc_id;

    /* allocate pool of receive mblks */
    rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh));
    if (rv) {
        DWARN(vswp, "%s: unable to create free mblk pool for"
            " channel %ld (rv %d)", __func__, ldc_id, rv);
        kmem_free(ldcp, sizeof (vsw_ldc_t));
        return (1);
    }

    progress |= PROG_mblks;

    mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
    mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
    mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
    cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
    rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
    rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);

    /* required for handshake with peer */
    ldcp->local_session = (uint64_t)ddi_get_lbolt();
    ldcp->peer_session = 0;
    ldcp->session_status = 0;

    mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL);
    ldcp->hss_id = 1;   /* Initial handshake session id */

    /* only set for outbound lane, inbound set by peer */
    mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL);
    mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL);
    vsw_set_lane_attr(vswp, &ldcp->lane_out);

    attr.devclass = LDC_DEV_NT_SVC;
    attr.instance = ddi_get_instance(vswp->dip);
    attr.mode = LDC_MODE_UNRELIABLE;
    attr.mtu = VSW_LDC_MTU;
    status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
    if (status != 0) {
        DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
            __func__, ldc_id, status);
        goto ldc_attach_fail;
    }

    status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
    if (status != 0) {
        DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
            __func__, ldc_id, status);
        (void) ldc_fini(ldcp->ldc_handle);
        goto ldc_attach_fail;
    }

    progress |= PROG_callback;

    mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);

    if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
        DERR(vswp, "%s: ldc_status failed", __func__);
        mutex_destroy(&ldcp->status_lock);
        goto ldc_attach_fail;
    }

    ldcp->ldc_status = istatus;
    ldcp->ldc_port = port;
    ldcp->ldc_vswp = vswp;

    /* link it into the list of channels for this port */
    WRITE_ENTER(&ldcl->lockrw);
    ldcp->ldc_next = ldcl->head;
    ldcl->head = ldcp;
    ldcl->num_ldcs++;
    RW_EXIT(&ldcl->lockrw);

    D1(vswp, "%s: exit", __func__);
    return (0);

ldc_attach_fail:
    mutex_destroy(&ldcp->ldc_txlock);
    mutex_destroy(&ldcp->ldc_cblock);

    cv_destroy(&ldcp->drain_cv);

    rw_destroy(&ldcp->lane_in.dlistrw);
    rw_destroy(&ldcp->lane_out.dlistrw);

    if (progress & PROG_callback) {
        (void) ldc_unreg_callback(ldcp->ldc_handle);
    }

    if ((progress & PROG_mblks) && (ldcp->rxh != NULL)) {
        if (vio_destroy_mblks(ldcp->rxh) != 0) {
            /*
             * Something odd has happened, as the destroy
             * will only fail if some mblks have been allocated
             * from the pool already (which shouldn't happen)
             * and have not been returned.
             *
             * Add the pool pointer to a list maintained in
             * the device instance. Another attempt will be made
             * to free the pool when the device itself detaches.
             */
            cmn_err(CE_WARN, "!vsw%d: Creation of ldc channel %ld "
                "failed and cannot destroy associated mblk "
                "pool", vswp->instance, ldc_id);
            ldcp->rxh->nextp =  vswp->rxh;
            vswp->rxh = ldcp->rxh;
        }
    }
    mutex_destroy(&ldcp->drain_cv_lock);
    mutex_destroy(&ldcp->hss_lock);

    mutex_destroy(&ldcp->lane_in.seq_lock);
    mutex_destroy(&ldcp->lane_out.seq_lock);
    kmem_free(ldcp, sizeof (vsw_ldc_t));

    return (1);
}

/*
 * Detach a logical domain channel (ldc) belonging to a
 * particular port.
 *
 * Returns 0 on success, 1 on failure.
 */
static int
vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
{
    vsw_t       *vswp = port->p_vswp;
    vsw_ldc_t   *ldcp, *prev_ldcp;
    vsw_ldc_list_t  *ldcl = &port->p_ldclist;
    int         rv;

    prev_ldcp = ldcl->head;
    for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
        if (ldcp->ldc_id == ldc_id) {
            break;
        }
    }

    /* specified ldc id not found */
    if (ldcp == NULL) {
        DERR(vswp, "%s: ldcp = NULL", __func__);
        return (1);
    }

    D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);

    /*
     * Before we can close the channel we must release any mapped
     * resources (e.g. drings).
     */
    vsw_free_lane_resources(ldcp, INBOUND);
    vsw_free_lane_resources(ldcp, OUTBOUND);

    /*
     * If the close fails we are in serious trouble, as won't
     * be able to delete the parent port.
     */
    if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
        DERR(vswp, "%s: error %d closing channel %lld",
            __func__, rv, ldcp->ldc_id);
        return (1);
    }

    (void) ldc_fini(ldcp->ldc_handle);

    ldcp->ldc_status = LDC_INIT;
    ldcp->ldc_handle = NULL;
    ldcp->ldc_vswp = NULL;

    if (ldcp->rxh != NULL) {
        if (vio_destroy_mblks(ldcp->rxh)) {
            /*
             * Mostly likely some mblks are still in use and
             * have not been returned to the pool. Add the pool
             * to the list maintained in the device instance.
             * Another attempt will be made to destroy the pool
             * when the device detaches.
             */
            ldcp->rxh->nextp =  vswp->rxh;
            vswp->rxh = ldcp->rxh;
        }
    }

    /* unlink it from the list */
    prev_ldcp = ldcp->ldc_next;
    ldcl->num_ldcs--;

    mutex_destroy(&ldcp->ldc_txlock);
    mutex_destroy(&ldcp->ldc_cblock);
    cv_destroy(&ldcp->drain_cv);
    mutex_destroy(&ldcp->drain_cv_lock);
    mutex_destroy(&ldcp->hss_lock);
    mutex_destroy(&ldcp->lane_in.seq_lock);
    mutex_destroy(&ldcp->lane_out.seq_lock);
    mutex_destroy(&ldcp->status_lock);
    rw_destroy(&ldcp->lane_in.dlistrw);
    rw_destroy(&ldcp->lane_out.dlistrw);

    kmem_free(ldcp, sizeof (vsw_ldc_t));

    return (0);
}

/*
 * Open and attempt to bring up the channel. Note that channel
 * can only be brought up if peer has also opened channel.
 *
 * Returns 0 if can open and bring up channel, otherwise
 * returns 1.
 */
static int
vsw_ldc_init(vsw_ldc_t *ldcp)
{
    vsw_t       *vswp = ldcp->ldc_vswp;
    ldc_status_t    istatus = 0;
    int     rv;

    D1(vswp, "%s: enter", __func__);

    LDC_ENTER_LOCK(ldcp);

    /* don't start at 0 in case clients don't like that */
    ldcp->next_ident = 1;

    rv = ldc_open(ldcp->ldc_handle);
    if (rv != 0) {
        DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
            __func__, ldcp->ldc_id, rv);
        LDC_EXIT_LOCK(ldcp);
        return (1);
    }

    if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
        DERR(vswp, "%s: unable to get status", __func__);
        LDC_EXIT_LOCK(ldcp);
        return (1);

    } else if (istatus != LDC_OPEN && istatus != LDC_READY) {
        DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
            __func__, ldcp->ldc_id, istatus);
        LDC_EXIT_LOCK(ldcp);
        return (1);
    }

    mutex_enter(&ldcp->status_lock);
    ldcp->ldc_status = istatus;
    mutex_exit(&ldcp->status_lock);

    rv = ldc_up(ldcp->ldc_handle);
    if (rv != 0) {
        /*
         * Not a fatal error for ldc_up() to fail, as peer
         * end point may simply not be ready yet.
         */
        D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
            ldcp->ldc_id, rv);
        LDC_EXIT_LOCK(ldcp);
        return (1);
    }

    /*
     * ldc_up() call is non-blocking so need to explicitly
     * check channel status to see if in fact the channel
     * is UP.
     */
    mutex_enter(&ldcp->status_lock);
    if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
        DERR(vswp, "%s: unable to get status", __func__);
        mutex_exit(&ldcp->status_lock);
        LDC_EXIT_LOCK(ldcp);
        return (1);

    }

    if (ldcp->ldc_status == LDC_UP) {
        D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
            ldcp->ldc_id, istatus);
        mutex_exit(&ldcp->status_lock);
        LDC_EXIT_LOCK(ldcp);

        vsw_process_conn_evt(ldcp, VSW_CONN_UP);
        return (0);
    }

    mutex_exit(&ldcp->status_lock);
    LDC_EXIT_LOCK(ldcp);

    D1(vswp, "%s: exit", __func__);
    return (0);
}

/* disable callbacks on the channel */
static int
vsw_ldc_uninit(vsw_ldc_t *ldcp)
{
    vsw_t   *vswp = ldcp->ldc_vswp;
    int rv;

    D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);

    LDC_ENTER_LOCK(ldcp);

    rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
    if (rv != 0) {
        DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
            "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
        LDC_EXIT_LOCK(ldcp);
        return (1);
    }

    mutex_enter(&ldcp->status_lock);
    ldcp->ldc_status = LDC_INIT;
    mutex_exit(&ldcp->status_lock);

    LDC_EXIT_LOCK(ldcp);

    D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);

    return (0);
}

static int
vsw_init_ldcs(vsw_port_t *port)
{
    vsw_ldc_list_t  *ldcl = &port->p_ldclist;
    vsw_ldc_t   *ldcp;

    READ_ENTER(&ldcl->lockrw);
    ldcp =  ldcl->head;
    for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
        (void) vsw_ldc_init(ldcp);
    }
    RW_EXIT(&ldcl->lockrw);

    return (0);
}

static int
vsw_uninit_ldcs(vsw_port_t *port)
{
    vsw_ldc_list_t  *ldcl = &port->p_ldclist;
    vsw_ldc_t   *ldcp;

    D1(NULL, "vsw_uninit_ldcs: enter\n");

    READ_ENTER(&ldcl->lockrw);
    ldcp =  ldcl->head;
    for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
        (void) vsw_ldc_uninit(ldcp);
    }
    RW_EXIT(&ldcl->lockrw);

    D1(NULL, "vsw_uninit_ldcs: exit\n");

    return (0);
}

/*
 * Wait until the callback(s) associated with the ldcs under the specified
 * port have completed.
 *
 * Prior to this function being invoked each channel under this port
 * should have been quiesced via ldc_set_cb_mode(DISABLE).
 *
 * A short explaination of what we are doing below..
 *
 * The simplest approach would be to have a reference counter in
 * the ldc structure which is increment/decremented by the callbacks as
 * they use the channel. The drain function could then simply disable any
 * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
 * there is a tiny window here - before the callback is able to get the lock
 * on the channel it is interrupted and this function gets to execute. It
 * sees that the ref count is zero and believes its free to delete the
 * associated data structures.
 *
 * We get around this by taking advantage of the fact that before the ldc
 * framework invokes a callback it sets a flag to indicate that there is a
 * callback active (or about to become active). If when we attempt to
 * unregister a callback when this active flag is set then the unregister
 * will fail with EWOULDBLOCK.
 *
 * If the unregister fails we do a cv_timedwait. We will either be signaled
 * by the callback as it is exiting (note we have to wait a short period to
 * allow the callback to return fully to the ldc framework and it to clear
 * the active flag), or by the timer expiring. In either case we again attempt
 * the unregister. We repeat this until we can succesfully unregister the
 * callback.
 *
 * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
 * the case where the callback has finished but the ldc framework has not yet
 * cleared the active flag. In this case we would never get a cv_signal.
 */
static int
vsw_drain_ldcs(vsw_port_t *port)
{
    vsw_ldc_list_t  *ldcl = &port->p_ldclist;
    vsw_ldc_t   *ldcp;
    vsw_t       *vswp = port->p_vswp;

    D1(vswp, "%s: enter", __func__);

    READ_ENTER(&ldcl->lockrw);

    ldcp = ldcl->head;

    for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
        /*
         * If we can unregister the channel callback then we
         * know that there is no callback either running or
         * scheduled to run for this channel so move on to next
         * channel in the list.
         */
        mutex_enter(&ldcp->drain_cv_lock);

        /* prompt active callbacks to quit */
        ldcp->drain_state = VSW_LDC_DRAINING;

        if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
            D2(vswp, "%s: unreg callback for chan %ld", __func__,
                ldcp->ldc_id);
            mutex_exit(&ldcp->drain_cv_lock);
            continue;
        } else {
            /*
             * If we end up here we know that either 1) a callback
             * is currently executing, 2) is about to start (i.e.
             * the ldc framework has set the active flag but
             * has not actually invoked the callback yet, or 3)
             * has finished and has returned to the ldc framework
             * but the ldc framework has not yet cleared the
             * active bit.
             *
             * Wait for it to finish.
             */
            while (ldc_unreg_callback(ldcp->ldc_handle)
                                == EWOULDBLOCK)
                (void) cv_timedwait(&ldcp->drain_cv,
                    &ldcp->drain_cv_lock, lbolt + hz);

            mutex_exit(&ldcp->drain_cv_lock);
            D2(vswp, "%s: unreg callback for chan %ld after "
                "timeout", __func__, ldcp->ldc_id);
        }
    }
    RW_EXIT(&ldcl->lockrw);

    D1(vswp, "%s: exit", __func__);
    return (0);
}

/*
 * Wait until all tasks which reference this port have completed.
 *
 * Prior to this function being invoked each channel under this port
 * should have been quiesced via ldc_set_cb_mode(DISABLE).
 */
static int
vsw_drain_port_taskq(vsw_port_t *port)
{
    vsw_t       *vswp = port->p_vswp;

    D1(vswp, "%s: enter", __func__);

    /*
     * Mark the port as in the process of being detached, and
     * dispatch a marker task to the queue so we know when all
     * relevant tasks have completed.
     */
    mutex_enter(&port->state_lock);
    port->state = VSW_PORT_DETACHING;

    if ((vswp->taskq_p == NULL) ||
        (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
            port, DDI_NOSLEEP) != DDI_SUCCESS)) {
        DERR(vswp, "%s: unable to dispatch marker task",
            __func__);
        mutex_exit(&port->state_lock);
        return (1);
    }

    /*
     * Wait for the marker task to finish.
     */
    while (port->state != VSW_PORT_DETACHABLE)
        cv_wait(&port->state_cv, &port->state_lock);

    mutex_exit(&port->state_lock);

    D1(vswp, "%s: exit", __func__);

    return (0);
}

static void
vsw_marker_task(void *arg)
{
    vsw_port_t  *port = arg;
    vsw_t       *vswp = port->p_vswp;

    D1(vswp, "%s: enter", __func__);

    mutex_enter(&port->state_lock);

    /*
     * No further tasks should be dispatched which reference
     * this port so ok to mark it as safe to detach.
     */
    port->state = VSW_PORT_DETACHABLE;

    cv_signal(&port->state_cv);

    mutex_exit(&port->state_lock);

    D1(vswp, "%s: exit", __func__);
}

static vsw_port_t *
vsw_lookup_port(vsw_t *vswp, int p_instance)
{
    vsw_port_list_t *plist = &vswp->plist;
    vsw_port_t  *port;

    for (port = plist->head; port != NULL; port = port->p_next) {
        if (port->p_instance == p_instance) {
            D2(vswp, "vsw_lookup_port: found p_instance\n");
            return (port);
        }
    }

    return (NULL);
}

/*
 * Search for and remove the specified port from the port
 * list. Returns 0 if able to locate and remove port, otherwise
 * returns 1.
 */
static int
vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
{
    vsw_port_list_t *plist = &vswp->plist;
    vsw_port_t  *curr_p, *prev_p;

    if (plist->head == NULL)
        return (1);

    curr_p = prev_p = plist->head;

    while (curr_p != NULL) {
        if (curr_p == port) {
            if (prev_p == curr_p) {
                plist->head = curr_p->p_next;
            } else {
                prev_p->p_next = curr_p->p_next;
            }
            plist->num_ports--;
            break;
        } else {
            prev_p = curr_p;
            curr_p = curr_p->p_next;
        }
    }
    return (0);
}

/*
 * Interrupt handler for ldc messages.
 */
static uint_t
vsw_ldc_cb(uint64_t event, caddr_t arg)
{
    vsw_ldc_t   *ldcp = (vsw_ldc_t  *)arg;
    vsw_t       *vswp = ldcp->ldc_vswp;

    D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);

    mutex_enter(&ldcp->ldc_cblock);

    mutex_enter(&ldcp->status_lock);
    if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
        mutex_exit(&ldcp->status_lock);
        mutex_exit(&ldcp->ldc_cblock);
        return (LDC_SUCCESS);
    }
    mutex_exit(&ldcp->status_lock);

    if (event & LDC_EVT_UP) {
        /*
         * Channel has come up.
         */
        D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
            __func__, ldcp->ldc_id, event, ldcp->ldc_status);

        vsw_process_conn_evt(ldcp, VSW_CONN_UP);

        ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
    }

    if (event & LDC_EVT_READ) {
        /*
         * Data available for reading.
         */
        D2(vswp, "%s: id(ld) event(%llx) data READ",
                __func__, ldcp->ldc_id, event);

        vsw_process_pkt(ldcp);

        ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);

        goto vsw_cb_exit;
    }

    if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
        D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
            __func__, ldcp->ldc_id, event, ldcp->ldc_status);

        vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
    }

    /*
     * Catch either LDC_EVT_WRITE which we don't support or any
     * unknown event.
     */
    if (event & ~(LDC_EVT_UP | LDC_EVT_RESET
                    | LDC_EVT_DOWN | LDC_EVT_READ)) {

        DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
            __func__, ldcp->ldc_id, event, ldcp->ldc_status);
    }

vsw_cb_exit:
    mutex_exit(&ldcp->ldc_cblock);

    /*
     * Let the drain function know we are finishing if it
     * is waiting.
     */
    mutex_enter(&ldcp->drain_cv_lock);
    if (ldcp->drain_state == VSW_LDC_DRAINING)
        cv_signal(&ldcp->drain_cv);
    mutex_exit(&ldcp->drain_cv_lock);

    return (LDC_SUCCESS);
}

/*
 * Reinitialise data structures associated with the channel.
 */
static void
vsw_ldc_reinit(vsw_ldc_t *ldcp)
{
    vsw_t       *vswp = ldcp->ldc_vswp;
    vsw_port_t  *port;
    vsw_ldc_list_t  *ldcl;

    D1(vswp, "%s: enter", __func__);

    port = ldcp->ldc_port;
    ldcl = &port->p_ldclist;

    READ_ENTER(&ldcl->lockrw);

    D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
        ldcp->lane_in.lstate, ldcp->lane_out.lstate);

    vsw_free_lane_resources(ldcp, INBOUND);
    vsw_free_lane_resources(ldcp, OUTBOUND);
    RW_EXIT(&ldcl->lockrw);

    ldcp->lane_in.lstate = 0;
    ldcp->lane_out.lstate = 0;

    /*
     * Remove parent port from any multicast groups
     * it may have registered with. Client must resend
     * multicast add command after handshake completes.
     */
    (void) vsw_del_fdb(vswp, port);

    vsw_del_mcst_port(port);

    ldcp->peer_session = 0;
    ldcp->session_status = 0;
    ldcp->hcnt = 0;
    ldcp->hphase = VSW_MILESTONE0;

    D1(vswp, "%s: exit", __func__);
}

/*
 * Process a connection event.
 *
 * Note - care must be taken to ensure that this function is
 * not called with the dlistrw lock held.
 */
static void
vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
{
    vsw_t       *vswp = ldcp->ldc_vswp;
    vsw_conn_evt_t  *conn = NULL;

    D1(vswp, "%s: enter", __func__);

    /*
     * Check if either a reset or restart event is pending
     * or in progress. If so just return.
     *
     * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
     * being received by the callback handler, or a ECONNRESET error
     * code being returned from a ldc_read() or ldc_write() call.
     *
     * A VSW_CONN_RESTART event occurs when some error checking code
     * decides that there is a problem with data from the channel,
     * and that the handshake should be restarted.
     */
    if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
            (ldstub((uint8_t *)&ldcp->reset_active)))
        return;

    /*
     * If it is an LDC_UP event we first check the recorded
     * state of the channel. If this is UP then we know that
     * the channel moving to the UP state has already been dealt
     * with and don't need to dispatch a  new task.
     *
     * The reason for this check is that when we do a ldc_up(),
     * depending on the state of the peer, we may or may not get
     * a LDC_UP event. As we can't depend on getting a LDC_UP evt
     * every time we do ldc_up() we explicitly check the channel
     * status to see has it come up (ldc_up() is asynch and will
     * complete at some undefined time), and take the appropriate
     * action.
     *
     * The flip side of this is that we may get a LDC_UP event
     * when we have already seen that the channel is up and have
     * dealt with that.
     */
    mutex_enter(&ldcp->status_lock);
    if (evt == VSW_CONN_UP) {
        if ((ldcp->ldc_status == LDC_UP) ||
                    (ldcp->reset_active != 0)) {
            mutex_exit(&ldcp->status_lock);
            return;
        }
    }
    mutex_exit(&ldcp->status_lock);

    /*
     * The transaction group id allows us to identify and discard
     * any tasks which are still pending on the taskq and refer
     * to the handshake session we are about to restart or reset.
     * These stale messages no longer have any real meaning.
     */
    mutex_enter(&ldcp->hss_lock);
    ldcp->hss_id++;
    mutex_exit(&ldcp->hss_lock);

    ASSERT(vswp->taskq_p != NULL);

    if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
        cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
            " connection event", vswp->instance);
        goto err_exit;
    }

    conn->evt = evt;
    conn->ldcp = ldcp;

    if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
        DDI_NOSLEEP) != DDI_SUCCESS) {
        cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
            vswp->instance);

        kmem_free(conn, sizeof (vsw_conn_evt_t));
        goto err_exit;
    }

    D1(vswp, "%s: exit", __func__);
    return;

err_exit:
    /*
     * Have mostly likely failed due to memory shortage. Clear the flag so
     * that future requests will at least be attempted and will hopefully
     * succeed.
     */
    if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
        ldcp->reset_active = 0;
}

/*
 * Deal with events relating to a connection. Invoked from a taskq.
 */
static void
vsw_conn_task(void *arg)
{
    vsw_conn_evt_t  *conn = (vsw_conn_evt_t *)arg;
    vsw_ldc_t   *ldcp = NULL;
    vsw_t       *vswp = NULL;
    uint16_t    evt;
    ldc_status_t    curr_status;

    ldcp = conn->ldcp;
    evt = conn->evt;
    vswp = ldcp->ldc_vswp;

    D1(vswp, "%s: enter", __func__);

    /* can safely free now have copied out data */
    kmem_free(conn, sizeof (vsw_conn_evt_t));

    mutex_enter(&ldcp->status_lock);
    if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
        cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
            "channel %ld", vswp->instance, ldcp->ldc_id);
        mutex_exit(&ldcp->status_lock);
        return;
    }

    /*
     * If we wish to restart the handshake on this channel, then if
     * the channel is UP we bring it DOWN to flush the underlying
     * ldc queue.
     */
    if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
        (void) ldc_down(ldcp->ldc_handle);

    /*
     * re-init all the associated data structures.
     */
    vsw_ldc_reinit(ldcp);

    /*
     * Bring the channel back up (note it does no harm to
     * do this even if the channel is already UP, Just
     * becomes effectively a no-op).
     */
    (void) ldc_up(ldcp->ldc_handle);

    /*
     * Check if channel is now UP. This will only happen if
     * peer has also done a ldc_up().
     */
    if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
        cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
            "channel %ld", vswp->instance, ldcp->ldc_id);
        mutex_exit(&ldcp->status_lock);
        return;
    }

    ldcp->ldc_status = curr_status;

    /* channel UP so restart handshake by sending version info */
    if (curr_status == LDC_UP) {
        if (ldcp->hcnt++ > vsw_num_handshakes) {
            cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
                " handshake attempts (%d) on channel %ld",
                vswp->instance, ldcp->hcnt, ldcp->ldc_id);
            mutex_exit(&ldcp->status_lock);
            return;
        }

        if (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
            DDI_NOSLEEP) != DDI_SUCCESS) {
            cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
                vswp->instance);

            /*
             * Don't count as valid restart attempt if couldn't
             * send version msg.
             */
            if (ldcp->hcnt > 0)
                ldcp->hcnt--;
        }
    }

    /*
     * Mark that the process is complete by clearing the flag.
     *
     * Note is it possible that the taskq dispatch above may have failed,
     * most likely due to memory shortage. We still clear the flag so
     * future attempts will at least be attempted and will hopefully
     * succeed.
     */
    if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
        ldcp->reset_active = 0;

    mutex_exit(&ldcp->status_lock);

    D1(vswp, "%s: exit", __func__);
}

/*
 * returns 0 if legal for event signified by flag to have
 * occured at the time it did. Otherwise returns 1.
 */
int
vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
{
    vsw_t       *vswp = ldcp->ldc_vswp;
    uint64_t    state;
    uint64_t    phase;

    if (dir == INBOUND)
        state = ldcp->lane_in.lstate;
    else
        state = ldcp->lane_out.lstate;

    phase = ldcp->hphase;

    switch (flag) {
    case VSW_VER_INFO_RECV:
        if (phase > VSW_MILESTONE0) {
            DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
                " when in state %d\n", ldcp->ldc_id, phase);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return (1);
        }
        break;

    case VSW_VER_ACK_RECV:
    case VSW_VER_NACK_RECV:
        if (!(state & VSW_VER_INFO_SENT)) {
            DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK"
                " or VER_NACK when in state %d\n",
                ldcp->ldc_id, phase);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return (1);
        } else
            state &= ~VSW_VER_INFO_SENT;
        break;

    case VSW_ATTR_INFO_RECV:
        if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
            DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
                " when in state %d\n", ldcp->ldc_id, phase);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return (1);
        }
        break;

    case VSW_ATTR_ACK_RECV:
    case VSW_ATTR_NACK_RECV:
        if (!(state & VSW_ATTR_INFO_SENT)) {
            DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
                " or ATTR_NACK when in state %d\n",
                ldcp->ldc_id, phase);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return (1);
        } else
            state &= ~VSW_ATTR_INFO_SENT;
        break;

    case VSW_DRING_INFO_RECV:
        if (phase < VSW_MILESTONE1) {
            DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
                " when in state %d\n", ldcp->ldc_id, phase);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return (1);
        }
        break;

    case VSW_DRING_ACK_RECV:
    case VSW_DRING_NACK_RECV:
        if (!(state & VSW_DRING_INFO_SENT)) {
            DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK"
                " or DRING_NACK when in state %d\n",
                ldcp->ldc_id, phase);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return (1);
        } else
            state &= ~VSW_DRING_INFO_SENT;
        break;

    case VSW_RDX_INFO_RECV:
        if (phase < VSW_MILESTONE3) {
            DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
                " when in state %d\n", ldcp->ldc_id, phase);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return (1);
        }
        break;

    case VSW_RDX_ACK_RECV:
    case VSW_RDX_NACK_RECV:
        if (!(state & VSW_RDX_INFO_SENT)) {
            DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK"
                " or RDX_NACK when in state %d\n",
                ldcp->ldc_id, phase);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return (1);
        } else
            state &= ~VSW_RDX_INFO_SENT;
        break;

    case VSW_MCST_INFO_RECV:
        if (phase < VSW_MILESTONE3) {
            DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
                " when in state %d\n", ldcp->ldc_id, phase);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return (1);
        }
        break;

    default:
        DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
                ldcp->ldc_id, flag);
        return (1);
    }

    if (dir == INBOUND)
        ldcp->lane_in.lstate = state;
    else
        ldcp->lane_out.lstate = state;

    D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);

    return (0);
}

void
vsw_next_milestone(vsw_ldc_t *ldcp)
{
    vsw_t       *vswp = ldcp->ldc_vswp;

    D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
        ldcp->ldc_id, ldcp->hphase);

    DUMP_FLAGS(ldcp->lane_in.lstate);
    DUMP_FLAGS(ldcp->lane_out.lstate);

    switch (ldcp->hphase) {

    case VSW_MILESTONE0:
        /*
         * If we haven't started to handshake with our peer,
         * start to do so now.
         */
        if (ldcp->lane_out.lstate == 0) {
            D2(vswp, "%s: (chan %lld) starting handshake "
                "with peer", __func__, ldcp->ldc_id);
            vsw_process_conn_evt(ldcp, VSW_CONN_UP);
        }

        /*
         * Only way to pass this milestone is to have successfully
         * negotiated version info.
         */
        if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
            (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {

            D2(vswp, "%s: (chan %lld) leaving milestone 0",
                __func__, ldcp->ldc_id);

            /*
             * Next milestone is passed when attribute
             * information has been successfully exchanged.
             */
            ldcp->hphase = VSW_MILESTONE1;
            vsw_send_attr(ldcp);

        }
        break;

    case VSW_MILESTONE1:
        /*
         * Only way to pass this milestone is to have successfully
         * negotiated attribute information.
         */
        if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {

            ldcp->hphase = VSW_MILESTONE2;

            /*
             * If the peer device has said it wishes to
             * use descriptor rings then we send it our ring
             * info, otherwise we just set up a private ring
             * which we use an internal buffer
             */
            if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE)
                vsw_send_dring_info(ldcp);
        }
        break;

    case VSW_MILESTONE2:
        /*
         * If peer has indicated in its attribute message that
         * it wishes to use descriptor rings then the only way
         * to pass this milestone is for us to have received
         * valid dring info.
         *
         * If peer is not using descriptor rings then just fall
         * through.
         */
        if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) &&
            (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT)))
            break;

        D2(vswp, "%s: (chan %lld) leaving milestone 2",
                __func__, ldcp->ldc_id);

        ldcp->hphase = VSW_MILESTONE3;
        vsw_send_rdx(ldcp);
        break;

    case VSW_MILESTONE3:
        /*
         * Pass this milestone when all paramaters have been
         * successfully exchanged and RDX sent in both directions.
         *
         * Mark outbound lane as available to transmit data.
         */
        if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
            (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {

            D2(vswp, "%s: (chan %lld) leaving milestone 3",
                __func__, ldcp->ldc_id);
            D2(vswp, "%s: ** handshake complete (0x%llx : "
                "0x%llx) **", __func__, ldcp->lane_in.lstate,
                ldcp->lane_out.lstate);
            ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
            ldcp->hphase = VSW_MILESTONE4;
            ldcp->hcnt = 0;
            DISPLAY_STATE();
        } else {
            D2(vswp, "%s: still in milestone 3 (0x%llx :"
                " 0x%llx", __func__, ldcp->lane_in.lstate,
                ldcp->lane_out.lstate);
        }
        break;

    case VSW_MILESTONE4:
        D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
                            ldcp->ldc_id);
        break;

    default:
        DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
            ldcp->ldc_id, ldcp->hphase);
    }

    D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
        ldcp->hphase);
}

/*
 * Check if major version is supported.
 *
 * Returns 0 if finds supported major number, and if necessary
 * adjusts the minor field.
 *
 * Returns 1 if can't match major number exactly. Sets mjor/minor
 * to next lowest support values, or to zero if no other values possible.
 */
static int
vsw_supported_version(vio_ver_msg_t *vp)
{
    int i;

    D1(NULL, "vsw_supported_version: enter");

    for (i = 0; i < VSW_NUM_VER; i++) {
        if (vsw_versions[i].ver_major == vp->ver_major) {
            /*
             * Matching or lower major version found. Update
             * minor number if necessary.
             */
            if (vp->ver_minor > vsw_versions[i].ver_minor) {
                D2(NULL, "%s: adjusting minor value"
                    " from %d to %d", __func__,
                    vp->ver_minor,
                    vsw_versions[i].ver_minor);
                vp->ver_minor = vsw_versions[i].ver_minor;
            }

            return (0);
        }

        if (vsw_versions[i].ver_major < vp->ver_major) {
            if (vp->ver_minor > vsw_versions[i].ver_minor) {
                D2(NULL, "%s: adjusting minor value"
                    " from %d to %d", __func__,
                    vp->ver_minor,
                    vsw_versions[i].ver_minor);
                vp->ver_minor = vsw_versions[i].ver_minor;
            }
            return (1);
        }
    }

    /* No match was possible, zero out fields */
    vp->ver_major = 0;
    vp->ver_minor = 0;

    D1(NULL, "vsw_supported_version: exit");

    return (1);
}

/*
 * Main routine for processing messages received over LDC.
 */
static void
vsw_process_pkt(void *arg)
{
    vsw_ldc_t   *ldcp = (vsw_ldc_t  *)arg;
    vsw_t       *vswp = ldcp->ldc_vswp;
    size_t      msglen;
    vio_msg_tag_t   tag;
    def_msg_t   dmsg;
    int         rv = 0;


    D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);

    /*
     * If channel is up read messages until channel is empty.
     */
    do {
        msglen = sizeof (dmsg);
        rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen);

        if (rv != 0) {
            DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) "
                "len(%d)\n", __func__, ldcp->ldc_id,
                            rv, msglen);
        }

        /* channel has been reset */
        if (rv == ECONNRESET) {
            vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
            break;
        }

        if (msglen == 0) {
            D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
            ldcp->ldc_id);
            break;
        }

        D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
            ldcp->ldc_id, msglen);

        /*
         * Figure out what sort of packet we have gotten by
         * examining the msg tag, and then switch it appropriately.
         */
        bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t));

        switch (tag.vio_msgtype) {
        case VIO_TYPE_CTRL:
            vsw_dispatch_ctrl_task(ldcp, &dmsg, tag);
            break;
        case VIO_TYPE_DATA:
            vsw_process_data_pkt(ldcp, &dmsg, tag);
            break;
        case VIO_TYPE_ERR:
            vsw_process_err_pkt(ldcp, &dmsg, tag);
            break;
        default:
            DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
                "id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id);
            break;
        }
    } while (msglen);

    D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
}

/*
 * Dispatch a task to process a VIO control message.
 */
static void
vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag)
{
    vsw_ctrl_task_t     *ctaskp = NULL;
    vsw_port_t      *port = ldcp->ldc_port;
    vsw_t           *vswp = port->p_vswp;

    D1(vswp, "%s: enter", __func__);

    /*
     * We need to handle RDX ACK messages in-band as once they
     * are exchanged it is possible that we will get an
     * immediate (legitimate) data packet.
     */
    if ((tag.vio_subtype_env == VIO_RDX) &&
        (tag.vio_subtype == VIO_SUBTYPE_ACK)) {

        if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
            return;

        ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
        D2(vswp, "%s (%ld) handling RDX_ACK in place "
            "(ostate 0x%llx : hphase %d)", __func__,
            ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
        vsw_next_milestone(ldcp);
        return;
    }

    ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);

    if (ctaskp == NULL) {
        DERR(vswp, "%s: unable to alloc space for ctrl"
            " msg", __func__);
        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
        return;
    }

    ctaskp->ldcp = ldcp;
    bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
    mutex_enter(&ldcp->hss_lock);
    ctaskp->hss_id = ldcp->hss_id;
    mutex_exit(&ldcp->hss_lock);

    /*
     * Dispatch task to processing taskq if port is not in
     * the process of being detached.
     */
    mutex_enter(&port->state_lock);
    if (port->state == VSW_PORT_INIT) {
        if ((vswp->taskq_p == NULL) ||
            (ddi_taskq_dispatch(vswp->taskq_p,
            vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP)
                            != DDI_SUCCESS)) {
            DERR(vswp, "%s: unable to dispatch task to taskq",
                __func__);
            kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
            mutex_exit(&port->state_lock);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return;
        }
    } else {
        DWARN(vswp, "%s: port %d detaching, not dispatching "
            "task", __func__, port->p_instance);
    }

    mutex_exit(&port->state_lock);

    D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
            ldcp->ldc_id);
    D1(vswp, "%s: exit", __func__);
}

/*
 * Process a VIO ctrl message. Invoked from taskq.
 */
static void
vsw_process_ctrl_pkt(void *arg)
{
    vsw_ctrl_task_t *ctaskp = (vsw_ctrl_task_t *)arg;
    vsw_ldc_t   *ldcp = ctaskp->ldcp;
    vsw_t       *vswp = ldcp->ldc_vswp;
    vio_msg_tag_t   tag;
    uint16_t    env;

    D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

    bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
    env = tag.vio_subtype_env;

    /* stale pkt check */
    mutex_enter(&ldcp->hss_lock);
    if (ctaskp->hss_id < ldcp->hss_id) {
        DWARN(vswp, "%s: discarding stale packet belonging to"
            " earlier (%ld) handshake session", __func__,
            ctaskp->hss_id);
        mutex_exit(&ldcp->hss_lock);
        return;
    }
    mutex_exit(&ldcp->hss_lock);

    /* session id check */
    if (ldcp->session_status & VSW_PEER_SESSION) {
        if (ldcp->peer_session != tag.vio_sid) {
            DERR(vswp, "%s (chan %d): invalid session id (%llx)",
                __func__, ldcp->ldc_id, tag.vio_sid);
            kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return;
        }
    }

    /*
     * Switch on vio_subtype envelope, then let lower routines
     * decide if its an INFO, ACK or NACK packet.
     */
    switch (env) {
    case VIO_VER_INFO:
        vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
        break;
    case VIO_DRING_REG:
        vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
        break;
    case VIO_DRING_UNREG:
        vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
        break;
    case VIO_ATTR_INFO:
        vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
        break;
    case VNET_MCAST_INFO:
        vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
        break;
    case VIO_RDX:
        vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
        break;
    default:
        DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
                            __func__, env);
    }

    kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
    D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

/*
 * Version negotiation. We can end up here either because our peer
 * has responded to a handshake message we have sent it, or our peer
 * has initiated a handshake with us. If its the former then can only
 * be ACK or NACK, if its the later can only be INFO.
 *
 * If its an ACK we move to the next stage of the handshake, namely
 * attribute exchange. If its a NACK we see if we can specify another
 * version, if we can't we stop.
 *
 * If it is an INFO we reset all params associated with communication
 * in that direction over this channel (remember connection is
 * essentially 2 independent simplex channels).
 */
void
vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
{
    vio_ver_msg_t   *ver_pkt;
    vsw_t       *vswp = ldcp->ldc_vswp;

    D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

    /*
     * We know this is a ctrl/version packet so
     * cast it into the correct structure.
     */
    ver_pkt = (vio_ver_msg_t *)pkt;

    switch (ver_pkt->tag.vio_subtype) {
    case VIO_SUBTYPE_INFO:
        D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");

        /*
         * Record the session id, which we will use from now
         * until we see another VER_INFO msg. Even then the
         * session id in most cases will be unchanged, execpt
         * if channel was reset.
         */
        if ((ldcp->session_status & VSW_PEER_SESSION) &&
            (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
            DERR(vswp, "%s: updating session id for chan %lld "
                "from %llx to %llx", __func__, ldcp->ldc_id,
                ldcp->peer_session, ver_pkt->tag.vio_sid);
        }

        ldcp->peer_session = ver_pkt->tag.vio_sid;
        ldcp->session_status |= VSW_PEER_SESSION;

        /* Legal message at this time ? */
        if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
            return;

        /*
         * First check the device class. Currently only expect
         * to be talking to a network device. In the future may
         * also talk to another switch.
         */
        if (ver_pkt->dev_class != VDEV_NETWORK) {
            DERR(vswp, "%s: illegal device class %d", __func__,
                ver_pkt->dev_class);

            ver_pkt->tag.vio_sid = ldcp->local_session;
            ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;

            DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);

            (void) vsw_send_msg(ldcp, (void *)ver_pkt,
                    sizeof (vio_ver_msg_t), B_TRUE);

            ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
            vsw_next_milestone(ldcp);
            return;
        } else {
            ldcp->dev_class = ver_pkt->dev_class;
        }

        /*
         * Now check the version.
         */
        if (vsw_supported_version(ver_pkt) == 0) {
            /*
             * Support this major version and possibly
             * adjusted minor version.
             */

            D2(vswp, "%s: accepted ver %d:%d", __func__,
                ver_pkt->ver_major, ver_pkt->ver_minor);

            /* Store accepted values */
            ldcp->lane_in.ver_major = ver_pkt->ver_major;
            ldcp->lane_in.ver_minor = ver_pkt->ver_minor;

            ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;

            ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
        } else {
            /*
             * NACK back with the next lower major/minor
             * pairing we support (if don't suuport any more
             * versions then they will be set to zero.
             */

            D2(vswp, "%s: replying with ver %d:%d", __func__,
                ver_pkt->ver_major, ver_pkt->ver_minor);

            /* Store updated values */
            ldcp->lane_in.ver_major = ver_pkt->ver_major;
            ldcp->lane_in.ver_minor = ver_pkt->ver_minor;

            ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;

            ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
        }

        DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
        ver_pkt->tag.vio_sid = ldcp->local_session;
        (void) vsw_send_msg(ldcp, (void *)ver_pkt,
            sizeof (vio_ver_msg_t), B_TRUE);

        vsw_next_milestone(ldcp);
        break;

    case VIO_SUBTYPE_ACK:
        D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);

        if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
            return;

        /* Store updated values */
        ldcp->lane_in.ver_major = ver_pkt->ver_major;
        ldcp->lane_in.ver_minor = ver_pkt->ver_minor;


        ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
        vsw_next_milestone(ldcp);

        break;

    case VIO_SUBTYPE_NACK:
        D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);

        if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
            return;

        /*
         * If our peer sent us a NACK with the ver fields set to
         * zero then there is nothing more we can do. Otherwise see
         * if we support either the version suggested, or a lesser
         * one.
         */
        if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
            DERR(vswp, "%s: peer unable to negotiate any "
                "further.", __func__);
            ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
            vsw_next_milestone(ldcp);
            return;
        }

        /*
         * Check to see if we support this major version or
         * a lower one. If we don't then maj/min will be set
         * to zero.
         */
        (void) vsw_supported_version(ver_pkt);
        if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
            /* Nothing more we can do */
            DERR(vswp, "%s: version negotiation failed.\n",
                                __func__);
            ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
            vsw_next_milestone(ldcp);
        } else {
            /* found a supported major version */
            ldcp->lane_out.ver_major = ver_pkt->ver_major;
            ldcp->lane_out.ver_minor = ver_pkt->ver_minor;

            D2(vswp, "%s: resending with updated values (%x, %x)",
                __func__, ver_pkt->ver_major,
                ver_pkt->ver_minor);

            ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
            ver_pkt->tag.vio_sid = ldcp->local_session;
            ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;

            DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);

            (void) vsw_send_msg(ldcp, (void *)ver_pkt,
                sizeof (vio_ver_msg_t), B_TRUE);

            vsw_next_milestone(ldcp);

        }
        break;

    default:
        DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
            ver_pkt->tag.vio_subtype);
    }

    D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
}

/*
 * Process an attribute packet. We can end up here either because our peer
 * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
 * peer has sent us an attribute INFO message
 *
 * If its an ACK we then move to the next stage of the handshake which
 * is to send our descriptor ring info to our peer. If its a NACK then
 * there is nothing more we can (currently) do.
 *
 * If we get a valid/acceptable INFO packet (and we have already negotiated
 * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
 * NACK back and reset channel state to INACTIV.
 *
 * FUTURE: in time we will probably negotiate over attributes, but for
 * the moment unacceptable attributes are regarded as a fatal error.
 *
 */
void
vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
{
    vnet_attr_msg_t     *attr_pkt;
    vsw_t           *vswp = ldcp->ldc_vswp;
    vsw_port_t      *port = ldcp->ldc_port;
    uint64_t        macaddr = 0;
    int         i;

    D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);

    /*
     * We know this is a ctrl/attr packet so
     * cast it into the correct structure.
     */
    attr_pkt = (vnet_attr_msg_t *)pkt;

    switch (attr_pkt->tag.vio_subtype) {
    case VIO_SUBTYPE_INFO:
        D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

        if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
            return;

        /*
         * If the attributes are unacceptable then we NACK back.
         */
        if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) {

            DERR(vswp, "%s (chan %d): invalid attributes",
                __func__, ldcp->ldc_id);

            vsw_free_lane_resources(ldcp, INBOUND);

            attr_pkt->tag.vio_sid = ldcp->local_session;
            attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;

            DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
            ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
            (void) vsw_send_msg(ldcp, (void *)attr_pkt,
                sizeof (vnet_attr_msg_t), B_TRUE);

            vsw_next_milestone(ldcp);
            return;
        }

        /*
         * Otherwise store attributes for this lane and update
         * lane state.
         */
        ldcp->lane_in.mtu = attr_pkt->mtu;
        ldcp->lane_in.addr = attr_pkt->addr;
        ldcp->lane_in.addr_type = attr_pkt->addr_type;
        ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
        ldcp->lane_in.ack_freq = attr_pkt->ack_freq;

        macaddr = ldcp->lane_in.addr;
        for (i = ETHERADDRL - 1; i >= 0; i--) {
            port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
            macaddr >>= 8;
        }

        /* create the fdb entry for this port/mac address */
        (void) vsw_add_fdb(vswp, port);

        /* setup device specifc xmit routines */
        mutex_enter(&port->tx_lock);
        if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) {
            D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
            port->transmit = vsw_dringsend;
        } else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
            D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
            vsw_create_privring(ldcp);
            port->transmit = vsw_descrsend;
        }
        mutex_exit(&port->tx_lock);

        attr_pkt->tag.vio_sid = ldcp->local_session;
        attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;

        DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);

        ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;

        (void) vsw_send_msg(ldcp, (void *)attr_pkt,
                sizeof (vnet_attr_msg_t), B_TRUE);

        vsw_next_milestone(ldcp);
        break;

    case VIO_SUBTYPE_ACK:
        D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

        if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
            return;

        ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
        vsw_next_milestone(ldcp);
        break;

    case VIO_SUBTYPE_NACK:
        D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

        if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
            return;

        ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
        vsw_next_milestone(ldcp);
        break;

    default:
        DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
            attr_pkt->tag.vio_subtype);
    }

    D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
}

/*
 * Process a dring info packet. We can end up here either because our peer
 * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
 * peer has sent us a dring INFO message.
 *
 * If we get a valid/acceptable INFO packet (and we have already negotiated
 * a version) we ACK back and update the lane state, otherwise we NACK back.
 *
 * FUTURE: nothing to stop client from sending us info on multiple dring's
 * but for the moment we will just use the first one we are given.
 *
 */
void
vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
{
    vio_dring_reg_msg_t *dring_pkt;
    vsw_t           *vswp = ldcp->ldc_vswp;
    ldc_mem_info_t      minfo;
    dring_info_t        *dp, *dbp;
    int         dring_found = 0;

    /*
     * We know this is a ctrl/dring packet so
     * cast it into the correct structure.
     */
    dring_pkt = (vio_dring_reg_msg_t *)pkt;

    D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);

    switch (dring_pkt->tag.vio_subtype) {
    case VIO_SUBTYPE_INFO:
        D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

        if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
            return;

        /*
         * If the dring params are unacceptable then we NACK back.
         */
        if (vsw_check_dring_info(dring_pkt)) {

            DERR(vswp, "%s (%lld): invalid dring info",
                __func__, ldcp->ldc_id);

            vsw_free_lane_resources(ldcp, INBOUND);

            dring_pkt->tag.vio_sid = ldcp->local_session;
            dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;

            DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);

            ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;

            (void) vsw_send_msg(ldcp, (void *)dring_pkt,
                sizeof (vio_dring_reg_msg_t), B_TRUE);

            vsw_next_milestone(ldcp);
            return;
        }

        /*
         * Otherwise, attempt to map in the dring using the
         * cookie. If that succeeds we send back a unique dring
         * identifier that the sending side will use in future
         * to refer to this descriptor ring.
         */
        dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);

        dp->num_descriptors = dring_pkt->num_descriptors;
        dp->descriptor_size = dring_pkt->descriptor_size;
        dp->options = dring_pkt->options;
        dp->ncookies = dring_pkt->ncookies;

        /*
         * Note: should only get one cookie. Enforced in
         * the ldc layer.
         */
        bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
            sizeof (ldc_mem_cookie_t));

        D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
            dp->num_descriptors, dp->descriptor_size);
        D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
            dp->options, dp->ncookies);

        if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
            dp->ncookies, dp->num_descriptors,
            dp->descriptor_size, LDC_SHADOW_MAP,
            &(dp->handle))) != 0) {

            DERR(vswp, "%s: dring_map failed\n", __func__);

            kmem_free(dp, sizeof (dring_info_t));
            vsw_free_lane_resources(ldcp, INBOUND);

            dring_pkt->tag.vio_sid = ldcp->local_session;
            dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;

            DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);

            ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
            (void) vsw_send_msg(ldcp, (void *)dring_pkt,
                sizeof (vio_dring_reg_msg_t), B_TRUE);

            vsw_next_milestone(ldcp);
            return;
        }

        if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {

            DERR(vswp, "%s: dring_addr failed\n", __func__);

            kmem_free(dp, sizeof (dring_info_t));
            vsw_free_lane_resources(ldcp, INBOUND);

            dring_pkt->tag.vio_sid = ldcp->local_session;
            dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;

            DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);

            ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
            (void) vsw_send_msg(ldcp, (void *)dring_pkt,
                sizeof (vio_dring_reg_msg_t), B_TRUE);

            vsw_next_milestone(ldcp);
            return;
        } else {
            /* store the address of the pub part of ring */
            dp->pub_addr = minfo.vaddr;
        }

        /* no private section as we are importing */
        dp->priv_addr = NULL;

        /*
         * Using simple mono increasing int for ident at
         * the moment.
         */
        dp->ident = ldcp->next_ident;
        ldcp->next_ident++;

        dp->end_idx = 0;
        dp->next = NULL;

        /*
         * Link it onto the end of the list of drings
         * for this lane.
         */
        if (ldcp->lane_in.dringp == NULL) {
            D2(vswp, "%s: adding first INBOUND dring", __func__);
            ldcp->lane_in.dringp = dp;
        } else {
            dbp = ldcp->lane_in.dringp;

            while (dbp->next != NULL)
                dbp = dbp->next;

            dbp->next = dp;
        }

        /* acknowledge it */
        dring_pkt->tag.vio_sid = ldcp->local_session;
        dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
        dring_pkt->dring_ident = dp->ident;

        (void) vsw_send_msg(ldcp, (void *)dring_pkt,
            sizeof (vio_dring_reg_msg_t), B_TRUE);

        ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
        vsw_next_milestone(ldcp);
        break;

    case VIO_SUBTYPE_ACK:
        D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

        if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
            return;

        /*
         * Peer is acknowledging our dring info and will have
         * sent us a dring identifier which we will use to
         * refer to this ring w.r.t. our peer.
         */
        dp = ldcp->lane_out.dringp;
        if (dp != NULL) {
            /*
             * Find the ring this ident should be associated
             * with.
             */
            if (vsw_dring_match(dp, dring_pkt)) {
                dring_found = 1;

            } else while (dp != NULL) {
                if (vsw_dring_match(dp, dring_pkt)) {
                    dring_found = 1;
                    break;
                }
                dp = dp->next;
            }

            if (dring_found == 0) {
                DERR(NULL, "%s: unrecognised ring cookie",
                    __func__);
                vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
                return;
            }

        } else {
            DERR(vswp, "%s: DRING ACK received but no drings "
                "allocated", __func__);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return;
        }

        /* store ident */
        dp->ident = dring_pkt->dring_ident;
        ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
        vsw_next_milestone(ldcp);
        break;

    case VIO_SUBTYPE_NACK:
        D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

        if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
            return;

        ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
        vsw_next_milestone(ldcp);
        break;

    default:
        DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
            dring_pkt->tag.vio_subtype);
    }

    D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
}

/*
 * Process a request from peer to unregister a dring.
 *
 * For the moment we just restart the handshake if our
 * peer endpoint attempts to unregister a dring.
 */
void
vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
{
    vsw_t           *vswp = ldcp->ldc_vswp;
    vio_dring_unreg_msg_t   *dring_pkt;

    /*
     * We know this is a ctrl/dring packet so
     * cast it into the correct structure.
     */
    dring_pkt = (vio_dring_unreg_msg_t *)pkt;

    D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

    switch (dring_pkt->tag.vio_subtype) {
    case VIO_SUBTYPE_INFO:
        D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

        DWARN(vswp, "%s: restarting handshake..", __func__);
        break;

    case VIO_SUBTYPE_ACK:
        D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

        DWARN(vswp, "%s: restarting handshake..", __func__);
        break;

    case VIO_SUBTYPE_NACK:
        D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

        DWARN(vswp, "%s: restarting handshake..", __func__);
        break;

    default:
        DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
            dring_pkt->tag.vio_subtype);
    }

    vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);

    D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

#define SND_MCST_NACK(ldcp, pkt) \
    pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
    pkt->tag.vio_sid = ldcp->local_session; \
    (void) vsw_send_msg(ldcp, (void *)pkt, \
            sizeof (vnet_mcast_msg_t), B_TRUE);

/*
 * Process a multicast request from a vnet.
 *
 * Vnet's specify a multicast address that they are interested in. This
 * address is used as a key into the hash table which forms the multicast
 * forwarding database (mFDB).
 *
 * The table keys are the multicast addresses, while the table entries
 * are pointers to lists of ports which wish to receive packets for the
 * specified multicast address.
 *
 * When a multicast packet is being switched we use the address as a key
 * into the hash table, and then walk the appropriate port list forwarding
 * the pkt to each port in turn.
 *
 * If a vnet is no longer interested in a particular multicast grouping
 * we simply find the correct location in the hash table and then delete
 * the relevant port from the port list.
 *
 * To deal with the case whereby a port is being deleted without first
 * removing itself from the lists in the hash table, we maintain a list
 * of multicast addresses the port has registered an interest in, within
 * the port structure itself. We then simply walk that list of addresses
 * using them as keys into the hash table and remove the port from the
 * appropriate lists.
 */
static void
vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
{
    vnet_mcast_msg_t    *mcst_pkt;
    vsw_port_t      *port = ldcp->ldc_port;
    vsw_t           *vswp = ldcp->ldc_vswp;
    int         i;

    D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

    /*
     * We know this is a ctrl/mcast packet so
     * cast it into the correct structure.
     */
    mcst_pkt = (vnet_mcast_msg_t *)pkt;

    switch (mcst_pkt->tag.vio_subtype) {
    case VIO_SUBTYPE_INFO:
        D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

        /*
         * Check if in correct state to receive a multicast
         * message (i.e. handshake complete). If not reset
         * the handshake.
         */
        if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
            return;

        /*
         * Before attempting to add or remove address check
         * that they are valid multicast addresses.
         * If not, then NACK back.
         */
        for (i = 0; i < mcst_pkt->count; i++) {
            if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
                DERR(vswp, "%s: invalid multicast address",
                                __func__);
                SND_MCST_NACK(ldcp, mcst_pkt);
                return;
            }
        }

        /*
         * Now add/remove the addresses. If this fails we
         * NACK back.
         */
        if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
            SND_MCST_NACK(ldcp, mcst_pkt);
            return;
        }

        mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
        mcst_pkt->tag.vio_sid = ldcp->local_session;

        DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);

        (void) vsw_send_msg(ldcp, (void *)mcst_pkt,
                sizeof (vnet_mcast_msg_t), B_TRUE);
        break;

    case VIO_SUBTYPE_ACK:
        DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

        /*
         * We shouldn't ever get a multicast ACK message as
         * at the moment we never request multicast addresses
         * to be set on some other device. This may change in
         * the future if we have cascading switches.
         */
        if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
            return;

                /* Do nothing */
        break;

    case VIO_SUBTYPE_NACK:
        DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

        /*
         * We shouldn't get a multicast NACK packet for the
         * same reasons as we shouldn't get a ACK packet.
         */
        if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
            return;

                /* Do nothing */
        break;

    default:
        DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
            mcst_pkt->tag.vio_subtype);
    }

    D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

static void
vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
{
    vio_rdx_msg_t   *rdx_pkt;
    vsw_t       *vswp = ldcp->ldc_vswp;

    /*
     * We know this is a ctrl/rdx packet so
     * cast it into the correct structure.
     */
    rdx_pkt = (vio_rdx_msg_t *)pkt;

    D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);

    switch (rdx_pkt->tag.vio_subtype) {
    case VIO_SUBTYPE_INFO:
        D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

        if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
            return;

        rdx_pkt->tag.vio_sid = ldcp->local_session;
        rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;

        DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);

        ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;

        (void) vsw_send_msg(ldcp, (void *)rdx_pkt,
            sizeof (vio_rdx_msg_t), B_TRUE);

        vsw_next_milestone(ldcp);
        break;

    case VIO_SUBTYPE_ACK:
        /*
         * Should be handled in-band by callback handler.
         */
        DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
        break;

    case VIO_SUBTYPE_NACK:
        D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

        if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
            return;

        ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
        vsw_next_milestone(ldcp);
        break;

    default:
        DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
            rdx_pkt->tag.vio_subtype);
    }

    D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

static void
vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag)
{
    uint16_t    env = tag.vio_subtype_env;
    vsw_t       *vswp = ldcp->ldc_vswp;

    D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

    /* session id check */
    if (ldcp->session_status & VSW_PEER_SESSION) {
        if (ldcp->peer_session != tag.vio_sid) {
            DERR(vswp, "%s (chan %d): invalid session id (%llx)",
                __func__, ldcp->ldc_id, tag.vio_sid);
            vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
            return;
        }
    }

    /*
     * It is an error for us to be getting data packets
     * before the handshake has completed.
     */
    if (ldcp->hphase != VSW_MILESTONE4) {
        DERR(vswp, "%s: got data packet before handshake complete "
            "hphase %d (%x: %x)", __func__, ldcp->hphase,
            ldcp->lane_in.lstate, ldcp->lane_out.lstate);
        DUMP_FLAGS(ldcp->lane_in.lstate);
        DUMP_FLAGS(ldcp->lane_out.lstate);
        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
        return;
    }

    /*
     * Switch on vio_subtype envelope, then let lower routines
     * decide if its an INFO, ACK or NACK packet.
     */
    if (env == VIO_DRING_DATA) {
        vsw_process_data_dring_pkt(ldcp, dpkt);
    } else if (env == VIO_PKT_DATA) {
        vsw_process_data_raw_pkt(ldcp, dpkt);
    } else if (env == VIO_DESC_DATA) {
        vsw_process_data_ibnd_pkt(ldcp, dpkt);
    } else {
        DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
                            __func__, env);
    }

    D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

#define SND_DRING_NACK(ldcp, pkt) \
    pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
    pkt->tag.vio_sid = ldcp->local_session; \
    (void) vsw_send_msg(ldcp, (void *)pkt, \
            sizeof (vio_dring_msg_t), B_TRUE);

static void
vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
{
    vio_dring_msg_t     *dring_pkt;
    vnet_public_desc_t  *pub_addr = NULL;
    vsw_private_desc_t  *priv_addr = NULL;
    dring_info_t        *dp = NULL;
    vsw_t           *vswp = ldcp->ldc_vswp;
    mblk_t          *mp = NULL;
    mblk_t          *bp = NULL;
    mblk_t          *bpt = NULL;
    size_t          nbytes = 0;
    size_t          off = 0;
    uint64_t        ncookies = 0;
    uint64_t        chain = 0;
    uint64_t        j, len;
    uint32_t        pos, start, datalen;
    uint32_t        range_start, range_end;
    int32_t         end, num, cnt = 0;
    int         i, rv, msg_rv = 0;
    boolean_t       ack_needed = B_FALSE;
    boolean_t       prev_desc_ack = B_FALSE;
    int         read_attempts = 0;

    D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

    /*
     * We know this is a data/dring packet so
     * cast it into the correct structure.
     */
    dring_pkt = (vio_dring_msg_t *)dpkt;

    /*
     * Switch on the vio_subtype. If its INFO then we need to
     * process the data. If its an ACK we need to make sure
     * it makes sense (i.e did we send an earlier data/info),
     * and if its a NACK then we maybe attempt a retry.
     */
    switch (dring_pkt->tag.vio_subtype) {
    case VIO_SUBTYPE_INFO:
        D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);

        READ_ENTER(&ldcp->lane_in.dlistrw);
        if ((dp = vsw_ident2dring(&ldcp->lane_in,
                dring_pkt->dring_ident)) == NULL) {
            RW_EXIT(&ldcp->lane_in.dlistrw);

            DERR(vswp, "%s(%lld): unable to find dring from "
                "ident 0x%llx", __func__, ldcp->ldc_id,
                dring_pkt->dring_ident);

            SND_DRING_NACK(ldcp, dring_pkt);
            return;
        }

        start = pos = dring_pkt->start_idx;
        end = dring_pkt->end_idx;
        len = dp->num_descriptors;

        range_start = range_end = pos;

        D2(vswp, "%s(%lld): start index %ld : end %ld\n",
            __func__, ldcp->ldc_id, start, end);

        if (end == -1) {
            num = -1;
        } else if (end >= 0) {
            num = end >= pos ?
                end - pos + 1: (len - pos + 1) + end;

            /* basic sanity check */
            if (end > len) {
                RW_EXIT(&ldcp->lane_in.dlistrw);
                DERR(vswp, "%s(%lld): endpoint %lld outside "
                    "ring length %lld", __func__,
                    ldcp->ldc_id, end, len);

                SND_DRING_NACK(ldcp, dring_pkt);
                return;
            }
        } else {
            RW_EXIT(&ldcp->lane_in.dlistrw);
            DERR(vswp, "%s(%lld): invalid endpoint %lld",
                __func__, ldcp->ldc_id, end);
            SND_DRING_NACK(ldcp, dring_pkt);
            return;
        }

        while (cnt != num) {
vsw_recheck_desc:
            if ((rv = ldc_mem_dring_acquire(dp->handle,
                            pos, pos)) != 0) {
                RW_EXIT(&ldcp->lane_in.dlistrw);
                DERR(vswp, "%s(%lld): unable to acquire "
                    "descriptor at pos %d: err %d",
                    __func__, pos, ldcp->ldc_id, rv);
                SND_DRING_NACK(ldcp, dring_pkt);
                return;
            }

            pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;

            /*
             * When given a bounded range of descriptors
             * to process, its an error to hit a descriptor
             * which is not ready. In the non-bounded case
             * (end_idx == -1) this simply indicates we have
             * reached the end of the current active range.
             */
            if (pub_addr->hdr.dstate != VIO_DESC_READY) {
                /* unbound - no error */
                if (end == -1) {
                    if (read_attempts == vsw_read_attempts)
                        break;

                    delay(drv_usectohz(vsw_desc_delay));
                    read_attempts++;
                    goto vsw_recheck_desc;
                }

                /* bounded - error - so NACK back */
                RW_EXIT(&ldcp->lane_in.dlistrw);
                DERR(vswp, "%s(%lld): descriptor not READY "
                    "(%d)", __func__, ldcp->ldc_id,
                    pub_addr->hdr.dstate);
                SND_DRING_NACK(ldcp, dring_pkt);
                return;
            }

            DTRACE_PROBE1(read_attempts, int, read_attempts);

            range_end = pos;

            /*
             * If we ACK'd the previous descriptor then now
             * record the new range start position for later
             * ACK's.
             */
            if (prev_desc_ack) {
                range_start = pos;

                D2(vswp, "%s(%lld): updating range start "
                    "to be %d", __func__, ldcp->ldc_id,
                    range_start);

                prev_desc_ack = B_FALSE;
            }

            /*
             * Data is padded to align on 8 byte boundary,
             * datalen is actual data length, i.e. minus that
             * padding.
             */
            datalen = pub_addr->nbytes;

            /*
             * Does peer wish us to ACK when we have finished
             * with this descriptor ?
             */
            if (pub_addr->hdr.ack)
                ack_needed = B_TRUE;

            D2(vswp, "%s(%lld): processing desc %lld at pos"
                " 0x%llx : dstate 0x%lx : datalen 0x%lx",
                __func__, ldcp->ldc_id, pos, pub_addr,
                pub_addr->hdr.dstate, datalen);

            /*
             * Mark that we are starting to process descriptor.
             */
            pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;

            mp = vio_allocb(ldcp->rxh);
            if (mp == NULL) {
                /*
                 * No free receive buffers available, so
                 * fallback onto allocb(9F). Make sure that
                 * we get a data buffer which is a multiple
                 * of 8 as this is required by ldc_mem_copy.
                 */
                DTRACE_PROBE(allocb);
                mp = allocb(datalen + VNET_IPALIGN + 8,
                                BPRI_MED);
            }

            /*
             * Ensure that we ask ldc for an aligned
             * number of bytes.
             */
            nbytes = datalen + VNET_IPALIGN;
            if (nbytes & 0x7) {
                off = 8 - (nbytes & 0x7);
                nbytes += off;
            }

            ncookies = pub_addr->ncookies;
            rv = ldc_mem_copy(ldcp->ldc_handle,
                (caddr_t)mp->b_rptr, 0, &nbytes,
                pub_addr->memcookie, ncookies,
                LDC_COPY_IN);

            if (rv != 0) {
                DERR(vswp, "%s(%d): unable to copy in "
                    "data from %d cookies in desc %d"
                    " (rv %d)", __func__, ldcp->ldc_id,
                    ncookies, pos, rv);
                freemsg(mp);

                pub_addr->hdr.dstate = VIO_DESC_DONE;
                (void) ldc_mem_dring_release(dp->handle,
                                pos, pos);
                break;
            } else {
                D2(vswp, "%s(%d): copied in %ld bytes"
                    " using %d cookies", __func__,
                    ldcp->ldc_id, nbytes, ncookies);
            }

            /* adjust the read pointer to skip over the padding */
            mp->b_rptr += VNET_IPALIGN;

            /* point to the actual end of data */
            mp->b_wptr = mp->b_rptr + datalen;

            /* build a chain of received packets */
            if (bp == NULL) {
                /* first pkt */
                bp = mp;
                bp->b_next = bp->b_prev = NULL;
                bpt = bp;
                chain = 1;
            } else {
                mp->b_next = NULL;
                mp->b_prev = bpt;
                bpt->b_next = mp;
                bpt = mp;
                chain++;
            }

            /* mark we are finished with this descriptor */
            pub_addr->hdr.dstate = VIO_DESC_DONE;

            (void) ldc_mem_dring_release(dp->handle, pos, pos);

            /*
             * Send an ACK back to peer if requested.
             */
            if (ack_needed) {
                ack_needed = B_FALSE;

                dring_pkt->start_idx = range_start;
                dring_pkt->end_idx = range_end;

                DERR(vswp, "%s(%lld): processed %d %d, ACK"
                    " requested", __func__, ldcp->ldc_id,
                    dring_pkt->start_idx,
                    dring_pkt->end_idx);

                dring_pkt->dring_process_state = VIO_DP_ACTIVE;
                dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
                dring_pkt->tag.vio_sid = ldcp->local_session;
                msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
                        sizeof (vio_dring_msg_t),
                        B_FALSE);

                /*
                 * Check if ACK was successfully sent. If not
                 * we break and deal with that below.
                 */
                if (msg_rv != 0)
                    break;

                prev_desc_ack = B_TRUE;
                range_start = pos;
            }

            /* next descriptor */
            pos = (pos + 1) % len;
            cnt++;

            /*
             * Break out of loop here and stop processing to
             * allow some other network device (or disk) to
             * get access to the cpu.
             */
            if (chain > vsw_chain_len) {
                D3(vswp, "%s(%lld): switching chain of %d "
                    "msgs", __func__, ldcp->ldc_id, chain);
                break;
            }
        }
        RW_EXIT(&ldcp->lane_in.dlistrw);

        /*
         * If when we attempted to send the ACK we found that the
         * channel had been reset then now handle this. We deal with
         * it here as we cannot reset the channel while holding the
         * dlistrw lock, and we don't want to acquire/release it
         * continuously in the above loop, as a channel reset should
         * be a rare event.
         */
        if (msg_rv == ECONNRESET) {
            vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
            break;
        }

        /* send the chain of packets to be switched */
        if (bp != NULL) {
            D3(vswp, "%s(%lld): switching chain of %d msgs",
                    __func__, ldcp->ldc_id, chain);
            vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
                            ldcp->ldc_port, NULL);
        }

        DTRACE_PROBE1(msg_cnt, int, cnt);

        /*
         * We are now finished so ACK back with the state
         * set to STOPPING so our peer knows we are finished
         */
        dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
        dring_pkt->tag.vio_sid = ldcp->local_session;

        dring_pkt->dring_process_state = VIO_DP_STOPPED;

        DTRACE_PROBE(stop_process_sent);

        /*
         * We have not processed any more descriptors beyond
         * the last one we ACK'd.
         */
        if (prev_desc_ack)
            range_start = range_end;

        dring_pkt->start_idx = range_start;
        dring_pkt->end_idx = range_end;

        D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
            __func__, ldcp->ldc_id, dring_pkt->start_idx,
            dring_pkt->end_idx);

        (void) vsw_send_msg(ldcp, (void *)dring_pkt,
                sizeof (vio_dring_msg_t), B_TRUE);
        break;

    case VIO_SUBTYPE_ACK:
        D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
        /*
         * Verify that the relevant descriptors are all
         * marked as DONE
         */
        READ_ENTER(&ldcp->lane_out.dlistrw);
        if ((dp = vsw_ident2dring(&ldcp->lane_out,
            dring_pkt->dring_ident)) == NULL) {
            RW_EXIT(&ldcp->lane_out.dlistrw);
            DERR(vswp, "%s: unknown ident in ACK", __func__);
            return;
        }

        pub_addr = (vnet_public_desc_t *)dp->pub_addr;
        priv_addr = (vsw_private_desc_t *)dp->priv_addr;

        start = end = 0;
        start = dring_pkt->start_idx;
        end = dring_pkt->end_idx;
        len = dp->num_descriptors;

        j = num = 0;
        /* calculate # descriptors taking into a/c wrap around */
        num = end >= start ? end - start + 1: (len - start + 1) + end;

        D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n",
            __func__, ldcp->ldc_id, start, end, num);

        mutex_enter(&dp->dlock);
        dp->last_ack_recv = end;
        mutex_exit(&dp->dlock);

        for (i = start; j < num; i = (i + 1) % len, j++) {
            pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
            priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;

            /*
             * If the last descriptor in a range has the ACK
             * bit set then we will get two messages from our
             * peer relating to it. The normal ACK msg and then
             * a subsequent STOP msg. The first message will have
             * resulted in the descriptor being reclaimed and
             * its state set to FREE so when we encounter a non
             * DONE descriptor we need to check to see if its
             * because we have just reclaimed it.
             */
            mutex_enter(&priv_addr->dstate_lock);
            if (pub_addr->hdr.dstate == VIO_DESC_DONE) {
                /* clear all the fields */
                bzero(priv_addr->datap, priv_addr->datalen);
                priv_addr->datalen = 0;

                pub_addr->hdr.dstate = VIO_DESC_FREE;
                pub_addr->hdr.ack = 0;

                priv_addr->dstate = VIO_DESC_FREE;
                mutex_exit(&priv_addr->dstate_lock);

                D3(vswp, "clearing descp %d : pub state "
                    "0x%llx : priv state 0x%llx", i,
                    pub_addr->hdr.dstate,
                    priv_addr->dstate);

            } else {
                mutex_exit(&priv_addr->dstate_lock);

                if (dring_pkt->dring_process_state !=
                            VIO_DP_STOPPED) {
                    DERR(vswp, "%s: descriptor %lld at pos "
                        " 0x%llx not DONE (0x%lx)\n",
                        __func__, i, pub_addr,
                        pub_addr->hdr.dstate);
                    RW_EXIT(&ldcp->lane_out.dlistrw);
                    return;
                }
            }
        }

        /*
         * If our peer is stopping processing descriptors then
         * we check to make sure it has processed all the descriptors
         * we have updated. If not then we send it a new message
         * to prompt it to restart.
         */
        if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
            DTRACE_PROBE(stop_process_recv);
            D2(vswp, "%s(%lld): got stopping msg : %d : %d",
                __func__, ldcp->ldc_id, dring_pkt->start_idx,
                dring_pkt->end_idx);

            /*
             * Check next descriptor in public section of ring.
             * If its marked as READY then we need to prompt our
             * peer to start processing the ring again.
             */
            i = (end + 1) % len;
            pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
            priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;

            /*
             * Hold the restart lock across all of this to
             * make sure that its not possible for us to
             * decide that a msg needs to be sent in the future
             * but the sending code having already checked is
             * about to exit.
             */
            mutex_enter(&dp->restart_lock);
            mutex_enter(&priv_addr->dstate_lock);
            if (pub_addr->hdr.dstate == VIO_DESC_READY) {

                mutex_exit(&priv_addr->dstate_lock);

                dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
                dring_pkt->tag.vio_sid = ldcp->local_session;

                mutex_enter(&ldcp->lane_out.seq_lock);
                dring_pkt->seq_num = ldcp->lane_out.seq_num++;
                mutex_exit(&ldcp->lane_out.seq_lock);

                dring_pkt->start_idx = (end + 1) % len;
                dring_pkt->end_idx = -1;

                D2(vswp, "%s(%lld) : sending restart msg:"
                    " %d : %d", __func__, ldcp->ldc_id,
                    dring_pkt->start_idx,
                    dring_pkt->end_idx);

                msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
                    sizeof (vio_dring_msg_t), B_FALSE);

            } else {
                mutex_exit(&priv_addr->dstate_lock);
                dp->restart_reqd = B_TRUE;
            }
            mutex_exit(&dp->restart_lock);
        }
        RW_EXIT(&ldcp->lane_out.dlistrw);

        /* only do channel reset after dropping dlistrw lock */
        if (msg_rv == ECONNRESET)
            vsw_process_conn_evt(ldcp, VSW_CONN_RESET);

        break;

    case VIO_SUBTYPE_NACK:
        DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
                        __func__, ldcp->ldc_id);
        /*
         * Something is badly wrong if we are getting NACK's
         * for our data pkts. So reset the channel.
         */
        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);

        break;

    default:
        DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
            ldcp->ldc_id, dring_pkt->tag.vio_subtype);
    }

    D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
}

/*
 * VIO_PKT_DATA (a.k.a raw data mode )
 *
 * Note - currently not supported. Do nothing.
 */
static void
vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt)
{
    _NOTE(ARGUNUSED(dpkt))

    D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id);

    DERR(NULL, "%s (%lld): currently  not supported",
                        __func__, ldcp->ldc_id);

    D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
}

/*
 * Process an in-band descriptor message (most likely from
 * OBP).
 */
static void
vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
{
    vnet_ibnd_desc_t    *ibnd_desc;
    dring_info_t        *dp = NULL;
    vsw_private_desc_t  *priv_addr = NULL;
    vsw_t           *vswp = ldcp->ldc_vswp;
    mblk_t          *mp = NULL;
    mblk_t          *nmp;
    size_t          nbytes = 0;
    size_t          off = 0;
    uint64_t        idx = 0;
    uint32_t        num = 1, len, datalen = 0;
    uint64_t        ncookies = 0;
    int         i, rv;
    int         j = 0;

    D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

    ibnd_desc = (vnet_ibnd_desc_t *)pkt;

    switch (ibnd_desc->hdr.tag.vio_subtype) {
    case VIO_SUBTYPE_INFO:
        D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

        if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
            return;

        /*
         * Data is padded to align on a 8 byte boundary,
         * nbytes is actual data length, i.e. minus that
         * padding.
         */
        datalen = ibnd_desc->nbytes;

        D2(vswp, "%s(%lld): processing inband desc : "
            ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);

        ncookies = ibnd_desc->ncookies;

        /*
         * allocb(9F) returns an aligned data block. We
         * need to ensure that we ask ldc for an aligned
         * number of bytes also.
         */
        nbytes = datalen;
        if (nbytes & 0x7) {
            off = 8 - (nbytes & 0x7);
            nbytes += off;
        }

        mp = allocb(datalen, BPRI_MED);
        if (mp == NULL) {
            DERR(vswp, "%s(%lld): allocb failed",
                    __func__, ldcp->ldc_id);
            return;
        }

        rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
            0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
            LDC_COPY_IN);

        if (rv != 0) {
            DERR(vswp, "%s(%d): unable to copy in data from "
                "%d cookie(s)", __func__,
                ldcp->ldc_id, ncookies);
            freemsg(mp);
            return;
        }

        D2(vswp, "%s(%d): copied in %ld bytes using %d "
            "cookies", __func__, ldcp->ldc_id, nbytes,
            ncookies);

        /*
         * Upper layer is expecting the IP header in the packet to
         * be 4-bytes aligned, but the OBP is sending packets that
         * are not aligned.  So, copy the data to another message
         * such that the alignment requirement is met.
         */
        nmp = allocb(datalen + VNET_IPALIGN, BPRI_MED);
        if (nmp == NULL) {
            DERR(vswp, "%s(%lld): allocb failed",
                __func__, ldcp->ldc_id);
            freemsg(mp);
            return;
        }
        nmp->b_rptr += VNET_IPALIGN;
        bcopy(mp->b_rptr, nmp->b_rptr, datalen);
        freemsg(mp);

        /* point to the actual end of data */
        nmp->b_wptr = nmp->b_rptr + datalen;

        /*
         * We ACK back every in-band descriptor message we process
         */
        ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
        ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
        (void) vsw_send_msg(ldcp, (void *)ibnd_desc,
                sizeof (vnet_ibnd_desc_t), B_TRUE);

        /* send the packet to be switched */
        vswp->vsw_switch_frame(vswp, nmp, VSW_VNETPORT,
                    ldcp->ldc_port, NULL);

        break;

    case VIO_SUBTYPE_ACK:
        D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

        /* Verify the ACK is valid */
        idx = ibnd_desc->hdr.desc_handle;

        if (idx >= VSW_RING_NUM_EL) {
            cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
                "(idx %ld)", vswp->instance, idx);
            return;
        }

        if ((dp = ldcp->lane_out.dringp) == NULL) {
            DERR(vswp, "%s: no dring found", __func__);
            return;
        }

        len = dp->num_descriptors;
        /*
         * If the descriptor we are being ACK'ed for is not the
         * one we expected, then pkts were lost somwhere, either
         * when we tried to send a msg, or a previous ACK msg from
         * our peer. In either case we now reclaim the descriptors
         * in the range from the last ACK we received up to the
         * current ACK.
         */
        if (idx != dp->last_ack_recv) {
            DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
                __func__, dp->last_ack_recv, idx);
            num = idx >= dp->last_ack_recv ?
                idx - dp->last_ack_recv + 1:
                (len - dp->last_ack_recv + 1) + idx;
        }

        /*
         * When we sent the in-band message to our peer we
         * marked the copy in our private ring as READY. We now
         * check that the descriptor we are being ACK'ed for is in
         * fact READY, i.e. it is one we have shared with our peer.
         *
         * If its not we flag an error, but still reset the descr
         * back to FREE.
         */
        for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
            priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
            mutex_enter(&priv_addr->dstate_lock);
            if (priv_addr->dstate != VIO_DESC_READY) {
                DERR(vswp, "%s: (%ld) desc at index %ld not "
                    "READY (0x%lx)", __func__,
                    ldcp->ldc_id, idx, priv_addr->dstate);
                DERR(vswp, "%s: bound %d: ncookies %ld : "
                    "datalen %ld", __func__,
                    priv_addr->bound, priv_addr->ncookies,
                    priv_addr->datalen);
            }
            D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
                ldcp->ldc_id, idx);
            /* release resources associated with sent msg */
            bzero(priv_addr->datap, priv_addr->datalen);
            priv_addr->datalen = 0;
            priv_addr->dstate = VIO_DESC_FREE;
            mutex_exit(&priv_addr->dstate_lock);
        }
        /* update to next expected value */
        dp->last_ack_recv = (idx + 1) % dp->num_descriptors;

        break;

    case VIO_SUBTYPE_NACK:
        DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

        /*
         * We should only get a NACK if our peer doesn't like
         * something about a message we have sent it. If this
         * happens we just release the resources associated with
         * the message. (We are relying on higher layers to decide
         * whether or not to resend.
         */

        /* limit check */
        idx = ibnd_desc->hdr.desc_handle;

        if (idx >= VSW_RING_NUM_EL) {
            DERR(vswp, "%s: corrupted NACK received (idx %lld)",
                __func__, idx);
            return;
        }

        if ((dp = ldcp->lane_out.dringp) == NULL) {
            DERR(vswp, "%s: no dring found", __func__);
            return;
        }

        priv_addr = (vsw_private_desc_t *)dp->priv_addr;

        /* move to correct location in ring */
        priv_addr += idx;

        /* release resources associated with sent msg */
        mutex_enter(&priv_addr->dstate_lock);
        bzero(priv_addr->datap, priv_addr->datalen);
        priv_addr->datalen = 0;
        priv_addr->dstate = VIO_DESC_FREE;
        mutex_exit(&priv_addr->dstate_lock);

        break;

    default:
        DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
            ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
    }

    D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
}

static void
vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag)
{
    _NOTE(ARGUNUSED(epkt))

    vsw_t       *vswp = ldcp->ldc_vswp;
    uint16_t    env = tag.vio_subtype_env;

    D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);

    /*
     * Error vio_subtypes have yet to be defined. So for
     * the moment we can't do anything.
     */
    D2(vswp, "%s: (%x) vio_subtype env", __func__, env);

    D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
}

/*
 * Switch the given ethernet frame when operating in layer 2 mode.
 *
 * vswp: pointer to the vsw instance
 * mp: pointer to chain of ethernet frame(s) to be switched
 * caller: identifies the source of this frame as:
 *      1. VSW_VNETPORT - a vsw port (connected to a vnet).
 *      2. VSW_PHYSDEV - the physical ethernet device
 *      3. VSW_LOCALDEV - vsw configured as a virtual interface
 * arg: argument provided by the caller.
 *      1. for VNETPORT - pointer to the corresponding vsw_port_t.
 *      2. for PHYSDEV - NULL
 *      3. for LOCALDEV - pointer to to this vsw_t(self)
 */
void
vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
            vsw_port_t *arg, mac_resource_handle_t mrh)
{
    struct ether_header *ehp;
    vsw_port_t      *port = NULL;
    mblk_t          *bp, *ret_m;
    mblk_t          *nmp = NULL;
    vsw_port_list_t     *plist = &vswp->plist;

    D1(vswp, "%s: enter (caller %d)", __func__, caller);

    /*
     * PERF: rather than breaking up the chain here, scan it
     * to find all mblks heading to same destination and then
     * pass that sub-chain to the lower transmit functions.
     */

    /* process the chain of packets */
    bp = mp;
    while (bp) {
        mp = bp;
        bp = bp->b_next;
        mp->b_next = mp->b_prev = NULL;
        ehp = (struct ether_header *)mp->b_rptr;

        D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
            __func__, MBLKSIZE(mp), MBLKL(mp));

        READ_ENTER(&vswp->if_lockrw);
        if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
            /*
             * If destination is VSW_LOCALDEV (vsw as an eth
             * interface) and if the device is up & running,
             * send the packet up the stack on this host.
             * If the virtual interface is down, drop the packet.
             */
            if (caller != VSW_LOCALDEV) {
                if (vswp->if_state & VSW_IF_UP) {
                    RW_EXIT(&vswp->if_lockrw);
                    mac_rx(vswp->if_mh, mrh, mp);
                } else {
                    RW_EXIT(&vswp->if_lockrw);
                    /* Interface down, drop pkt */
                    freemsg(mp);
                }
            } else {
                RW_EXIT(&vswp->if_lockrw);
                freemsg(mp);
            }
            continue;
        }
        RW_EXIT(&vswp->if_lockrw);

        READ_ENTER(&plist->lockrw);
        port = vsw_lookup_fdb(vswp, ehp);
        if (port) {
            /*
             * Mark the port as in-use.
             */
            mutex_enter(&port->ref_lock);
            port->ref_cnt++;
            mutex_exit(&port->ref_lock);
            RW_EXIT(&plist->lockrw);

            /*
             * If plumbed and in promisc mode then copy msg
             * and send up the stack.
             */
            READ_ENTER(&vswp->if_lockrw);
            if (VSW_U_P(vswp->if_state)) {
                RW_EXIT(&vswp->if_lockrw);
                nmp = copymsg(mp);
                if (nmp)
                    mac_rx(vswp->if_mh, mrh, nmp);
            } else {
                RW_EXIT(&vswp->if_lockrw);
            }

            /*
             * If the destination is in FDB, the packet
             * should be forwarded to the correponding
             * vsw_port (connected to a vnet device -
             * VSW_VNETPORT)
             */
            (void) vsw_portsend(port, mp);

            /*
             * Decrement use count in port and check if
             * should wake delete thread.
             */
            mutex_enter(&port->ref_lock);
            port->ref_cnt--;
            if (port->ref_cnt == 0)
                cv_signal(&port->ref_cv);
            mutex_exit(&port->ref_lock);
        } else {
            RW_EXIT(&plist->lockrw);
            /*
             * Destination not in FDB.
             *
             * If the destination is broadcast or
             * multicast forward the packet to all
             * (VNETPORTs, PHYSDEV, LOCALDEV),
             * except the caller.
             */
            if (IS_BROADCAST(ehp)) {
                D3(vswp, "%s: BROADCAST pkt", __func__);
                (void) vsw_forward_all(vswp, mp,
                                caller, arg);
            } else if (IS_MULTICAST(ehp)) {
                D3(vswp, "%s: MULTICAST pkt", __func__);
                (void) vsw_forward_grp(vswp, mp,
                            caller, arg);
            } else {
                /*
                 * If the destination is unicast, and came
                 * from either a logical network device or
                 * the switch itself when it is plumbed, then
                 * send it out on the physical device and also
                 * up the stack if the logical interface is
                 * in promiscious mode.
                 *
                 * NOTE:  The assumption here is that if we
                 * cannot find the destination in our fdb, its
                 * a unicast address, and came from either a
                 * vnet or down the stack (when plumbed) it
                 * must be destinded for an ethernet device
                 * outside our ldoms.
                 */
                if (caller == VSW_VNETPORT) {
                    READ_ENTER(&vswp->if_lockrw);
                    if (VSW_U_P(vswp->if_state)) {
                        RW_EXIT(&vswp->if_lockrw);
                        nmp = copymsg(mp);
                        if (nmp)
                            mac_rx(vswp->if_mh,
                                mrh, nmp);
                    } else {
                        RW_EXIT(&vswp->if_lockrw);
                    }
                    if ((ret_m = vsw_tx_msg(vswp, mp))
                                != NULL) {
                        DERR(vswp, "%s: drop mblks to "
                            "phys dev", __func__);
                        freemsg(ret_m);
                    }

                } else if (caller == VSW_PHYSDEV) {
                    /*
                     * Pkt seen because card in promisc
                     * mode. Send up stack if plumbed in
                     * promisc mode, else drop it.
                     */
                    READ_ENTER(&vswp->if_lockrw);
                    if (VSW_U_P(vswp->if_state)) {
                        RW_EXIT(&vswp->if_lockrw);
                        mac_rx(vswp->if_mh, mrh, mp);
                    } else {
                        RW_EXIT(&vswp->if_lockrw);
                        freemsg(mp);
                    }

                } else if (caller == VSW_LOCALDEV) {
                    /*
                     * Pkt came down the stack, send out
                     * over physical device.
                     */
                    if ((ret_m = vsw_tx_msg(vswp, mp))
                                != NULL) {
                        DERR(vswp, "%s: drop mblks to "
                            "phys dev", __func__);
                        freemsg(ret_m);
                    }
                }
            }
        }
    }
    D1(vswp, "%s: exit\n", __func__);
}

/*
 * Switch ethernet frame when in layer 3 mode (i.e. using IP
 * layer to do the routing).
 *
 * There is a large amount of overlap between this function and
 * vsw_switch_l2_frame. At some stage we need to revisit and refactor
 * both these functions.
 */
void
vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
            vsw_port_t *arg, mac_resource_handle_t mrh)
{
    struct ether_header *ehp;
    vsw_port_t      *port = NULL;
    mblk_t          *bp = NULL;
    vsw_port_list_t     *plist = &vswp->plist;

    D1(vswp, "%s: enter (caller %d)", __func__, caller);

    /*
     * In layer 3 mode should only ever be switching packets
     * between IP layer and vnet devices. So make sure thats
     * who is invoking us.
     */
    if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
        DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
        freemsgchain(mp);
        return;
    }

    /* process the chain of packets */
    bp = mp;
    while (bp) {
        mp = bp;
        bp = bp->b_next;
        mp->b_next = mp->b_prev = NULL;
        ehp = (struct ether_header *)mp->b_rptr;

        D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
            __func__, MBLKSIZE(mp), MBLKL(mp));

        READ_ENTER(&plist->lockrw);
        port = vsw_lookup_fdb(vswp, ehp);
        if (port) {
            /*
             * Mark port as in-use.
             */
            mutex_enter(&port->ref_lock);
            port->ref_cnt++;
            mutex_exit(&port->ref_lock);
            RW_EXIT(&plist->lockrw);

            D2(vswp, "%s: sending to target port", __func__);
            (void) vsw_portsend(port, mp);

            /*
             * Finished with port so decrement ref count and
             * check if should wake delete thread.
             */
            mutex_enter(&port->ref_lock);
            port->ref_cnt--;
            if (port->ref_cnt == 0)
                cv_signal(&port->ref_cv);
            mutex_exit(&port->ref_lock);
        } else {
            RW_EXIT(&plist->lockrw);
            /*
             * Destination not in FDB
             *
             * If the destination is broadcast or
             * multicast forward the packet to all
             * (VNETPORTs, PHYSDEV, LOCALDEV),
             * except the caller.
             */
            if (IS_BROADCAST(ehp)) {
                D2(vswp, "%s: BROADCAST pkt", __func__);
                (void) vsw_forward_all(vswp, mp,
                                caller, arg);
            } else if (IS_MULTICAST(ehp)) {
                D2(vswp, "%s: MULTICAST pkt", __func__);
                (void) vsw_forward_grp(vswp, mp,
                            caller, arg);
            } else {
                /*
                 * Unicast pkt from vnet that we don't have
                 * an FDB entry for, so must be destinded for
                 * the outside world. Attempt to send up to the
                 * IP layer to allow it to deal with it.
                 */
                if (caller == VSW_VNETPORT) {
                    READ_ENTER(&vswp->if_lockrw);
                    if (vswp->if_state & VSW_IF_UP) {
                        RW_EXIT(&vswp->if_lockrw);
                        D2(vswp, "%s: sending up",
                            __func__);
                        mac_rx(vswp->if_mh, mrh, mp);
                    } else {
                        RW_EXIT(&vswp->if_lockrw);
                        /* Interface down, drop pkt */
                        D2(vswp, "%s I/F down",
                                __func__);
                        freemsg(mp);
                    }
                }
            }
        }
    }

    D1(vswp, "%s: exit", __func__);
}

/*
 * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
 * except the caller (port on which frame arrived).
 */
static int
vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
{
    vsw_port_list_t *plist = &vswp->plist;
    vsw_port_t  *portp;
    mblk_t      *nmp = NULL;
    mblk_t      *ret_m = NULL;
    int     skip_port = 0;

    D1(vswp, "vsw_forward_all: enter\n");

    /*
     * Broadcast message from inside ldoms so send to outside
     * world if in either of layer 2 modes.
     */
    if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
        (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
        ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {

        nmp = dupmsg(mp);
        if (nmp) {
            if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
                DERR(vswp, "%s: dropping pkt(s) "
                "consisting of %ld bytes of data for"
                " physical device", __func__, MBLKL(ret_m));
            freemsg(ret_m);
            }
        }
    }

    if (caller == VSW_VNETPORT)
        skip_port = 1;

    /*
     * Broadcast message from other vnet (layer 2 or 3) or outside
     * world (layer 2 only), send up stack if plumbed.
     */
    if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
        READ_ENTER(&vswp->if_lockrw);
        if (vswp->if_state & VSW_IF_UP) {
            RW_EXIT(&vswp->if_lockrw);
            nmp = copymsg(mp);
            if (nmp)
                mac_rx(vswp->if_mh, NULL, nmp);
        } else {
            RW_EXIT(&vswp->if_lockrw);
        }
    }

    /* send it to all VNETPORTs */
    READ_ENTER(&plist->lockrw);
    for (portp = plist->head; portp != NULL; portp = portp->p_next) {
        D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
        /*
         * Caution ! - don't reorder these two checks as arg
         * will be NULL if the caller is PHYSDEV. skip_port is
         * only set if caller is VNETPORT.
         */
        if ((skip_port) && (portp == arg))
            continue;
        else {
            nmp = dupmsg(mp);
            if (nmp) {
                (void) vsw_portsend(portp, nmp);
            } else {
                DERR(vswp, "vsw_forward_all: nmp NULL");
            }
        }
    }
    RW_EXIT(&plist->lockrw);

    freemsg(mp);

    D1(vswp, "vsw_forward_all: exit\n");
    return (0);
}

/*
 * Forward pkts to any devices or interfaces which have registered
 * an interest in them (i.e. multicast groups).
 */
static int
vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
{
    struct ether_header *ehp = (struct ether_header *)mp->b_rptr;
    mfdb_ent_t      *entp = NULL;
    mfdb_ent_t      *tpp = NULL;
    vsw_port_t      *port;
    uint64_t        key = 0;
    mblk_t          *nmp = NULL;
    mblk_t          *ret_m = NULL;
    boolean_t       check_if = B_TRUE;

    /*
     * Convert address to hash table key
     */
    KEY_HASH(key, ehp->ether_dhost);

    D1(vswp, "%s: key 0x%llx", __func__, key);

    /*
     * If pkt came from either a vnet or down the stack (if we are
     * plumbed) and we are in layer 2 mode, then we send the pkt out
     * over the physical adapter, and then check to see if any other
     * vnets are interested in it.
     */
    if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
        (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
        ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
        nmp = dupmsg(mp);
        if (nmp) {
            if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
                DERR(vswp, "%s: dropping pkt(s) "
                    "consisting of %ld bytes of "
                    "data for physical device",
                    __func__, MBLKL(ret_m));
                freemsg(ret_m);
            }
        }
    }

    READ_ENTER(&vswp->mfdbrw);
    if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
                (mod_hash_val_t *)&entp) != 0) {
        D3(vswp, "%s: no table entry found for addr 0x%llx",
                                __func__, key);
    } else {
        /*
         * Send to list of devices associated with this address...
         */
        for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {

            /* dont send to ourselves */
            if ((caller == VSW_VNETPORT) &&
                (tpp->d_addr == (void *)arg)) {
                port = (vsw_port_t *)tpp->d_addr;
                D3(vswp, "%s: not sending to ourselves"
                    " : port %d", __func__,
                    port->p_instance);
                continue;

            } else if ((caller == VSW_LOCALDEV) &&
                (tpp->d_type == VSW_LOCALDEV)) {
                D3(vswp, "%s: not sending back up stack",
                    __func__);
                continue;
            }

            if (tpp->d_type == VSW_VNETPORT) {
                port = (vsw_port_t *)tpp->d_addr;
                D3(vswp, "%s: sending to port %ld for "
                    " addr 0x%llx", __func__,
                    port->p_instance, key);

                nmp = dupmsg(mp);
                if (nmp)
                    (void) vsw_portsend(port, nmp);
            } else {
                if (vswp->if_state & VSW_IF_UP) {
                    nmp = copymsg(mp);
                    if (nmp)
                        mac_rx(vswp->if_mh, NULL, nmp);
                    check_if = B_FALSE;
                    D3(vswp, "%s: sending up stack"
                        " for addr 0x%llx", __func__,
                        key);
                }
            }
        }
    }

    RW_EXIT(&vswp->mfdbrw);

    /*
     * If the pkt came from either a vnet or from physical device,
     * and if we havent already sent the pkt up the stack then we
     * check now if we can/should (i.e. the interface is plumbed
     * and in promisc mode).
     */
    if ((check_if) &&
        ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
        READ_ENTER(&vswp->if_lockrw);
        if (VSW_U_P(vswp->if_state)) {
            RW_EXIT(&vswp->if_lockrw);
            D3(vswp, "%s: (caller %d) finally sending up stack"
                " for addr 0x%llx", __func__, caller, key);
            nmp = copymsg(mp);
            if (nmp)
                mac_rx(vswp->if_mh, NULL, nmp);
        } else {
            RW_EXIT(&vswp->if_lockrw);
        }
    }

    freemsg(mp);

    D1(vswp, "%s: exit", __func__);

    return (0);
}

/* transmit the packet over the given port */
static int
vsw_portsend(vsw_port_t *port, mblk_t *mp)
{
    vsw_ldc_list_t  *ldcl = &port->p_ldclist;
    vsw_ldc_t   *ldcp;
    int     status = 0;


    READ_ENTER(&ldcl->lockrw);
    /*
     * Note for now, we have a single channel.
     */
    ldcp = ldcl->head;
    if (ldcp == NULL) {
        DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
        freemsg(mp);
        RW_EXIT(&ldcl->lockrw);
        return (1);
    }

    /*
     * Send the message out using the appropriate
     * transmit function which will free mblock when it
     * is finished with it.
     */
    mutex_enter(&port->tx_lock);
    if (port->transmit != NULL)
        status = (*port->transmit)(ldcp, mp);
    else {
        freemsg(mp);
    }
    mutex_exit(&port->tx_lock);

    RW_EXIT(&ldcl->lockrw);

    return (status);
}

/*
 * Send packet out via descriptor ring to a logical device.
 */
static int
vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
{
    vio_dring_msg_t     dring_pkt;
    dring_info_t        *dp = NULL;
    vsw_private_desc_t  *priv_desc = NULL;
    vnet_public_desc_t  *pub = NULL;
    vsw_t           *vswp = ldcp->ldc_vswp;
    mblk_t          *bp;
    size_t          n, size;
    caddr_t         bufp;
    int         idx;
    int         status = LDC_TX_SUCCESS;

    D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);

    /* TODO: make test a macro */
    if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
        (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
        DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
            "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
            ldcp->lane_out.lstate);
        freemsg(mp);
        return (LDC_TX_FAILURE);
    }

    /*
     * Note - using first ring only, this may change
     * in the future.
     */
    READ_ENTER(&ldcp->lane_out.dlistrw);
    if ((dp = ldcp->lane_out.dringp) == NULL) {
        RW_EXIT(&ldcp->lane_out.dlistrw);
        DERR(vswp, "%s(%lld): no dring for outbound lane on"
            " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
        freemsg(mp);
        return (LDC_TX_FAILURE);
    }

    size = msgsize(mp);
    if (size > (size_t)ETHERMAX) {
        RW_EXIT(&ldcp->lane_out.dlistrw);
        DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
            ldcp->ldc_id, size);
        freemsg(mp);
        return (LDC_TX_FAILURE);
    }

    /*
     * Find a free descriptor
     *
     * Note: for the moment we are assuming that we will only
     * have one dring going from the switch to each of its
     * peers. This may change in the future.
     */
    if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
        D2(vswp, "%s(%lld): no descriptor available for ring "
            "at 0x%llx", __func__, ldcp->ldc_id, dp);

        /* nothing more we can do */
        status = LDC_TX_NORESOURCES;
        goto vsw_dringsend_free_exit;
    } else {
        D2(vswp, "%s(%lld): free private descriptor found at pos "
            "%ld addr 0x%llx\n", __func__, ldcp->ldc_id, idx,
            priv_desc);
    }

    /* copy data into the descriptor */
    bufp = priv_desc->datap;
    bufp += VNET_IPALIGN;
    for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
        n = MBLKL(bp);
        bcopy(bp->b_rptr, bufp, n);
        bufp += n;
    }

    priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;

    pub = priv_desc->descp;
    pub->nbytes = priv_desc->datalen;

    mutex_enter(&priv_desc->dstate_lock);
    pub->hdr.dstate = VIO_DESC_READY;
    mutex_exit(&priv_desc->dstate_lock);

    /*
     * Determine whether or not we need to send a message to our
     * peer prompting them to read our newly updated descriptor(s).
     */
    mutex_enter(&dp->restart_lock);
    if (dp->restart_reqd) {
        dp->restart_reqd = B_FALSE;
        mutex_exit(&dp->restart_lock);

        /*
         * Send a vio_dring_msg to peer to prompt them to read
         * the updated descriptor ring.
         */
        dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
        dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
        dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
        dring_pkt.tag.vio_sid = ldcp->local_session;

        /* Note - for now using first ring */
        dring_pkt.dring_ident = dp->ident;

        mutex_enter(&ldcp->lane_out.seq_lock);
        dring_pkt.seq_num = ldcp->lane_out.seq_num++;
        mutex_exit(&ldcp->lane_out.seq_lock);

        /*
         * If last_ack_recv is -1 then we know we've not
         * received any ack's yet, so this must be the first
         * msg sent, so set the start to the begining of the ring.
         */
        mutex_enter(&dp->dlock);
        if (dp->last_ack_recv == -1) {
            dring_pkt.start_idx = 0;
        } else {
            dring_pkt.start_idx = (dp->last_ack_recv + 1) %
                        dp->num_descriptors;
        }
        dring_pkt.end_idx = -1;
        mutex_exit(&dp->dlock);

        D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
            ldcp->ldc_id, dp, dring_pkt.dring_ident);
        D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n",
            __func__, ldcp->ldc_id, dring_pkt.start_idx,
            dring_pkt.end_idx, dring_pkt.seq_num);

        RW_EXIT(&ldcp->lane_out.dlistrw);

        (void) vsw_send_msg(ldcp, (void *)&dring_pkt,
                    sizeof (vio_dring_msg_t), B_TRUE);

        /* free the message block */
        freemsg(mp);
        return (status);

    } else {
        mutex_exit(&dp->restart_lock);
        D2(vswp, "%s(%lld): updating descp %d", __func__,
            ldcp->ldc_id, idx);
    }

vsw_dringsend_free_exit:

    RW_EXIT(&ldcp->lane_out.dlistrw);

    /* free the message block */
    freemsg(mp);

    D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
    return (status);
}

/*
 * Send an in-band descriptor message over ldc.
 */
static int
vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
{
    vsw_t           *vswp = ldcp->ldc_vswp;
    vnet_ibnd_desc_t    ibnd_msg;
    vsw_private_desc_t  *priv_desc = NULL;
    dring_info_t        *dp = NULL;
    size_t          n, size = 0;
    caddr_t         bufp;
    mblk_t          *bp;
    int         idx, i;
    int         status = LDC_TX_SUCCESS;
    static int      warn_msg = 1;

    D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

    ASSERT(mp != NULL);

    if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
        (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
        DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
            __func__, ldcp->ldc_id, ldcp->ldc_status,
            ldcp->lane_out.lstate);
        freemsg(mp);
        return (LDC_TX_FAILURE);
    }

    /*
     * only expect single dring to exist, which we use
     * as an internal buffer, rather than a transfer channel.
     */
    READ_ENTER(&ldcp->lane_out.dlistrw);
    if ((dp = ldcp->lane_out.dringp) == NULL) {
        DERR(vswp, "%s(%lld): no dring for outbound lane",
            __func__, ldcp->ldc_id);
        DERR(vswp, "%s(%lld) status(%d) state (0x%llx)",
            __func__, ldcp->ldc_id, ldcp->ldc_status,
            ldcp->lane_out.lstate);
        RW_EXIT(&ldcp->lane_out.dlistrw);
        freemsg(mp);
        return (LDC_TX_FAILURE);
    }

    size = msgsize(mp);
    if (size > (size_t)ETHERMAX) {
        RW_EXIT(&ldcp->lane_out.dlistrw);
        DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
            ldcp->ldc_id, size);
        freemsg(mp);
        return (LDC_TX_FAILURE);
    }

    /*
     * Find a free descriptor in our buffer ring
     */
    if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
        RW_EXIT(&ldcp->lane_out.dlistrw);
        if (warn_msg) {
            DERR(vswp, "%s(%lld): no descriptor available for ring "
            "at 0x%llx", __func__, ldcp->ldc_id, dp);
            warn_msg = 0;
        }

        /* nothing more we can do */
        status = LDC_TX_NORESOURCES;
        goto vsw_descrsend_free_exit;
    } else {
        D2(vswp, "%s(%lld): free private descriptor found at pos "
            "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx,
            priv_desc);
        warn_msg = 1;
    }

    /* copy data into the descriptor */
    bufp = priv_desc->datap;
    for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
        n = MBLKL(bp);
        bcopy(bp->b_rptr, bufp, n);
        bufp += n;
    }

    priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;

    /* create and send the in-band descp msg */
    ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
    ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
    ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
    ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;

    mutex_enter(&ldcp->lane_out.seq_lock);
    ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++;
    mutex_exit(&ldcp->lane_out.seq_lock);

    /*
     * Copy the mem cookies describing the data from the
     * private region of the descriptor ring into the inband
     * descriptor.
     */
    for (i = 0; i < priv_desc->ncookies; i++) {
        bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
            sizeof (ldc_mem_cookie_t));
    }

    ibnd_msg.hdr.desc_handle = idx;
    ibnd_msg.ncookies = priv_desc->ncookies;
    ibnd_msg.nbytes = size;

    RW_EXIT(&ldcp->lane_out.dlistrw);

    (void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
            sizeof (vnet_ibnd_desc_t), B_TRUE);

vsw_descrsend_free_exit:

    /* free the allocated message blocks */
    freemsg(mp);

    D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
    return (status);
}

static void
vsw_send_ver(void *arg)
{
    vsw_ldc_t   *ldcp = (vsw_ldc_t *)arg;
    vsw_t       *vswp = ldcp->ldc_vswp;
    lane_t      *lp = &ldcp->lane_out;
    vio_ver_msg_t   ver_msg;

    D1(vswp, "%s enter", __func__);

    ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
    ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
    ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
    ver_msg.tag.vio_sid = ldcp->local_session;

    ver_msg.ver_major = vsw_versions[0].ver_major;
    ver_msg.ver_minor = vsw_versions[0].ver_minor;
    ver_msg.dev_class = VDEV_NETWORK_SWITCH;

    lp->lstate |= VSW_VER_INFO_SENT;
    lp->ver_major = ver_msg.ver_major;
    lp->ver_minor = ver_msg.ver_minor;

    DUMP_TAG(ver_msg.tag);

    (void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);

    D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
}

static void
vsw_send_attr(vsw_ldc_t *ldcp)
{
    vsw_t           *vswp = ldcp->ldc_vswp;
    lane_t          *lp = &ldcp->lane_out;
    vnet_attr_msg_t     attr_msg;

    D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);

    /*
     * Subtype is set to INFO by default
     */
    attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
    attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
    attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
    attr_msg.tag.vio_sid = ldcp->local_session;

    /* payload copied from default settings for lane */
    attr_msg.mtu = lp->mtu;
    attr_msg.addr_type = lp->addr_type;
    attr_msg.xfer_mode = lp->xfer_mode;
    attr_msg.ack_freq = lp->xfer_mode;

    READ_ENTER(&vswp->if_lockrw);
    bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL);
    RW_EXIT(&vswp->if_lockrw);

    ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;

    DUMP_TAG(attr_msg.tag);

    (void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);

    D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
}

/*
 * Create dring info msg (which also results in the creation of
 * a dring).
 */
static vio_dring_reg_msg_t *
vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
{
    vio_dring_reg_msg_t *mp;
    dring_info_t        *dp;
    vsw_t           *vswp = ldcp->ldc_vswp;

    D1(vswp, "vsw_create_dring_info_pkt enter\n");

    /*
     * If we can't create a dring, obviously no point sending
     * a message.
     */
    if ((dp = vsw_create_dring(ldcp)) == NULL)
        return (NULL);

    mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);

    mp->tag.vio_msgtype = VIO_TYPE_CTRL;
    mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
    mp->tag.vio_subtype_env = VIO_DRING_REG;
    mp->tag.vio_sid = ldcp->local_session;

    /* payload */
    mp->num_descriptors = dp->num_descriptors;
    mp->descriptor_size = dp->descriptor_size;
    mp->options = dp->options;
    mp->ncookies = dp->ncookies;
    bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));

    mp->dring_ident = 0;

    D1(vswp, "vsw_create_dring_info_pkt exit\n");

    return (mp);
}

static void
vsw_send_dring_info(vsw_ldc_t *ldcp)
{
    vio_dring_reg_msg_t *dring_msg;
    vsw_t           *vswp = ldcp->ldc_vswp;

    D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);

    dring_msg = vsw_create_dring_info_pkt(ldcp);
    if (dring_msg == NULL) {
        cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
            vswp->instance, __func__);
        return;
    }

    ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;

    DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);

    (void) vsw_send_msg(ldcp, dring_msg,
        sizeof (vio_dring_reg_msg_t), B_TRUE);

    kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));

    D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
}

static void
vsw_send_rdx(vsw_ldc_t *ldcp)
{
    vsw_t       *vswp = ldcp->ldc_vswp;
    vio_rdx_msg_t   rdx_msg;

    D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);

    rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
    rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
    rdx_msg.tag.vio_subtype_env = VIO_RDX;
    rdx_msg.tag.vio_sid = ldcp->local_session;

    ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;

    DUMP_TAG(rdx_msg.tag);

    (void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);

    D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
}

/*
 * Generic routine to send message out over ldc channel.
 *
 * It is possible that when we attempt to write over the ldc channel
 * that we get notified that it has been reset. Depending on the value
 * of the handle_reset flag we either handle that event here or simply
 * notify the caller that the channel was reset.
 */
static int
vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
{
    int     rv;
    size_t      msglen = size;
    vio_msg_tag_t   *tag = (vio_msg_tag_t *)msgp;
    vsw_t       *vswp = ldcp->ldc_vswp;

    D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
            ldcp->ldc_id, size);

    D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
    D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
    D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);

    mutex_enter(&ldcp->ldc_txlock);
    do {
        msglen = size;
        rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
    } while (rv == EWOULDBLOCK && --vsw_wretries > 0);

    if ((rv != 0) || (msglen != size)) {
        DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) "
            "rv(%d) size (%d) msglen(%d)\n", ldcp->ldc_id,
            rv, size, msglen);
    }
    mutex_exit(&ldcp->ldc_txlock);

    /*
     * If channel has been reset we either handle it here or
     * simply report back that it has been reset and let caller
     * decide what to do.
     */
    if (rv == ECONNRESET) {
        DWARN(vswp, "%s (%lld) channel reset",
                    __func__, ldcp->ldc_id);

        /*
         * N.B - must never be holding the dlistrw lock when
         * we do a reset of the channel.
         */
        if (handle_reset) {
            vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
        }
    }

    return (rv);
}

/*
 * Add an entry into FDB, for the given mac address and port_id.
 * Returns 0 on success, 1 on failure.
 *
 * Lock protecting FDB must be held by calling process.
 */
static int
vsw_add_fdb(vsw_t *vswp, vsw_port_t *port)
{
    uint64_t    addr = 0;

    D1(vswp, "%s: enter", __func__);

    KEY_HASH(addr, port->p_macaddr);

    D2(vswp, "%s: key = 0x%llx", __func__, addr);

    /*
     * Note: duplicate keys will be rejected by mod_hash.
     */
    if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr,
                (mod_hash_val_t)port) != 0) {
        DERR(vswp, "%s: unable to add entry into fdb.", __func__);
        return (1);
    }

    D1(vswp, "%s: exit", __func__);
    return (0);
}

/*
 * Remove an entry from FDB.
 * Returns 0 on success, 1 on failure.
 */
static int
vsw_del_fdb(vsw_t *vswp, vsw_port_t *port)
{
    uint64_t    addr = 0;

    D1(vswp, "%s: enter", __func__);

    KEY_HASH(addr, port->p_macaddr);

    D2(vswp, "%s: key = 0x%llx", __func__, addr);

    (void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr);

    D1(vswp, "%s: enter", __func__);

    return (0);
}

/*
 * Search fdb for a given mac address.
 * Returns pointer to the entry if found, else returns NULL.
 */
static vsw_port_t *
vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp)
{
    uint64_t    key = 0;
    vsw_port_t  *port = NULL;

    D1(vswp, "%s: enter", __func__);

    KEY_HASH(key, ehp->ether_dhost);

    D2(vswp, "%s: key = 0x%llx", __func__, key);

    if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key,
                (mod_hash_val_t *)&port) != 0) {
        D2(vswp, "%s: no port found", __func__);
        return (NULL);
    }

    D1(vswp, "%s: exit", __func__);

    return (port);
}

/*
 * Add or remove multicast address(es).
 *
 * Returns 0 on success, 1 on failure.
 */
static int
vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
{
    mcst_addr_t     *mcst_p = NULL;
    vsw_t           *vswp = port->p_vswp;
    uint64_t        addr = 0x0;
    int         i;

    D1(vswp, "%s: enter", __func__);

    D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);

    mutex_enter(&vswp->mac_lock);
    if (vswp->mh == NULL) {
        mutex_exit(&vswp->mac_lock);
        return (1);
    }
    mutex_exit(&vswp->mac_lock);

    for (i = 0; i < mcst_pkt->count; i++) {
        /*
         * Convert address into form that can be used
         * as hash table key.
         */
        KEY_HASH(addr, mcst_pkt->mca[i]);

        /*
         * Add or delete the specified address/port combination.
         */
        if (mcst_pkt->set == 0x1) {
            D3(vswp, "%s: adding multicast address 0x%llx for "
                "port %ld", __func__, addr, port->p_instance);
            if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
                /*
                 * Update the list of multicast
                 * addresses contained within the
                 * port structure to include this new
                 * one.
                 */
                mcst_p = kmem_alloc(sizeof (mcst_addr_t),
                                KM_NOSLEEP);
                if (mcst_p == NULL) {
                    DERR(vswp, "%s: unable to alloc mem",
                        __func__);
                    return (1);
                }

                mcst_p->nextp = NULL;
                mcst_p->addr = addr;

                mutex_enter(&port->mca_lock);
                mcst_p->nextp = port->mcap;
                port->mcap = mcst_p;
                mutex_exit(&port->mca_lock);

                /*
                 * Program the address into HW. If the addr
                 * has already been programmed then the MAC
                 * just increments a ref counter (which is
                 * used when the address is being deleted)
                 */
                mutex_enter(&vswp->mac_lock);
                if ((vswp->mh == NULL) ||
                    mac_multicst_add(vswp->mh,
                        (uchar_t *)&mcst_pkt->mca[i])) {
                    mutex_exit(&vswp->mac_lock);
                    cmn_err(CE_WARN, "!vsw%d: unable to "
                        "add multicast address",
                        vswp->instance);
                    (void) vsw_del_mcst(vswp, VSW_VNETPORT,
                        addr, port);
                    vsw_del_addr(VSW_VNETPORT, port, addr);
                    return (1);
                }
                mutex_exit(&vswp->mac_lock);

            } else {
                DERR(vswp, "%s: error adding multicast "
                    "address 0x%llx for port %ld",
                    __func__, addr, port->p_instance);
                return (1);
            }
        } else {
            /*
             * Delete an entry from the multicast hash
             * table and update the address list
             * appropriately.
             */
            if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
                D3(vswp, "%s: deleting multicast address "
                    "0x%llx for port %ld", __func__, addr,
                    port->p_instance);

                vsw_del_addr(VSW_VNETPORT, port, addr);

                /*
                 * Remove the address from HW. The address
                 * will actually only be removed once the ref
                 * count within the MAC layer has dropped to
                 * zero. I.e. we can safely call this fn even
                 * if other ports are interested in this
                 * address.
                 */
                mutex_enter(&vswp->mac_lock);
                if ((vswp->mh == NULL) ||
                    mac_multicst_remove(vswp->mh,
                        (uchar_t *)&mcst_pkt->mca[i])) {
                    mutex_exit(&vswp->mac_lock);
                    cmn_err(CE_WARN, "!vsw%d: unable to "
                        "remove multicast address",
                        vswp->instance);
                    return (1);
                }
                mutex_exit(&vswp->mac_lock);

            } else {
                DERR(vswp, "%s: error deleting multicast "
                    "addr 0x%llx for port %ld",
                    __func__, addr, port->p_instance);
                return (1);
            }
        }
    }
    D1(vswp, "%s: exit", __func__);
    return (0);
}

/*
 * Add a new multicast entry.
 *
 * Search hash table based on address. If match found then
 * update associated val (which is chain of ports), otherwise
 * create new key/val (addr/port) pair and insert into table.
 */
static int
vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
{
    int     dup = 0;
    int     rv = 0;
    mfdb_ent_t  *ment = NULL;
    mfdb_ent_t  *tmp_ent = NULL;
    mfdb_ent_t  *new_ent = NULL;
    void        *tgt = NULL;

    if (devtype == VSW_VNETPORT) {
        /*
         * Being invoked from a vnet.
         */
        ASSERT(arg != NULL);
        tgt = arg;
        D2(NULL, "%s: port %d : address 0x%llx", __func__,
            ((vsw_port_t *)arg)->p_instance, addr);
    } else {
        /*
         * We are being invoked via the m_multicst mac entry
         * point.
         */
        D2(NULL, "%s: address 0x%llx", __func__, addr);
        tgt = (void *)vswp;
    }

    WRITE_ENTER(&vswp->mfdbrw);
    if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
                (mod_hash_val_t *)&ment) != 0) {

        /* address not currently in table */
        ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
        ment->d_addr = (void *)tgt;
        ment->d_type = devtype;
        ment->nextp = NULL;

        if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
            (mod_hash_val_t)ment) != 0) {
            DERR(vswp, "%s: hash table insertion failed", __func__);
            kmem_free(ment, sizeof (mfdb_ent_t));
            rv = 1;
        } else {
            D2(vswp, "%s: added initial entry for 0x%llx to "
                "table", __func__, addr);
        }
    } else {
        /*
         * Address in table. Check to see if specified port
         * is already associated with the address. If not add
         * it now.
         */
        tmp_ent = ment;
        while (tmp_ent != NULL) {
            if (tmp_ent->d_addr == (void *)tgt) {
                if (devtype == VSW_VNETPORT) {
                    DERR(vswp, "%s: duplicate port entry "
                        "found for portid %ld and key "
                        "0x%llx", __func__,
                        ((vsw_port_t *)arg)->p_instance,
                        addr);
                } else {
                    DERR(vswp, "%s: duplicate entry found"
                        "for key 0x%llx",
                        __func__, addr);
                }
                rv = 1;
                dup = 1;
                break;
            }
            tmp_ent = tmp_ent->nextp;
        }

        /*
         * Port not on list so add it to end now.
         */
        if (0 == dup) {
            D2(vswp, "%s: added entry for 0x%llx to table",
                __func__, addr);
            new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
            new_ent->d_addr = (void *)tgt;
            new_ent->d_type = devtype;
            new_ent->nextp = NULL;

            tmp_ent = ment;
            while (tmp_ent->nextp != NULL)
                tmp_ent = tmp_ent->nextp;

            tmp_ent->nextp = new_ent;
        }
    }

    RW_EXIT(&vswp->mfdbrw);
    return (rv);
}

/*
 * Remove a multicast entry from the hashtable.
 *
 * Search hash table based on address. If match found, scan
 * list of ports associated with address. If specified port
 * found remove it from list.
 */
static int
vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
{
    mfdb_ent_t  *ment = NULL;
    mfdb_ent_t  *curr_p, *prev_p;
    void        *tgt = NULL;

    D1(vswp, "%s: enter", __func__);

    if (devtype == VSW_VNETPORT) {
        tgt = (vsw_port_t *)arg;
        D2(vswp, "%s: removing port %d from mFDB for address"
            " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance,
            addr);
    } else {
        D2(vswp, "%s: removing entry", __func__);
        tgt = (void *)vswp;
    }

    WRITE_ENTER(&vswp->mfdbrw);
    if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
                (mod_hash_val_t *)&ment) != 0) {
        D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
        RW_EXIT(&vswp->mfdbrw);
        return (1);
    }

    prev_p = curr_p = ment;

    while (curr_p != NULL) {
        if (curr_p->d_addr == (void *)tgt) {
            if (devtype == VSW_VNETPORT) {
                D2(vswp, "%s: port %d found", __func__,
                    ((vsw_port_t *)tgt)->p_instance);
            } else {
                D2(vswp, "%s: instance found", __func__);
            }

            if (prev_p == curr_p) {
                /*
                 * head of list, if no other element is in
                 * list then destroy this entry, otherwise
                 * just replace it with updated value.
                 */
                ment = curr_p->nextp;
                kmem_free(curr_p, sizeof (mfdb_ent_t));
                if (ment == NULL) {
                    (void) mod_hash_destroy(vswp->mfdb,
                            (mod_hash_val_t)addr);
                } else {
                    (void) mod_hash_replace(vswp->mfdb,
                            (mod_hash_key_t)addr,
                            (mod_hash_val_t)ment);
                }
            } else {
                /*
                 * Not head of list, no need to do
                 * replacement, just adjust list pointers.
                 */
                prev_p->nextp = curr_p->nextp;
                kmem_free(curr_p, sizeof (mfdb_ent_t));
            }
            break;
        }

        prev_p = curr_p;
        curr_p = curr_p->nextp;
    }

    RW_EXIT(&vswp->mfdbrw);

    D1(vswp, "%s: exit", __func__);

    return (0);
}

/*
 * Port is being deleted, but has registered an interest in one
 * or more multicast groups. Using the list of addresses maintained
 * within the port structure find the appropriate entry in the hash
 * table and remove this port from the list of interested ports.
 */
static void
vsw_del_mcst_port(vsw_port_t *port)
{
    mcst_addr_t *mcst_p = NULL;
    vsw_t       *vswp = port->p_vswp;

    D1(vswp, "%s: enter", __func__);

    mutex_enter(&port->mca_lock);
    while (port->mcap != NULL) {
        (void) vsw_del_mcst(vswp, VSW_VNETPORT,
                    port->mcap->addr, port);

        mcst_p = port->mcap->nextp;
        kmem_free(port->mcap, sizeof (mcst_addr_t));
        port->mcap = mcst_p;
    }
    mutex_exit(&port->mca_lock);

    D1(vswp, "%s: exit", __func__);
}

/*
 * This vsw instance is detaching, but has registered an interest in one
 * or more multicast groups. Using the list of addresses maintained
 * within the vsw structure find the appropriate entry in the hash
 * table and remove this instance from the list of interested ports.
 */
static void
vsw_del_mcst_vsw(vsw_t *vswp)
{
    mcst_addr_t *next_p = NULL;

    D1(vswp, "%s: enter", __func__);

    mutex_enter(&vswp->mca_lock);

    while (vswp->mcap != NULL) {
        DERR(vswp, "%s: deleting addr 0x%llx",
            __func__, vswp->mcap->addr);
        (void) vsw_del_mcst(vswp, VSW_LOCALDEV,
                vswp->mcap->addr, NULL);

        next_p = vswp->mcap->nextp;
        kmem_free(vswp->mcap, sizeof (mcst_addr_t));
        vswp->mcap = next_p;
    }

    vswp->mcap = NULL;
    mutex_exit(&vswp->mca_lock);

    D1(vswp, "%s: exit", __func__);
}


/*
 * Remove the specified address from the list of address maintained
 * in this port node.
 */
static void
vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
{
    vsw_t       *vswp = NULL;
    vsw_port_t  *port = NULL;
    mcst_addr_t *prev_p = NULL;
    mcst_addr_t *curr_p = NULL;

    D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
        __func__, devtype, addr);

    if (devtype == VSW_VNETPORT) {
        port = (vsw_port_t *)arg;
        mutex_enter(&port->mca_lock);
        prev_p = curr_p = port->mcap;
    } else {
        vswp = (vsw_t *)arg;
        mutex_enter(&vswp->mca_lock);
        prev_p = curr_p = vswp->mcap;
    }

    while (curr_p != NULL) {
        if (curr_p->addr == addr) {
            D2(NULL, "%s: address found", __func__);
            /* match found */
            if (prev_p == curr_p) {
                /* list head */
                if (devtype == VSW_VNETPORT)
                    port->mcap = curr_p->nextp;
                else
                    vswp->mcap = curr_p->nextp;
            } else {
                prev_p->nextp = curr_p->nextp;
            }
            kmem_free(curr_p, sizeof (mcst_addr_t));
            break;
        } else {
            prev_p = curr_p;
            curr_p = curr_p->nextp;
        }
    }

    if (devtype == VSW_VNETPORT)
        mutex_exit(&port->mca_lock);
    else
        mutex_exit(&vswp->mca_lock);

    D1(NULL, "%s: exit", __func__);
}

/*
 * Creates a descriptor ring (dring) and links it into the
 * link of outbound drings for this channel.
 *
 * Returns NULL if creation failed.
 */
static dring_info_t *
vsw_create_dring(vsw_ldc_t *ldcp)
{
    vsw_private_desc_t  *priv_addr = NULL;
    vsw_t           *vswp = ldcp->ldc_vswp;
    ldc_mem_info_t      minfo;
    dring_info_t        *dp, *tp;
    int         i;

    dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);

    mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);

    /* create public section of ring */
    if ((ldc_mem_dring_create(VSW_RING_NUM_EL,
            VSW_PUB_SIZE, &dp->handle)) != 0) {

        DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
            "failed", ldcp->ldc_id);
        goto create_fail_exit;
    }

    ASSERT(dp->handle != NULL);

    /*
     * Get the base address of the public section of the ring.
     */
    if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
        DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
            ldcp->ldc_id);
        goto dring_fail_exit;
    } else {
        ASSERT(minfo.vaddr != 0);
        dp->pub_addr = minfo.vaddr;
    }

    dp->num_descriptors = VSW_RING_NUM_EL;
    dp->descriptor_size = VSW_PUB_SIZE;
    dp->options = VIO_TX_DRING;
    dp->ncookies = 1;   /* guaranteed by ldc */

    /*
     * create private portion of ring
     */
    dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
        (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);

    if (vsw_setup_ring(ldcp, dp)) {
        DERR(vswp, "%s: unable to setup ring", __func__);
        goto dring_fail_exit;
    }

    /* haven't used any descriptors yet */
    dp->end_idx = 0;
    dp->last_ack_recv = -1;

    /* bind dring to the channel */
    if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
        LDC_SHADOW_MAP, LDC_MEM_RW,
        &dp->cookie[0], &dp->ncookies)) != 0) {
        DERR(vswp, "vsw_create_dring: unable to bind to channel "
            "%lld", ldcp->ldc_id);
        goto dring_fail_exit;
    }

    mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
    dp->restart_reqd = B_TRUE;

    /*
     * Only ever create rings for outgoing lane. Link it onto
     * end of list.
     */
    WRITE_ENTER(&ldcp->lane_out.dlistrw);
    if (ldcp->lane_out.dringp == NULL) {
        D2(vswp, "vsw_create_dring: adding first outbound ring");
        ldcp->lane_out.dringp = dp;
    } else {
        tp = ldcp->lane_out.dringp;
        while (tp->next != NULL)
            tp = tp->next;

        tp->next = dp;
    }
    RW_EXIT(&ldcp->lane_out.dlistrw);

    return (dp);

dring_fail_exit:
    (void) ldc_mem_dring_destroy(dp->handle);

create_fail_exit:
    if (dp->priv_addr != NULL) {
        priv_addr = dp->priv_addr;
        for (i = 0; i < VSW_RING_NUM_EL; i++) {
            if (priv_addr->memhandle != NULL)
                (void) ldc_mem_free_handle(
                        priv_addr->memhandle);
            priv_addr++;
        }
        kmem_free(dp->priv_addr,
            (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
    }
    mutex_destroy(&dp->dlock);

    kmem_free(dp, sizeof (dring_info_t));
    return (NULL);
}

/*
 * Create a ring consisting of just a private portion and link
 * it into the list of rings for the outbound lane.
 *
 * These type of rings are used primarily for temporary data
 * storage (i.e. as data buffers).
 */
void
vsw_create_privring(vsw_ldc_t *ldcp)
{
    dring_info_t        *dp, *tp;
    vsw_t           *vswp = ldcp->ldc_vswp;

    D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

    dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);

    mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);

    /* no public section */
    dp->pub_addr = NULL;

    dp->priv_addr = kmem_zalloc((sizeof (vsw_private_desc_t) *
                    VSW_RING_NUM_EL), KM_SLEEP);

    dp->num_descriptors = VSW_RING_NUM_EL;

    if (vsw_setup_ring(ldcp, dp)) {
        DERR(vswp, "%s: setup of ring failed", __func__);
        kmem_free(dp->priv_addr,
            (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
        mutex_destroy(&dp->dlock);
        kmem_free(dp, sizeof (dring_info_t));
        return;
    }

    /* haven't used any descriptors yet */
    dp->end_idx = 0;

    mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
    dp->restart_reqd = B_TRUE;

    /*
     * Only ever create rings for outgoing lane. Link it onto
     * end of list.
     */
    WRITE_ENTER(&ldcp->lane_out.dlistrw);
    if (ldcp->lane_out.dringp == NULL) {
        D2(vswp, "%s: adding first outbound privring", __func__);
        ldcp->lane_out.dringp = dp;
    } else {
        tp = ldcp->lane_out.dringp;
        while (tp->next != NULL)
            tp = tp->next;

        tp->next = dp;
    }
    RW_EXIT(&ldcp->lane_out.dlistrw);

    D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

/*
 * Setup the descriptors in the dring. Returns 0 on success, 1 on
 * failure.
 */
int
vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
{
    vnet_public_desc_t  *pub_addr = NULL;
    vsw_private_desc_t  *priv_addr = NULL;
    vsw_t           *vswp = ldcp->ldc_vswp;
    uint64_t        *tmpp;
    uint64_t        offset = 0;
    uint32_t        ncookies = 0;
    static char     *name = "vsw_setup_ring";
    int         i, j, nc, rv;

    priv_addr = dp->priv_addr;
    pub_addr = dp->pub_addr;

    /* public section may be null but private should never be */
    ASSERT(priv_addr != NULL);

    /*
     * Allocate the region of memory which will be used to hold
     * the data the descriptors will refer to.
     */
    dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ);
    dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);

    D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
        dp->data_sz, dp->data_addr);

    tmpp = (uint64_t *)dp->data_addr;
    offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp);

    /*
     * Initialise some of the private and public (if they exist)
     * descriptor fields.
     */
    for (i = 0; i < VSW_RING_NUM_EL; i++) {
        mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);

        if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
            &priv_addr->memhandle)) != 0) {
            DERR(vswp, "%s: alloc mem handle failed", name);
            goto setup_ring_cleanup;
        }

        priv_addr->datap = (void *)tmpp;

        rv = ldc_mem_bind_handle(priv_addr->memhandle,
            (caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ,
            LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
            &(priv_addr->memcookie[0]), &ncookies);
        if (rv != 0) {
            DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
                "(rv %d)", name, ldcp->ldc_id, rv);
            goto setup_ring_cleanup;
        }
        priv_addr->bound = 1;

        D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
            name, i, priv_addr->memcookie[0].addr,
            priv_addr->memcookie[0].size);

        if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
            DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
                "invalid num of cookies (%d) for size 0x%llx",
                name, ldcp->ldc_id, ncookies,
                VSW_RING_EL_DATA_SZ);

            goto setup_ring_cleanup;
        } else {
            for (j = 1; j < ncookies; j++) {
                rv = ldc_mem_nextcookie(priv_addr->memhandle,
                    &(priv_addr->memcookie[j]));
                if (rv != 0) {
                    DERR(vswp, "%s: ldc_mem_nextcookie "
                        "failed rv (%d)", name, rv);
                    goto setup_ring_cleanup;
                }
                D3(vswp, "%s: memcookie %d : addr 0x%llx : "
                    "size 0x%llx", name, j,
                    priv_addr->memcookie[j].addr,
                    priv_addr->memcookie[j].size);
            }

        }
        priv_addr->ncookies = ncookies;
        priv_addr->dstate = VIO_DESC_FREE;

        if (pub_addr != NULL) {

            /* link pub and private sides */
            priv_addr->descp = pub_addr;

            pub_addr->ncookies = priv_addr->ncookies;

            for (nc = 0; nc < pub_addr->ncookies; nc++) {
                bcopy(&priv_addr->memcookie[nc],
                    &pub_addr->memcookie[nc],
                    sizeof (ldc_mem_cookie_t));
            }

            pub_addr->hdr.dstate = VIO_DESC_FREE;
            pub_addr++;
        }

        /*
         * move to next element in the dring and the next
         * position in the data buffer.
         */
        priv_addr++;
        tmpp += offset;
    }

    return (0);

setup_ring_cleanup:
    priv_addr = dp->priv_addr;

    for (j = 0; j < i; j++) {
        (void) ldc_mem_unbind_handle(priv_addr->memhandle);
        (void) ldc_mem_free_handle(priv_addr->memhandle);

        mutex_destroy(&priv_addr->dstate_lock);

        priv_addr++;
    }
    kmem_free(dp->data_addr, dp->data_sz);

    return (1);
}

/*
 * Searches the private section of a ring for a free descriptor,
 * starting at the location of the last free descriptor found
 * previously.
 *
 * Returns 0 if free descriptor is available, and updates state
 * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
 *
 * FUTURE: might need to return contiguous range of descriptors
 * as dring info msg assumes all will be contiguous.
 */
static int
vsw_dring_find_free_desc(dring_info_t *dringp,
        vsw_private_desc_t **priv_p, int *idx)
{
    vsw_private_desc_t  *addr = NULL;
    int         num = VSW_RING_NUM_EL;
    int         ret = 1;

    D1(NULL, "%s enter\n", __func__);

    ASSERT(dringp->priv_addr != NULL);

    D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
            __func__, dringp, dringp->end_idx);

    addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;

    mutex_enter(&addr->dstate_lock);
    if (addr->dstate == VIO_DESC_FREE) {
        addr->dstate = VIO_DESC_READY;
        *priv_p = addr;
        *idx = dringp->end_idx;
        dringp->end_idx = (dringp->end_idx + 1) % num;
        ret = 0;

    }
    mutex_exit(&addr->dstate_lock);

    /* ring full */
    if (ret == 1) {
        D2(NULL, "%s: no desp free: started at %d", __func__,
            dringp->end_idx);
    }

    D1(NULL, "%s: exit\n", __func__);

    return (ret);
}

/*
 * Map from a dring identifier to the ring itself. Returns
 * pointer to ring or NULL if no match found.
 *
 * Should be called with dlistrw rwlock held as reader.
 */
static dring_info_t *
vsw_ident2dring(lane_t *lane, uint64_t ident)
{
    dring_info_t    *dp = NULL;

    if ((dp = lane->dringp) == NULL) {
        return (NULL);
    } else {
        if (dp->ident == ident)
            return (dp);

        while (dp != NULL) {
            if (dp->ident == ident)
                break;
            dp = dp->next;
        }
    }

    return (dp);
}

/*
 * Set the default lane attributes. These are copied into
 * the attr msg we send to our peer. If they are not acceptable
 * then (currently) the handshake ends.
 */
static void
vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
{
    bzero(lp, sizeof (lane_t));

    READ_ENTER(&vswp->if_lockrw);
    ether_copy(&(vswp->if_addr), &(lp->addr));
    RW_EXIT(&vswp->if_lockrw);

    lp->mtu = VSW_MTU;
    lp->addr_type = ADDR_TYPE_MAC;
    lp->xfer_mode = VIO_DRING_MODE;
    lp->ack_freq = 0;   /* for shared mode */

    mutex_enter(&lp->seq_lock);
    lp->seq_num = VNET_ISS;
    mutex_exit(&lp->seq_lock);
}

/*
 * Verify that the attributes are acceptable.
 *
 * FUTURE: If some attributes are not acceptable, change them
 * our desired values.
 */
static int
vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port)
{
    int ret = 0;

    D1(NULL, "vsw_check_attr enter\n");

    /*
     * Note we currently only support in-band descriptors
     * and descriptor rings, not packet based transfer (VIO_PKT_MODE)
     */
    if ((pkt->xfer_mode != VIO_DESC_MODE) &&
            (pkt->xfer_mode != VIO_DRING_MODE)) {
        D2(NULL, "vsw_check_attr: unknown mode %x\n",
            pkt->xfer_mode);
        ret = 1;
    }

    /* Only support MAC addresses at moment. */
    if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
        D2(NULL, "vsw_check_attr: invalid addr_type %x, "
            "or address 0x%llx\n", pkt->addr_type,
            pkt->addr);
        ret = 1;
    }

    /*
     * MAC address supplied by device should match that stored
     * in the vsw-port OBP node. Need to decide what to do if they
     * don't match, for the moment just warn but don't fail.
     */
    if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) {
        DERR(NULL, "vsw_check_attr: device supplied address "
            "0x%llx doesn't match node address 0x%llx\n",
            pkt->addr, port->p_macaddr);
    }

    /*
     * Ack freq only makes sense in pkt mode, in shared
     * mode the ring descriptors say whether or not to
     * send back an ACK.
     */
    if ((pkt->xfer_mode == VIO_DRING_MODE) &&
                (pkt->ack_freq > 0)) {
        D2(NULL, "vsw_check_attr: non zero ack freq "
            " in SHM mode\n");
        ret = 1;
    }

    /*
     * Note: for the moment we only support ETHER
     * frames. This may change in the future.
     */
    if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) {
        D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
            pkt->mtu);
        ret = 1;
    }

    D1(NULL, "vsw_check_attr exit\n");

    return (ret);
}

/*
 * Returns 1 if there is a problem, 0 otherwise.
 */
static int
vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
{
    _NOTE(ARGUNUSED(pkt))

    int ret = 0;

    D1(NULL, "vsw_check_dring_info enter\n");

    if ((pkt->num_descriptors == 0) ||
        (pkt->descriptor_size == 0) ||
        (pkt->ncookies != 1)) {
        DERR(NULL, "vsw_check_dring_info: invalid dring msg");
        ret = 1;
    }

    D1(NULL, "vsw_check_dring_info exit\n");

    return (ret);
}

/*
 * Returns 1 if two memory cookies match. Otherwise returns 0.
 */
static int
vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
{
    if ((m1->addr != m2->addr) ||
        (m2->size != m2->size)) {
        return (0);
    } else {
        return (1);
    }
}

/*
 * Returns 1 if ring described in reg message matches that
 * described by dring_info structure. Otherwise returns 0.
 */
static int
vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
{
    if ((msg->descriptor_size != dp->descriptor_size) ||
        (msg->num_descriptors != dp->num_descriptors) ||
        (msg->ncookies != dp->ncookies) ||
        !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
        return (0);
    } else {
        return (1);
    }

}

static caddr_t
vsw_print_ethaddr(uint8_t *a, char *ebuf)
{
    (void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
        a[0], a[1], a[2], a[3], a[4], a[5]);
    return (ebuf);
}

/*
 * Reset and free all the resources associated with
 * the channel.
 */
static void
vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
{
    dring_info_t        *dp, *dpp;
    lane_t          *lp = NULL;
    int         rv = 0;

    ASSERT(ldcp != NULL);

    D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);

    if (dir == INBOUND) {
        D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
            " of channel %lld", __func__, ldcp->ldc_id);
        lp = &ldcp->lane_in;
    } else {
        D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
            " of channel %lld", __func__, ldcp->ldc_id);
        lp = &ldcp->lane_out;
    }

    lp->lstate = VSW_LANE_INACTIV;
    mutex_enter(&lp->seq_lock);
    lp->seq_num = VNET_ISS;
    mutex_exit(&lp->seq_lock);
    if (lp->dringp) {
        if (dir == INBOUND) {
            WRITE_ENTER(&lp->dlistrw);
            dp = lp->dringp;
            while (dp != NULL) {
                dpp = dp->next;
                if (dp->handle != NULL)
                    (void) ldc_mem_dring_unmap(dp->handle);
                kmem_free(dp, sizeof (dring_info_t));
                dp = dpp;
            }
            RW_EXIT(&lp->dlistrw);
        } else {
            /*
             * unbind, destroy exported dring, free dring struct
             */
            WRITE_ENTER(&lp->dlistrw);
            dp = lp->dringp;
            rv = vsw_free_ring(dp);
            RW_EXIT(&lp->dlistrw);
        }
        if (rv == 0) {
            lp->dringp = NULL;
        }
    }

    D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
}

/*
 * Free ring and all associated resources.
 *
 * Should be called with dlistrw rwlock held as writer.
 */
static int
vsw_free_ring(dring_info_t *dp)
{
    vsw_private_desc_t  *paddr = NULL;
    dring_info_t        *dpp;
    int         i, rv = 1;

    while (dp != NULL) {
        mutex_enter(&dp->dlock);
        dpp = dp->next;
        if (dp->priv_addr != NULL) {
            /*
             * First unbind and free the memory handles
             * stored in each descriptor within the ring.
             */
            for (i = 0; i < VSW_RING_NUM_EL; i++) {
                paddr = (vsw_private_desc_t *)
                        dp->priv_addr + i;
                if (paddr->memhandle != NULL) {
                    if (paddr->bound == 1) {
                        rv = ldc_mem_unbind_handle(
                            paddr->memhandle);

                        if (rv != 0) {
                            DERR(NULL, "error "
                            "unbinding handle for "
                            "ring 0x%llx at pos %d",
                            dp, i);
                            mutex_exit(&dp->dlock);
                            return (rv);
                        }
                        paddr->bound = 0;
                    }

                    rv = ldc_mem_free_handle(
                            paddr->memhandle);
                    if (rv != 0) {
                        DERR(NULL, "error freeing "
                            "handle for ring "
                            "0x%llx at pos %d",
                            dp, i);
                        mutex_exit(&dp->dlock);
                        return (rv);
                    }
                    paddr->memhandle = NULL;
                }
                mutex_destroy(&paddr->dstate_lock);
            }
            kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t)
                    * VSW_RING_NUM_EL));
        }

        /*
         * Now unbind and destroy the ring itself.
         */
        if (dp->handle != NULL) {
            (void) ldc_mem_dring_unbind(dp->handle);
            (void) ldc_mem_dring_destroy(dp->handle);
        }

        if (dp->data_addr != NULL) {
            kmem_free(dp->data_addr, dp->data_sz);
        }

        mutex_exit(&dp->dlock);
        mutex_destroy(&dp->dlock);
        mutex_destroy(&dp->restart_lock);
        kmem_free(dp, sizeof (dring_info_t));

        dp = dpp;
    }
    return (0);
}

/*
 * Debugging routines
 */
static void
display_state(void)
{
    vsw_t       *vswp;
    vsw_port_list_t *plist;
    vsw_port_t  *port;
    vsw_ldc_list_t  *ldcl;
    vsw_ldc_t   *ldcp;

    cmn_err(CE_NOTE, "***** system state *****");

    for (vswp = vsw_head; vswp; vswp = vswp->next) {
        plist = &vswp->plist;
        READ_ENTER(&plist->lockrw);
        cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
            vswp->instance, plist->num_ports);

        for (port = plist->head; port != NULL; port = port->p_next) {
            ldcl = &port->p_ldclist;
            cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
                port->p_instance, ldcl->num_ldcs);
            READ_ENTER(&ldcl->lockrw);
            ldcp = ldcl->head;
            for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
                cmn_err(CE_CONT, "chan %lu : dev %d : "
                    "status %d : phase %u\n",
                    ldcp->ldc_id, ldcp->dev_class,
                    ldcp->ldc_status, ldcp->hphase);
                cmn_err(CE_CONT, "chan %lu : lsession %lu : "
                    "psession %lu\n",
                    ldcp->ldc_id,
                    ldcp->local_session,
                    ldcp->peer_session);

                cmn_err(CE_CONT, "Inbound lane:\n");
                display_lane(&ldcp->lane_in);
                cmn_err(CE_CONT, "Outbound lane:\n");
                display_lane(&ldcp->lane_out);
            }
            RW_EXIT(&ldcl->lockrw);
        }
        RW_EXIT(&plist->lockrw);
    }
    cmn_err(CE_NOTE, "***** system state *****");
}

static void
display_lane(lane_t *lp)
{
    dring_info_t    *drp;

    cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
        lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
    cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
        lp->addr_type, lp->addr, lp->xfer_mode);
    cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);

    cmn_err(CE_CONT, "Dring info:\n");
    for (drp = lp->dringp; drp != NULL; drp = drp->next) {
        cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
            drp->num_descriptors, drp->descriptor_size);
        cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
        cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
            (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
        cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
            drp->ident, drp->end_idx);
        display_ring(drp);
    }
}

static void
display_ring(dring_info_t *dringp)
{
    uint64_t        i;
    uint64_t        priv_count = 0;
    uint64_t        pub_count = 0;
    vnet_public_desc_t  *pub_addr = NULL;
    vsw_private_desc_t  *priv_addr = NULL;

    for (i = 0; i < VSW_RING_NUM_EL; i++) {
        if (dringp->pub_addr != NULL) {
            pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;

            if (pub_addr->hdr.dstate == VIO_DESC_FREE)
                pub_count++;
        }

        if (dringp->priv_addr != NULL) {
            priv_addr =
                (vsw_private_desc_t *)dringp->priv_addr + i;

            if (priv_addr->dstate == VIO_DESC_FREE)
                priv_count++;
        }
    }
    cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
            i, priv_count, pub_count);
}

static void
dump_flags(uint64_t state)
{
    int i;

    typedef struct flag_name {
        int flag_val;
        char    *flag_name;
    } flag_name_t;

    flag_name_t flags[] = {
        VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
        VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
        VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
        VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
        VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
        VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
        VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
        VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
        VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
        VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
        VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
        VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
        VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
        VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
        VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
        VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
        VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
        VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
        VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
        VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
        VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
        VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
        VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
        VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
        VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
        VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
        VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
        VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
        VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
        VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
        VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};

    DERR(NULL, "DUMP_FLAGS: %llx\n", state);
    for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
        if (state & flags[i].flag_val)
            DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
    }
}