sun4v/io/vnet.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/types.h>
#include <sys/errno.h>
#include <sys/param.h>
#include <sys/callb.h>
#include <sys/stream.h>
#include <sys/kmem.h>
#include <sys/conf.h>
#include <sys/devops.h>
#include <sys/ksynch.h>
#include <sys/stat.h>
#include <sys/modctl.h>
#include <sys/modhash.h>
#include <sys/debug.h>
#include <sys/ethernet.h>
#include <sys/dlpi.h>
#include <net/if.h>
#include <sys/mac_provider.h>
#include <sys/mac_client.h>
#include <sys/mac_client_priv.h>
#include <sys/mac_ether.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/strsun.h>
#include <sys/note.h>
#include <sys/atomic.h>
#include <sys/vnet.h>
#include <sys/vlan.h>
#include <sys/vnet_mailbox.h>
#include <sys/vnet_common.h>
#include <sys/dds.h>
#include <sys/strsubr.h>
#include <sys/taskq.h>

/*
 * Function prototypes.
 */

/* DDI entrypoints */
static int vnetdevinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
static int vnetattach(dev_info_t *, ddi_attach_cmd_t);
static int vnetdetach(dev_info_t *, ddi_detach_cmd_t);

/* MAC entrypoints  */
static int vnet_m_stat(void *, uint_t, uint64_t *);
static int vnet_m_start(void *);
static void vnet_m_stop(void *);
static int vnet_m_promisc(void *, boolean_t);
static int vnet_m_multicst(void *, boolean_t, const uint8_t *);
static int vnet_m_unicst(void *, const uint8_t *);
mblk_t *vnet_m_tx(void *, mblk_t *);
static void vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp);
#ifdef  VNET_IOC_DEBUG
static void vnet_force_link_state(vnet_t *vnetp, queue_t *q, mblk_t *mp);
#endif
static boolean_t vnet_m_capab(void *arg, mac_capab_t cap, void *cap_data);
static void vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index,
    const int r_index, mac_ring_info_t *infop, mac_ring_handle_t r_handle);
static void vnet_get_group(void *arg, mac_ring_type_t type, const int index,
    mac_group_info_t *infop, mac_group_handle_t handle);
static int vnet_rx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num);
static void vnet_rx_ring_stop(mac_ring_driver_t rdriver);
static int vnet_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat,
    uint64_t *val);
static int vnet_tx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num);
static void vnet_tx_ring_stop(mac_ring_driver_t rdriver);
static int vnet_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat,
    uint64_t *val);
static int vnet_ring_enable_intr(void *arg);
static int vnet_ring_disable_intr(void *arg);
static mblk_t *vnet_rx_poll(void *arg, int bytes_to_pickup);
static int vnet_addmac(void *arg, const uint8_t *mac_addr);
static int vnet_remmac(void *arg, const uint8_t *mac_addr);

/* vnet internal functions */
static int vnet_unattach(vnet_t *vnetp);
static void vnet_ring_grp_init(vnet_t *vnetp);
static void vnet_ring_grp_uninit(vnet_t *vnetp);
static int vnet_mac_register(vnet_t *);
static int vnet_read_mac_address(vnet_t *vnetp);
static int vnet_bind_vgenring(vnet_res_t *vresp);
static void vnet_unbind_vgenring(vnet_res_t *vresp);
static int vnet_bind_hwrings(vnet_t *vnetp);
static void vnet_unbind_hwrings(vnet_t *vnetp);
static int vnet_bind_rings(vnet_res_t *vresp);
static void vnet_unbind_rings(vnet_res_t *vresp);
static int vnet_hio_stat(void *, uint_t, uint64_t *);
static int vnet_hio_start(void *);
static void vnet_hio_stop(void *);
mblk_t *vnet_hio_tx(void *, mblk_t *);

/* Forwarding database (FDB) routines */
static void vnet_fdb_create(vnet_t *vnetp);
static void vnet_fdb_destroy(vnet_t *vnetp);
static vnet_res_t *vnet_fdbe_find(vnet_t *vnetp, struct ether_addr *addrp);
static void vnet_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val);
void vnet_fdbe_add(vnet_t *vnetp, vnet_res_t *vresp);
static void vnet_fdbe_del(vnet_t *vnetp, vnet_res_t *vresp);

static void vnet_rx_frames_untag(uint16_t pvid, mblk_t **mp);
static void vnet_rx(vio_net_handle_t vrh, mblk_t *mp);
static void vnet_tx_update(vio_net_handle_t vrh);
static void vnet_res_start_task(void *arg);
static void vnet_start_resources(vnet_t *vnetp);
static void vnet_stop_resources(vnet_t *vnetp);
static void vnet_dispatch_res_task(vnet_t *vnetp);
static void vnet_res_start_task(void *arg);
static void vnet_handle_res_err(vio_net_handle_t vrh, vio_net_err_val_t err);
static void vnet_add_resource(vnet_t *vnetp, vnet_res_t *vresp);
static vnet_res_t *vnet_rem_resource(vnet_t *vnetp, vnet_res_t *vresp);
static void vnet_tx_notify_thread(void *);

/* Exported to vnet_gen */
int vnet_mtu_update(vnet_t *vnetp, uint32_t mtu);
void vnet_link_update(vnet_t *vnetp, link_state_t link_state);
void vnet_dds_cleanup_hio(vnet_t *vnetp);

static kstat_t *vnet_hio_setup_kstats(char *ks_mod, char *ks_name,
    vnet_res_t *vresp);
static int vnet_hio_update_kstats(kstat_t *ksp, int rw);
static void vnet_hio_get_stats(vnet_res_t *vresp, vnet_hio_stats_t *statsp);
static void vnet_hio_destroy_kstats(kstat_t *ksp);

/* Exported to to vnet_dds */
int vnet_send_dds_msg(vnet_t *vnetp, void *dmsg);
int vnet_hio_mac_init(vnet_t *vnetp, char *ifname);
void vnet_hio_mac_cleanup(vnet_t *vnetp);

/* Externs that are imported from vnet_gen */
extern int vgen_init(void *vnetp, uint64_t regprop, dev_info_t *vnetdip,
    const uint8_t *macaddr, void **vgenhdl);
extern int vgen_init_mdeg(void *arg);
extern void vgen_uninit(void *arg);
extern int vgen_dds_tx(void *arg, void *dmsg);
extern int vgen_enable_intr(void *arg);
extern int vgen_disable_intr(void *arg);
extern mblk_t *vgen_rx_poll(void *arg, int bytes_to_pickup);

/* Externs that are imported from vnet_dds */
extern void vdds_mod_init(void);
extern void vdds_mod_fini(void);
extern int vdds_init(vnet_t *vnetp);
extern void vdds_cleanup(vnet_t *vnetp);
extern void vdds_process_dds_msg(vnet_t *vnetp, vio_dds_msg_t *dmsg);
extern void vdds_cleanup_hybrid_res(void *arg);
extern void vdds_cleanup_hio(vnet_t *vnetp);

extern pri_t    minclsyspri;

#define DRV_NAME    "vnet"
#define VNET_FDBE_REFHOLD(p)                        \
{                                   \
    atomic_inc_32(&(p)->refcnt);                    \
    ASSERT((p)->refcnt != 0);                   \
}

#define VNET_FDBE_REFRELE(p)                        \
{                                   \
    ASSERT((p)->refcnt != 0);                   \
    atomic_dec_32(&(p)->refcnt);                    \
}

#ifdef  VNET_IOC_DEBUG
#define VNET_M_CALLBACK_FLAGS   (MC_IOCTL | MC_GETCAPAB)
#else
#define VNET_M_CALLBACK_FLAGS   (MC_GETCAPAB)
#endif

static mac_callbacks_t vnet_m_callbacks = {
    VNET_M_CALLBACK_FLAGS,
    vnet_m_stat,
    vnet_m_start,
    vnet_m_stop,
    vnet_m_promisc,
    vnet_m_multicst,
    NULL,   /* m_unicst entry must be NULL while rx rings are exposed */
    NULL,   /* m_tx entry must be NULL while tx rings are exposed */
    NULL,
    vnet_m_ioctl,
    vnet_m_capab,
    NULL
};

static mac_callbacks_t vnet_hio_res_callbacks = {
    0,
    vnet_hio_stat,
    vnet_hio_start,
    vnet_hio_stop,
    NULL,
    NULL,
    NULL,
    vnet_hio_tx,
    NULL,
    NULL,
    NULL
};

/*
 * Linked list of "vnet_t" structures - one per instance.
 */
static vnet_t   *vnet_headp = NULL;
static krwlock_t vnet_rw;

/* Tunables */
uint32_t vnet_num_descriptors = VNET_NUM_DESCRIPTORS;

/*
 * Configure tx serialization in mac layer for the vnet device. This tunable
 * should be enabled to improve performance only if HybridIO is configured for
 * the vnet device.
 */
boolean_t vnet_mac_tx_serialize = B_FALSE;

/* Configure enqueing at Rx soft rings in mac layer for the vnet device */
boolean_t vnet_mac_rx_queuing = B_TRUE;

/*
 * Set this to non-zero to enable additional internal receive buffer pools
 * based on the MTU of the device for better performance at the cost of more
 * memory consumption. This is turned off by default, to use allocb(9F) for
 * receive buffer allocations of sizes > 2K.
 */
boolean_t vnet_jumbo_rxpools = B_FALSE;

/* # of chains in fdb hash table */
uint32_t    vnet_fdb_nchains = VNET_NFDB_HASH;

/* Internal tunables */
uint32_t    vnet_ethermtu = 1500;   /* mtu of the device */

/*
 * Default vlan id. This is only used internally when the "default-vlan-id"
 * property is not present in the MD device node. Therefore, this should not be
 * used as a tunable; if this value is changed, the corresponding variable
 * should be updated to the same value in vsw and also other vnets connected to
 * the same vsw.
 */
uint16_t    vnet_default_vlan_id = 1;

/* delay in usec to wait for all references on a fdb entry to be dropped */
uint32_t vnet_fdbe_refcnt_delay = 10;

static struct ether_addr etherbroadcastaddr = {
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff
};

/* mac_open() retry delay in usec */
uint32_t vnet_mac_open_delay = 100; /* 0.1 ms */

/* max # of mac_open() retries */
uint32_t vnet_mac_open_retries = 100;

/*
 * Property names
 */
static char macaddr_propname[] = "local-mac-address";

/*
 * This is the string displayed by modinfo(1m).
 */
static char vnet_ident[] = "vnet driver";
extern struct mod_ops mod_driverops;
static struct cb_ops cb_vnetops = {
    nulldev,        /* cb_open */
    nulldev,        /* cb_close */
    nodev,          /* cb_strategy */
    nodev,          /* cb_print */
    nodev,          /* cb_dump */
    nodev,          /* cb_read */
    nodev,          /* cb_write */
    nodev,          /* cb_ioctl */
    nodev,          /* cb_devmap */
    nodev,          /* cb_mmap */
    nodev,          /* cb_segmap */
    nochpoll,       /* cb_chpoll */
    ddi_prop_op,        /* cb_prop_op */
    NULL,           /* cb_stream */
    (int)(D_MP)     /* cb_flag */
};

static struct dev_ops vnetops = {
    DEVO_REV,       /* devo_rev */
    0,          /* devo_refcnt */
    NULL,           /* devo_getinfo */
    nulldev,        /* devo_identify */
    nulldev,        /* devo_probe */
    vnetattach,     /* devo_attach */
    vnetdetach,     /* devo_detach */
    nodev,          /* devo_reset */
    &cb_vnetops,        /* devo_cb_ops */
    (struct bus_ops *)NULL, /* devo_bus_ops */
    NULL,           /* devo_power */
    ddi_quiesce_not_supported,  /* devo_quiesce */
};

static struct modldrv modldrv = {
    &mod_driverops,     /* Type of module.  This one is a driver */
    vnet_ident,     /* ID string */
    &vnetops        /* driver specific ops */
};

static struct modlinkage modlinkage = {
    MODREV_1, (void *)&modldrv, NULL
};

#ifdef DEBUG

#define DEBUG_PRINTF    debug_printf

/*
 * Print debug messages - set to 0xf to enable all msgs
 */
int vnet_dbglevel = 0x8;

static void
debug_printf(const char *fname, void *arg, const char *fmt, ...)
{
    char    buf[512];
    va_list ap;
    vnet_t *vnetp = (vnet_t *)arg;
    char    *bufp = buf;

    if (vnetp == NULL) {
        (void) sprintf(bufp, "%s: ", fname);
        bufp += strlen(bufp);
    } else {
        (void) sprintf(bufp, "vnet%d:%s: ", vnetp->instance, fname);
        bufp += strlen(bufp);
    }
    va_start(ap, fmt);
    (void) vsprintf(bufp, fmt, ap);
    va_end(ap);
    cmn_err(CE_CONT, "%s\n", buf);
}

#endif

/* _init(9E): initialize the loadable module */
int
_init(void)
{
    int status;

    DBG1(NULL, "enter\n");

    mac_init_ops(&vnetops, "vnet");
    status = mod_install(&modlinkage);
    if (status != 0) {
        mac_fini_ops(&vnetops);
    }
    vdds_mod_init();
    DBG1(NULL, "exit(%d)\n", status);
    return (status);
}

/* _fini(9E): prepare the module for unloading. */
int
_fini(void)
{
    int     status;

    DBG1(NULL, "enter\n");

    status = mod_remove(&modlinkage);
    if (status != 0)
        return (status);
    mac_fini_ops(&vnetops);
    vdds_mod_fini();

    DBG1(NULL, "exit(%d)\n", status);
    return (status);
}

/* _info(9E): return information about the loadable module */
int
_info(struct modinfo *modinfop)
{
    return (mod_info(&modlinkage, modinfop));
}

/*
 * attach(9E): attach a device to the system.
 * called once for each instance of the device on the system.
 */
static int
vnetattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
    vnet_t          *vnetp;
    int         status;
    int         instance;
    uint64_t        reg;
    char            qname[TASKQ_NAMELEN];
    vnet_attach_progress_t  attach_progress;

    attach_progress = AST_init;

    switch (cmd) {
    case DDI_ATTACH:
        break;
    case DDI_RESUME:
    case DDI_PM_RESUME:
    default:
        goto vnet_attach_fail;
    }

    instance = ddi_get_instance(dip);
    DBG1(NULL, "instance(%d) enter\n", instance);

    /* allocate vnet_t and mac_t structures */
    vnetp = kmem_zalloc(sizeof (vnet_t), KM_SLEEP);
    vnetp->dip = dip;
    vnetp->instance = instance;
    rw_init(&vnetp->vrwlock, NULL, RW_DRIVER, NULL);
    rw_init(&vnetp->vsw_fp_rw, NULL, RW_DRIVER, NULL);
    attach_progress |= AST_vnet_alloc;

    vnet_ring_grp_init(vnetp);
    attach_progress |= AST_ring_init;

    status = vdds_init(vnetp);
    if (status != 0) {
        goto vnet_attach_fail;
    }
    attach_progress |= AST_vdds_init;

    /* setup links to vnet_t from both devinfo and mac_t */
    ddi_set_driver_private(dip, (caddr_t)vnetp);

    /* read the mac address */
    status = vnet_read_mac_address(vnetp);
    if (status != DDI_SUCCESS) {
        goto vnet_attach_fail;
    }
    attach_progress |= AST_read_macaddr;

    reg = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
        DDI_PROP_DONTPASS, "reg", -1);
    if (reg == -1) {
        goto vnet_attach_fail;
    }
    vnetp->reg = reg;

    vnet_fdb_create(vnetp);
    attach_progress |= AST_fdbh_alloc;

    (void) snprintf(qname, TASKQ_NAMELEN, "vres_taskq%d", instance);
    if ((vnetp->taskqp = ddi_taskq_create(dip, qname, 1,
        TASKQ_DEFAULTPRI, 0)) == NULL) {
        cmn_err(CE_WARN, "!vnet%d: Unable to create task queue",
            instance);
        goto vnet_attach_fail;
    }
    attach_progress |= AST_taskq_create;

    /* add to the list of vnet devices */
    WRITE_ENTER(&vnet_rw);
    vnetp->nextp = vnet_headp;
    vnet_headp = vnetp;
    RW_EXIT(&vnet_rw);

    attach_progress |= AST_vnet_list;

    /*
     * Initialize the generic vnet plugin which provides communication via
     * sun4v LDC (logical domain channel) based resources. This involves 2
     * steps; first, vgen_init() is invoked to read the various properties
     * of the vnet device from its MD node (including its mtu which is
     * needed to mac_register()) and obtain a handle to the vgen layer.
     * After mac_register() is done and we have a mac handle, we then
     * invoke vgen_init_mdeg() which registers with the the MD event
     * generator (mdeg) framework to allow LDC resource notifications.
     * Note: this sequence also allows us to report the correct default #
     * of pseudo rings (2TX and 3RX) in vnet_m_capab() which gets invoked
     * in the context of mac_register(); and avoids conflicting with
     * dynamic pseudo rx rings which get added/removed as a result of mdeg
     * events in vgen.
     */
    status = vgen_init(vnetp, reg, vnetp->dip,
        (uint8_t *)vnetp->curr_macaddr, &vnetp->vgenhdl);
    if (status != DDI_SUCCESS) {
        DERR(vnetp, "vgen_init() failed\n");
        goto vnet_attach_fail;
    }
    attach_progress |= AST_vgen_init;

    status = vnet_mac_register(vnetp);
    if (status != DDI_SUCCESS) {
        goto vnet_attach_fail;
    }
    vnetp->link_state = LINK_STATE_UNKNOWN;
    attach_progress |= AST_macreg;

    status = vgen_init_mdeg(vnetp->vgenhdl);
    if (status != DDI_SUCCESS) {
        goto vnet_attach_fail;
    }
    attach_progress |= AST_init_mdeg;

    vnetp->attach_progress = attach_progress;

    DBG1(NULL, "instance(%d) exit\n", instance);
    return (DDI_SUCCESS);

vnet_attach_fail:
    vnetp->attach_progress = attach_progress;
    status = vnet_unattach(vnetp);
    ASSERT(status == 0);
    return (DDI_FAILURE);
}

/*
 * detach(9E): detach a device from the system.
 */
static int
vnetdetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
    vnet_t      *vnetp;
    int     instance;

    instance = ddi_get_instance(dip);
    DBG1(NULL, "instance(%d) enter\n", instance);

    vnetp = ddi_get_driver_private(dip);
    if (vnetp == NULL) {
        goto vnet_detach_fail;
    }

    switch (cmd) {
    case DDI_DETACH:
        break;
    case DDI_SUSPEND:
    case DDI_PM_SUSPEND:
    default:
        goto vnet_detach_fail;
    }

    if (vnet_unattach(vnetp) != 0) {
        goto vnet_detach_fail;
    }

    return (DDI_SUCCESS);

vnet_detach_fail:
    return (DDI_FAILURE);
}

/*
 * Common routine to handle vnetattach() failure and vnetdetach(). Note that
 * the only reason this function could fail is if mac_unregister() fails.
 * Otherwise, this function must ensure that all resources are freed and return
 * success.
 */
static int
vnet_unattach(vnet_t *vnetp)
{
    vnet_attach_progress_t  attach_progress;

    attach_progress = vnetp->attach_progress;

    /*
     * Disable the mac device in the gldv3 subsystem. This can fail, in
     * particular if there are still any open references to this mac
     * device; in which case we just return failure without continuing to
     * detach further.
     * If it succeeds, we then invoke vgen_uninit() which should unregister
     * any pseudo rings registered with the mac layer. Note we keep the
     * AST_macreg flag on, so we can unregister with the mac layer at
     * the end of this routine.
     */
    if (attach_progress & AST_macreg) {
        if (mac_disable(vnetp->mh) != 0) {
            return (1);
        }
    }

    /*
     * Now that we have disabled the device, we must finish all other steps
     * and successfully return from this function; otherwise we will end up
     * leaving the device in a broken/unusable state.
     *
     * First, release any hybrid resources assigned to this vnet device.
     */
    if (attach_progress & AST_vdds_init) {
        vdds_cleanup(vnetp);
        attach_progress &= ~AST_vdds_init;
    }

    /*
     * Uninit vgen. This stops further mdeg callbacks to this vnet
     * device and/or its ports; and detaches any existing ports.
     */
    if (attach_progress & (AST_vgen_init|AST_init_mdeg)) {
        vgen_uninit(vnetp->vgenhdl);
        attach_progress &= ~AST_vgen_init;
        attach_progress &= ~AST_init_mdeg;
    }

    /* Destroy the taskq. */
    if (attach_progress & AST_taskq_create) {
        ddi_taskq_destroy(vnetp->taskqp);
        attach_progress &= ~AST_taskq_create;
    }

    /* Destroy fdb. */
    if (attach_progress & AST_fdbh_alloc) {
        vnet_fdb_destroy(vnetp);
        attach_progress &= ~AST_fdbh_alloc;
    }

    /* Remove from the device list */
    if (attach_progress & AST_vnet_list) {
        vnet_t      **vnetpp;
        /* unlink from instance(vnet_t) list */
        WRITE_ENTER(&vnet_rw);
        for (vnetpp = &vnet_headp; *vnetpp;
            vnetpp = &(*vnetpp)->nextp) {
            if (*vnetpp == vnetp) {
                *vnetpp = vnetp->nextp;
                break;
            }
        }
        RW_EXIT(&vnet_rw);
        attach_progress &= ~AST_vnet_list;
    }

    if (attach_progress & AST_ring_init) {
        vnet_ring_grp_uninit(vnetp);
        attach_progress &= ~AST_ring_init;
    }

    if (attach_progress & AST_macreg) {
        VERIFY(mac_unregister(vnetp->mh) == 0);
        vnetp->mh = NULL;
        attach_progress &= ~AST_macreg;
    }

    if (attach_progress & AST_vnet_alloc) {
        rw_destroy(&vnetp->vrwlock);
        rw_destroy(&vnetp->vsw_fp_rw);
        attach_progress &= ~AST_vnet_list;
        KMEM_FREE(vnetp);
    }

    return (0);
}

/* enable the device for transmit/receive */
static int
vnet_m_start(void *arg)
{
    vnet_t      *vnetp = arg;

    DBG1(vnetp, "enter\n");

    WRITE_ENTER(&vnetp->vrwlock);
    vnetp->flags |= VNET_STARTED;
    vnet_start_resources(vnetp);
    RW_EXIT(&vnetp->vrwlock);

    DBG1(vnetp, "exit\n");
    return (VNET_SUCCESS);

}

/* stop transmit/receive for the device */
static void
vnet_m_stop(void *arg)
{
    vnet_t      *vnetp = arg;

    DBG1(vnetp, "enter\n");

    WRITE_ENTER(&vnetp->vrwlock);
    if (vnetp->flags & VNET_STARTED) {
        /*
         * Set the flags appropriately; this should prevent starting of
         * any new resources that are added(see vnet_res_start_task()),
         * while we release the vrwlock in vnet_stop_resources() before
         * stopping each resource.
         */
        vnetp->flags &= ~VNET_STARTED;
        vnetp->flags |= VNET_STOPPING;
        vnet_stop_resources(vnetp);
        vnetp->flags &= ~VNET_STOPPING;
    }
    RW_EXIT(&vnetp->vrwlock);

    DBG1(vnetp, "exit\n");
}

/* set the unicast mac address of the device */
static int
vnet_m_unicst(void *arg, const uint8_t *macaddr)
{
    _NOTE(ARGUNUSED(macaddr))

    vnet_t *vnetp = arg;

    DBG1(vnetp, "enter\n");
    /*
     * NOTE: setting mac address dynamically is not supported.
     */
    DBG1(vnetp, "exit\n");

    return (VNET_FAILURE);
}

/* enable/disable a multicast address */
static int
vnet_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
{
    _NOTE(ARGUNUSED(add, mca))

    vnet_t      *vnetp = arg;
    vnet_res_t  *vresp;
    mac_register_t  *macp;
    mac_callbacks_t *cbp;
    int     rv = VNET_SUCCESS;

    DBG1(vnetp, "enter\n");

    READ_ENTER(&vnetp->vsw_fp_rw);
    if (vnetp->vsw_fp == NULL) {
        RW_EXIT(&vnetp->vsw_fp_rw);
        return (EAGAIN);
    }
    VNET_FDBE_REFHOLD(vnetp->vsw_fp);
    RW_EXIT(&vnetp->vsw_fp_rw);

    vresp = vnetp->vsw_fp;
    macp = &vresp->macreg;
    cbp = macp->m_callbacks;
    rv = cbp->mc_multicst(macp->m_driver, add, mca);

    VNET_FDBE_REFRELE(vnetp->vsw_fp);

    DBG1(vnetp, "exit(%d)\n", rv);
    return (rv);
}

/* set or clear promiscuous mode on the device */
static int
vnet_m_promisc(void *arg, boolean_t on)
{
    _NOTE(ARGUNUSED(on))

    vnet_t *vnetp = arg;
    DBG1(vnetp, "enter\n");
    /*
     * NOTE: setting promiscuous mode is not supported, just return success.
     */
    DBG1(vnetp, "exit\n");
    return (VNET_SUCCESS);
}

/*
 * Transmit a chain of packets. This function provides switching functionality
 * based on the destination mac address to reach other guests (within ldoms) or
 * external hosts.
 */
mblk_t *
vnet_tx_ring_send(void *arg, mblk_t *mp)
{
    vnet_pseudo_tx_ring_t   *tx_ringp;
    vnet_tx_ring_stats_t    *statsp;
    vnet_t          *vnetp;
    vnet_res_t      *vresp;
    mblk_t          *next;
    mblk_t          *resid_mp;
    mac_register_t      *macp;
    struct ether_header *ehp;
    boolean_t       is_unicast;
    boolean_t       is_pvid;    /* non-default pvid ? */
    boolean_t       hres;       /* Hybrid resource ? */
    void            *tx_arg;
    size_t          size;

    tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
    statsp = &tx_ringp->tx_ring_stats;
    vnetp = (vnet_t *)tx_ringp->vnetp;
    DBG1(vnetp, "enter\n");
    ASSERT(mp != NULL);

    is_pvid = (vnetp->pvid != vnetp->default_vlan_id) ? B_TRUE : B_FALSE;

    while (mp != NULL) {

        next = mp->b_next;
        mp->b_next = NULL;

        /* update stats */
        size = msgsize(mp);

        /*
         * Find fdb entry for the destination
         * and hold a reference to it.
         */
        ehp = (struct ether_header *)mp->b_rptr;
        vresp = vnet_fdbe_find(vnetp, &ehp->ether_dhost);
        if (vresp != NULL) {

            /*
             * Destination found in FDB.
             * The destination is a vnet device within ldoms
             * and directly reachable, invoke the tx function
             * in the fdb entry.
             */
            macp = &vresp->macreg;
            resid_mp = macp->m_callbacks->mc_tx(macp->m_driver, mp);

            /* tx done; now release ref on fdb entry */
            VNET_FDBE_REFRELE(vresp);

            if (resid_mp != NULL) {
                /* m_tx failed */
                mp->b_next = next;
                break;
            }
        } else {
            is_unicast = !(IS_BROADCAST(ehp) ||
                (IS_MULTICAST(ehp)));
            /*
             * Destination is not in FDB.
             * If the destination is broadcast or multicast,
             * then forward the packet to vswitch.
             * If a Hybrid resource avilable, then send the
             * unicast packet via hybrid resource, otherwise
             * forward it to vswitch.
             */
            READ_ENTER(&vnetp->vsw_fp_rw);

            if ((is_unicast) && (vnetp->hio_fp != NULL)) {
                vresp = vnetp->hio_fp;
                hres = B_TRUE;
            } else {
                vresp = vnetp->vsw_fp;
                hres = B_FALSE;
            }
            if (vresp == NULL) {
                /*
                 * no fdb entry to vsw? drop the packet.
                 */
                RW_EXIT(&vnetp->vsw_fp_rw);
                freemsg(mp);
                mp = next;
                continue;
            }

            /* ref hold the fdb entry to vsw */
            VNET_FDBE_REFHOLD(vresp);

            RW_EXIT(&vnetp->vsw_fp_rw);

            /*
             * In the case of a hybrid resource we need to insert
             * the tag for the pvid case here; unlike packets that
             * are destined to a vnet/vsw in which case the vgen
             * layer does the tagging before sending it over ldc.
             */
            if (hres == B_TRUE) {
                /*
                 * Determine if the frame being transmitted
                 * over the hybrid resource is untagged. If so,
                 * insert the tag before transmitting.
                 */
                if (is_pvid == B_TRUE &&
                    ehp->ether_type != htons(ETHERTYPE_VLAN)) {

                    mp = vnet_vlan_insert_tag(mp,
                        vnetp->pvid);
                    if (mp == NULL) {
                        VNET_FDBE_REFRELE(vresp);
                        mp = next;
                        continue;
                    }

                }

                macp = &vresp->macreg;
                tx_arg = tx_ringp;
            } else {
                macp = &vresp->macreg;
                tx_arg = macp->m_driver;
            }
            resid_mp = macp->m_callbacks->mc_tx(tx_arg, mp);

            /* tx done; now release ref on fdb entry */
            VNET_FDBE_REFRELE(vresp);

            if (resid_mp != NULL) {
                /* m_tx failed */
                mp->b_next = next;
                break;
            }
        }

        statsp->obytes += size;
        statsp->opackets++;
        mp = next;
    }

    DBG1(vnetp, "exit\n");
    return (mp);
}

/* get statistics from the device */
int
vnet_m_stat(void *arg, uint_t stat, uint64_t *val)
{
    vnet_t *vnetp = arg;
    vnet_res_t  *vresp;
    mac_register_t  *macp;
    mac_callbacks_t *cbp;
    uint64_t val_total = 0;

    DBG1(vnetp, "enter\n");

    /*
     * get the specified statistic from each transport and return the
     * aggregate val.  This obviously only works for counters.
     */
    if ((IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat)) ||
        (IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat))) {
        return (ENOTSUP);
    }

    READ_ENTER(&vnetp->vrwlock);
    for (vresp = vnetp->vres_list; vresp != NULL; vresp = vresp->nextp) {
        macp = &vresp->macreg;
        cbp = macp->m_callbacks;
        if (cbp->mc_getstat(macp->m_driver, stat, val) == 0)
            val_total += *val;
    }
    RW_EXIT(&vnetp->vrwlock);

    *val = val_total;

    DBG1(vnetp, "exit\n");
    return (0);
}

static void
vnet_ring_grp_init(vnet_t *vnetp)
{
    vnet_pseudo_rx_group_t  *rx_grp;
    vnet_pseudo_rx_ring_t   *rx_ringp;
    vnet_pseudo_tx_group_t  *tx_grp;
    vnet_pseudo_tx_ring_t   *tx_ringp;
    int         i;

    tx_grp = &vnetp->tx_grp[0];
    tx_ringp = kmem_zalloc(sizeof (vnet_pseudo_tx_ring_t) *
        VNET_NUM_PSEUDO_TXRINGS, KM_SLEEP);
    for (i = 0; i < VNET_NUM_PSEUDO_TXRINGS; i++) {
        tx_ringp[i].state |= VNET_TXRING_SHARED;
    }
    tx_grp->rings = tx_ringp;
    tx_grp->ring_cnt = VNET_NUM_PSEUDO_TXRINGS;
    mutex_init(&tx_grp->flowctl_lock, NULL, MUTEX_DRIVER, NULL);
    cv_init(&tx_grp->flowctl_cv, NULL, CV_DRIVER, NULL);
    tx_grp->flowctl_thread = thread_create(NULL, 0,
        vnet_tx_notify_thread, tx_grp, 0, &p0, TS_RUN, minclsyspri);

    rx_grp = &vnetp->rx_grp[0];
    rx_grp->max_ring_cnt = MAX_RINGS_PER_GROUP;
    rw_init(&rx_grp->lock, NULL, RW_DRIVER, NULL);
    rx_ringp = kmem_zalloc(sizeof (vnet_pseudo_rx_ring_t) *
        rx_grp->max_ring_cnt, KM_SLEEP);

    /*
     * Setup the first 3 Pseudo RX Rings that are reserved;
     * 1 for LDC resource to vswitch + 2 for RX rings of Hybrid resource.
     */
    rx_ringp[0].state |= VNET_RXRING_INUSE|VNET_RXRING_LDC_SERVICE;
    rx_ringp[0].index = 0;
    rx_ringp[1].state |= VNET_RXRING_INUSE|VNET_RXRING_HYBRID;
    rx_ringp[1].index = 1;
    rx_ringp[2].state |= VNET_RXRING_INUSE|VNET_RXRING_HYBRID;
    rx_ringp[2].index = 2;

    rx_grp->ring_cnt = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
    rx_grp->rings = rx_ringp;

    for (i = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
        i < rx_grp->max_ring_cnt; i++) {
        rx_ringp = &rx_grp->rings[i];
        rx_ringp->state = VNET_RXRING_FREE;
        rx_ringp->index = i;
    }
}

static void
vnet_ring_grp_uninit(vnet_t *vnetp)
{
    vnet_pseudo_rx_group_t  *rx_grp;
    vnet_pseudo_tx_group_t  *tx_grp;
    kt_did_t        tid = 0;

    tx_grp = &vnetp->tx_grp[0];

    /* Inform tx_notify_thread to exit */
    mutex_enter(&tx_grp->flowctl_lock);
    if (tx_grp->flowctl_thread != NULL) {
        tid = tx_grp->flowctl_thread->t_did;
        tx_grp->flowctl_done = B_TRUE;
        cv_signal(&tx_grp->flowctl_cv);
    }
    mutex_exit(&tx_grp->flowctl_lock);
    if (tid != 0)
        thread_join(tid);

    if (tx_grp->rings != NULL) {
        ASSERT(tx_grp->ring_cnt == VNET_NUM_PSEUDO_TXRINGS);
        kmem_free(tx_grp->rings, sizeof (vnet_pseudo_tx_ring_t) *
            tx_grp->ring_cnt);
        tx_grp->rings = NULL;
    }

    rx_grp = &vnetp->rx_grp[0];
    if (rx_grp->rings != NULL) {
        ASSERT(rx_grp->max_ring_cnt == MAX_RINGS_PER_GROUP);
        ASSERT(rx_grp->ring_cnt == VNET_NUM_PSEUDO_RXRINGS_DEFAULT);
        kmem_free(rx_grp->rings, sizeof (vnet_pseudo_rx_ring_t) *
            rx_grp->max_ring_cnt);
        rx_grp->rings = NULL;
    }
}

static vnet_pseudo_rx_ring_t *
vnet_alloc_pseudo_rx_ring(vnet_t *vnetp)
{
    vnet_pseudo_rx_group_t  *rx_grp;
    vnet_pseudo_rx_ring_t   *rx_ringp;
    int         index;

    rx_grp = &vnetp->rx_grp[0];
    WRITE_ENTER(&rx_grp->lock);

    if (rx_grp->ring_cnt == rx_grp->max_ring_cnt) {
        /* no rings available */
        RW_EXIT(&rx_grp->lock);
        return (NULL);
    }

    for (index = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
        index < rx_grp->max_ring_cnt; index++) {
        rx_ringp = &rx_grp->rings[index];
        if (rx_ringp->state == VNET_RXRING_FREE) {
            rx_ringp->state |= VNET_RXRING_INUSE;
            rx_grp->ring_cnt++;
            break;
        }
    }

    RW_EXIT(&rx_grp->lock);
    return (rx_ringp);
}

static void
vnet_free_pseudo_rx_ring(vnet_t *vnetp, vnet_pseudo_rx_ring_t *ringp)
{
    vnet_pseudo_rx_group_t  *rx_grp;

    ASSERT(ringp->index >= VNET_NUM_PSEUDO_RXRINGS_DEFAULT);
    rx_grp = &vnetp->rx_grp[0];
    WRITE_ENTER(&rx_grp->lock);

    if (ringp->state != VNET_RXRING_FREE) {
        ringp->state = VNET_RXRING_FREE;
        ringp->handle = NULL;
        rx_grp->ring_cnt--;
    }

    RW_EXIT(&rx_grp->lock);
}

/* wrapper function for mac_register() */
static int
vnet_mac_register(vnet_t *vnetp)
{
    mac_register_t  *macp;
    int     err;

    if ((macp = mac_alloc(MAC_VERSION)) == NULL)
        return (DDI_FAILURE);
    macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
    macp->m_driver = vnetp;
    macp->m_dip = vnetp->dip;
    macp->m_src_addr = vnetp->curr_macaddr;
    macp->m_callbacks = &vnet_m_callbacks;
    macp->m_min_sdu = 0;
    macp->m_max_sdu = vnetp->mtu;
    macp->m_margin = VLAN_TAGSZ;

    macp->m_v12n = MAC_VIRT_LEVEL1;

    /*
     * Finally, we're ready to register ourselves with the MAC layer
     * interface; if this succeeds, we're all ready to start()
     */
    err = mac_register(macp, &vnetp->mh);
    mac_free(macp);
    return (err == 0 ? DDI_SUCCESS : DDI_FAILURE);
}

/* read the mac address of the device */
static int
vnet_read_mac_address(vnet_t *vnetp)
{
    uchar_t     *macaddr;
    uint32_t    size;
    int         rv;

    rv = ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, vnetp->dip,
        DDI_PROP_DONTPASS, macaddr_propname, &macaddr, &size);
    if ((rv != DDI_PROP_SUCCESS) || (size != ETHERADDRL)) {
        DWARN(vnetp, "prop_lookup failed(%s) err(%d)\n",
            macaddr_propname, rv);
        return (DDI_FAILURE);
    }
    bcopy(macaddr, (caddr_t)vnetp->vendor_addr, ETHERADDRL);
    bcopy(macaddr, (caddr_t)vnetp->curr_macaddr, ETHERADDRL);
    ddi_prop_free(macaddr);

    return (DDI_SUCCESS);
}

static void
vnet_fdb_create(vnet_t *vnetp)
{
    char        hashname[MAXNAMELEN];

    (void) snprintf(hashname, MAXNAMELEN, "vnet%d-fdbhash",
        vnetp->instance);
    vnetp->fdb_nchains = vnet_fdb_nchains;
    vnetp->fdb_hashp = mod_hash_create_ptrhash(hashname, vnetp->fdb_nchains,
        mod_hash_null_valdtor, sizeof (void *));
}

static void
vnet_fdb_destroy(vnet_t *vnetp)
{
    /* destroy fdb-hash-table */
    if (vnetp->fdb_hashp != NULL) {
        mod_hash_destroy_hash(vnetp->fdb_hashp);
        vnetp->fdb_hashp = NULL;
        vnetp->fdb_nchains = 0;
    }
}

/*
 * Add an entry into the fdb.
 */
void
vnet_fdbe_add(vnet_t *vnetp, vnet_res_t *vresp)
{
    uint64_t    addr = 0;
    int     rv;

    KEY_HASH(addr, vresp->rem_macaddr);

    /*
     * If the entry being added corresponds to LDC_SERVICE resource,
     * that is, vswitch connection, it is added to the hash and also
     * the entry is cached, an additional reference count reflects
     * this. The HYBRID resource is not added to the hash, but only
     * cached, as it is only used for sending out packets for unknown
     * unicast destinations.
     */
    (vresp->type == VIO_NET_RES_LDC_SERVICE) ?
        (vresp->refcnt = 1) : (vresp->refcnt = 0);

    /*
     * Note: duplicate keys will be rejected by mod_hash.
     */
    if (vresp->type != VIO_NET_RES_HYBRID) {
        rv = mod_hash_insert(vnetp->fdb_hashp, (mod_hash_key_t)addr,
            (mod_hash_val_t)vresp);
        if (rv != 0) {
            DWARN(vnetp, "Duplicate macaddr key(%lx)\n", addr);
            return;
        }
    }

    if (vresp->type == VIO_NET_RES_LDC_SERVICE) {
        /* Cache the fdb entry to vsw-port */
        WRITE_ENTER(&vnetp->vsw_fp_rw);
        if (vnetp->vsw_fp == NULL)
            vnetp->vsw_fp = vresp;
        RW_EXIT(&vnetp->vsw_fp_rw);
    } else if (vresp->type == VIO_NET_RES_HYBRID) {
        /* Cache the fdb entry to hybrid resource */
        WRITE_ENTER(&vnetp->vsw_fp_rw);
        if (vnetp->hio_fp == NULL)
            vnetp->hio_fp = vresp;
        RW_EXIT(&vnetp->vsw_fp_rw);
    }
}

/*
 * Remove an entry from fdb.
 */
static void
vnet_fdbe_del(vnet_t *vnetp, vnet_res_t *vresp)
{
    uint64_t    addr = 0;
    int     rv;
    uint32_t    refcnt;
    vnet_res_t  *tmp;

    KEY_HASH(addr, vresp->rem_macaddr);

    /*
     * Remove the entry from fdb hash table.
     * This prevents further references to this fdb entry.
     */
    if (vresp->type != VIO_NET_RES_HYBRID) {
        rv = mod_hash_remove(vnetp->fdb_hashp, (mod_hash_key_t)addr,
            (mod_hash_val_t *)&tmp);
        if (rv != 0) {
            /*
             * As the resources are added to the hash only
             * after they are started, this can occur if
             * a resource unregisters before it is ever started.
             */
            return;
        }
    }

    if (vresp->type == VIO_NET_RES_LDC_SERVICE) {
        WRITE_ENTER(&vnetp->vsw_fp_rw);

        ASSERT(tmp == vnetp->vsw_fp);
        vnetp->vsw_fp = NULL;

        RW_EXIT(&vnetp->vsw_fp_rw);
    } else if (vresp->type == VIO_NET_RES_HYBRID) {
        WRITE_ENTER(&vnetp->vsw_fp_rw);

        vnetp->hio_fp = NULL;

        RW_EXIT(&vnetp->vsw_fp_rw);
    }

    /*
     * If there are threads already ref holding before the entry was
     * removed from hash table, then wait for ref count to drop to zero.
     */
    (vresp->type == VIO_NET_RES_LDC_SERVICE) ?
        (refcnt = 1) : (refcnt = 0);
    while (vresp->refcnt > refcnt) {
        delay(drv_usectohz(vnet_fdbe_refcnt_delay));
    }
}

/*
 * Search fdb for a given mac address. If an entry is found, hold
 * a reference to it and return the entry; else returns NULL.
 */
static vnet_res_t *
vnet_fdbe_find(vnet_t *vnetp, struct ether_addr *addrp)
{
    uint64_t    key = 0;
    vnet_res_t  *vresp;
    int     rv;

    KEY_HASH(key, addrp->ether_addr_octet);

    rv = mod_hash_find_cb(vnetp->fdb_hashp, (mod_hash_key_t)key,
        (mod_hash_val_t *)&vresp, vnet_fdbe_find_cb);

    if (rv != 0)
        return (NULL);

    return (vresp);
}

/*
 * Callback function provided to mod_hash_find_cb(). After finding the fdb
 * entry corresponding to the key (macaddr), this callback will be invoked by
 * mod_hash_find_cb() to atomically increment the reference count on the fdb
 * entry before returning the found entry.
 */
static void
vnet_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val)
{
    _NOTE(ARGUNUSED(key))
    VNET_FDBE_REFHOLD((vnet_res_t *)val);
}

/*
 * Frames received that are tagged with the pvid of the vnet device must be
 * untagged before sending up the stack. This function walks the chain of rx
 * frames, untags any such frames and returns the updated chain.
 *
 * Arguments:
 *    pvid:  pvid of the vnet device for which packets are being received
 *    mp:    head of pkt chain to be validated and untagged
 *
 * Returns:
 *    mp:    head of updated chain of packets
 */
static void
vnet_rx_frames_untag(uint16_t pvid, mblk_t **mp)
{
    struct ether_vlan_header    *evhp;
    mblk_t              *bp;
    mblk_t              *bpt;
    mblk_t              *bph;
    mblk_t              *bpn;

    bpn = bph = bpt = NULL;

    for (bp = *mp; bp != NULL; bp = bpn) {

        bpn = bp->b_next;
        bp->b_next = bp->b_prev = NULL;

        evhp = (struct ether_vlan_header *)bp->b_rptr;

        if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN &&
            VLAN_ID(ntohs(evhp->ether_tci)) == pvid) {

            bp = vnet_vlan_remove_tag(bp);
            if (bp == NULL) {
                continue;
            }

        }

        /* build a chain of processed packets */
        if (bph == NULL) {
            bph = bpt = bp;
        } else {
            bpt->b_next = bp;
            bpt = bp;
        }

    }

    *mp = bph;
}

static void
vnet_rx(vio_net_handle_t vrh, mblk_t *mp)
{
    vnet_res_t      *vresp = (vnet_res_t *)vrh;
    vnet_t          *vnetp = vresp->vnetp;
    vnet_pseudo_rx_ring_t   *ringp;

    if ((vnetp == NULL) || (vnetp->mh == 0)) {
        freemsgchain(mp);
        return;
    }

    ringp = vresp->rx_ringp;
    mac_rx_ring(vnetp->mh, ringp->handle, mp, ringp->gen_num);
}

void
vnet_tx_update(vio_net_handle_t vrh)
{
    vnet_res_t      *vresp = (vnet_res_t *)vrh;
    vnet_t          *vnetp = vresp->vnetp;
    vnet_pseudo_tx_ring_t   *tx_ringp;
    vnet_pseudo_tx_group_t  *tx_grp;
    int         i;

    if (vnetp == NULL || vnetp->mh == NULL) {
        return;
    }

    /*
     * Currently, the tx hwring API (used to access rings that belong to
     * a Hybrid IO resource) does not provide us a per ring flow ctrl
     * update; also the pseudo rings are shared by the ports/ldcs in the
     * vgen layer. Thus we can't figure out which pseudo ring is being
     * re-enabled for transmits. To work around this, when we get a tx
     * restart notification from below, we simply propagate that to all
     * the tx pseudo rings registered with the mac layer above.
     *
     * There are a couple of side effects with this approach, but they are
     * not harmful, as outlined below:
     *
     * A) We might send an invalid ring_update() for a ring that is not
     * really flow controlled. This will not have any effect in the mac
     * layer and packets will continue to be transmitted on that ring.
     *
     * B) We might end up clearing the flow control in the mac layer for
     * a ring that is still flow controlled in the underlying resource.
     * This will result in the mac layer restarting transmit, only to be
     * flow controlled again on that ring.
     */
    tx_grp = &vnetp->tx_grp[0];
    for (i = 0; i < tx_grp->ring_cnt; i++) {
        tx_ringp = &tx_grp->rings[i];
        mac_tx_ring_update(vnetp->mh, tx_ringp->handle);
    }
}

/*
 * vnet_tx_notify_thread:
 *
 * vnet_tx_ring_update() callback function wakes up this thread when
 * it gets called. This thread will call mac_tx_ring_update() to
 * notify upper mac of flow control getting relieved. Note that
 * vnet_tx_ring_update() cannot call mac_tx_ring_update() directly
 * because vnet_tx_ring_update() is called from lower mac with
 * mi_rw_lock held and mac_tx_ring_update() would also try to grab
 * the same lock.
 */
static void
vnet_tx_notify_thread(void *arg)
{
    callb_cpr_t     cprinfo;
    vnet_pseudo_tx_group_t  *tx_grp = (vnet_pseudo_tx_group_t *)arg;
    vnet_pseudo_tx_ring_t   *tx_ringp;
    vnet_t          *vnetp;
    int         i;

    CALLB_CPR_INIT(&cprinfo, &tx_grp->flowctl_lock, callb_generic_cpr,
        "vnet_tx_notify_thread");

    mutex_enter(&tx_grp->flowctl_lock);
    while (!tx_grp->flowctl_done) {
        CALLB_CPR_SAFE_BEGIN(&cprinfo);
        cv_wait(&tx_grp->flowctl_cv, &tx_grp->flowctl_lock);
        CALLB_CPR_SAFE_END(&cprinfo, &tx_grp->flowctl_lock);

        for (i = 0; i < tx_grp->ring_cnt; i++) {
            tx_ringp = &tx_grp->rings[i];
            if (tx_ringp->woken_up) {
                tx_ringp->woken_up = B_FALSE;
                vnetp = tx_ringp->vnetp;
                mac_tx_ring_update(vnetp->mh, tx_ringp->handle);
            }
        }
    }
    /*
     * The tx_grp is being destroyed, exit the thread.
     */
    tx_grp->flowctl_thread = NULL;
    CALLB_CPR_EXIT(&cprinfo);
    thread_exit();
}

void
vnet_tx_ring_update(void *arg1, uintptr_t arg2)
{
    vnet_t          *vnetp = (vnet_t *)arg1;
    vnet_pseudo_tx_group_t  *tx_grp;
    vnet_pseudo_tx_ring_t   *tx_ringp;
    int         i;

    tx_grp = &vnetp->tx_grp[0];
    for (i = 0; i < tx_grp->ring_cnt; i++) {
        tx_ringp = &tx_grp->rings[i];
        if (tx_ringp->hw_rh == (mac_ring_handle_t)arg2) {
            mutex_enter(&tx_grp->flowctl_lock);
            tx_ringp->woken_up = B_TRUE;
            cv_signal(&tx_grp->flowctl_cv);
            mutex_exit(&tx_grp->flowctl_lock);
            break;
        }
    }
}

/*
 * Update the new mtu of vnet into the mac layer. First check if the device has
 * been plumbed and if so fail the mtu update. Returns 0 on success.
 */
int
vnet_mtu_update(vnet_t *vnetp, uint32_t mtu)
{
    int rv;

    if (vnetp == NULL || vnetp->mh == NULL) {
        return (EINVAL);
    }

    WRITE_ENTER(&vnetp->vrwlock);

    if (vnetp->flags & VNET_STARTED) {
        RW_EXIT(&vnetp->vrwlock);
        cmn_err(CE_NOTE, "!vnet%d: Unable to process mtu "
            "update as the device is plumbed\n",
            vnetp->instance);
        return (EBUSY);
    }

    /* update mtu in the mac layer */
    rv = mac_maxsdu_update(vnetp->mh, mtu);
    if (rv != 0) {
        RW_EXIT(&vnetp->vrwlock);
        cmn_err(CE_NOTE,
            "!vnet%d: Unable to update mtu with mac layer\n",
            vnetp->instance);
        return (EIO);
    }

    vnetp->mtu = mtu;

    RW_EXIT(&vnetp->vrwlock);

    return (0);
}

/*
 * Update the link state of vnet to the mac layer.
 */
void
vnet_link_update(vnet_t *vnetp, link_state_t link_state)
{
    if (vnetp == NULL || vnetp->mh == NULL) {
        return;
    }

    WRITE_ENTER(&vnetp->vrwlock);
    if (vnetp->link_state == link_state) {
        RW_EXIT(&vnetp->vrwlock);
        return;
    }
    vnetp->link_state = link_state;
    RW_EXIT(&vnetp->vrwlock);

    mac_link_update(vnetp->mh, link_state);
}

/*
 * vio_net_resource_reg -- An interface called to register a resource
 *  with vnet.
 *  macp -- a GLDv3 mac_register that has all the details of
 *      a resource and its callbacks etc.
 *  type -- resource type.
 *  local_macaddr -- resource's MAC address. This is used to
 *           associate a resource with a corresponding vnet.
 *  remote_macaddr -- remote side MAC address. This is ignored for
 *            the Hybrid resources.
 *  vhp -- A handle returned to the caller.
 *  vcb -- A set of callbacks provided to the callers.
 */
int vio_net_resource_reg(mac_register_t *macp, vio_net_res_type_t type,
    ether_addr_t local_macaddr, ether_addr_t rem_macaddr, vio_net_handle_t *vhp,
    vio_net_callbacks_t *vcb)
{
    vnet_t      *vnetp;
    vnet_res_t  *vresp;

    vresp = kmem_zalloc(sizeof (vnet_res_t), KM_SLEEP);
    ether_copy(local_macaddr, vresp->local_macaddr);
    ether_copy(rem_macaddr, vresp->rem_macaddr);
    vresp->type = type;
    bcopy(macp, &vresp->macreg, sizeof (mac_register_t));

    DBG1(NULL, "Resource Registerig type=0%X\n", type);

    READ_ENTER(&vnet_rw);
    vnetp = vnet_headp;
    while (vnetp != NULL) {
        if (VNET_MATCH_RES(vresp, vnetp)) {
            vresp->vnetp = vnetp;

            /* Setup kstats for hio resource */
            if (vresp->type == VIO_NET_RES_HYBRID) {
                vresp->ksp = vnet_hio_setup_kstats(DRV_NAME,
                    "hio", vresp);
                if (vresp->ksp == NULL) {
                    cmn_err(CE_NOTE, "!vnet%d: Cannot "
                        "create kstats for hio resource",
                        vnetp->instance);
                }
            }
            vnet_add_resource(vnetp, vresp);
            break;
        }
        vnetp = vnetp->nextp;
    }
    RW_EXIT(&vnet_rw);
    if (vresp->vnetp == NULL) {
        DWARN(NULL, "No vnet instance");
        kmem_free(vresp, sizeof (vnet_res_t));
        return (ENXIO);
    }

    *vhp = vresp;
    vcb->vio_net_rx_cb = vnet_rx;
    vcb->vio_net_tx_update = vnet_tx_update;
    vcb->vio_net_report_err = vnet_handle_res_err;

    /* Bind the resource to pseudo ring(s) */
    if (vnet_bind_rings(vresp) != 0) {
        (void) vnet_rem_resource(vnetp, vresp);
        vnet_hio_destroy_kstats(vresp->ksp);
        KMEM_FREE(vresp);
        return (1);
    }

    /* Dispatch a task to start resources */
    vnet_dispatch_res_task(vnetp);
    return (0);
}

/*
 * vio_net_resource_unreg -- An interface to unregister a resource.
 */
void
vio_net_resource_unreg(vio_net_handle_t vhp)
{
    vnet_res_t  *vresp = (vnet_res_t *)vhp;
    vnet_t      *vnetp = vresp->vnetp;

    DBG1(NULL, "Resource Registerig hdl=0x%p", vhp);

    ASSERT(vnetp != NULL);
    /*
     * Remove the resource from fdb; this ensures
     * there are no references to the resource.
     */
    vnet_fdbe_del(vnetp, vresp);

    vnet_unbind_rings(vresp);

    /* Now remove the resource from the list */
    (void) vnet_rem_resource(vnetp, vresp);

    vnet_hio_destroy_kstats(vresp->ksp);
    KMEM_FREE(vresp);
}

static void
vnet_add_resource(vnet_t *vnetp, vnet_res_t *vresp)
{
    WRITE_ENTER(&vnetp->vrwlock);
    vresp->nextp = vnetp->vres_list;
    vnetp->vres_list = vresp;
    RW_EXIT(&vnetp->vrwlock);
}

static vnet_res_t *
vnet_rem_resource(vnet_t *vnetp, vnet_res_t *vresp)
{
    vnet_res_t  *vrp;

    WRITE_ENTER(&vnetp->vrwlock);
    if (vresp == vnetp->vres_list) {
        vnetp->vres_list = vresp->nextp;
    } else {
        vrp = vnetp->vres_list;
        while (vrp->nextp != NULL) {
            if (vrp->nextp == vresp) {
                vrp->nextp = vresp->nextp;
                break;
            }
            vrp = vrp->nextp;
        }
    }
    vresp->vnetp = NULL;
    vresp->nextp = NULL;

    RW_EXIT(&vnetp->vrwlock);

    return (vresp);
}

/*
 * vnet_dds_rx -- an interface called by vgen to DDS messages.
 */
void
vnet_dds_rx(void *arg, void *dmsg)
{
    vnet_t *vnetp = arg;
    vdds_process_dds_msg(vnetp, dmsg);
}

/*
 * vnet_send_dds_msg -- An interface provided to DDS to send
 *  DDS messages. This simply sends meessages via vgen.
 */
int
vnet_send_dds_msg(vnet_t *vnetp, void *dmsg)
{
    int rv;

    if (vnetp->vgenhdl != NULL) {
        rv = vgen_dds_tx(vnetp->vgenhdl, dmsg);
    }
    return (rv);
}

/*
 * vnet_cleanup_hio -- an interface called by vgen to cleanup hio resources.
 */
void
vnet_dds_cleanup_hio(vnet_t *vnetp)
{
    vdds_cleanup_hio(vnetp);
}

/*
 * vnet_handle_res_err -- A callback function called by a resource
 *  to report an error. For example, vgen can call to report
 *  an LDC down/reset event. This will trigger cleanup of associated
 *  Hybrid resource.
 */
/* ARGSUSED */
static void
vnet_handle_res_err(vio_net_handle_t vrh, vio_net_err_val_t err)
{
    vnet_res_t *vresp = (vnet_res_t *)vrh;
    vnet_t *vnetp = vresp->vnetp;

    if (vnetp == NULL) {
        return;
    }
    if ((vresp->type != VIO_NET_RES_LDC_SERVICE) &&
        (vresp->type != VIO_NET_RES_HYBRID)) {
        return;
    }

    vdds_cleanup_hio(vnetp);
}

/*
 * vnet_dispatch_res_task -- A function to dispatch tasks start resources.
 */
static void
vnet_dispatch_res_task(vnet_t *vnetp)
{
    int rv;

    /*
     * Dispatch the task. It could be the case that vnetp->flags does
     * not have VNET_STARTED set. This is ok as vnet_rest_start_task()
     * can abort the task when the task is started. See related comments
     * in vnet_m_stop() and vnet_stop_resources().
     */
    rv = ddi_taskq_dispatch(vnetp->taskqp, vnet_res_start_task,
        vnetp, DDI_NOSLEEP);
    if (rv != DDI_SUCCESS) {
        cmn_err(CE_WARN,
            "vnet%d:Can't dispatch start resource task",
            vnetp->instance);
    }
}

/*
 * vnet_res_start_task -- A taskq callback function that starts a resource.
 */
static void
vnet_res_start_task(void *arg)
{
    vnet_t *vnetp = arg;

    WRITE_ENTER(&vnetp->vrwlock);
    if (vnetp->flags & VNET_STARTED) {
        vnet_start_resources(vnetp);
    }
    RW_EXIT(&vnetp->vrwlock);
}

/*
 * vnet_start_resources -- starts all resources associated with
 *  a vnet.
 */
static void
vnet_start_resources(vnet_t *vnetp)
{
    mac_register_t  *macp;
    mac_callbacks_t *cbp;
    vnet_res_t  *vresp;
    int rv;

    DBG1(vnetp, "enter\n");

    ASSERT(RW_WRITE_HELD(&vnetp->vrwlock));

    for (vresp = vnetp->vres_list; vresp != NULL; vresp = vresp->nextp) {
        /* skip if it is already started */
        if (vresp->flags & VNET_STARTED) {
            continue;
        }
        macp = &vresp->macreg;
        cbp = macp->m_callbacks;
        rv = cbp->mc_start(macp->m_driver);
        if (rv == 0) {
            /*
             * Successfully started the resource, so now
             * add it to the fdb.
             */
            vresp->flags |= VNET_STARTED;
            vnet_fdbe_add(vnetp, vresp);
        }
    }

    DBG1(vnetp, "exit\n");

}

/*
 * vnet_stop_resources -- stop all resources associated with a vnet.
 */
static void
vnet_stop_resources(vnet_t *vnetp)
{
    vnet_res_t  *vresp;
    mac_register_t  *macp;
    mac_callbacks_t *cbp;

    DBG1(vnetp, "enter\n");

    ASSERT(RW_WRITE_HELD(&vnetp->vrwlock));

    for (vresp = vnetp->vres_list; vresp != NULL; ) {
        if (vresp->flags & VNET_STARTED) {
            /*
             * Release the lock while invoking mc_stop() of the
             * underlying resource. We hold a reference to this
             * resource to prevent being removed from the list in
             * vio_net_resource_unreg(). Note that new resources
             * can be added to the head of the list while the lock
             * is released, but they won't be started, as
             * VNET_STARTED flag has been cleared for the vnet
             * device in vnet_m_stop(). Also, while the lock is
             * released a resource could be removed from the list
             * in vio_net_resource_unreg(); but that is ok, as we
             * re-acquire the lock and only then access the forward
             * link (vresp->nextp) to continue with the next
             * resource.
             */
            vresp->flags &= ~VNET_STARTED;
            vresp->flags |= VNET_STOPPING;
            macp = &vresp->macreg;
            cbp = macp->m_callbacks;
            VNET_FDBE_REFHOLD(vresp);
            RW_EXIT(&vnetp->vrwlock);

            cbp->mc_stop(macp->m_driver);

            WRITE_ENTER(&vnetp->vrwlock);
            vresp->flags &= ~VNET_STOPPING;
            VNET_FDBE_REFRELE(vresp);
        }
        vresp = vresp->nextp;
    }
    DBG1(vnetp, "exit\n");
}

/*
 * Setup kstats for the HIO statistics.
 * NOTE: the synchronization for the statistics is the
 * responsibility of the caller.
 */
kstat_t *
vnet_hio_setup_kstats(char *ks_mod, char *ks_name, vnet_res_t *vresp)
{
    kstat_t *ksp;
    vnet_t *vnetp = vresp->vnetp;
    vnet_hio_kstats_t *hiokp;
    size_t size;

    ASSERT(vnetp != NULL);
    size = sizeof (vnet_hio_kstats_t) / sizeof (kstat_named_t);
    ksp = kstat_create(ks_mod, vnetp->instance, ks_name, "net",
        KSTAT_TYPE_NAMED, size, 0);
    if (ksp == NULL) {
        return (NULL);
    }

    hiokp = (vnet_hio_kstats_t *)ksp->ks_data;
    kstat_named_init(&hiokp->ipackets,      "ipackets",
        KSTAT_DATA_ULONG);
    kstat_named_init(&hiokp->ierrors,       "ierrors",
        KSTAT_DATA_ULONG);
    kstat_named_init(&hiokp->opackets,      "opackets",
        KSTAT_DATA_ULONG);
    kstat_named_init(&hiokp->oerrors,       "oerrors",
        KSTAT_DATA_ULONG);


    /* MIB II kstat variables */
    kstat_named_init(&hiokp->rbytes,        "rbytes",
        KSTAT_DATA_ULONG);
    kstat_named_init(&hiokp->obytes,        "obytes",
        KSTAT_DATA_ULONG);
    kstat_named_init(&hiokp->multircv,      "multircv",
        KSTAT_DATA_ULONG);
    kstat_named_init(&hiokp->multixmt,      "multixmt",
        KSTAT_DATA_ULONG);
    kstat_named_init(&hiokp->brdcstrcv,     "brdcstrcv",
        KSTAT_DATA_ULONG);
    kstat_named_init(&hiokp->brdcstxmt,     "brdcstxmt",
        KSTAT_DATA_ULONG);
    kstat_named_init(&hiokp->norcvbuf,      "norcvbuf",
        KSTAT_DATA_ULONG);
    kstat_named_init(&hiokp->noxmtbuf,      "noxmtbuf",
        KSTAT_DATA_ULONG);

    ksp->ks_update = vnet_hio_update_kstats;
    ksp->ks_private = (void *)vresp;
    kstat_install(ksp);
    return (ksp);
}

/*
 * Destroy kstats.
 */
static void
vnet_hio_destroy_kstats(kstat_t *ksp)
{
    if (ksp != NULL)
        kstat_delete(ksp);
}

/*
 * Update the kstats.
 */
static int
vnet_hio_update_kstats(kstat_t *ksp, int rw)
{
    vnet_t *vnetp;
    vnet_res_t *vresp;
    vnet_hio_stats_t statsp;
    vnet_hio_kstats_t *hiokp;

    vresp = (vnet_res_t *)ksp->ks_private;
    vnetp = vresp->vnetp;

    bzero(&statsp, sizeof (vnet_hio_stats_t));

    READ_ENTER(&vnetp->vsw_fp_rw);
    if (vnetp->hio_fp == NULL) {
        /* not using hio resources, just return */
        RW_EXIT(&vnetp->vsw_fp_rw);
        return (0);
    }
    VNET_FDBE_REFHOLD(vnetp->hio_fp);
    RW_EXIT(&vnetp->vsw_fp_rw);
    vnet_hio_get_stats(vnetp->hio_fp, &statsp);
    VNET_FDBE_REFRELE(vnetp->hio_fp);

    hiokp = (vnet_hio_kstats_t *)ksp->ks_data;

    if (rw == KSTAT_READ) {
        /* Link Input/Output stats */
        hiokp->ipackets.value.ul    = (uint32_t)statsp.ipackets;
        hiokp->ipackets64.value.ull = statsp.ipackets;
        hiokp->ierrors.value.ul     = statsp.ierrors;
        hiokp->opackets.value.ul    = (uint32_t)statsp.opackets;
        hiokp->opackets64.value.ull = statsp.opackets;
        hiokp->oerrors.value.ul     = statsp.oerrors;

        /* MIB II kstat variables */
        hiokp->rbytes.value.ul      = (uint32_t)statsp.rbytes;
        hiokp->rbytes64.value.ull   = statsp.rbytes;
        hiokp->obytes.value.ul      = (uint32_t)statsp.obytes;
        hiokp->obytes64.value.ull   = statsp.obytes;
        hiokp->multircv.value.ul    = statsp.multircv;
        hiokp->multixmt.value.ul    = statsp.multixmt;
        hiokp->brdcstrcv.value.ul   = statsp.brdcstrcv;
        hiokp->brdcstxmt.value.ul   = statsp.brdcstxmt;
        hiokp->norcvbuf.value.ul    = statsp.norcvbuf;
        hiokp->noxmtbuf.value.ul    = statsp.noxmtbuf;
    } else {
        return (EACCES);
    }

    return (0);
}

static void
vnet_hio_get_stats(vnet_res_t *vresp, vnet_hio_stats_t *statsp)
{
    mac_register_t      *macp;
    mac_callbacks_t     *cbp;
    uint64_t        val;
    int         stat;

    /*
     * get the specified statistics from the underlying nxge.
     */
    macp = &vresp->macreg;
    cbp = macp->m_callbacks;
    for (stat = MAC_STAT_MIN; stat < MAC_STAT_OVERFLOWS; stat++) {
        if (cbp->mc_getstat(macp->m_driver, stat, &val) == 0) {
            switch (stat) {
            case MAC_STAT_IPACKETS:
                statsp->ipackets = val;
                break;

            case MAC_STAT_IERRORS:
                statsp->ierrors = val;
                break;

            case MAC_STAT_OPACKETS:
                statsp->opackets = val;
                break;

            case MAC_STAT_OERRORS:
                statsp->oerrors = val;
                break;

            case MAC_STAT_RBYTES:
                statsp->rbytes = val;
                break;

            case MAC_STAT_OBYTES:
                statsp->obytes = val;
                break;

            case MAC_STAT_MULTIRCV:
                statsp->multircv = val;
                break;

            case MAC_STAT_MULTIXMT:
                statsp->multixmt = val;
                break;

            case MAC_STAT_BRDCSTRCV:
                statsp->brdcstrcv = val;
                break;

            case MAC_STAT_BRDCSTXMT:
                statsp->brdcstxmt = val;
                break;

            case MAC_STAT_NOXMTBUF:
                statsp->noxmtbuf = val;
                break;

            case MAC_STAT_NORCVBUF:
                statsp->norcvbuf = val;
                break;

            default:
                /*
                 * parameters not interested.
                 */
                break;
            }
        }
    }
}

static boolean_t
vnet_m_capab(void *arg, mac_capab_t cap, void *cap_data)
{
    vnet_t  *vnetp = (vnet_t *)arg;

    if (vnetp == NULL) {
        return (0);
    }

    switch (cap) {

    case MAC_CAPAB_RINGS: {

        mac_capab_rings_t *cap_rings = cap_data;
        /*
         * Rings Capability Notes:
         * We advertise rings to make use of the rings framework in
         * gldv3 mac layer, to improve the performance. This is
         * specifically needed when a Hybrid resource (with multiple
         * tx/rx hardware rings) is assigned to a vnet device. We also
         * leverage this for the normal case when no Hybrid resource is
         * assigned.
         *
         * Ring Allocation:
         * - TX path:
         * We expose a pseudo ring group with 2 pseudo tx rings (as
         * currently HybridIO exports only 2 rings) In the normal case,
         * transmit traffic that comes down to the driver through the
         * mri_tx (vnet_tx_ring_send()) entry point goes through the
         * distributed switching algorithm in vnet and gets transmitted
         * over a port/LDC in the vgen layer to either the vswitch or a
         * peer vnet. If and when a Hybrid resource is assigned to the
         * vnet, we obtain the tx ring information of the Hybrid device
         * (nxge) and map the pseudo rings 1:1 to the 2 hw tx rings.
         * Traffic being sent over the Hybrid resource by the mac layer
         * gets spread across both hw rings, as they are mapped to the
         * 2 pseudo tx rings in vnet.
         *
         * - RX path:
         * We expose a pseudo ring group with 3 pseudo rx rings (static
         * rings) initially. The first (default) pseudo rx ring is
         * reserved for the resource that connects to the vswitch
         * service. The next 2 rings are reserved for a Hybrid resource
         * that may be assigned to the vnet device. If and when a
         * Hybrid resource is assigned to the vnet, we obtain the rx
         * ring information of the Hybrid device (nxge) and map these
         * pseudo rings 1:1 to the 2 hw rx rings. For each additional
         * resource that connects to a peer vnet, we dynamically
         * allocate a pseudo rx ring and map it to that resource, when
         * the resource gets added; and the pseudo rx ring is
         * dynamically registered with the upper mac layer. We do the
         * reverse and unregister the ring with the mac layer when
         * the resource gets removed.
         *
         * Synchronization notes:
         * We don't need any lock to protect members of ring structure,
         * specifically ringp->hw_rh, in either the TX or the RX ring,
         * as explained below.
         * - TX ring:
         * ring->hw_rh is initialized only when a Hybrid resource is
         * associated; and gets referenced only in vnet_hio_tx(). The
         * Hybrid resource itself is available in fdb only after tx
         * hwrings are found and mapped; i.e, in vio_net_resource_reg()
         * we call vnet_bind_rings() first and then call
         * vnet_start_resources() which adds an entry to fdb. For
         * traffic going over LDC resources, we don't reference
         * ring->hw_rh at all.
         * - RX ring:
         * For rings mapped to Hybrid resource ring->hw_rh is
         * initialized and only then do we add the rx callback for
         * the underlying Hybrid resource; we disable callbacks before
         * we unmap ring->hw_rh. For rings mapped to LDC resources, we
         * stop the rx callbacks (in vgen) before we remove ring->hw_rh
         * (vio_net_resource_unreg()).
         * Also, we access ring->hw_rh in vnet_rx_ring_stat().
         * Note that for rings mapped to Hybrid resource, though the
         * rings are statically registered with the mac layer, its
         * hardware ring mapping (ringp->hw_rh) can be torn down in
         * vnet_unbind_hwrings() while the kstat operation is in
         * progress. To protect against this, we hold a reference to
         * the resource in FDB; this ensures that the thread in
         * vio_net_resource_unreg() waits for the reference to be
         * dropped before unbinding the ring.
         *
         * We don't need to do this for rings mapped to LDC resources.
         * These rings are registered/unregistered dynamically with
         * the mac layer and so any attempt to unregister the ring
         * while kstat operation is in progress will block in
         * mac_group_rem_ring(). Thus implicitly protects the
         * resource (ringp->hw_rh) from disappearing.
         */

        if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
            cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;

            /*
             * The ring_cnt for rx grp is initialized in
             * vnet_ring_grp_init(). Later, the ring_cnt gets
             * updated dynamically whenever LDC resources are added
             * or removed.
             */
            cap_rings->mr_rnum = vnetp->rx_grp[0].ring_cnt;
            cap_rings->mr_rget = vnet_get_ring;

            cap_rings->mr_gnum = VNET_NUM_PSEUDO_GROUPS;
            cap_rings->mr_gget = vnet_get_group;
            cap_rings->mr_gaddring = NULL;
            cap_rings->mr_gremring = NULL;
        } else {
            cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;

            /*
             * The ring_cnt for tx grp is initialized in
             * vnet_ring_grp_init() and remains constant, as we
             * do not support dymanic tx rings for now.
             */
            cap_rings->mr_rnum = vnetp->tx_grp[0].ring_cnt;
            cap_rings->mr_rget = vnet_get_ring;

            /*
             * Transmit rings are not grouped; i.e, the number of
             * transmit ring groups advertised should be set to 0.
             */
            cap_rings->mr_gnum = 0;

            cap_rings->mr_gget = vnet_get_group;
            cap_rings->mr_gaddring = NULL;
            cap_rings->mr_gremring = NULL;
        }
        return (B_TRUE);

    }

    default:
        break;

    }

    return (B_FALSE);
}

/*
 * Callback funtion for MAC layer to get ring information.
 */
static void
vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index,
    const int r_index, mac_ring_info_t *infop, mac_ring_handle_t r_handle)
{
    vnet_t  *vnetp = arg;

    switch (rtype) {

    case MAC_RING_TYPE_RX: {

        vnet_pseudo_rx_group_t  *rx_grp;
        vnet_pseudo_rx_ring_t   *rx_ringp;
        mac_intr_t      *mintr;

        /* We advertised only one RX group */
        ASSERT(g_index == 0);
        rx_grp = &vnetp->rx_grp[g_index];

        /* Check the current # of rings in the rx group */
        ASSERT((r_index >= 0) && (r_index < rx_grp->max_ring_cnt));

        /* Get the ring based on the index */
        rx_ringp = &rx_grp->rings[r_index];

        rx_ringp->handle = r_handle;
        /*
         * Note: we don't need to save the incoming r_index in rx_ring,
         * as vnet_ring_grp_init() would have initialized the index for
         * each ring in the array.
         */
        rx_ringp->grp = rx_grp;
        rx_ringp->vnetp = vnetp;

        mintr = &infop->mri_intr;
        mintr->mi_handle = (mac_intr_handle_t)rx_ringp;
        mintr->mi_enable = (mac_intr_enable_t)vnet_ring_enable_intr;
        mintr->mi_disable = (mac_intr_disable_t)vnet_ring_disable_intr;

        infop->mri_driver = (mac_ring_driver_t)rx_ringp;
        infop->mri_start = vnet_rx_ring_start;
        infop->mri_stop = vnet_rx_ring_stop;
        infop->mri_stat = vnet_rx_ring_stat;

        /* Set the poll function, as this is an rx ring */
        infop->mri_poll = vnet_rx_poll;
        /*
         * MAC_RING_RX_ENQUEUE bit needed to be set for nxge
         * which was not sending packet chains in interrupt
         * context. For such drivers, packets are queued in
         * Rx soft rings so that we get a chance to switch
         * into a polling mode under backlog. This bug (not
         * sending packet chains) has now been fixed. Once
         * the performance impact is measured, this change
         * will be removed.
         */
        infop->mri_flags = (vnet_mac_rx_queuing ?
            MAC_RING_RX_ENQUEUE : 0);
        break;
    }

    case MAC_RING_TYPE_TX: {
        vnet_pseudo_tx_group_t  *tx_grp;
        vnet_pseudo_tx_ring_t   *tx_ringp;

        /*
         * No need to check grp index; mac layer passes -1 for it.
         */
        tx_grp = &vnetp->tx_grp[0];

        /* Check the # of rings in the tx group */
        ASSERT((r_index >= 0) && (r_index < tx_grp->ring_cnt));

        /* Get the ring based on the index */
        tx_ringp = &tx_grp->rings[r_index];

        tx_ringp->handle = r_handle;
        tx_ringp->index = r_index;
        tx_ringp->grp = tx_grp;
        tx_ringp->vnetp = vnetp;

        infop->mri_driver = (mac_ring_driver_t)tx_ringp;
        infop->mri_start = vnet_tx_ring_start;
        infop->mri_stop = vnet_tx_ring_stop;
        infop->mri_stat = vnet_tx_ring_stat;

        /* Set the transmit function, as this is a tx ring */
        infop->mri_tx = vnet_tx_ring_send;
        /*
         * MAC_RING_TX_SERIALIZE bit needs to be set while
         * hybridIO is enabled to workaround tx lock
         * contention issues in nxge.
         */
        infop->mri_flags = (vnet_mac_tx_serialize ?
            MAC_RING_TX_SERIALIZE : 0);
        break;
    }

    default:
        break;
    }
}

/*
 * Callback funtion for MAC layer to get group information.
 */
static void
vnet_get_group(void *arg, mac_ring_type_t type, const int index,
    mac_group_info_t *infop, mac_group_handle_t handle)
{
    vnet_t  *vnetp = (vnet_t *)arg;

    switch (type) {

    case MAC_RING_TYPE_RX:
    {
        vnet_pseudo_rx_group_t  *rx_grp;

        /* We advertised only one RX group */
        ASSERT(index == 0);

        rx_grp = &vnetp->rx_grp[index];
        rx_grp->handle = handle;
        rx_grp->index = index;
        rx_grp->vnetp = vnetp;

        infop->mgi_driver = (mac_group_driver_t)rx_grp;
        infop->mgi_start = NULL;
        infop->mgi_stop = NULL;
        infop->mgi_addmac = vnet_addmac;
        infop->mgi_remmac = vnet_remmac;
        infop->mgi_count = rx_grp->ring_cnt;

        break;
    }

    case MAC_RING_TYPE_TX:
    {
        vnet_pseudo_tx_group_t  *tx_grp;

        /* We advertised only one TX group */
        ASSERT(index == 0);

        tx_grp = &vnetp->tx_grp[index];
        tx_grp->handle = handle;
        tx_grp->index = index;
        tx_grp->vnetp = vnetp;

        infop->mgi_driver = (mac_group_driver_t)tx_grp;
        infop->mgi_start = NULL;
        infop->mgi_stop = NULL;
        infop->mgi_addmac = NULL;
        infop->mgi_remmac = NULL;
        infop->mgi_count = VNET_NUM_PSEUDO_TXRINGS;

        break;
    }

    default:
        break;

    }
}

static int
vnet_rx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num)
{
    vnet_pseudo_rx_ring_t   *rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
    int         err;

    /*
     * If this ring is mapped to a LDC resource, simply mark the state to
     * indicate the ring is started and return.
     */
    if ((rx_ringp->state &
        (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0) {
        rx_ringp->gen_num = mr_gen_num;
        rx_ringp->state |= VNET_RXRING_STARTED;
        return (0);
    }

    ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);

    /*
     * This must be a ring reserved for a hwring. If the hwring is not
     * bound yet, simply mark the state to indicate the ring is started and
     * return. If and when a hybrid resource is activated for this vnet
     * device, we will bind the hwring and start it then. If a hwring is
     * already bound, start it now.
     */
    if (rx_ringp->hw_rh == NULL) {
        rx_ringp->gen_num = mr_gen_num;
        rx_ringp->state |= VNET_RXRING_STARTED;
        return (0);
    }

    err = mac_hwring_start(rx_ringp->hw_rh);
    if (err == 0) {
        rx_ringp->gen_num = mr_gen_num;
        rx_ringp->state |= VNET_RXRING_STARTED;
    } else {
        err = ENXIO;
    }

    return (err);
}

static void
vnet_rx_ring_stop(mac_ring_driver_t arg)
{
    vnet_pseudo_rx_ring_t   *rx_ringp = (vnet_pseudo_rx_ring_t *)arg;

    /*
     * If this ring is mapped to a LDC resource, simply mark the state to
     * indicate the ring is now stopped and return.
     */
    if ((rx_ringp->state &
        (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0) {
        rx_ringp->state &= ~VNET_RXRING_STARTED;
        return;
    }

    ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);

    /*
     * This must be a ring reserved for a hwring. If the hwring is not
     * bound yet, simply mark the state to indicate the ring is stopped and
     * return. If a hwring is already bound, stop it now.
     */
    if (rx_ringp->hw_rh == NULL) {
        rx_ringp->state &= ~VNET_RXRING_STARTED;
        return;
    }

    mac_hwring_stop(rx_ringp->hw_rh);
    rx_ringp->state &= ~VNET_RXRING_STARTED;
}

static int
vnet_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
{
    vnet_pseudo_rx_ring_t   *rx_ringp = (vnet_pseudo_rx_ring_t *)rdriver;
    vnet_t          *vnetp = (vnet_t *)rx_ringp->vnetp;
    vnet_res_t      *vresp;
    mac_register_t      *macp;
    mac_callbacks_t     *cbp;

    /*
     * Refer to vnet_m_capab() function for detailed comments on ring
     * synchronization.
     */
    if ((rx_ringp->state & VNET_RXRING_HYBRID) != 0) {
        READ_ENTER(&vnetp->vsw_fp_rw);
        if (vnetp->hio_fp == NULL) {
            RW_EXIT(&vnetp->vsw_fp_rw);
            return (0);
        }

        VNET_FDBE_REFHOLD(vnetp->hio_fp);
        RW_EXIT(&vnetp->vsw_fp_rw);
        (void) mac_hwring_getstat(rx_ringp->hw_rh, stat, val);
        VNET_FDBE_REFRELE(vnetp->hio_fp);
        return (0);
    }

    ASSERT((rx_ringp->state &
        (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0);
    vresp = (vnet_res_t *)rx_ringp->hw_rh;
    macp = &vresp->macreg;
    cbp = macp->m_callbacks;

    cbp->mc_getstat(macp->m_driver, stat, val);

    return (0);
}

/* ARGSUSED */
static int
vnet_tx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num)
{
    vnet_pseudo_tx_ring_t   *tx_ringp = (vnet_pseudo_tx_ring_t *)arg;

    tx_ringp->state |= VNET_TXRING_STARTED;
    return (0);
}

static void
vnet_tx_ring_stop(mac_ring_driver_t arg)
{
    vnet_pseudo_tx_ring_t   *tx_ringp = (vnet_pseudo_tx_ring_t *)arg;

    tx_ringp->state &= ~VNET_TXRING_STARTED;
}

static int
vnet_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
{
    vnet_pseudo_tx_ring_t   *tx_ringp = (vnet_pseudo_tx_ring_t *)rdriver;
    vnet_tx_ring_stats_t    *statsp;

    statsp = &tx_ringp->tx_ring_stats;

    switch (stat) {
    case MAC_STAT_OPACKETS:
        *val = statsp->opackets;
        break;

    case MAC_STAT_OBYTES:
        *val = statsp->obytes;
        break;

    default:
        *val = 0;
        return (ENOTSUP);
    }

    return (0);
}

/*
 * Disable polling for a ring and enable its interrupt.
 */
static int
vnet_ring_enable_intr(void *arg)
{
    vnet_pseudo_rx_ring_t   *rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
    vnet_res_t      *vresp;

    if (rx_ringp->hw_rh == NULL) {
        /*
         * Ring enable intr func is being invoked, but the ring is
         * not bound to any underlying resource ? This must be a ring
         * reserved for Hybrid resource and no such resource has been
         * assigned to this vnet device yet. We simply return success.
         */
        ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
        return (0);
    }

    /*
     * The rx ring has been bound to either a LDC or a Hybrid resource.
     * Call the appropriate function to enable interrupts for the ring.
     */
    if (rx_ringp->state & VNET_RXRING_HYBRID) {
        return (mac_hwring_enable_intr(rx_ringp->hw_rh));
    } else {
        vresp = (vnet_res_t *)rx_ringp->hw_rh;
        return (vgen_enable_intr(vresp->macreg.m_driver));
    }
}

/*
 * Enable polling for a ring and disable its interrupt.
 */
static int
vnet_ring_disable_intr(void *arg)
{
    vnet_pseudo_rx_ring_t   *rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
    vnet_res_t      *vresp;

    if (rx_ringp->hw_rh == NULL) {
        /*
         * Ring disable intr func is being invoked, but the ring is
         * not bound to any underlying resource ? This must be a ring
         * reserved for Hybrid resource and no such resource has been
         * assigned to this vnet device yet. We simply return success.
         */
        ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
        return (0);
    }

    /*
     * The rx ring has been bound to either a LDC or a Hybrid resource.
     * Call the appropriate function to disable interrupts for the ring.
     */
    if (rx_ringp->state & VNET_RXRING_HYBRID) {
        return (mac_hwring_disable_intr(rx_ringp->hw_rh));
    } else {
        vresp = (vnet_res_t *)rx_ringp->hw_rh;
        return (vgen_disable_intr(vresp->macreg.m_driver));
    }
}

/*
 * Poll 'bytes_to_pickup' bytes of message from the rx ring.
 */
static mblk_t *
vnet_rx_poll(void *arg, int bytes_to_pickup)
{
    vnet_pseudo_rx_ring_t   *rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
    mblk_t          *mp = NULL;
    vnet_res_t      *vresp;
    vnet_t          *vnetp = rx_ringp->vnetp;

    if (rx_ringp->hw_rh == NULL) {
        return (NULL);
    }

    if (rx_ringp->state & VNET_RXRING_HYBRID) {
        mp = mac_hwring_poll(rx_ringp->hw_rh, bytes_to_pickup);
        /*
         * Packets received over a hybrid resource need additional
         * processing to remove the tag, for the pvid case. The
         * underlying resource is not aware of the vnet's pvid and thus
         * packets are received with the vlan tag in the header; unlike
         * packets that are received over a ldc channel in which case
         * the peer vnet/vsw would have already removed the tag.
         */
        if (vnetp->pvid != vnetp->default_vlan_id) {
            vnet_rx_frames_untag(vnetp->pvid, &mp);
        }
    } else {
        vresp = (vnet_res_t *)rx_ringp->hw_rh;
        mp = vgen_rx_poll(vresp->macreg.m_driver, bytes_to_pickup);
    }
    return (mp);
}

/* ARGSUSED */
void
vnet_hio_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
    boolean_t loopback)
{
    vnet_t          *vnetp = (vnet_t *)arg;
    vnet_pseudo_rx_ring_t   *ringp = (vnet_pseudo_rx_ring_t *)mrh;

    /*
     * Packets received over a hybrid resource need additional processing
     * to remove the tag, for the pvid case. The underlying resource is
     * not aware of the vnet's pvid and thus packets are received with the
     * vlan tag in the header; unlike packets that are received over a ldc
     * channel in which case the peer vnet/vsw would have already removed
     * the tag.
     */
    if (vnetp->pvid != vnetp->default_vlan_id) {
        vnet_rx_frames_untag(vnetp->pvid, &mp);
        if (mp == NULL) {
            return;
        }
    }
    mac_rx_ring(vnetp->mh, ringp->handle, mp, ringp->gen_num);
}

static int
vnet_addmac(void *arg, const uint8_t *mac_addr)
{
    vnet_pseudo_rx_group_t  *rx_grp = (vnet_pseudo_rx_group_t *)arg;
    vnet_t          *vnetp;

    vnetp = rx_grp->vnetp;

    if (bcmp(mac_addr, vnetp->curr_macaddr, ETHERADDRL) == 0) {
        return (0);
    }

    cmn_err(CE_CONT, "!vnet%d: %s: Multiple macaddr unsupported\n",
        vnetp->instance, __func__);
    return (EINVAL);
}

static int
vnet_remmac(void *arg, const uint8_t *mac_addr)
{
    vnet_pseudo_rx_group_t  *rx_grp = (vnet_pseudo_rx_group_t *)arg;
    vnet_t          *vnetp;

    vnetp = rx_grp->vnetp;

    if (bcmp(mac_addr, vnetp->curr_macaddr, ETHERADDRL) == 0) {
        return (0);
    }

    cmn_err(CE_CONT, "!vnet%d: %s: Invalid macaddr: %s\n",
        vnetp->instance, __func__, ether_sprintf((void *)mac_addr));
    return (EINVAL);
}

int
vnet_hio_mac_init(vnet_t *vnetp, char *ifname)
{
    mac_handle_t        mh;
    mac_client_handle_t mch = NULL;
    mac_unicast_handle_t    muh = NULL;
    mac_diag_t      diag;
    mac_register_t      *macp;
    char            client_name[MAXNAMELEN];
    int         rv;
    uint16_t        mac_flags = MAC_UNICAST_TAG_DISABLE |
        MAC_UNICAST_STRIP_DISABLE | MAC_UNICAST_PRIMARY;
    vio_net_callbacks_t vcb;
    ether_addr_t        rem_addr =
        { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
    uint32_t        retries = 0;

    if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
        return (EAGAIN);
    }

    do {
        rv = mac_open_by_linkname(ifname, &mh);
        if (rv == 0) {
            break;
        }
        if (rv != ENOENT || (retries++ >= vnet_mac_open_retries)) {
            mac_free(macp);
            return (rv);
        }
        drv_usecwait(vnet_mac_open_delay);
    } while (rv == ENOENT);

    vnetp->hio_mh = mh;

    (void) snprintf(client_name, MAXNAMELEN, "vnet%d-%s", vnetp->instance,
        ifname);
    rv = mac_client_open(mh, &mch, client_name, MAC_OPEN_FLAGS_EXCLUSIVE);
    if (rv != 0) {
        goto fail;
    }
    vnetp->hio_mch = mch;

    rv = mac_unicast_add(mch, vnetp->curr_macaddr, mac_flags, &muh, 0,
        &diag);
    if (rv != 0) {
        goto fail;
    }
    vnetp->hio_muh = muh;

    macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
    macp->m_driver = vnetp;
    macp->m_dip = NULL;
    macp->m_src_addr = NULL;
    macp->m_callbacks = &vnet_hio_res_callbacks;
    macp->m_min_sdu = 0;
    macp->m_max_sdu = ETHERMTU;

    rv = vio_net_resource_reg(macp, VIO_NET_RES_HYBRID,
        vnetp->curr_macaddr, rem_addr, &vnetp->hio_vhp, &vcb);
    if (rv != 0) {
        goto fail;
    }
    mac_free(macp);

    /* add the recv callback */
    mac_rx_set(vnetp->hio_mch, vnet_hio_rx_cb, vnetp);

    return (0);

fail:
    mac_free(macp);
    vnet_hio_mac_cleanup(vnetp);
    return (1);
}

void
vnet_hio_mac_cleanup(vnet_t *vnetp)
{
    if (vnetp->hio_vhp != NULL) {
        vio_net_resource_unreg(vnetp->hio_vhp);
        vnetp->hio_vhp = NULL;
    }

    if (vnetp->hio_muh != NULL) {
        (void) mac_unicast_remove(vnetp->hio_mch, vnetp->hio_muh);
        vnetp->hio_muh = NULL;
    }

    if (vnetp->hio_mch != NULL) {
        mac_client_close(vnetp->hio_mch, 0);
        vnetp->hio_mch = NULL;
    }

    if (vnetp->hio_mh != NULL) {
        mac_close(vnetp->hio_mh);
        vnetp->hio_mh = NULL;
    }
}

/* Bind pseudo rings to hwrings */
static int
vnet_bind_hwrings(vnet_t *vnetp)
{
    mac_ring_handle_t   hw_rh[VNET_NUM_HYBRID_RINGS];
    mac_perim_handle_t  mph1;
    vnet_pseudo_rx_group_t  *rx_grp;
    vnet_pseudo_rx_ring_t   *rx_ringp;
    vnet_pseudo_tx_group_t  *tx_grp;
    vnet_pseudo_tx_ring_t   *tx_ringp;
    int         hw_ring_cnt;
    int         i;
    int         rv;

    mac_perim_enter_by_mh(vnetp->hio_mh, &mph1);

    /* Get the list of the underlying RX rings. */
    hw_ring_cnt = mac_hwrings_get(vnetp->hio_mch, &vnetp->rx_hwgh, hw_rh,
        MAC_RING_TYPE_RX);

    /* We expect the the # of hw rx rings to match VNET_NUM_HYBRID_RINGS */
    if (hw_ring_cnt != VNET_NUM_HYBRID_RINGS) {
        cmn_err(CE_WARN,
            "!vnet%d: vnet_bind_hwrings: bad rx hw_ring_cnt(%d)\n",
            vnetp->instance, hw_ring_cnt);
        goto fail;
    }

    if (vnetp->rx_hwgh != NULL) {
        /*
         * Quiesce the HW ring and the mac srs on the ring. Note
         * that the HW ring will be restarted when the pseudo ring
         * is started. At that time all the packets will be
         * directly passed up to the pseudo RX ring and handled
         * by mac srs created over the pseudo RX ring.
         */
        mac_rx_client_quiesce(vnetp->hio_mch);
        mac_srs_perm_quiesce(vnetp->hio_mch, B_TRUE);
    }

    /*
     * Bind the pseudo rings to the hwrings and start the hwrings.
     * Note we don't need to register these with the upper mac, as we have
     * statically exported these pseudo rxrings which are reserved for
     * rxrings of Hybrid resource.
     */
    rx_grp = &vnetp->rx_grp[0];
    for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
        /* Pick the rxrings reserved for Hybrid resource */
        rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX];

        /* Store the hw ring handle */
        rx_ringp->hw_rh = hw_rh[i];

        /* Bind the pseudo ring to the underlying hwring */
        mac_hwring_setup(rx_ringp->hw_rh,
            (mac_resource_handle_t)rx_ringp, NULL);

        /* Start the hwring if needed */
        if (rx_ringp->state & VNET_RXRING_STARTED) {
            rv = mac_hwring_start(rx_ringp->hw_rh);
            if (rv != 0) {
                mac_hwring_teardown(rx_ringp->hw_rh);
                rx_ringp->hw_rh = NULL;
                goto fail;
            }
        }
    }

    /* Get the list of the underlying TX rings. */
    hw_ring_cnt = mac_hwrings_get(vnetp->hio_mch, &vnetp->tx_hwgh, hw_rh,
        MAC_RING_TYPE_TX);

    /* We expect the # of hw tx rings to match VNET_NUM_HYBRID_RINGS */
    if (hw_ring_cnt != VNET_NUM_HYBRID_RINGS) {
        cmn_err(CE_WARN,
            "!vnet%d: vnet_bind_hwrings: bad tx hw_ring_cnt(%d)\n",
            vnetp->instance, hw_ring_cnt);
        goto fail;
    }

    /*
     * Now map the pseudo txrings to the hw txrings. Note we don't need
     * to register these with the upper mac, as we have statically exported
     * these rings. Note that these rings will continue to be used for LDC
     * resources to peer vnets and vswitch (shared ring).
     */
    tx_grp = &vnetp->tx_grp[0];
    for (i = 0; i < tx_grp->ring_cnt; i++) {
        tx_ringp = &tx_grp->rings[i];
        tx_ringp->hw_rh = hw_rh[i];
        tx_ringp->state |= VNET_TXRING_HYBRID;
    }
    tx_grp->tx_notify_handle =
        mac_client_tx_notify(vnetp->hio_mch, vnet_tx_ring_update, vnetp);

    mac_perim_exit(mph1);
    return (0);

fail:
    mac_perim_exit(mph1);
    vnet_unbind_hwrings(vnetp);
    return (1);
}

/* Unbind pseudo rings from hwrings */
static void
vnet_unbind_hwrings(vnet_t *vnetp)
{
    mac_perim_handle_t  mph1;
    vnet_pseudo_rx_ring_t   *rx_ringp;
    vnet_pseudo_rx_group_t  *rx_grp;
    vnet_pseudo_tx_group_t  *tx_grp;
    vnet_pseudo_tx_ring_t   *tx_ringp;
    int         i;

    mac_perim_enter_by_mh(vnetp->hio_mh, &mph1);

    tx_grp = &vnetp->tx_grp[0];
    for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
        tx_ringp = &tx_grp->rings[i];
        if (tx_ringp->state & VNET_TXRING_HYBRID) {
            tx_ringp->state &= ~VNET_TXRING_HYBRID;
            tx_ringp->hw_rh = NULL;
        }
    }
    (void) mac_client_tx_notify(vnetp->hio_mch, NULL,
        tx_grp->tx_notify_handle);

    rx_grp = &vnetp->rx_grp[0];
    for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
        rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX];
        if (rx_ringp->hw_rh != NULL) {
            /* Stop the hwring */
            mac_hwring_stop(rx_ringp->hw_rh);

            /* Teardown the hwring */
            mac_hwring_teardown(rx_ringp->hw_rh);
            rx_ringp->hw_rh = NULL;
        }
    }

    if (vnetp->rx_hwgh != NULL) {
        vnetp->rx_hwgh = NULL;
        /*
         * First clear the permanent-quiesced flag of the RX srs then
         * restart the HW ring and the mac srs on the ring.
         */
        mac_srs_perm_quiesce(vnetp->hio_mch, B_FALSE);
        mac_rx_client_restart(vnetp->hio_mch);
    }

    mac_perim_exit(mph1);
}

/* Bind pseudo ring to a LDC resource */
static int
vnet_bind_vgenring(vnet_res_t *vresp)
{
    vnet_t          *vnetp;
    vnet_pseudo_rx_group_t  *rx_grp;
    vnet_pseudo_rx_ring_t   *rx_ringp;
    mac_perim_handle_t  mph1;
    int         rv;
    int         type;

    vnetp = vresp->vnetp;
    type = vresp->type;
    rx_grp = &vnetp->rx_grp[0];

    if (type == VIO_NET_RES_LDC_SERVICE) {
        /*
         * Ring Index 0 is the default ring in the group and is
         * reserved for LDC_SERVICE in vnet_ring_grp_init(). This ring
         * is allocated statically and is reported to the mac layer
         * in vnet_m_capab(). So, all we need to do here, is save a
         * reference to the associated vresp.
         */
        rx_ringp = &rx_grp->rings[0];
        rx_ringp->hw_rh = (mac_ring_handle_t)vresp;
        vresp->rx_ringp = (void *)rx_ringp;
        return (0);
    }
    ASSERT(type == VIO_NET_RES_LDC_GUEST);

    mac_perim_enter_by_mh(vnetp->mh, &mph1);

    rx_ringp = vnet_alloc_pseudo_rx_ring(vnetp);
    if (rx_ringp == NULL) {
        cmn_err(CE_WARN, "!vnet%d: Failed to allocate pseudo rx ring",
            vnetp->instance);
        goto fail;
    }

    /* Store the LDC resource itself as the ring handle */
    rx_ringp->hw_rh = (mac_ring_handle_t)vresp;

    /*
     * Save a reference to the ring in the resource for lookup during
     * unbind. Note this is only done for LDC resources. We don't need this
     * in the case of a Hybrid resource (see vnet_bind_hwrings()), as its
     * rx rings are mapped to reserved pseudo rx rings (index 1 and 2).
     */
    vresp->rx_ringp = (void *)rx_ringp;
    rx_ringp->state |= VNET_RXRING_LDC_GUEST;

    /* Register the pseudo ring with upper-mac */
    rv = mac_group_add_ring(rx_grp->handle, rx_ringp->index);
    if (rv != 0) {
        rx_ringp->state &= ~VNET_RXRING_LDC_GUEST;
        rx_ringp->hw_rh = NULL;
        vnet_free_pseudo_rx_ring(vnetp, rx_ringp);
        goto fail;
    }

    mac_perim_exit(mph1);
    return (0);
fail:
    mac_perim_exit(mph1);
    return (1);
}

/* Unbind pseudo ring from a LDC resource */
static void
vnet_unbind_vgenring(vnet_res_t *vresp)
{
    vnet_t          *vnetp;
    vnet_pseudo_rx_group_t  *rx_grp;
    vnet_pseudo_rx_ring_t   *rx_ringp;
    mac_perim_handle_t  mph1;
    int         type;

    vnetp = vresp->vnetp;
    type = vresp->type;
    rx_grp = &vnetp->rx_grp[0];

    if (vresp->rx_ringp == NULL) {
        return;
    }

    if (type == VIO_NET_RES_LDC_SERVICE) {
        /*
         * Ring Index 0 is the default ring in the group and is
         * reserved for LDC_SERVICE in vnet_ring_grp_init(). This ring
         * is allocated statically and is reported to the mac layer
         * in vnet_m_capab(). So, all we need to do here, is remove its
         * reference to the associated vresp.
         */
        rx_ringp = &rx_grp->rings[0];
        rx_ringp->hw_rh = NULL;
        vresp->rx_ringp = NULL;
        return;
    }
    ASSERT(type == VIO_NET_RES_LDC_GUEST);

    mac_perim_enter_by_mh(vnetp->mh, &mph1);

    rx_ringp = (vnet_pseudo_rx_ring_t *)vresp->rx_ringp;
    vresp->rx_ringp = NULL;

    if (rx_ringp != NULL && (rx_ringp->state & VNET_RXRING_LDC_GUEST)) {
        /* Unregister the pseudo ring with upper-mac */
        mac_group_rem_ring(rx_grp->handle, rx_ringp->handle);

        rx_ringp->hw_rh = NULL;
        rx_ringp->state &= ~VNET_RXRING_LDC_GUEST;

        /* Free the pseudo rx ring */
        vnet_free_pseudo_rx_ring(vnetp, rx_ringp);
    }

    mac_perim_exit(mph1);
}

static void
vnet_unbind_rings(vnet_res_t *vresp)
{
    switch (vresp->type) {

    case VIO_NET_RES_LDC_SERVICE:
    case VIO_NET_RES_LDC_GUEST:
        vnet_unbind_vgenring(vresp);
        break;

    case VIO_NET_RES_HYBRID:
        vnet_unbind_hwrings(vresp->vnetp);
        break;

    default:
        break;

    }
}

static int
vnet_bind_rings(vnet_res_t *vresp)
{
    int rv;

    switch (vresp->type) {

    case VIO_NET_RES_LDC_SERVICE:
    case VIO_NET_RES_LDC_GUEST:
        rv = vnet_bind_vgenring(vresp);
        break;

    case VIO_NET_RES_HYBRID:
        rv = vnet_bind_hwrings(vresp->vnetp);
        break;

    default:
        rv = 1;
        break;

    }

    return (rv);
}

/* ARGSUSED */
int
vnet_hio_stat(void *arg, uint_t stat, uint64_t *val)
{
    vnet_t  *vnetp = (vnet_t *)arg;

    *val = mac_stat_get(vnetp->hio_mh, stat);
    return (0);
}

/*
 * The start() and stop() routines for the Hybrid resource below, are just
 * dummy functions. This is provided to avoid resource type specific code in
 * vnet_start_resources() and vnet_stop_resources(). The starting and stopping
 * of the Hybrid resource happens in the context of the mac_client interfaces
 * that are invoked in vnet_hio_mac_init() and vnet_hio_mac_cleanup().
 */
/* ARGSUSED */
static int
vnet_hio_start(void *arg)
{
    return (0);
}

/* ARGSUSED */
static void
vnet_hio_stop(void *arg)
{
}

mblk_t *
vnet_hio_tx(void *arg, mblk_t *mp)
{
    vnet_pseudo_tx_ring_t   *tx_ringp;
    mblk_t          *nextp;
    mblk_t          *ret_mp;

    tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
    for (;;) {
        nextp = mp->b_next;
        mp->b_next = NULL;

        ret_mp = mac_hwring_tx(tx_ringp->hw_rh, mp);
        if (ret_mp != NULL) {
            ret_mp->b_next = nextp;
            mp = ret_mp;
            break;
        }

        if ((mp = nextp) == NULL)
            break;
    }
    return (mp);
}

#ifdef  VNET_IOC_DEBUG

/*
 * The ioctl entry point is used only for debugging for now. The ioctl commands
 * can be used to force the link state of the channel connected to vsw.
 */
static void
vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
{
    struct iocblk   *iocp;
    vnet_t      *vnetp;

    iocp = (struct iocblk *)(uintptr_t)mp->b_rptr;
    iocp->ioc_error = 0;
    vnetp = (vnet_t *)arg;

    if (vnetp == NULL) {
        miocnak(q, mp, 0, EINVAL);
        return;
    }

    switch (iocp->ioc_cmd) {

    case VNET_FORCE_LINK_DOWN:
    case VNET_FORCE_LINK_UP:
        vnet_force_link_state(vnetp, q, mp);
        break;

    default:
        iocp->ioc_error = EINVAL;
        miocnak(q, mp, 0, iocp->ioc_error);
        break;

    }
}

static void
vnet_force_link_state(vnet_t *vnetp, queue_t *q, mblk_t *mp)
{
    mac_register_t  *macp;
    mac_callbacks_t *cbp;
    vnet_res_t  *vresp;

    READ_ENTER(&vnetp->vsw_fp_rw);

    vresp = vnetp->vsw_fp;
    if (vresp == NULL) {
        RW_EXIT(&vnetp->vsw_fp_rw);
        return;
    }

    macp = &vresp->macreg;
    cbp = macp->m_callbacks;
    cbp->mc_ioctl(macp->m_driver, q, mp);

    RW_EXIT(&vnetp->vsw_fp_rw);
}

#else

static void
vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
{
    vnet_t      *vnetp;

    vnetp = (vnet_t *)arg;

    if (vnetp == NULL) {
        miocnak(q, mp, 0, EINVAL);
        return;
    }

    /* ioctl support only for debugging */
    miocnak(q, mp, 0, ENOTSUP);
}

#endif