vsw.c revision 023505bcce788e9ac958a334707e49cddbf18d1d
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/types.h>
#include <sys/errno.h>
#include <sys/debug.h>
#include <sys/time.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/user.h>
#include <sys/stropts.h>
#include <sys/stream.h>
#include <sys/strlog.h>
#include <sys/strsubr.h>
#include <sys/cmn_err.h>
#include <sys/cpu.h>
#include <sys/kmem.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/ksynch.h>
#include <sys/stat.h>
#include <sys/kstat.h>
#include <sys/vtrace.h>
#include <sys/strsun.h>
#include <sys/dlpi.h>
#include <sys/ethernet.h>
#include <net/if.h>
#include <sys/varargs.h>
#include <sys/machsystm.h>
#include <sys/modctl.h>
#include <sys/modhash.h>
#include <sys/mac.h>
#include <sys/mac_ether.h>
#include <sys/taskq.h>
#include <sys/note.h>
#include <sys/mach_descrip.h>
#include <sys/mac.h>
#include <sys/mdeg.h>
#include <sys/ldc.h>
#include <sys/vsw_fdb.h>
#include <sys/vsw.h>
#include <sys/vio_mailbox.h>
#include <sys/vnet_mailbox.h>
#include <sys/vnet_common.h>
#include <sys/vio_util.h>
#include <sys/sdt.h>
/*
* Function prototypes.
*/
static int vsw_attach(dev_info_t *, ddi_attach_cmd_t);
static int vsw_detach(dev_info_t *, ddi_detach_cmd_t);
static int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
static int vsw_get_md_physname(vsw_t *, md_t *, mde_cookie_t, char *);
static int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *, int *);
static int vsw_get_physaddr(vsw_t *);
static int vsw_setup_switching(vsw_t *);
static int vsw_setup_layer2(vsw_t *);
static int vsw_setup_layer3(vsw_t *);
/* MAC Ring table functions. */
static void vsw_mac_ring_tbl_init(vsw_t *vswp);
static void vsw_mac_ring_tbl_destroy(vsw_t *vswp);
static void vsw_queue_worker(vsw_mac_ring_t *rrp);
static void vsw_queue_stop(vsw_queue_t *vqp);
static vsw_queue_t *vsw_queue_create();
static void vsw_queue_destroy(vsw_queue_t *vqp);
/* MAC layer routines */
static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg,
mac_resource_t *mrp);
static int vsw_get_hw_maddr(vsw_t *);
static int vsw_set_hw(vsw_t *, vsw_port_t *, int);
static int vsw_set_hw_addr(vsw_t *, mac_multi_addr_t *);
static int vsw_set_hw_promisc(vsw_t *, vsw_port_t *, int);
static int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
static int vsw_unset_hw_addr(vsw_t *, int);
static int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *, int);
static void vsw_reconfig_hw(vsw_t *);
static int vsw_prog_if(vsw_t *);
static int vsw_prog_ports(vsw_t *);
static int vsw_mac_attach(vsw_t *vswp);
static void vsw_mac_detach(vsw_t *vswp);
static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *);
static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
static int vsw_mac_register(vsw_t *);
static int vsw_mac_unregister(vsw_t *);
static int vsw_m_stat(void *, uint_t, uint64_t *);
static void vsw_m_stop(void *arg);
static int vsw_m_start(void *arg);
static int vsw_m_unicst(void *arg, const uint8_t *);
static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *);
static int vsw_m_promisc(void *arg, boolean_t);
static mblk_t *vsw_m_tx(void *arg, mblk_t *);
/* MDEG routines */
static int vsw_mdeg_register(vsw_t *vswp);
static void vsw_mdeg_unregister(vsw_t *vswp);
static int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *);
static int vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *);
static void vsw_get_initial_md_properties(vsw_t *vswp, md_t *, mde_cookie_t);
static void vsw_update_md_prop(vsw_t *, md_t *, mde_cookie_t);
/* Port add/deletion routines */
static int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
static int vsw_port_attach(vsw_t *vswp, int p_instance,
uint64_t *ldcids, int nids, struct ether_addr *macaddr);
static int vsw_detach_ports(vsw_t *vswp);
static int vsw_port_detach(vsw_t *vswp, int p_instance);
static int vsw_port_delete(vsw_port_t *port);
static int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
static int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
static int vsw_init_ldcs(vsw_port_t *port);
static int vsw_uninit_ldcs(vsw_port_t *port);
static int vsw_ldc_init(vsw_ldc_t *ldcp);
static int vsw_ldc_uninit(vsw_ldc_t *ldcp);
static int vsw_drain_ldcs(vsw_port_t *port);
static int vsw_drain_port_taskq(vsw_port_t *port);
static void vsw_marker_task(void *);
static vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
static int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
/* Interrupt routines */
static uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
/* Handshake routines */
static void vsw_ldc_reinit(vsw_ldc_t *);
static void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
static void vsw_conn_task(void *);
static int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
static void vsw_next_milestone(vsw_ldc_t *);
static int vsw_supported_version(vio_ver_msg_t *);
/* Data processing routines */
static void vsw_process_pkt(void *);
static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t);
static void vsw_process_ctrl_pkt(void *);
static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *);
static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
/* Switching/data transmit routines */
static void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
vsw_port_t *port, mac_resource_handle_t);
static void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
vsw_port_t *port, mac_resource_handle_t);
static int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller,
vsw_port_t *port);
static int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller,
vsw_port_t *port);
static int vsw_portsend(vsw_port_t *, mblk_t *);
static int vsw_dringsend(vsw_ldc_t *, mblk_t *);
static int vsw_descrsend(vsw_ldc_t *, mblk_t *);
/* Packet creation routines */
static void vsw_send_ver(void *);
static void vsw_send_attr(vsw_ldc_t *);
static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
static void vsw_send_dring_info(vsw_ldc_t *);
static void vsw_send_rdx(vsw_ldc_t *);
static int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
/* Forwarding database (FDB) routines */
static int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port);
static int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port);
static vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *);
static int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
static int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
static int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
static void vsw_del_addr(uint8_t, void *, uint64_t);
static void vsw_del_mcst_port(vsw_port_t *);
static void vsw_del_mcst_vsw(vsw_t *);
/* Dring routines */
static dring_info_t *vsw_create_dring(vsw_ldc_t *);
static void vsw_create_privring(vsw_ldc_t *);
static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
int *);
static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
static void vsw_set_lane_attr(vsw_t *, lane_t *);
static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *);
static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
static int vsw_check_dring_info(vio_dring_reg_msg_t *);
/* Misc support routines */
static caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
static int vsw_free_ring(dring_info_t *);
/* Debugging routines */
static void dump_flags(uint64_t);
static void display_state(void);
static void display_lane(lane_t *);
static void display_ring(dring_info_t *);
int vsw_num_handshakes = VNET_NUM_HANDSHAKES; /* # of handshake attempts */
int vsw_wretries = 100; /* # of write attempts */
int vsw_chain_len = 150; /* max # of mblks in msg chain */
int vsw_desc_delay = 0; /* delay in us */
int vsw_read_attempts = 5; /* # of reads of descriptor */
uint32_t vsw_mblk_size = VSW_MBLK_SIZE;
uint32_t vsw_num_mblks = VSW_NUM_MBLKS;
static mac_callbacks_t vsw_m_callbacks = {
0,
vsw_m_stat,
vsw_m_start,
vsw_m_stop,
vsw_m_promisc,
vsw_m_multicst,
vsw_m_unicst,
vsw_m_tx,
NULL,
NULL,
NULL
};
static struct cb_ops vsw_cb_ops = {
nulldev, /* cb_open */
nulldev, /* cb_close */
nodev, /* cb_strategy */
nodev, /* cb_print */
nodev, /* cb_dump */
nodev, /* cb_read */
nodev, /* cb_write */
nodev, /* cb_ioctl */
nodev, /* cb_devmap */
nodev, /* cb_mmap */
nodev, /* cb_segmap */
nochpoll, /* cb_chpoll */
ddi_prop_op, /* cb_prop_op */
NULL, /* cb_stream */
D_MP, /* cb_flag */
CB_REV, /* rev */
nodev, /* int (*cb_aread)() */
nodev /* int (*cb_awrite)() */
};
static struct dev_ops vsw_ops = {
DEVO_REV, /* devo_rev */
0, /* devo_refcnt */
vsw_getinfo, /* devo_getinfo */
nulldev, /* devo_identify */
nulldev, /* devo_probe */
vsw_attach, /* devo_attach */
vsw_detach, /* devo_detach */
nodev, /* devo_reset */
&vsw_cb_ops, /* devo_cb_ops */
(struct bus_ops *)NULL, /* devo_bus_ops */
ddi_power /* devo_power */
};
extern struct mod_ops mod_driverops;
static struct modldrv vswmodldrv = {
&mod_driverops,
"sun4v Virtual Switch %I%",
&vsw_ops,
};
#define LDC_ENTER_LOCK(ldcp) \
mutex_enter(&((ldcp)->ldc_cblock));\
mutex_enter(&((ldcp)->ldc_txlock));
#define LDC_EXIT_LOCK(ldcp) \
mutex_exit(&((ldcp)->ldc_txlock));\
mutex_exit(&((ldcp)->ldc_cblock));
/* Driver soft state ptr */
static void *vsw_state;
/*
* Linked list of "vsw_t" structures - one per instance.
*/
vsw_t *vsw_head = NULL;
krwlock_t vsw_rw;
/*
* Property names
*/
static char vdev_propname[] = "virtual-device";
static char vsw_propname[] = "virtual-network-switch";
static char physdev_propname[] = "vsw-phys-dev";
static char smode_propname[] = "vsw-switch-mode";
static char macaddr_propname[] = "local-mac-address";
static char remaddr_propname[] = "remote-mac-address";
static char ldcids_propname[] = "ldc-ids";
static char chan_propname[] = "channel-endpoint";
static char id_propname[] = "id";
static char reg_propname[] = "reg";
/* supported versions */
static ver_sup_t vsw_versions[] = { {1, 0} };
/*
* Matching criteria passed to the MDEG to register interest
* in changes to 'virtual-device-port' nodes identified by their
* 'id' property.
*/
static md_prop_match_t vport_prop_match[] = {
{ MDET_PROP_VAL, "id" },
{ MDET_LIST_END, NULL }
};
static mdeg_node_match_t vport_match = { "virtual-device-port",
vport_prop_match };
/*
* Matching criteria passed to the MDEG to register interest
* in changes to 'virtual-device' nodes (i.e. vsw nodes) identified
* by their 'name' and 'cfg-handle' properties.
*/
static md_prop_match_t vdev_prop_match[] = {
{ MDET_PROP_STR, "name" },
{ MDET_PROP_VAL, "cfg-handle" },
{ MDET_LIST_END, NULL }
};
static mdeg_node_match_t vdev_match = { "virtual-device",
vdev_prop_match };
/*
* Specification of an MD node passed to the MDEG to filter any
* 'vport' nodes that do not belong to the specified node. This
* template is copied for each vsw instance and filled in with
* the appropriate 'cfg-handle' value before being passed to the MDEG.
*/
static mdeg_prop_spec_t vsw_prop_template[] = {
{ MDET_PROP_STR, "name", vsw_propname },
{ MDET_PROP_VAL, "cfg-handle", NULL },
{ MDET_LIST_END, NULL, NULL }
};
#define VSW_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val);
/*
* From /etc/system enable/disable thread per ring. This is a mode
* selection that is done a vsw driver attach time.
*/
boolean_t vsw_multi_ring_enable = B_FALSE;
int vsw_mac_rx_rings = VSW_MAC_RX_RINGS;
/*
* Print debug messages - set to 0x1f to enable all msgs
* or 0x0 to turn all off.
*/
int vswdbg = 0x0;
/*
* debug levels:
* 0x01: Function entry/exit tracing
* 0x02: Internal function messages
* 0x04: Verbose internal messages
* 0x08: Warning messages
* 0x10: Error messages
*/
static void
vswdebug(vsw_t *vswp, const char *fmt, ...)
{
char buf[512];
va_list ap;
va_start(ap, fmt);
(void) vsprintf(buf, fmt, ap);
va_end(ap);
if (vswp == NULL)
cmn_err(CE_CONT, "%s\n", buf);
else
cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf);
}
/*
* For the moment the state dump routines have their own
* private flag.
*/
#define DUMP_STATE 0
#if DUMP_STATE
#define DUMP_TAG(tag) \
{ \
D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype); \
D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env); \
}
#define DUMP_TAG_PTR(tag) \
{ \
D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype); \
D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env); \
}
#define DUMP_FLAGS(flags) dump_flags(flags);
#define DISPLAY_STATE() display_state()
#else
#define DUMP_TAG(tag)
#define DUMP_TAG_PTR(tag)
#define DUMP_FLAGS(state)
#define DISPLAY_STATE()
#endif /* DUMP_STATE */
#ifdef DEBUG
#define D1 \
if (vswdbg & 0x01) \
vswdebug
#define D2 \
if (vswdbg & 0x02) \
vswdebug
#define D3 \
if (vswdbg & 0x04) \
vswdebug
#define DWARN \
if (vswdbg & 0x08) \
vswdebug
#define DERR \
if (vswdbg & 0x10) \
vswdebug
#else
#define DERR if (0) vswdebug
#define DWARN if (0) vswdebug
#define D1 if (0) vswdebug
#define D2 if (0) vswdebug
#define D3 if (0) vswdebug
#endif /* DEBUG */
static struct modlinkage modlinkage = {
MODREV_1,
&vswmodldrv,
NULL
};
int
_init(void)
{
int status;
rw_init(&vsw_rw, NULL, RW_DRIVER, NULL);
status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1);
if (status != 0) {
return (status);
}
mac_init_ops(&vsw_ops, "vsw");
status = mod_install(&modlinkage);
if (status != 0) {
ddi_soft_state_fini(&vsw_state);
}
return (status);
}
int
_fini(void)
{
int status;
status = mod_remove(&modlinkage);
if (status != 0)
return (status);
mac_fini_ops(&vsw_ops);
ddi_soft_state_fini(&vsw_state);
rw_destroy(&vsw_rw);
return (status);
}
int
_info(struct modinfo *modinfop)
{
return (mod_info(&modlinkage, modinfop));
}
static int
vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
vsw_t *vswp;
int instance;
char hashname[MAXNAMELEN];
char qname[TASKQ_NAMELEN];
enum { PROG_init = 0x00,
PROG_if_lock = 0x01,
PROG_fdb = 0x02,
PROG_mfdb = 0x04,
PROG_report_dev = 0x08,
PROG_plist = 0x10,
PROG_taskq = 0x20}
progress;
progress = PROG_init;
switch (cmd) {
case DDI_ATTACH:
break;
case DDI_RESUME:
/* nothing to do for this non-device */
return (DDI_SUCCESS);
case DDI_PM_RESUME:
default:
return (DDI_FAILURE);
}
instance = ddi_get_instance(dip);
if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) {
DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance);
return (DDI_FAILURE);
}
vswp = ddi_get_soft_state(vsw_state, instance);
if (vswp == NULL) {
DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance);
goto vsw_attach_fail;
}
vswp->dip = dip;
vswp->instance = instance;
ddi_set_driver_private(dip, (caddr_t)vswp);
mutex_init(&vswp->hw_lock, NULL, MUTEX_DRIVER, NULL);
mutex_init(&vswp->mac_lock, NULL, MUTEX_DRIVER, NULL);
rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL);
progress |= PROG_if_lock;
/* setup the unicast forwarding database */
(void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d",
vswp->instance);
D2(vswp, "creating unicast hash table (%s)...", hashname);
vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
mod_hash_null_valdtor, sizeof (void *));
progress |= PROG_fdb;
/* setup the multicast fowarding database */
(void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d",
vswp->instance);
D2(vswp, "creating multicast hash table %s)...", hashname);
rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL);
vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
mod_hash_null_valdtor, sizeof (void *));
progress |= PROG_mfdb;
/*
* create lock protecting list of multicast addresses
* which could come via m_multicst() entry point when plumbed.
*/
mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL);
vswp->mcap = NULL;
ddi_report_dev(vswp->dip);
progress |= PROG_report_dev;
WRITE_ENTER(&vsw_rw);
vswp->next = vsw_head;
vsw_head = vswp;
RW_EXIT(&vsw_rw);
/* setup the port list */
rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL);
vswp->plist.head = NULL;
progress |= PROG_plist;
/*
* Create the taskq which will process all the VIO
* control messages.
*/
(void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance);
if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1,
TASKQ_DEFAULTPRI, 0)) == NULL) {
cmn_err(CE_WARN, "!vsw%d: Unable to create task queue",
vswp->instance);
goto vsw_attach_fail;
}
progress |= PROG_taskq;
/* prevent auto-detaching */
if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip,
DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) {
cmn_err(CE_NOTE, "!Unable to set \"%s\" property for "
"instance %u", DDI_NO_AUTODETACH, instance);
}
/*
* Now we have everything setup, register an interest in
* specific MD nodes.
*
* The callback is invoked in 2 cases, firstly if upon mdeg
* registration there are existing nodes which match our specified
* criteria, and secondly if the MD is changed (and again, there
* are nodes which we are interested in present within it. Note
* that our callback will be invoked even if our specified nodes
* have not actually changed).
*
* Until the callback is invoked we cannot switch any pkts as
* we don't know basic information such as what mode we are
* operating in. However we expect the callback to be invoked
* immediately upon registration as this driver should only
* be attaching if there are vsw nodes in the MD.
*/
if (vsw_mdeg_register(vswp))
goto vsw_attach_fail;
return (DDI_SUCCESS);
vsw_attach_fail:
DERR(NULL, "vsw_attach: failed");
if (progress & PROG_taskq)
ddi_taskq_destroy(vswp->taskq_p);
if (progress & PROG_plist)
rw_destroy(&vswp->plist.lockrw);
if (progress & PROG_report_dev) {
ddi_remove_minor_node(dip, NULL);
mutex_destroy(&vswp->mca_lock);
}
if (progress & PROG_mfdb) {
mod_hash_destroy_hash(vswp->mfdb);
vswp->mfdb = NULL;
rw_destroy(&vswp->mfdbrw);
}
if (progress & PROG_fdb) {
mod_hash_destroy_hash(vswp->fdb);
vswp->fdb = NULL;
}
if (progress & PROG_if_lock) {
rw_destroy(&vswp->if_lockrw);
mutex_destroy(&vswp->mac_lock);
mutex_destroy(&vswp->hw_lock);
}
ddi_soft_state_free(vsw_state, instance);
return (DDI_FAILURE);
}
static int
vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
vio_mblk_pool_t *poolp, *npoolp;
vsw_t **vswpp, *vswp;
int instance;
instance = ddi_get_instance(dip);
vswp = ddi_get_soft_state(vsw_state, instance);
if (vswp == NULL) {
return (DDI_FAILURE);
}
switch (cmd) {
case DDI_DETACH:
break;
case DDI_SUSPEND:
case DDI_PM_SUSPEND:
default:
return (DDI_FAILURE);
}
D2(vswp, "detaching instance %d", instance);
if (vswp->if_state & VSW_IF_REG) {
if (vsw_mac_unregister(vswp) != 0) {
cmn_err(CE_WARN, "!vsw%d: Unable to detach from "
"MAC layer", vswp->instance);
return (DDI_FAILURE);
}
}
vsw_mdeg_unregister(vswp);
/* remove mac layer callback */
mutex_enter(&vswp->mac_lock);
if ((vswp->mh != NULL) && (vswp->mrh != NULL)) {
mac_rx_remove(vswp->mh, vswp->mrh);
vswp->mrh = NULL;
}
mutex_exit(&vswp->mac_lock);
if (vsw_detach_ports(vswp) != 0) {
cmn_err(CE_WARN, "!vsw%d: Unable to detach ports",
vswp->instance);
return (DDI_FAILURE);
}
rw_destroy(&vswp->if_lockrw);
mutex_destroy(&vswp->hw_lock);
/*
* Now that the ports have been deleted, stop and close
* the physical device.
*/
mutex_enter(&vswp->mac_lock);
if (vswp->mh != NULL) {
if (vswp->mstarted)
mac_stop(vswp->mh);
if (vswp->mresources)
mac_resource_set(vswp->mh, NULL, NULL);
mac_close(vswp->mh);
vswp->mh = NULL;
vswp->txinfo = NULL;
}
mutex_exit(&vswp->mac_lock);
mutex_destroy(&vswp->mac_lock);
/*
* Destroy any free pools that may still exist.
*/
poolp = vswp->rxh;
while (poolp != NULL) {
npoolp = vswp->rxh = poolp->nextp;
if (vio_destroy_mblks(poolp) != 0) {
vswp->rxh = poolp;
return (DDI_FAILURE);
}
poolp = npoolp;
}
/*
* Remove this instance from any entries it may be on in
* the hash table by using the list of addresses maintained
* in the vsw_t structure.
*/
vsw_del_mcst_vsw(vswp);
vswp->mcap = NULL;
mutex_destroy(&vswp->mca_lock);
/*
* By now any pending tasks have finished and the underlying
* ldc's have been destroyed, so its safe to delete the control
* message taskq.
*/
if (vswp->taskq_p != NULL)
ddi_taskq_destroy(vswp->taskq_p);
/*
* At this stage all the data pointers in the hash table
* should be NULL, as all the ports have been removed and will
* have deleted themselves from the port lists which the data
* pointers point to. Hence we can destroy the table using the
* default destructors.
*/
D2(vswp, "vsw_detach: destroying hash tables..");
mod_hash_destroy_hash(vswp->fdb);
vswp->fdb = NULL;
WRITE_ENTER(&vswp->mfdbrw);
mod_hash_destroy_hash(vswp->mfdb);
vswp->mfdb = NULL;
RW_EXIT(&vswp->mfdbrw);
rw_destroy(&vswp->mfdbrw);
ddi_remove_minor_node(dip, NULL);
rw_destroy(&vswp->plist.lockrw);
WRITE_ENTER(&vsw_rw);
for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) {
if (*vswpp == vswp) {
*vswpp = vswp->next;
break;
}
}
RW_EXIT(&vsw_rw);
ddi_soft_state_free(vsw_state, instance);
return (DDI_SUCCESS);
}
static int
vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
{
_NOTE(ARGUNUSED(dip))
vsw_t *vswp = NULL;
dev_t dev = (dev_t)arg;
int instance;
instance = getminor(dev);
switch (infocmd) {
case DDI_INFO_DEVT2DEVINFO:
if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) {
*result = NULL;
return (DDI_FAILURE);
}
*result = vswp->dip;
return (DDI_SUCCESS);
case DDI_INFO_DEVT2INSTANCE:
*result = (void *)(uintptr_t)instance;
return (DDI_SUCCESS);
default:
*result = NULL;
return (DDI_FAILURE);
}
}
/*
* Get the value of the "vsw-phys-dev" property in the specified
* node. This property is the name of the physical device that
* the virtual switch will use to talk to the outside world.
*
* Note it is valid for this property to be NULL (but the property
* itself must exist). Callers of this routine should verify that
* the value returned is what they expected (i.e. either NULL or non NULL).
*
* On success returns value of the property in region pointed to by
* the 'name' argument, and with return value of 0. Otherwise returns 1.
*/
static int
vsw_get_md_physname(vsw_t *vswp, md_t *mdp, mde_cookie_t node, char *name)
{
int len = 0;
char *physname = NULL;
char *dev;
if (md_get_prop_data(mdp, node, physdev_propname,
(uint8_t **)(&physname), &len) != 0) {
cmn_err(CE_WARN, "!vsw%d: Unable to get name(s) of physical "
"device(s) from MD", vswp->instance);
return (1);
} else if ((strlen(physname) + 1) > LIFNAMSIZ) {
cmn_err(CE_WARN, "!vsw%d: %s is too long a device name",
vswp->instance, physname);
return (1);
} else {
(void) strncpy(name, physname, strlen(physname) + 1);
D2(vswp, "%s: using first device specified (%s)",
__func__, physname);
}
#ifdef DEBUG
/*
* As a temporary measure to aid testing we check to see if there
* is a vsw.conf file present. If there is we use the value of the
* vsw_physname property in the file as the name of the physical
* device, overriding the value from the MD.
*
* There may be multiple devices listed, but for the moment
* we just use the first one.
*/
if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0,
"vsw_physname", &dev) == DDI_PROP_SUCCESS) {
if ((strlen(dev) + 1) > LIFNAMSIZ) {
cmn_err(CE_WARN, "vsw%d: %s is too long a device name",
vswp->instance, dev);
ddi_prop_free(dev);
return (1);
} else {
cmn_err(CE_NOTE, "vsw%d: Using device name (%s) from "
"config file", vswp->instance, dev);
(void) strncpy(name, dev, strlen(dev) + 1);
}
ddi_prop_free(dev);
}
#endif
return (0);
}
/*
* Read the 'vsw-switch-mode' property from the specified MD node.
*
* Returns 0 on success and the number of modes found in 'found',
* otherwise returns 1.
*/
static int
vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
uint8_t *modes, int *found)
{
int len = 0;
int smode_num = 0;
char *smode = NULL;
char *curr_mode = NULL;
D1(vswp, "%s: enter", __func__);
/*
* Get the switch-mode property. The modes are listed in
* decreasing order of preference, i.e. prefered mode is
* first item in list.
*/
len = 0;
smode_num = 0;
if (md_get_prop_data(mdp, node, smode_propname,
(uint8_t **)(&smode), &len) != 0) {
/*
* Unable to get switch-mode property from MD, nothing
* more we can do.
*/
cmn_err(CE_WARN, "!vsw%d: Unable to get switch mode property"
" from the MD", vswp->instance);
*found = 0;
return (1);
}
curr_mode = smode;
/*
* Modes of operation:
* 'switched' - layer 2 switching, underlying HW in
* programmed mode.
* 'promiscuous' - layer 2 switching, underlying HW in
* promiscuous mode.
* 'routed' - layer 3 (i.e. IP) routing, underlying HW
* in non-promiscuous mode.
*/
while ((curr_mode < (smode + len)) && (smode_num < NUM_SMODES)) {
D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode);
if (strcmp(curr_mode, "switched") == 0) {
modes[smode_num++] = VSW_LAYER2;
} else if (strcmp(curr_mode, "promiscuous") == 0) {
modes[smode_num++] = VSW_LAYER2_PROMISC;
} else if (strcmp(curr_mode, "routed") == 0) {
modes[smode_num++] = VSW_LAYER3;
} else {
cmn_err(CE_WARN, "!vsw%d: Unknown switch mode %s, "
"setting to default switched mode",
vswp->instance, curr_mode);
modes[smode_num++] = VSW_LAYER2;
}
curr_mode += strlen(curr_mode) + 1;
}
*found = smode_num;
D2(vswp, "%s: %d modes found", __func__, smode_num);
D1(vswp, "%s: exit", __func__);
return (0);
}
/*
* Get the mac address of the physical device.
*
* Returns 0 on success, 1 on failure.
*/
static int
vsw_get_physaddr(vsw_t *vswp)
{
mac_handle_t mh;
char drv[LIFNAMSIZ];
uint_t ddi_instance;
D1(vswp, "%s: enter", __func__);
if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS)
return (1);
if (mac_open(vswp->physname, ddi_instance, &mh) != 0) {
cmn_err(CE_WARN, "!vsw%d: mac_open %s failed",
vswp->instance, vswp->physname);
return (1);
}
READ_ENTER(&vswp->if_lockrw);
mac_unicst_get(mh, vswp->if_addr.ether_addr_octet);
RW_EXIT(&vswp->if_lockrw);
mac_close(mh);
vswp->mdprops |= VSW_DEV_MACADDR;
D1(vswp, "%s: exit", __func__);
return (0);
}
/*
* Check to see if the card supports the setting of multiple unicst
* addresses.
*
* Returns 0 if card supports the programming of multiple unicast addresses,
* otherwise returns 1.
*/
static int
vsw_get_hw_maddr(vsw_t *vswp)
{
D1(vswp, "%s: enter", __func__);
mutex_enter(&vswp->mac_lock);
if (vswp->mh == NULL) {
mutex_exit(&vswp->mac_lock);
return (1);
}
if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) {
cmn_err(CE_WARN, "!vsw%d: device (%s) does not support "
"setting multiple unicast addresses", vswp->instance,
vswp->physname);
mutex_exit(&vswp->mac_lock);
return (1);
}
mutex_exit(&vswp->mac_lock);
D2(vswp, "%s: %d addrs : %d free", __func__,
vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree);
D1(vswp, "%s: exit", __func__);
return (0);
}
/*
* Setup the required switching mode.
*
* Returns 0 on success, 1 on failure.
*/
static int
vsw_setup_switching(vsw_t *vswp)
{
int i, rv = 1;
D1(vswp, "%s: enter", __func__);
/* select best switching mode */
for (i = 0; i < vswp->smode_num; i++) {
vswp->smode_idx = i;
switch (vswp->smode[i]) {
case VSW_LAYER2:
case VSW_LAYER2_PROMISC:
rv = vsw_setup_layer2(vswp);
break;
case VSW_LAYER3:
rv = vsw_setup_layer3(vswp);
break;
default:
DERR(vswp, "unknown switch mode");
rv = 1;
break;
}
if (rv == 0)
break;
}
if (rv == 1) {
cmn_err(CE_WARN, "!vsw%d: Unable to setup specified "
"switching mode", vswp->instance);
return (rv);
}
D2(vswp, "%s: Operating in mode %d", __func__,
vswp->smode[vswp->smode_idx]);
D1(vswp, "%s: exit", __func__);
return (0);
}
/*
* Setup for layer 2 switching.
*
* Returns 0 on success, 1 on failure.
*/
static int
vsw_setup_layer2(vsw_t *vswp)
{
D1(vswp, "%s: enter", __func__);
vswp->vsw_switch_frame = vsw_switch_l2_frame;
/*
* Attempt to link into the MAC layer so we can get
* and send packets out over the physical adapter.
*/
if (vswp->mdprops & VSW_MD_PHYSNAME) {
if (vsw_mac_attach(vswp) != 0) {
/*
* Registration with the MAC layer has failed,
* so return 1 so that can fall back to next
* prefered switching method.
*/
cmn_err(CE_WARN, "!vsw%d: Unable to join as MAC layer "
"client", vswp->instance);
return (1);
}
if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) {
/*
* Verify that underlying device can support multiple
* unicast mac addresses.
*/
if (vsw_get_hw_maddr(vswp) != 0) {
cmn_err(CE_WARN, "!vsw%d: Unable to setup "
"layer2 switching", vswp->instance);
vsw_mac_detach(vswp);
return (1);
}
}
} else {
/*
* No physical device name found in MD which is
* required for layer 2.
*/
cmn_err(CE_WARN, "!vsw%d: no physical device name specified",
vswp->instance);
return (1);
}
D1(vswp, "%s: exit", __func__);
return (0);
}
static int
vsw_setup_layer3(vsw_t *vswp)
{
D1(vswp, "%s: enter", __func__);
D2(vswp, "%s: operating in layer 3 mode", __func__);
vswp->vsw_switch_frame = vsw_switch_l3_frame;
D1(vswp, "%s: exit", __func__);
return (0);
}
/*
* Link into the MAC layer to gain access to the services provided by
* the underlying physical device driver (which should also have
* registered with the MAC layer).
*
* Only when in layer 2 mode.
*/
static int
vsw_mac_attach(vsw_t *vswp)
{
char drv[LIFNAMSIZ];
uint_t ddi_instance;
D1(vswp, "%s: enter", __func__);
ASSERT(vswp->mh == NULL);
ASSERT(vswp->mrh == NULL);
ASSERT(vswp->mstarted == B_FALSE);
ASSERT(vswp->mresources == B_FALSE);
ASSERT(vswp->mdprops & VSW_MD_PHYSNAME);
mutex_enter(&vswp->mac_lock);
if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) {
cmn_err(CE_WARN, "!vsw%d: invalid device name: %s",
vswp->instance, vswp->physname);
goto mac_fail_exit;
}
if ((mac_open(vswp->physname, ddi_instance, &vswp->mh)) != 0) {
cmn_err(CE_WARN, "!vsw%d: mac_open %s failed",
vswp->instance, vswp->physname);
goto mac_fail_exit;
}
ASSERT(vswp->mh != NULL);
D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);
if (vsw_multi_ring_enable) {
/*
* Initialize the ring table.
*/
vsw_mac_ring_tbl_init(vswp);
/*
* Register our rx callback function.
*/
vswp->mrh = mac_rx_add(vswp->mh,
vsw_rx_queue_cb, (void *)vswp);
ASSERT(vswp->mrh != NULL);
/*
* Register our mac resource callback.
*/
mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp);
vswp->mresources = B_TRUE;
/*
* Get the ring resources available to us from
* the mac below us.
*/
mac_resources(vswp->mh);
} else {
/*
* Just register our rx callback function
*/
vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
ASSERT(vswp->mrh != NULL);
}
/* Get the MAC tx fn */
vswp->txinfo = mac_tx_get(vswp->mh);
/* start the interface */
if (mac_start(vswp->mh) != 0) {
cmn_err(CE_WARN, "!vsw%d: Could not start mac interface",
vswp->instance);
goto mac_fail_exit;
}
mutex_exit(&vswp->mac_lock);
vswp->mstarted = B_TRUE;
D1(vswp, "%s: exit", __func__);
return (0);
mac_fail_exit:
mutex_exit(&vswp->mac_lock);
vsw_mac_detach(vswp);
D1(vswp, "%s: exit", __func__);
return (1);
}
static void
vsw_mac_detach(vsw_t *vswp)
{
D1(vswp, "vsw_mac_detach: enter");
ASSERT(vswp != NULL);
if (vsw_multi_ring_enable) {
vsw_mac_ring_tbl_destroy(vswp);
}
mutex_enter(&vswp->mac_lock);
if (vswp->mh != NULL) {
if (vswp->mstarted)
mac_stop(vswp->mh);
if (vswp->mrh != NULL)
mac_rx_remove(vswp->mh, vswp->mrh);
if (vswp->mresources)
mac_resource_set(vswp->mh, NULL, NULL);
mac_close(vswp->mh);
}
vswp->mrh = NULL;
vswp->mh = NULL;
vswp->txinfo = NULL;
vswp->mstarted = B_FALSE;
mutex_exit(&vswp->mac_lock);
D1(vswp, "vsw_mac_detach: exit");
}
/*
* Depending on the mode specified, the capabilites and capacity
* of the underlying device setup the physical device.
*
* If in layer 3 mode, then do nothing.
*
* If in layer 2 programmed mode attempt to program the unicast address
* associated with the port into the physical device. If this is not
* possible due to resource exhaustion or simply because the device does
* not support multiple unicast addresses then if required fallback onto
* putting the card into promisc mode.
*
* If in promisc mode then simply set the card into promisc mode.
*
* Returns 0 success, 1 on failure.
*/
static int
vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type)
{
mac_multi_addr_t mac_addr;
int err;
D1(vswp, "%s: enter", __func__);
ASSERT(MUTEX_HELD(&vswp->hw_lock));
ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
return (0);
if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) {
return (vsw_set_hw_promisc(vswp, port, type));
}
/*
* Attempt to program the unicast address into the HW.
*/
mac_addr.mma_addrlen = ETHERADDRL;
if (type == VSW_VNETPORT) {
ASSERT(port != NULL);
ether_copy(&port->p_macaddr, &mac_addr.mma_addr);
} else {
READ_ENTER(&vswp->if_lockrw);
/*
* Don't program if the interface is not UP. This
* is possible if the address has just been changed
* in the MD node, but the interface has not yet been
* plumbed.
*/
if (!(vswp->if_state & VSW_IF_UP)) {
RW_EXIT(&vswp->if_lockrw);
return (0);
}
ether_copy(&vswp->if_addr, &mac_addr.mma_addr);
RW_EXIT(&vswp->if_lockrw);
}
err = vsw_set_hw_addr(vswp, &mac_addr);
if (err != 0) {
/*
* Mark that attempt should be made to re-config sometime
* in future if a port is deleted.
*/
vswp->recfg_reqd = B_TRUE;
/*
* Only 1 mode specified, nothing more to do.
*/
if (vswp->smode_num == 1)
return (err);
/*
* If promiscuous was next mode specified try to
* set the card into that mode.
*/
if ((vswp->smode_idx <= (vswp->smode_num - 2)) &&
(vswp->smode[vswp->smode_idx + 1]
== VSW_LAYER2_PROMISC)) {
vswp->smode_idx += 1;
return (vsw_set_hw_promisc(vswp, port, type));
}
return (err);
}
if (type == VSW_VNETPORT) {
port->addr_slot = mac_addr.mma_slot;
port->addr_set = VSW_ADDR_HW;
} else {
vswp->addr_slot = mac_addr.mma_slot;
vswp->addr_set = VSW_ADDR_HW;
}
D2(vswp, "programmed addr %x:%x:%x:%x:%x:%x into slot %d "
"of device %s",
mac_addr.mma_addr[0], mac_addr.mma_addr[1],
mac_addr.mma_addr[2], mac_addr.mma_addr[3],
mac_addr.mma_addr[4], mac_addr.mma_addr[5],
mac_addr.mma_slot, vswp->physname);
D1(vswp, "%s: exit", __func__);
return (0);
}
/*
* If in layer 3 mode do nothing.
*
* If in layer 2 switched mode remove the address from the physical
* device.
*
* If in layer 2 promiscuous mode disable promisc mode.
*
* Returns 0 on success.
*/
static int
vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type)
{
mac_addr_slot_t slot;
int rv;
D1(vswp, "%s: enter", __func__);
ASSERT(MUTEX_HELD(&vswp->hw_lock));
if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
return (0);
switch (type) {
case VSW_VNETPORT:
ASSERT(port != NULL);
if (port->addr_set == VSW_ADDR_PROMISC) {
return (vsw_unset_hw_promisc(vswp, port, type));
} else if (port->addr_set == VSW_ADDR_HW) {
slot = port->addr_slot;
if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
port->addr_set = VSW_ADDR_UNSET;
}
break;
case VSW_LOCALDEV:
if (vswp->addr_set == VSW_ADDR_PROMISC) {
return (vsw_unset_hw_promisc(vswp, NULL, type));
} else if (vswp->addr_set == VSW_ADDR_HW) {
slot = vswp->addr_slot;
if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
vswp->addr_set = VSW_ADDR_UNSET;
}
break;
default:
/* should never happen */
DERR(vswp, "%s: unknown type %d", __func__, type);
ASSERT(0);
return (1);
}
D1(vswp, "%s: exit", __func__);
return (rv);
}
/*
* Attempt to program a unicast address into HW.
*
* Returns 0 on sucess, 1 on failure.
*/
static int
vsw_set_hw_addr(vsw_t *vswp, mac_multi_addr_t *mac)
{
void *mah;
int rv;
D1(vswp, "%s: enter", __func__);
ASSERT(MUTEX_HELD(&vswp->hw_lock));
if (vswp->maddr.maddr_handle == NULL)
return (1);
mah = vswp->maddr.maddr_handle;
rv = vswp->maddr.maddr_add(mah, mac);
if (rv == 0)
return (0);
/*
* Its okay for the add to fail because we have exhausted
* all the resouces in the hardware device. Any other error
* we want to flag.
*/
if (rv != ENOSPC) {
cmn_err(CE_WARN, "!vsw%d: error programming "
"address %x:%x:%x:%x:%x:%x into HW "
"err (%d)", vswp->instance,
mac->mma_addr[0], mac->mma_addr[1],
mac->mma_addr[2], mac->mma_addr[3],
mac->mma_addr[4], mac->mma_addr[5], rv);
}
D1(vswp, "%s: exit", __func__);
return (1);
}
/*
* Remove a unicast mac address which has previously been programmed
* into HW.
*
* Returns 0 on sucess, 1 on failure.
*/
static int
vsw_unset_hw_addr(vsw_t *vswp, int slot)
{
void *mah;
int rv;
D1(vswp, "%s: enter", __func__);
ASSERT(MUTEX_HELD(&vswp->hw_lock));
ASSERT(slot >= 0);
if (vswp->maddr.maddr_handle == NULL)
return (1);
mah = vswp->maddr.maddr_handle;
rv = vswp->maddr.maddr_remove(mah, slot);
if (rv != 0) {
cmn_err(CE_WARN, "!vsw%d: unable to remove address "
"from slot %d in device %s (err %d)",
vswp->instance, slot, vswp->physname, rv);
return (1);
}
D2(vswp, "removed addr from slot %d in device %s",
slot, vswp->physname);
D1(vswp, "%s: exit", __func__);
return (0);
}
/*
* Set network card into promisc mode.
*
* Returns 0 on success, 1 on failure.
*/
static int
vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
{
D1(vswp, "%s: enter", __func__);
ASSERT(MUTEX_HELD(&vswp->hw_lock));
ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
mutex_enter(&vswp->mac_lock);
if (vswp->mh == NULL) {
mutex_exit(&vswp->mac_lock);
return (1);
}
if (vswp->promisc_cnt++ == 0) {
if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
vswp->promisc_cnt--;
mutex_exit(&vswp->mac_lock);
return (1);
}
cmn_err(CE_NOTE, "!vsw%d: switching device %s into "
"promiscuous mode", vswp->instance, vswp->physname);
}
mutex_exit(&vswp->mac_lock);
if (type == VSW_VNETPORT) {
ASSERT(port != NULL);
port->addr_set = VSW_ADDR_PROMISC;
} else {
vswp->addr_set = VSW_ADDR_PROMISC;
}
D1(vswp, "%s: exit", __func__);
return (0);
}
/*
* Turn off promiscuous mode on network card.
*
* Returns 0 on success, 1 on failure.
*/
static int
vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
{
vsw_port_list_t *plist = &vswp->plist;
D2(vswp, "%s: enter", __func__);
ASSERT(MUTEX_HELD(&vswp->hw_lock));
ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
mutex_enter(&vswp->mac_lock);
if (vswp->mh == NULL) {
mutex_exit(&vswp->mac_lock);
return (1);
}
if (--vswp->promisc_cnt == 0) {
if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) {
vswp->promisc_cnt++;
mutex_exit(&vswp->mac_lock);
return (1);
}
/*
* We are exiting promisc mode either because we were
* only in promisc mode because we had failed over from
* switched mode due to HW resource issues, or the user
* wanted the card in promisc mode for all the ports and
* the last port is now being deleted. Tweak the message
* accordingly.
*/
if (plist->num_ports != 0) {
cmn_err(CE_NOTE, "!vsw%d: switching device %s back to "
"programmed mode", vswp->instance,
vswp->physname);
} else {
cmn_err(CE_NOTE, "!vsw%d: switching device %s out of "
"promiscuous mode", vswp->instance,
vswp->physname);
}
}
mutex_exit(&vswp->mac_lock);
if (type == VSW_VNETPORT) {
ASSERT(port != NULL);
ASSERT(port->addr_set == VSW_ADDR_PROMISC);
port->addr_set = VSW_ADDR_UNSET;
} else {
ASSERT(vswp->addr_set == VSW_ADDR_PROMISC);
vswp->addr_set = VSW_ADDR_UNSET;
}
D1(vswp, "%s: exit", __func__);
return (0);
}
/*
* Determine whether or not we are operating in our prefered
* mode and if not whether the physical resources now allow us
* to operate in it.
*
* If a port is being removed should only be invoked after port has been
* removed from the port list.
*/
static void
vsw_reconfig_hw(vsw_t *vswp)
{
int s_idx;
D1(vswp, "%s: enter", __func__);
ASSERT(MUTEX_HELD(&vswp->hw_lock));
if (vswp->maddr.maddr_handle == NULL) {
return;
}
/*
* If we are in layer 2 (i.e. switched) or would like to be
* in layer 2 then check if any ports or the vswitch itself
* need to be programmed into the HW.
*
* This can happen in two cases - switched was specified as
* the prefered mode of operation but we exhausted the HW
* resources and so failed over to the next specifed mode,
* or switched was the only mode specified so after HW
* resources were exhausted there was nothing more we
* could do.
*/
if (vswp->smode_idx > 0)
s_idx = vswp->smode_idx - 1;
else
s_idx = vswp->smode_idx;
if (vswp->smode[s_idx] != VSW_LAYER2) {
return;
}
D2(vswp, "%s: attempting reconfig..", __func__);
/*
* First, attempt to set the vswitch mac address into HW,
* if required.
*/
if (vsw_prog_if(vswp)) {
return;
}
/*
* Next, attempt to set any ports which have not yet been
* programmed into HW.
*/
if (vsw_prog_ports(vswp)) {
return;
}
/*
* By now we know that have programmed all desired ports etc
* into HW, so safe to mark reconfiguration as complete.
*/
vswp->recfg_reqd = B_FALSE;
vswp->smode_idx = s_idx;
D1(vswp, "%s: exit", __func__);
}
/*
* Check to see if vsw itself is plumbed, and if so whether or not
* its mac address should be written into HW.
*
* Returns 0 if could set address, or didn't have to set it.
* Returns 1 if failed to set address.
*/
static int
vsw_prog_if(vsw_t *vswp)
{
mac_multi_addr_t addr;
D1(vswp, "%s: enter", __func__);
ASSERT(MUTEX_HELD(&vswp->hw_lock));
READ_ENTER(&vswp->if_lockrw);
if ((vswp->if_state & VSW_IF_UP) &&
(vswp->addr_set != VSW_ADDR_HW)) {
addr.mma_addrlen = ETHERADDRL;
ether_copy(&vswp->if_addr, &addr.mma_addr);
if (vsw_set_hw_addr(vswp, &addr) != 0) {
RW_EXIT(&vswp->if_lockrw);
return (1);
}
vswp->addr_slot = addr.mma_slot;
/*
* If previously when plumbed had had to place
* interface into promisc mode, now reverse that.
*
* Note that interface will only actually be set into
* non-promisc mode when last port/interface has been
* programmed into HW.
*/
if (vswp->addr_set == VSW_ADDR_PROMISC)
(void) vsw_unset_hw_promisc(vswp, NULL, VSW_LOCALDEV);
vswp->addr_set = VSW_ADDR_HW;
}
RW_EXIT(&vswp->if_lockrw);
D1(vswp, "%s: exit", __func__);
return (0);
}
/*
* Scan the port list for any ports which have not yet been set
* into HW. For those found attempt to program their mac addresses
* into the physical device.
*
* Returns 0 if able to program all required ports (can be 0) into HW.
* Returns 1 if failed to set at least one mac address.
*/
static int
vsw_prog_ports(vsw_t *vswp)
{
mac_multi_addr_t addr;
vsw_port_list_t *plist = &vswp->plist;
vsw_port_t *tp;
int rv = 0;
D1(vswp, "%s: enter", __func__);
ASSERT(MUTEX_HELD(&vswp->hw_lock));
READ_ENTER(&plist->lockrw);
for (tp = plist->head; tp != NULL; tp = tp->p_next) {
if (tp->addr_set != VSW_ADDR_HW) {
addr.mma_addrlen = ETHERADDRL;
ether_copy(&tp->p_macaddr, &addr.mma_addr);
if (vsw_set_hw_addr(vswp, &addr) != 0) {
rv = 1;
break;
}
tp->addr_slot = addr.mma_slot;
/*
* If when this port had first attached we had
* had to place the interface into promisc mode,
* then now reverse that.
*
* Note that the interface will not actually
* change to non-promisc mode until all ports
* have been programmed.
*/
if (tp->addr_set == VSW_ADDR_PROMISC)
(void) vsw_unset_hw_promisc(vswp,
tp, VSW_VNETPORT);
tp->addr_set = VSW_ADDR_HW;
}
}
RW_EXIT(&plist->lockrw);
D1(vswp, "%s: exit", __func__);
return (rv);
}
static void
vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp)
{
ringp->ring_state = VSW_MAC_RING_FREE;
ringp->ring_arg = NULL;
ringp->ring_blank = NULL;
ringp->ring_vqp = NULL;
ringp->ring_vswp = vswp;
}
static void
vsw_mac_ring_tbl_init(vsw_t *vswp)
{
int i;
mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL);
vswp->mac_ring_tbl_sz = vsw_mac_rx_rings;
vswp->mac_ring_tbl =
kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t),
KM_SLEEP);
for (i = 0; i < vswp->mac_ring_tbl_sz; i++)
vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]);
}
static void
vsw_mac_ring_tbl_destroy(vsw_t *vswp)
{
int i;
vsw_mac_ring_t *ringp;
mutex_enter(&vswp->mac_ring_lock);
for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
ringp = &vswp->mac_ring_tbl[i];
if (ringp->ring_state != VSW_MAC_RING_FREE) {
/*
* Destroy the queue.
*/
vsw_queue_stop(ringp->ring_vqp);
vsw_queue_destroy(ringp->ring_vqp);
/*
* Re-initialize the structure.
*/
vsw_mac_ring_tbl_entry_init(vswp, ringp);
}
}
mutex_exit(&vswp->mac_ring_lock);
mutex_destroy(&vswp->mac_ring_lock);
kmem_free(vswp->mac_ring_tbl,
vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t));
vswp->mac_ring_tbl_sz = 0;
}
/*
* Handle resource add callbacks from the driver below.
*/
static mac_resource_handle_t
vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp)
{
vsw_t *vswp = (vsw_t *)arg;
mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp;
vsw_mac_ring_t *ringp;
vsw_queue_t *vqp;
int i;
ASSERT(vswp != NULL);
ASSERT(mrp != NULL);
ASSERT(vswp->mac_ring_tbl != NULL);
D1(vswp, "%s: enter", __func__);
/*
* Check to make sure we have the correct resource type.
*/
if (mrp->mr_type != MAC_RX_FIFO)
return (NULL);
/*
* Find a open entry in the ring table.
*/
mutex_enter(&vswp->mac_ring_lock);
for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
ringp = &vswp->mac_ring_tbl[i];
/*
* Check for an empty slot, if found, then setup queue
* and thread.
*/
if (ringp->ring_state == VSW_MAC_RING_FREE) {
/*
* Create the queue for this ring.
*/
vqp = vsw_queue_create();
/*
* Initialize the ring data structure.
*/
ringp->ring_vqp = vqp;
ringp->ring_arg = mrfp->mrf_arg;
ringp->ring_blank = mrfp->mrf_blank;
ringp->ring_state = VSW_MAC_RING_INUSE;
/*
* Create the worker thread.
*/
vqp->vq_worker = thread_create(NULL, 0,
vsw_queue_worker, ringp, 0, &p0,
TS_RUN, minclsyspri);
if (vqp->vq_worker == NULL) {
vsw_queue_destroy(vqp);
vsw_mac_ring_tbl_entry_init(vswp, ringp);
ringp = NULL;
}
if (ringp != NULL) {
/*
* Make sure thread get's running state for
* this ring.
*/
mutex_enter(&vqp->vq_lock);
while ((vqp->vq_state != VSW_QUEUE_RUNNING) &&
(vqp->vq_state != VSW_QUEUE_DRAINED)) {
cv_wait(&vqp->vq_cv, &vqp->vq_lock);
}
/*
* If the thread is not running, cleanup.
*/
if (vqp->vq_state == VSW_QUEUE_DRAINED) {
vsw_queue_destroy(vqp);
vsw_mac_ring_tbl_entry_init(vswp,
ringp);
ringp = NULL;
}
mutex_exit(&vqp->vq_lock);
}
mutex_exit(&vswp->mac_ring_lock);
D1(vswp, "%s: exit", __func__);
return ((mac_resource_handle_t)ringp);
}
}
mutex_exit(&vswp->mac_ring_lock);
/*
* No slots in the ring table available.
*/
D1(vswp, "%s: exit", __func__);
return (NULL);
}
static void
vsw_queue_stop(vsw_queue_t *vqp)
{
mutex_enter(&vqp->vq_lock);
if (vqp->vq_state == VSW_QUEUE_RUNNING) {
vqp->vq_state = VSW_QUEUE_STOP;
cv_signal(&vqp->vq_cv);
while (vqp->vq_state != VSW_QUEUE_DRAINED)
cv_wait(&vqp->vq_cv, &vqp->vq_lock);
}
vqp->vq_state = VSW_QUEUE_STOPPED;
mutex_exit(&vqp->vq_lock);
}
static vsw_queue_t *
vsw_queue_create()
{
vsw_queue_t *vqp;
vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP);
mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL);
cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL);
vqp->vq_first = NULL;
vqp->vq_last = NULL;
vqp->vq_state = VSW_QUEUE_STOPPED;
return (vqp);
}
static void
vsw_queue_destroy(vsw_queue_t *vqp)
{
cv_destroy(&vqp->vq_cv);
mutex_destroy(&vqp->vq_lock);
kmem_free(vqp, sizeof (vsw_queue_t));
}
static void
vsw_queue_worker(vsw_mac_ring_t *rrp)
{
mblk_t *mp;
vsw_queue_t *vqp = rrp->ring_vqp;
vsw_t *vswp = rrp->ring_vswp;
mutex_enter(&vqp->vq_lock);
ASSERT(vqp->vq_state == VSW_QUEUE_STOPPED);
/*
* Set the state to running, since the thread is now active.
*/
vqp->vq_state = VSW_QUEUE_RUNNING;
cv_signal(&vqp->vq_cv);
while (vqp->vq_state == VSW_QUEUE_RUNNING) {
/*
* Wait for work to do or the state has changed
* to not running.
*/
while ((vqp->vq_state == VSW_QUEUE_RUNNING) &&
(vqp->vq_first == NULL)) {
cv_wait(&vqp->vq_cv, &vqp->vq_lock);
}
/*
* Process packets that we received from the interface.
*/
if (vqp->vq_first != NULL) {
mp = vqp->vq_first;
vqp->vq_first = NULL;
vqp->vq_last = NULL;
mutex_exit(&vqp->vq_lock);
/* switch the chain of packets received */
vswp->vsw_switch_frame(vswp, mp,
VSW_PHYSDEV, NULL, NULL);
mutex_enter(&vqp->vq_lock);
}
}
/*
* We are drained and signal we are done.
*/
vqp->vq_state = VSW_QUEUE_DRAINED;
cv_signal(&vqp->vq_cv);
/*
* Exit lock and drain the remaining packets.
*/
mutex_exit(&vqp->vq_lock);
/*
* Exit the thread
*/
thread_exit();
}
/*
* static void
* vsw_rx_queue_cb() - Receive callback routine when
* vsw_multi_ring_enable is non-zero. Queue the packets
* to a packet queue for a worker thread to process.
*/
static void
vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
{
vsw_mac_ring_t *ringp = (vsw_mac_ring_t *)mrh;
vsw_t *vswp = (vsw_t *)arg;
vsw_queue_t *vqp;
mblk_t *bp, *last;
ASSERT(mrh != NULL);
ASSERT(vswp != NULL);
ASSERT(mp != NULL);
D1(vswp, "%s: enter", __func__);
/*
* Find the last element in the mblk chain.
*/
bp = mp;
do {
last = bp;
bp = bp->b_next;
} while (bp != NULL);
/* Get the queue for the packets */
vqp = ringp->ring_vqp;
/*
* Grab the lock such we can queue the packets.
*/
mutex_enter(&vqp->vq_lock);
if (vqp->vq_state != VSW_QUEUE_RUNNING) {
freemsg(mp);
mutex_exit(&vqp->vq_lock);
goto vsw_rx_queue_cb_exit;
}
/*
* Add the mblk chain to the queue. If there
* is some mblks in the queue, then add the new
* chain to the end.
*/
if (vqp->vq_first == NULL)
vqp->vq_first = mp;
else
vqp->vq_last->b_next = mp;
vqp->vq_last = last;
/*
* Signal the worker thread that there is work to
* do.
*/
cv_signal(&vqp->vq_cv);
/*
* Let go of the lock and exit.
*/
mutex_exit(&vqp->vq_lock);
vsw_rx_queue_cb_exit:
D1(vswp, "%s: exit", __func__);
}
/*
* receive callback routine. Invoked by MAC layer when there
* are pkts being passed up from physical device.
*
* PERF: It may be more efficient when the card is in promisc
* mode to check the dest address of the pkts here (against
* the FDB) rather than checking later. Needs to be investigated.
*/
static void
vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
{
_NOTE(ARGUNUSED(mrh))
vsw_t *vswp = (vsw_t *)arg;
ASSERT(vswp != NULL);
D1(vswp, "vsw_rx_cb: enter");
/* switch the chain of packets received */
vswp->vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
D1(vswp, "vsw_rx_cb: exit");
}
/*
* Send a message out over the physical device via the MAC layer.
*
* Returns any mblks that it was unable to transmit.
*/
static mblk_t *
vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
{
const mac_txinfo_t *mtp;
mblk_t *nextp;
mutex_enter(&vswp->mac_lock);
if (vswp->mh == NULL) {
DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
mutex_exit(&vswp->mac_lock);
return (mp);
} else {
for (;;) {
nextp = mp->b_next;
mp->b_next = NULL;
mtp = vswp->txinfo;
if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
mp->b_next = nextp;
break;
}
if ((mp = nextp) == NULL)
break;
}
}
mutex_exit(&vswp->mac_lock);
return (mp);
}
/*
* Register with the MAC layer as a network device, so we
* can be plumbed if necessary.
*/
static int
vsw_mac_register(vsw_t *vswp)
{
mac_register_t *macp;
int rv;
D1(vswp, "%s: enter", __func__);
if ((macp = mac_alloc(MAC_VERSION)) == NULL)
return (EINVAL);
macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
macp->m_driver = vswp;
macp->m_dip = vswp->dip;
macp->m_src_addr = (uint8_t *)&vswp->if_addr;
macp->m_callbacks = &vsw_m_callbacks;
macp->m_min_sdu = 0;
macp->m_max_sdu = ETHERMTU;
rv = mac_register(macp, &vswp->if_mh);
mac_free(macp);
if (rv == 0)
vswp->if_state |= VSW_IF_REG;
D1(vswp, "%s: exit", __func__);
return (rv);
}
static int
vsw_mac_unregister(vsw_t *vswp)
{
int rv = 0;
D1(vswp, "%s: enter", __func__);
WRITE_ENTER(&vswp->if_lockrw);
if (vswp->if_state & VSW_IF_REG) {
rv = mac_unregister(vswp->if_mh);
if (rv != 0) {
DWARN(vswp, "%s: unable to unregister from MAC "
"framework", __func__);
RW_EXIT(&vswp->if_lockrw);
D1(vswp, "%s: fail exit", __func__);
return (rv);
}
/* mark i/f as down and unregistered */
vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG);
}
RW_EXIT(&vswp->if_lockrw);
D1(vswp, "%s: exit", __func__);
return (rv);
}
static int
vsw_m_stat(void *arg, uint_t stat, uint64_t *val)
{
vsw_t *vswp = (vsw_t *)arg;
D1(vswp, "%s: enter", __func__);
mutex_enter(&vswp->mac_lock);
if (vswp->mh == NULL) {
mutex_exit(&vswp->mac_lock);
return (EINVAL);
}
/* return stats from underlying device */
*val = mac_stat_get(vswp->mh, stat);
mutex_exit(&vswp->mac_lock);
return (0);
}
static void
vsw_m_stop(void *arg)
{
vsw_t *vswp = (vsw_t *)arg;
D1(vswp, "%s: enter", __func__);
WRITE_ENTER(&vswp->if_lockrw);
vswp->if_state &= ~VSW_IF_UP;
RW_EXIT(&vswp->if_lockrw);
mutex_enter(&vswp->hw_lock);
(void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
if (vswp->recfg_reqd)
vsw_reconfig_hw(vswp);
mutex_exit(&vswp->hw_lock);
D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
}
static int
vsw_m_start(void *arg)
{
vsw_t *vswp = (vsw_t *)arg;
D1(vswp, "%s: enter", __func__);
WRITE_ENTER(&vswp->if_lockrw);
vswp->if_state |= VSW_IF_UP;
RW_EXIT(&vswp->if_lockrw);
mutex_enter(&vswp->hw_lock);
(void) vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
mutex_exit(&vswp->hw_lock);
D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
return (0);
}
/*
* Change the local interface address.
*
* Note: we don't support this entry point. The local
* mac address of the switch can only be changed via its
* MD node properties.
*/
static int
vsw_m_unicst(void *arg, const uint8_t *macaddr)
{
_NOTE(ARGUNUSED(arg, macaddr))
return (DDI_FAILURE);
}
static int
vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
{
vsw_t *vswp = (vsw_t *)arg;
mcst_addr_t *mcst_p = NULL;
uint64_t addr = 0x0;
int i, ret = 0;
D1(vswp, "%s: enter", __func__);
/*
* Convert address into form that can be used
* as hash table key.
*/
for (i = 0; i < ETHERADDRL; i++) {
addr = (addr << 8) | mca[i];
}
D2(vswp, "%s: addr = 0x%llx", __func__, addr);
if (add) {
D2(vswp, "%s: adding multicast", __func__);
if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
/*
* Update the list of multicast addresses
* contained within the vsw_t structure to
* include this new one.
*/
mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP);
if (mcst_p == NULL) {
DERR(vswp, "%s unable to alloc mem", __func__);
return (1);
}
mcst_p->addr = addr;
mutex_enter(&vswp->mca_lock);
mcst_p->nextp = vswp->mcap;
vswp->mcap = mcst_p;
mutex_exit(&vswp->mca_lock);
/*
* Call into the underlying driver to program the
* address into HW.
*/
mutex_enter(&vswp->mac_lock);
if (vswp->mh != NULL) {
ret = mac_multicst_add(vswp->mh, mca);
if (ret != 0) {
cmn_err(CE_WARN, "!vsw%d: unable to "
"add multicast address",
vswp->instance);
mutex_exit(&vswp->mac_lock);
goto vsw_remove_addr;
}
}
mutex_exit(&vswp->mac_lock);
} else {
cmn_err(CE_WARN, "!vsw%d: unable to add multicast "
"address", vswp->instance);
}
return (ret);
}
vsw_remove_addr:
D2(vswp, "%s: removing multicast", __func__);
/*
* Remove the address from the hash table..
*/
if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
/*
* ..and then from the list maintained in the
* vsw_t structure.
*/
vsw_del_addr(VSW_LOCALDEV, vswp, addr);
mutex_enter(&vswp->mac_lock);
if (vswp->mh != NULL)
(void) mac_multicst_remove(vswp->mh, mca);
mutex_exit(&vswp->mac_lock);
}
D1(vswp, "%s: exit", __func__);
return (0);
}
static int
vsw_m_promisc(void *arg, boolean_t on)
{
vsw_t *vswp = (vsw_t *)arg;
D1(vswp, "%s: enter", __func__);
WRITE_ENTER(&vswp->if_lockrw);
if (on)
vswp->if_state |= VSW_IF_PROMISC;
else
vswp->if_state &= ~VSW_IF_PROMISC;
RW_EXIT(&vswp->if_lockrw);
D1(vswp, "%s: exit", __func__);
return (0);
}
static mblk_t *
vsw_m_tx(void *arg, mblk_t *mp)
{
vsw_t *vswp = (vsw_t *)arg;
D1(vswp, "%s: enter", __func__);
vswp->vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL);
D1(vswp, "%s: exit", __func__);
return (NULL);
}
/*
* Register for machine description (MD) updates.
*
* Returns 0 on success, 1 on failure.
*/
static int
vsw_mdeg_register(vsw_t *vswp)
{
mdeg_prop_spec_t *pspecp;
mdeg_node_spec_t *inst_specp;
mdeg_handle_t mdeg_hdl, mdeg_port_hdl;
size_t templatesz;
int inst, rv;
D1(vswp, "%s: enter", __func__);
/*
* In each 'virtual-device' node in the MD there is a
* 'cfg-handle' property which is the MD's concept of
* an instance number (this may be completely different from
* the device drivers instance #). OBP reads that value and
* stores it in the 'reg' property of the appropriate node in
* the device tree. So we use the 'reg' value when registering
* with the mdeg framework, to ensure we get events for the
* correct nodes.
*/
inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
DDI_PROP_DONTPASS, reg_propname, -1);
if (inst == -1) {
cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from "
"OBP device tree", vswp->instance, reg_propname);
return (1);
}
D2(vswp, "%s: instance %d registering with mdeg", __func__, inst);
/*
* Allocate and initialize a per-instance copy
* of the global property spec array that will
* uniquely identify this vsw instance.
*/
templatesz = sizeof (vsw_prop_template);
pspecp = kmem_zalloc(templatesz, KM_SLEEP);
bcopy(vsw_prop_template, pspecp, templatesz);
VSW_SET_MDEG_PROP_INST(pspecp, inst);
/* initialize the complete prop spec structure */
inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
inst_specp->namep = "virtual-device";
inst_specp->specp = pspecp;
/*
* Register an interest in 'virtual-device' nodes with a
* 'name' property of 'virtual-network-switch'
*/
rv = mdeg_register(inst_specp, &vdev_match, vsw_mdeg_cb,
(void *)vswp, &mdeg_hdl);
if (rv != MDEG_SUCCESS) {
DERR(vswp, "%s: mdeg_register failed (%d) for vsw node",
__func__, rv);
goto mdeg_reg_fail;
}
/*
* Register an interest in 'vsw-port' nodes.
*/
rv = mdeg_register(inst_specp, &vport_match, vsw_port_mdeg_cb,
(void *)vswp, &mdeg_port_hdl);
if (rv != MDEG_SUCCESS) {
DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv);
(void) mdeg_unregister(mdeg_hdl);
goto mdeg_reg_fail;
}
/* save off data that will be needed later */
vswp->inst_spec = inst_specp;
vswp->mdeg_hdl = mdeg_hdl;
vswp->mdeg_port_hdl = mdeg_port_hdl;
D1(vswp, "%s: exit", __func__);
return (0);
mdeg_reg_fail:
cmn_err(CE_WARN, "!vsw%d: Unable to register MDEG callbacks",
vswp->instance);
kmem_free(pspecp, templatesz);
kmem_free(inst_specp, sizeof (mdeg_node_spec_t));
vswp->mdeg_hdl = NULL;
vswp->mdeg_port_hdl = NULL;
return (1);
}
static void
vsw_mdeg_unregister(vsw_t *vswp)
{
D1(vswp, "vsw_mdeg_unregister: enter");
if (vswp->mdeg_hdl != NULL)
(void) mdeg_unregister(vswp->mdeg_hdl);
if (vswp->mdeg_port_hdl != NULL)
(void) mdeg_unregister(vswp->mdeg_port_hdl);
if (vswp->inst_spec != NULL) {
if (vswp->inst_spec->specp != NULL) {
(void) kmem_free(vswp->inst_spec->specp,
sizeof (vsw_prop_template));
vswp->inst_spec->specp = NULL;
}
(void) kmem_free(vswp->inst_spec,
sizeof (mdeg_node_spec_t));
vswp->inst_spec = NULL;
}
D1(vswp, "vsw_mdeg_unregister: exit");
}
/*
* Mdeg callback invoked for the vsw node itself.
*/
static int
vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
{
vsw_t *vswp;
int idx;
md_t *mdp;
mde_cookie_t node;
uint64_t inst;
char *node_name = NULL;
if (resp == NULL)
return (MDEG_FAILURE);
vswp = (vsw_t *)cb_argp;
D1(vswp, "%s: added %d : removed %d : curr matched %d"
" : prev matched %d", __func__, resp->added.nelem,
resp->removed.nelem, resp->match_curr.nelem,
resp->match_prev.nelem);
/*
* Expect 'added' to be non-zero if virtual-network-switch
* nodes exist in the MD when the driver attaches.
*/
for (idx = 0; idx < resp->added.nelem; idx++) {
mdp = resp->added.mdp;
node = resp->added.mdep[idx];
if (md_get_prop_str(mdp, node, "name", &node_name) != 0) {
DERR(vswp, "%s: unable to get node name for "
"node(%d) 0x%lx", __func__, idx, node);
continue;
}
if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) {
DERR(vswp, "%s: prop(cfg-handle) not found port(%d)",
__func__, idx);
continue;
}
D2(vswp, "%s: added node(%d) 0x%lx with name %s "
"and inst %d", __func__, idx, node, node_name, inst);
vsw_get_initial_md_properties(vswp, mdp, node);
}
/*
* A non-zero 'match' value indicates that the MD has been
* updated and that a virtual-network-switch node is present
* which may or may not have been updated. It is up to the clients
* to examine their own nodes and determine if they have changed.
*/
for (idx = 0; idx < resp->match_curr.nelem; idx++) {
mdp = resp->match_curr.mdp;
node = resp->match_curr.mdep[idx];
if (md_get_prop_str(mdp, node, "name", &node_name) != 0) {
DERR(vswp, "%s: unable to get node name for "
"node(%d) 0x%lx", __func__, idx, node);
continue;
}
if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) {
DERR(vswp, "%s: prop(cfg-handle) not found port(%d)",
__func__, idx);
continue;
}
D2(vswp, "%s: changed node(%d) 0x%lx with name %s "
"and inst %d", __func__, idx, node, node_name, inst);
vsw_update_md_prop(vswp, mdp, node);
}
return (MDEG_SUCCESS);
}
/*
* Mdeg callback invoked for changes to the vsw-port nodes
* under the vsw node.
*/
static int
vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
{
vsw_t *vswp;
int idx;
md_t *mdp;
mde_cookie_t node;
uint64_t inst;
if ((resp == NULL) || (cb_argp == NULL))
return (MDEG_FAILURE);
vswp = (vsw_t *)cb_argp;
D2(vswp, "%s: added %d : removed %d : curr matched %d"
" : prev matched %d", __func__, resp->added.nelem,
resp->removed.nelem, resp->match_curr.nelem,
resp->match_prev.nelem);
/* process added ports */
for (idx = 0; idx < resp->added.nelem; idx++) {
mdp = resp->added.mdp;
node = resp->added.mdep[idx];
D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node);
if (vsw_port_add(vswp, mdp, &node) != 0) {
cmn_err(CE_WARN, "!vsw%d: Unable to add new port "
"(0x%lx)", vswp->instance, node);
}
}
/* process removed ports */
for (idx = 0; idx < resp->removed.nelem; idx++) {
mdp = resp->removed.mdp;
node = resp->removed.mdep[idx];
if (md_get_prop_val(mdp, node, id_propname, &inst)) {
DERR(vswp, "%s: prop(%s) not found in port(%d)",
__func__, id_propname, idx);
continue;
}
D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node);
if (vsw_port_detach(vswp, inst) != 0) {
cmn_err(CE_WARN, "!vsw%d: Unable to remove port %ld",
vswp->instance, inst);
}
}
/*
* Currently no support for updating already active ports.
* So, ignore the match_curr and match_priv arrays for now.
*/
D1(vswp, "%s: exit", __func__);
return (MDEG_SUCCESS);
}
/*
* Read the initial start-of-day values from the specified MD node.
*/
static void
vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
{
int i;
uint64_t macaddr = 0;
D1(vswp, "%s: enter", __func__);
if (vsw_get_md_physname(vswp, mdp, node, vswp->physname) == 0) {
/*
* Note it is valid for the physname property to
* be NULL so check actual name length to determine
* if we have a actual device name.
*/
if (strlen(vswp->physname) > 0)
vswp->mdprops |= VSW_MD_PHYSNAME;
} else {
cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical "
"device from MD", vswp->instance);
return;
}
/* mac address for vswitch device itself */
if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) {
cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD",
vswp->instance);
/*
* Fallback to using the mac address of the physical
* device.
*/
if (vsw_get_physaddr(vswp) == 0) {
cmn_err(CE_NOTE, "!vsw%d: Using MAC address from "
"physical device (%s)", vswp->instance,
vswp->physname);
} else {
cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address"
"from device %s", vswp->instance,
vswp->physname);
}
} else {
WRITE_ENTER(&vswp->if_lockrw);
for (i = ETHERADDRL - 1; i >= 0; i--) {
vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
macaddr >>= 8;
}
RW_EXIT(&vswp->if_lockrw);
vswp->mdprops |= VSW_MD_MACADDR;
}
if (vsw_get_md_smodes(vswp, mdp, node,
vswp->smode, &vswp->smode_num)) {
cmn_err(CE_WARN, "vsw%d: Unable to read %s property from "
"MD, defaulting to programmed mode", vswp->instance,
smode_propname);
for (i = 0; i < NUM_SMODES; i++)
vswp->smode[i] = VSW_LAYER2;
vswp->smode_num = NUM_SMODES;
} else {
ASSERT(vswp->smode_num != 0);
vswp->mdprops |= VSW_MD_SMODE;
}
/*
* Unable to setup any switching mode, nothing more
* we can do.
*/
if (vsw_setup_switching(vswp))
return;
WRITE_ENTER(&vswp->if_lockrw);
vswp->if_state &= ~VSW_IF_UP;
RW_EXIT(&vswp->if_lockrw);
if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) {
if (vsw_mac_register(vswp) != 0) {
/*
* Treat this as a non-fatal error as we may be
* able to operate in some other mode.
*/
cmn_err(CE_WARN, "vsw%d: Unable to register as "
"provider with MAC layer", vswp->instance);
}
}
D1(vswp, "%s: exit", __func__);
}
/*
* Check to see if the relevant properties in the specified node have
* changed, and if so take the appropriate action.
*
* If any of the properties are missing or invalid we don't take
* any action, as this function should only be invoked when modifications
* have been made to what we assume is a working configuration, which
* we leave active.
*
* Note it is legal for this routine to be invoked even if none of the
* properties in the port node within the MD have actually changed.
*/
static void
vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
{
char physname[LIFNAMSIZ];
char drv[LIFNAMSIZ];
uint_t ddi_instance;
uint8_t new_smode[NUM_SMODES];
int i, smode_num = 0;
uint64_t macaddr = 0;
vsw_port_list_t *plist = &vswp->plist;
vsw_port_t *port = NULL;
enum {MD_init = 0x1,
MD_physname = 0x2,
MD_macaddr = 0x4,
MD_smode = 0x8} updated;
updated = MD_init;
D1(vswp, "%s: enter", __func__);
/*
* Check if name of physical device in MD has changed.
*/
if (vsw_get_md_physname(vswp, mdp, node, (char *)&physname) == 0) {
/*
* Do basic sanity check on new device name/instance,
* if its non NULL. It is valid for the device name to
* have changed from a non NULL to a NULL value, i.e.
* the vsw is being changed to 'routed' mode.
*/
if ((strlen(physname) != 0) &&
(ddi_parse(physname, drv,
&ddi_instance) != DDI_SUCCESS)) {
cmn_err(CE_WARN, "!vsw%d: new device name %s is not"
" a valid device name/instance",
vswp->instance, physname);
goto fail_reconf;
}
if (strcmp(physname, vswp->physname)) {
D2(vswp, "%s: device name changed from %s to %s",
__func__, vswp->physname, physname);
updated |= MD_physname;
} else {
D2(vswp, "%s: device name unchanged at %s",
__func__, vswp->physname);
}
} else {
cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical "
"device from updated MD.", vswp->instance);
goto fail_reconf;
}
/*
* Check if MAC address has changed.
*/
if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) {
cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD",
vswp->instance);
goto fail_reconf;
} else {
READ_ENTER(&vswp->if_lockrw);
for (i = ETHERADDRL - 1; i >= 0; i--) {
if (vswp->if_addr.ether_addr_octet[i]
!= (macaddr & 0xFF)) {
D2(vswp, "%s: octet[%d] 0x%x != 0x%x",
__func__, i,
vswp->if_addr.ether_addr_octet[i],
(macaddr & 0xFF));
updated |= MD_macaddr;
break;
}
macaddr >>= 8;
}
RW_EXIT(&vswp->if_lockrw);
}
/*
* Check if switching modes have changed.
*/
if (vsw_get_md_smodes(vswp, mdp, node,
new_smode, &smode_num)) {
cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from MD",
vswp->instance, smode_propname);
goto fail_reconf;
} else {
ASSERT(smode_num != 0);
if (smode_num != vswp->smode_num) {
D2(vswp, "%s: number of modes changed from %d to %d",
__func__, vswp->smode_num, smode_num);
}
for (i = 0; i < smode_num; i++) {
if (new_smode[i] != vswp->smode[i]) {
D2(vswp, "%s: mode changed from %d to %d",
__func__, vswp->smode[i], new_smode[i]);
updated |= MD_smode;
break;
}
}
}
/*
* Now make any changes which are needed...
*/
if (updated & (MD_physname | MD_smode)) {
/*
* Disconnect all ports from the current card
*/
WRITE_ENTER(&plist->lockrw);
for (port = plist->head; port != NULL; port = port->p_next) {
/* Remove address if was programmed into HW. */
mutex_enter(&vswp->hw_lock);
if (vsw_unset_hw(vswp, port, VSW_VNETPORT)) {
mutex_exit(&vswp->hw_lock);
RW_EXIT(&plist->lockrw);
goto fail_update;
}
mutex_exit(&vswp->hw_lock);
}
RW_EXIT(&plist->lockrw);
/*
* Stop, detach the old device..
*/
vsw_mac_detach(vswp);
/*
* Update phys name.
*/
if (updated & MD_physname) {
cmn_err(CE_NOTE, "!vsw%d: changing from %s to %s",
vswp->instance, vswp->physname, physname);
(void) strncpy(vswp->physname,
physname, strlen(physname) + 1);
if (strlen(vswp->physname) > 0)
vswp->mdprops |= VSW_MD_PHYSNAME;
}
/*
* Update array with the new switch mode values.
*/
if (updated & MD_smode) {
for (i = 0; i < smode_num; i++)
vswp->smode[i] = new_smode[i];
vswp->smode_num = smode_num;
vswp->smode_idx = 0;
}
/*
* ..and attach, start the new device.
*/
if (vsw_setup_switching(vswp))
goto fail_update;
/*
* Connect ports to new card.
*/
WRITE_ENTER(&plist->lockrw);
for (port = plist->head; port != NULL; port = port->p_next) {
mutex_enter(&vswp->hw_lock);
if (vsw_set_hw(vswp, port, VSW_VNETPORT)) {
mutex_exit(&vswp->hw_lock);
RW_EXIT(&plist->lockrw);
goto fail_update;
}
mutex_exit(&vswp->hw_lock);
}
RW_EXIT(&plist->lockrw);
}
if (updated & MD_macaddr) {
cmn_err(CE_NOTE, "!vsw%d: changing mac address to 0x%lx",
vswp->instance, macaddr);
WRITE_ENTER(&vswp->if_lockrw);
for (i = ETHERADDRL - 1; i >= 0; i--) {
vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
macaddr >>= 8;
}
RW_EXIT(&vswp->if_lockrw);
/*
* Remove old address from HW (if programmed) and set
* new address.
*/
mutex_enter(&vswp->hw_lock);
(void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
(void) vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
mutex_exit(&vswp->hw_lock);
/*
* Notify the MAC layer of the changed address.
*/
mac_unicst_update(vswp->if_mh, (uint8_t *)&vswp->if_addr);
}
return;
fail_reconf:
cmn_err(CE_WARN, "!vsw%d: configuration unchanged", vswp->instance);
return;
fail_update:
cmn_err(CE_WARN, "!vsw%d: update of configuration failed",
vswp->instance);
}
/*
* Add a new port to the system.
*
* Returns 0 on success, 1 on failure.
*/
int
vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node)
{
uint64_t ldc_id;
uint8_t *addrp;
int i, addrsz;
int num_nodes = 0, nchan = 0;
int listsz = 0;
mde_cookie_t *listp = NULL;
struct ether_addr ea;
uint64_t macaddr;
uint64_t inst = 0;
vsw_port_t *port;
if (md_get_prop_val(mdp, *node, id_propname, &inst)) {
DWARN(vswp, "%s: prop(%s) not found", __func__,
id_propname);
return (1);
}
/*
* Find the channel endpoint node(s) (which should be under this
* port node) which contain the channel id(s).
*/
if ((num_nodes = md_node_count(mdp)) <= 0) {
DERR(vswp, "%s: invalid number of nodes found (%d)",
__func__, num_nodes);
return (1);
}
D2(vswp, "%s: %d nodes found", __func__, num_nodes);
/* allocate enough space for node list */
listsz = num_nodes * sizeof (mde_cookie_t);
listp = kmem_zalloc(listsz, KM_SLEEP);
nchan = md_scan_dag(mdp, *node,
md_find_name(mdp, chan_propname),
md_find_name(mdp, "fwd"), listp);
if (nchan <= 0) {
DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname);
kmem_free(listp, listsz);
return (1);
}
D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname);
/* use property from first node found */
if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) {
DWARN(vswp, "%s: prop(%s) not found\n", __func__,
id_propname);
kmem_free(listp, listsz);
return (1);
}
/* don't need list any more */
kmem_free(listp, listsz);
D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id);
/* read mac-address property */
if (md_get_prop_data(mdp, *node, remaddr_propname,
&addrp, &addrsz)) {
DWARN(vswp, "%s: prop(%s) not found",
__func__, remaddr_propname);
return (1);
}
if (addrsz < ETHERADDRL) {
DWARN(vswp, "%s: invalid address size", __func__);
return (1);
}
macaddr = *((uint64_t *)addrp);
D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr);
for (i = ETHERADDRL - 1; i >= 0; i--) {
ea.ether_addr_octet[i] = macaddr & 0xFF;
macaddr >>= 8;
}
if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) {
DERR(vswp, "%s: failed to attach port", __func__);
return (1);
}
port = vsw_lookup_port(vswp, (int)inst);
/* just successfuly created the port, so it should exist */
ASSERT(port != NULL);
return (0);
}
/*
* Attach the specified port.
*
* Returns 0 on success, 1 on failure.
*/
static int
vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids,
struct ether_addr *macaddr)
{
vsw_port_list_t *plist = &vswp->plist;
vsw_port_t *port, **prev_port;
int i;
D1(vswp, "%s: enter : port %d", __func__, p_instance);
/* port already exists? */
READ_ENTER(&plist->lockrw);
for (port = plist->head; port != NULL; port = port->p_next) {
if (port->p_instance == p_instance) {
DWARN(vswp, "%s: port instance %d already attached",
__func__, p_instance);
RW_EXIT(&plist->lockrw);
return (1);
}
}
RW_EXIT(&plist->lockrw);
port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP);
port->p_vswp = vswp;
port->p_instance = p_instance;
port->p_ldclist.num_ldcs = 0;
port->p_ldclist.head = NULL;
port->addr_set = VSW_ADDR_UNSET;
rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL);
cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL);
mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
port->state = VSW_PORT_INIT;
if (nids > VSW_PORT_MAX_LDCS) {
D2(vswp, "%s: using first of %d ldc ids",
__func__, nids);
nids = VSW_PORT_MAX_LDCS;
}
D2(vswp, "%s: %d nids", __func__, nids);
for (i = 0; i < nids; i++) {
D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
DERR(vswp, "%s: ldc_attach failed", __func__);
rw_destroy(&port->p_ldclist.lockrw);
cv_destroy(&port->ref_cv);
mutex_destroy(&port->ref_lock);
cv_destroy(&port->state_cv);
mutex_destroy(&port->state_lock);
mutex_destroy(&port->tx_lock);
mutex_destroy(&port->mca_lock);
kmem_free(port, sizeof (vsw_port_t));
return (1);
}
}
ether_copy(macaddr, &port->p_macaddr);
WRITE_ENTER(&plist->lockrw);
/* create the fdb entry for this port/mac address */
(void) vsw_add_fdb(vswp, port);
mutex_enter(&vswp->hw_lock);
(void) vsw_set_hw(vswp, port, VSW_VNETPORT);
mutex_exit(&vswp->hw_lock);
/* link it into the list of ports for this vsw instance */
prev_port = (vsw_port_t **)(&plist->head);
port->p_next = *prev_port;
*prev_port = port;
plist->num_ports++;
RW_EXIT(&plist->lockrw);
/*
* Initialise the port and any ldc's under it.
*/
(void) vsw_init_ldcs(port);
D1(vswp, "%s: exit", __func__);
return (0);
}
/*
* Detach the specified port.
*
* Returns 0 on success, 1 on failure.
*/
static int
vsw_port_detach(vsw_t *vswp, int p_instance)
{
vsw_port_t *port = NULL;
vsw_port_list_t *plist = &vswp->plist;
D1(vswp, "%s: enter: port id %d", __func__, p_instance);
WRITE_ENTER(&plist->lockrw);
if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
RW_EXIT(&plist->lockrw);
return (1);
}
if (vsw_plist_del_node(vswp, port)) {
RW_EXIT(&plist->lockrw);
return (1);
}
/* Remove the fdb entry for this port/mac address */
(void) vsw_del_fdb(vswp, port);
/* Remove any multicast addresses.. */
vsw_del_mcst_port(port);
/*
* No longer need to hold writer lock on port list now
* that we have unlinked the target port from the list.
*/
RW_EXIT(&plist->lockrw);
/* Remove address if was programmed into HW. */
mutex_enter(&vswp->hw_lock);
(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
if (vswp->recfg_reqd)
vsw_reconfig_hw(vswp);
mutex_exit(&vswp->hw_lock);
if (vsw_port_delete(port)) {
return (1);
}
D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
return (0);
}
/*
* Detach all active ports.
*
* Returns 0 on success, 1 on failure.
*/
static int
vsw_detach_ports(vsw_t *vswp)
{
vsw_port_list_t *plist = &vswp->plist;
vsw_port_t *port = NULL;
D1(vswp, "%s: enter", __func__);
WRITE_ENTER(&plist->lockrw);
while ((port = plist->head) != NULL) {
if (vsw_plist_del_node(vswp, port)) {
DERR(vswp, "%s: Error deleting port %d"
" from port list", __func__,
port->p_instance);
RW_EXIT(&plist->lockrw);
return (1);
}
/* Remove address if was programmed into HW. */
mutex_enter(&vswp->hw_lock);
(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
mutex_exit(&vswp->hw_lock);
/* Remove the fdb entry for this port/mac address */
(void) vsw_del_fdb(vswp, port);
/* Remove any multicast addresses.. */
vsw_del_mcst_port(port);
/*
* No longer need to hold the lock on the port list
* now that we have unlinked the target port from the
* list.
*/
RW_EXIT(&plist->lockrw);
if (vsw_port_delete(port)) {
DERR(vswp, "%s: Error deleting port %d",
__func__, port->p_instance);
return (1);
}
WRITE_ENTER(&plist->lockrw);
}
RW_EXIT(&plist->lockrw);
D1(vswp, "%s: exit", __func__);
return (0);
}
/*
* Delete the specified port.
*
* Returns 0 on success, 1 on failure.
*/
static int
vsw_port_delete(vsw_port_t *port)
{
vsw_ldc_list_t *ldcl;
vsw_t *vswp = port->p_vswp;
D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
(void) vsw_uninit_ldcs(port);
/*
* Wait for any pending ctrl msg tasks which reference this
* port to finish.
*/
if (vsw_drain_port_taskq(port))
return (1);
/*
* Wait for port reference count to hit zero.
*/
mutex_enter(&port->ref_lock);
while (port->ref_cnt != 0)
cv_wait(&port->ref_cv, &port->ref_lock);
mutex_exit(&port->ref_lock);
/*
* Wait for any active callbacks to finish
*/
if (vsw_drain_ldcs(port))
return (1);
ldcl = &port->p_ldclist;
WRITE_ENTER(&ldcl->lockrw);
while (ldcl->num_ldcs > 0) {
if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {;
cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld",
vswp->instance, ldcl->head->ldc_id);
RW_EXIT(&ldcl->lockrw);
return (1);
}
}
RW_EXIT(&ldcl->lockrw);
rw_destroy(&port->p_ldclist.lockrw);
mutex_destroy(&port->mca_lock);
mutex_destroy(&port->tx_lock);
cv_destroy(&port->ref_cv);
mutex_destroy(&port->ref_lock);
cv_destroy(&port->state_cv);
mutex_destroy(&port->state_lock);
kmem_free(port, sizeof (vsw_port_t));
D1(vswp, "%s: exit", __func__);
return (0);
}
/*
* Attach a logical domain channel (ldc) under a specified port.
*
* Returns 0 on success, 1 on failure.
*/
static int
vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
{
vsw_t *vswp = port->p_vswp;
vsw_ldc_list_t *ldcl = &port->p_ldclist;
vsw_ldc_t *ldcp = NULL;
ldc_attr_t attr;
ldc_status_t istatus;
int status = DDI_FAILURE;
int rv;
enum { PROG_init = 0x0, PROG_mblks = 0x1,
PROG_callback = 0x2}
progress;
progress = PROG_init;
D1(vswp, "%s: enter", __func__);
ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
if (ldcp == NULL) {
DERR(vswp, "%s: kmem_zalloc failed", __func__);
return (1);
}
ldcp->ldc_id = ldc_id;
/* allocate pool of receive mblks */
rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh));
if (rv) {
DWARN(vswp, "%s: unable to create free mblk pool for"
" channel %ld (rv %d)", __func__, ldc_id, rv);
kmem_free(ldcp, sizeof (vsw_ldc_t));
return (1);
}
progress |= PROG_mblks;
mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
/* required for handshake with peer */
ldcp->local_session = (uint64_t)ddi_get_lbolt();
ldcp->peer_session = 0;
ldcp->session_status = 0;
mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL);
ldcp->hss_id = 1; /* Initial handshake session id */
/* only set for outbound lane, inbound set by peer */
mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL);
mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL);
vsw_set_lane_attr(vswp, &ldcp->lane_out);
attr.devclass = LDC_DEV_NT_SVC;
attr.instance = ddi_get_instance(vswp->dip);
attr.mode = LDC_MODE_UNRELIABLE;
attr.mtu = VSW_LDC_MTU;
status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
if (status != 0) {
DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
__func__, ldc_id, status);
goto ldc_attach_fail;
}
status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
if (status != 0) {
DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
__func__, ldc_id, status);
(void) ldc_fini(ldcp->ldc_handle);
goto ldc_attach_fail;
}
progress |= PROG_callback;
mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
DERR(vswp, "%s: ldc_status failed", __func__);
mutex_destroy(&ldcp->status_lock);
goto ldc_attach_fail;
}
ldcp->ldc_status = istatus;
ldcp->ldc_port = port;
ldcp->ldc_vswp = vswp;
/* link it into the list of channels for this port */
WRITE_ENTER(&ldcl->lockrw);
ldcp->ldc_next = ldcl->head;
ldcl->head = ldcp;
ldcl->num_ldcs++;
RW_EXIT(&ldcl->lockrw);
D1(vswp, "%s: exit", __func__);
return (0);
ldc_attach_fail:
mutex_destroy(&ldcp->ldc_txlock);
mutex_destroy(&ldcp->ldc_cblock);
cv_destroy(&ldcp->drain_cv);
rw_destroy(&ldcp->lane_in.dlistrw);
rw_destroy(&ldcp->lane_out.dlistrw);
if (progress & PROG_callback) {
(void) ldc_unreg_callback(ldcp->ldc_handle);
}
if ((progress & PROG_mblks) && (ldcp->rxh != NULL)) {
if (vio_destroy_mblks(ldcp->rxh) != 0) {
/*
* Something odd has happened, as the destroy
* will only fail if some mblks have been allocated
* from the pool already (which shouldn't happen)
* and have not been returned.
*
* Add the pool pointer to a list maintained in
* the device instance. Another attempt will be made
* to free the pool when the device itself detaches.
*/
cmn_err(CE_WARN, "!vsw%d: Creation of ldc channel %ld "
"failed and cannot destroy associated mblk "
"pool", vswp->instance, ldc_id);
ldcp->rxh->nextp = vswp->rxh;
vswp->rxh = ldcp->rxh;
}
}
mutex_destroy(&ldcp->drain_cv_lock);
mutex_destroy(&ldcp->hss_lock);
mutex_destroy(&ldcp->lane_in.seq_lock);
mutex_destroy(&ldcp->lane_out.seq_lock);
kmem_free(ldcp, sizeof (vsw_ldc_t));
return (1);
}
/*
* Detach a logical domain channel (ldc) belonging to a
* particular port.
*
* Returns 0 on success, 1 on failure.
*/
static int
vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
{
vsw_t *vswp = port->p_vswp;
vsw_ldc_t *ldcp, *prev_ldcp;
vsw_ldc_list_t *ldcl = &port->p_ldclist;
int rv;
prev_ldcp = ldcl->head;
for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
if (ldcp->ldc_id == ldc_id) {
break;
}
}
/* specified ldc id not found */
if (ldcp == NULL) {
DERR(vswp, "%s: ldcp = NULL", __func__);
return (1);
}
D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
/*
* Before we can close the channel we must release any mapped
* resources (e.g. drings).
*/
vsw_free_lane_resources(ldcp, INBOUND);
vsw_free_lane_resources(ldcp, OUTBOUND);
/*
* If the close fails we are in serious trouble, as won't
* be able to delete the parent port.
*/
if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
DERR(vswp, "%s: error %d closing channel %lld",
__func__, rv, ldcp->ldc_id);
return (1);
}
(void) ldc_fini(ldcp->ldc_handle);
ldcp->ldc_status = LDC_INIT;
ldcp->ldc_handle = NULL;
ldcp->ldc_vswp = NULL;
if (ldcp->rxh != NULL) {
if (vio_destroy_mblks(ldcp->rxh)) {
/*
* Mostly likely some mblks are still in use and
* have not been returned to the pool. Add the pool
* to the list maintained in the device instance.
* Another attempt will be made to destroy the pool
* when the device detaches.
*/
ldcp->rxh->nextp = vswp->rxh;
vswp->rxh = ldcp->rxh;
}
}
/* unlink it from the list */
prev_ldcp = ldcp->ldc_next;
ldcl->num_ldcs--;
mutex_destroy(&ldcp->ldc_txlock);
mutex_destroy(&ldcp->ldc_cblock);
cv_destroy(&ldcp->drain_cv);
mutex_destroy(&ldcp->drain_cv_lock);
mutex_destroy(&ldcp->hss_lock);
mutex_destroy(&ldcp->lane_in.seq_lock);
mutex_destroy(&ldcp->lane_out.seq_lock);
mutex_destroy(&ldcp->status_lock);
rw_destroy(&ldcp->lane_in.dlistrw);
rw_destroy(&ldcp->lane_out.dlistrw);
kmem_free(ldcp, sizeof (vsw_ldc_t));
return (0);
}
/*
* Open and attempt to bring up the channel. Note that channel
* can only be brought up if peer has also opened channel.
*
* Returns 0 if can open and bring up channel, otherwise
* returns 1.
*/
static int
vsw_ldc_init(vsw_ldc_t *ldcp)
{
vsw_t *vswp = ldcp->ldc_vswp;
ldc_status_t istatus = 0;
int rv;
D1(vswp, "%s: enter", __func__);
LDC_ENTER_LOCK(ldcp);
/* don't start at 0 in case clients don't like that */
ldcp->next_ident = 1;
rv = ldc_open(ldcp->ldc_handle);
if (rv != 0) {
DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
__func__, ldcp->ldc_id, rv);
LDC_EXIT_LOCK(ldcp);
return (1);
}
if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
DERR(vswp, "%s: unable to get status", __func__);
LDC_EXIT_LOCK(ldcp);
return (1);
} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
__func__, ldcp->ldc_id, istatus);
LDC_EXIT_LOCK(ldcp);
return (1);
}
mutex_enter(&ldcp->status_lock);
ldcp->ldc_status = istatus;
mutex_exit(&ldcp->status_lock);
rv = ldc_up(ldcp->ldc_handle);
if (rv != 0) {
/*
* Not a fatal error for ldc_up() to fail, as peer
* end point may simply not be ready yet.
*/
D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
ldcp->ldc_id, rv);
LDC_EXIT_LOCK(ldcp);
return (1);
}
/*
* ldc_up() call is non-blocking so need to explicitly
* check channel status to see if in fact the channel
* is UP.
*/
mutex_enter(&ldcp->status_lock);
if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
DERR(vswp, "%s: unable to get status", __func__);
mutex_exit(&ldcp->status_lock);
LDC_EXIT_LOCK(ldcp);
return (1);
}
if (ldcp->ldc_status == LDC_UP) {
D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
ldcp->ldc_id, istatus);
mutex_exit(&ldcp->status_lock);
LDC_EXIT_LOCK(ldcp);
vsw_process_conn_evt(ldcp, VSW_CONN_UP);
return (0);
}
mutex_exit(&ldcp->status_lock);
LDC_EXIT_LOCK(ldcp);
D1(vswp, "%s: exit", __func__);
return (0);
}
/* disable callbacks on the channel */
static int
vsw_ldc_uninit(vsw_ldc_t *ldcp)
{
vsw_t *vswp = ldcp->ldc_vswp;
int rv;
D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
LDC_ENTER_LOCK(ldcp);
rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
if (rv != 0) {
DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
"interrupts (rv = %d)\n", ldcp->ldc_id, rv);
LDC_EXIT_LOCK(ldcp);
return (1);
}
mutex_enter(&ldcp->status_lock);
ldcp->ldc_status = LDC_INIT;
mutex_exit(&ldcp->status_lock);
LDC_EXIT_LOCK(ldcp);
D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
return (0);
}
static int
vsw_init_ldcs(vsw_port_t *port)
{
vsw_ldc_list_t *ldcl = &port->p_ldclist;
vsw_ldc_t *ldcp;
READ_ENTER(&ldcl->lockrw);
ldcp = ldcl->head;
for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
(void) vsw_ldc_init(ldcp);
}
RW_EXIT(&ldcl->lockrw);
return (0);
}
static int
vsw_uninit_ldcs(vsw_port_t *port)
{
vsw_ldc_list_t *ldcl = &port->p_ldclist;
vsw_ldc_t *ldcp;
D1(NULL, "vsw_uninit_ldcs: enter\n");
READ_ENTER(&ldcl->lockrw);
ldcp = ldcl->head;
for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
(void) vsw_ldc_uninit(ldcp);
}
RW_EXIT(&ldcl->lockrw);
D1(NULL, "vsw_uninit_ldcs: exit\n");
return (0);
}
/*
* Wait until the callback(s) associated with the ldcs under the specified
* port have completed.
*
* Prior to this function being invoked each channel under this port
* should have been quiesced via ldc_set_cb_mode(DISABLE).
*
* A short explaination of what we are doing below..
*
* The simplest approach would be to have a reference counter in
* the ldc structure which is increment/decremented by the callbacks as
* they use the channel. The drain function could then simply disable any
* further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
* there is a tiny window here - before the callback is able to get the lock
* on the channel it is interrupted and this function gets to execute. It
* sees that the ref count is zero and believes its free to delete the
* associated data structures.
*
* We get around this by taking advantage of the fact that before the ldc
* framework invokes a callback it sets a flag to indicate that there is a
* callback active (or about to become active). If when we attempt to
* unregister a callback when this active flag is set then the unregister
* will fail with EWOULDBLOCK.
*
* If the unregister fails we do a cv_timedwait. We will either be signaled
* by the callback as it is exiting (note we have to wait a short period to
* allow the callback to return fully to the ldc framework and it to clear
* the active flag), or by the timer expiring. In either case we again attempt
* the unregister. We repeat this until we can succesfully unregister the
* callback.
*
* The reason we use a cv_timedwait rather than a simple cv_wait is to catch
* the case where the callback has finished but the ldc framework has not yet
* cleared the active flag. In this case we would never get a cv_signal.
*/
static int
vsw_drain_ldcs(vsw_port_t *port)
{
vsw_ldc_list_t *ldcl = &port->p_ldclist;
vsw_ldc_t *ldcp;
vsw_t *vswp = port->p_vswp;
D1(vswp, "%s: enter", __func__);
READ_ENTER(&ldcl->lockrw);
ldcp = ldcl->head;
for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
/*
* If we can unregister the channel callback then we
* know that there is no callback either running or
* scheduled to run for this channel so move on to next
* channel in the list.
*/
mutex_enter(&ldcp->drain_cv_lock);
/* prompt active callbacks to quit */
ldcp->drain_state = VSW_LDC_DRAINING;
if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
D2(vswp, "%s: unreg callback for chan %ld", __func__,
ldcp->ldc_id);
mutex_exit(&ldcp->drain_cv_lock);
continue;
} else {
/*
* If we end up here we know that either 1) a callback
* is currently executing, 2) is about to start (i.e.
* the ldc framework has set the active flag but
* has not actually invoked the callback yet, or 3)
* has finished and has returned to the ldc framework
* but the ldc framework has not yet cleared the
* active bit.
*
* Wait for it to finish.
*/
while (ldc_unreg_callback(ldcp->ldc_handle)
== EWOULDBLOCK)
(void) cv_timedwait(&ldcp->drain_cv,
&ldcp->drain_cv_lock, lbolt + hz);
mutex_exit(&ldcp->drain_cv_lock);
D2(vswp, "%s: unreg callback for chan %ld after "
"timeout", __func__, ldcp->ldc_id);
}
}
RW_EXIT(&ldcl->lockrw);
D1(vswp, "%s: exit", __func__);
return (0);
}
/*
* Wait until all tasks which reference this port have completed.
*
* Prior to this function being invoked each channel under this port
* should have been quiesced via ldc_set_cb_mode(DISABLE).
*/
static int
vsw_drain_port_taskq(vsw_port_t *port)
{
vsw_t *vswp = port->p_vswp;
D1(vswp, "%s: enter", __func__);
/*
* Mark the port as in the process of being detached, and
* dispatch a marker task to the queue so we know when all
* relevant tasks have completed.
*/
mutex_enter(&port->state_lock);
port->state = VSW_PORT_DETACHING;
if ((vswp->taskq_p == NULL) ||
(ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
port, DDI_NOSLEEP) != DDI_SUCCESS)) {
DERR(vswp, "%s: unable to dispatch marker task",
__func__);
mutex_exit(&port->state_lock);
return (1);
}
/*
* Wait for the marker task to finish.
*/
while (port->state != VSW_PORT_DETACHABLE)
cv_wait(&port->state_cv, &port->state_lock);
mutex_exit(&port->state_lock);
D1(vswp, "%s: exit", __func__);
return (0);
}
static void
vsw_marker_task(void *arg)
{
vsw_port_t *port = arg;
vsw_t *vswp = port->p_vswp;
D1(vswp, "%s: enter", __func__);
mutex_enter(&port->state_lock);
/*
* No further tasks should be dispatched which reference
* this port so ok to mark it as safe to detach.
*/
port->state = VSW_PORT_DETACHABLE;
cv_signal(&port->state_cv);
mutex_exit(&port->state_lock);
D1(vswp, "%s: exit", __func__);
}
static vsw_port_t *
vsw_lookup_port(vsw_t *vswp, int p_instance)
{
vsw_port_list_t *plist = &vswp->plist;
vsw_port_t *port;
for (port = plist->head; port != NULL; port = port->p_next) {
if (port->p_instance == p_instance) {
D2(vswp, "vsw_lookup_port: found p_instance\n");
return (port);
}
}
return (NULL);
}
/*
* Search for and remove the specified port from the port
* list. Returns 0 if able to locate and remove port, otherwise
* returns 1.
*/
static int
vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
{
vsw_port_list_t *plist = &vswp->plist;
vsw_port_t *curr_p, *prev_p;
if (plist->head == NULL)
return (1);
curr_p = prev_p = plist->head;
while (curr_p != NULL) {
if (curr_p == port) {
if (prev_p == curr_p) {
plist->head = curr_p->p_next;
} else {
prev_p->p_next = curr_p->p_next;
}
plist->num_ports--;
break;
} else {
prev_p = curr_p;
curr_p = curr_p->p_next;
}
}
return (0);
}
/*
* Interrupt handler for ldc messages.
*/
static uint_t
vsw_ldc_cb(uint64_t event, caddr_t arg)
{
vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
vsw_t *vswp = ldcp->ldc_vswp;
D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
mutex_enter(&ldcp->ldc_cblock);
mutex_enter(&ldcp->status_lock);
if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
mutex_exit(&ldcp->status_lock);
mutex_exit(&ldcp->ldc_cblock);
return (LDC_SUCCESS);
}
mutex_exit(&ldcp->status_lock);
if (event & LDC_EVT_UP) {
/*
* Channel has come up.
*/
D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
__func__, ldcp->ldc_id, event, ldcp->ldc_status);
vsw_process_conn_evt(ldcp, VSW_CONN_UP);
ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
}
if (event & LDC_EVT_READ) {
/*
* Data available for reading.
*/
D2(vswp, "%s: id(ld) event(%llx) data READ",
__func__, ldcp->ldc_id, event);
vsw_process_pkt(ldcp);
ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
goto vsw_cb_exit;
}
if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
__func__, ldcp->ldc_id, event, ldcp->ldc_status);
vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
}
/*
* Catch either LDC_EVT_WRITE which we don't support or any
* unknown event.
*/
if (event & ~(LDC_EVT_UP | LDC_EVT_RESET
| LDC_EVT_DOWN | LDC_EVT_READ)) {
DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
__func__, ldcp->ldc_id, event, ldcp->ldc_status);
}
vsw_cb_exit:
mutex_exit(&ldcp->ldc_cblock);
/*
* Let the drain function know we are finishing if it
* is waiting.
*/
mutex_enter(&ldcp->drain_cv_lock);
if (ldcp->drain_state == VSW_LDC_DRAINING)
cv_signal(&ldcp->drain_cv);
mutex_exit(&ldcp->drain_cv_lock);
return (LDC_SUCCESS);
}
/*
* Reinitialise data structures associated with the channel.
*/
static void
vsw_ldc_reinit(vsw_ldc_t *ldcp)
{
vsw_t *vswp = ldcp->ldc_vswp;
vsw_port_t *port;
vsw_ldc_list_t *ldcl;
D1(vswp, "%s: enter", __func__);
port = ldcp->ldc_port;
ldcl = &port->p_ldclist;
READ_ENTER(&ldcl->lockrw);
D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
ldcp->lane_in.lstate, ldcp->lane_out.lstate);
vsw_free_lane_resources(ldcp, INBOUND);
vsw_free_lane_resources(ldcp, OUTBOUND);
RW_EXIT(&ldcl->lockrw);
ldcp->lane_in.lstate = 0;
ldcp->lane_out.lstate = 0;
/*
* Remove parent port from any multicast groups
* it may have registered with. Client must resend
* multicast add command after handshake completes.
*/
(void) vsw_del_fdb(vswp, port);
vsw_del_mcst_port(port);
ldcp->peer_session = 0;
ldcp->session_status = 0;
ldcp->hcnt = 0;
ldcp->hphase = VSW_MILESTONE0;
D1(vswp, "%s: exit", __func__);
}
/*
* Process a connection event.
*
* Note - care must be taken to ensure that this function is
* not called with the dlistrw lock held.
*/
static void
vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
{
vsw_t *vswp = ldcp->ldc_vswp;
vsw_conn_evt_t *conn = NULL;
D1(vswp, "%s: enter", __func__);
/*
* Check if either a reset or restart event is pending
* or in progress. If so just return.
*
* A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
* being received by the callback handler, or a ECONNRESET error
* code being returned from a ldc_read() or ldc_write() call.
*
* A VSW_CONN_RESTART event occurs when some error checking code
* decides that there is a problem with data from the channel,
* and that the handshake should be restarted.
*/
if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
(ldstub((uint8_t *)&ldcp->reset_active)))
return;
/*
* If it is an LDC_UP event we first check the recorded
* state of the channel. If this is UP then we know that
* the channel moving to the UP state has already been dealt
* with and don't need to dispatch a new task.
*
* The reason for this check is that when we do a ldc_up(),
* depending on the state of the peer, we may or may not get
* a LDC_UP event. As we can't depend on getting a LDC_UP evt
* every time we do ldc_up() we explicitly check the channel
* status to see has it come up (ldc_up() is asynch and will
* complete at some undefined time), and take the appropriate
* action.
*
* The flip side of this is that we may get a LDC_UP event
* when we have already seen that the channel is up and have
* dealt with that.
*/
mutex_enter(&ldcp->status_lock);
if (evt == VSW_CONN_UP) {
if ((ldcp->ldc_status == LDC_UP) ||
(ldcp->reset_active != 0)) {
mutex_exit(&ldcp->status_lock);
return;
}
}
mutex_exit(&ldcp->status_lock);
/*
* The transaction group id allows us to identify and discard
* any tasks which are still pending on the taskq and refer
* to the handshake session we are about to restart or reset.
* These stale messages no longer have any real meaning.
*/
mutex_enter(&ldcp->hss_lock);
ldcp->hss_id++;
mutex_exit(&ldcp->hss_lock);
ASSERT(vswp->taskq_p != NULL);
if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
" connection event", vswp->instance);
goto err_exit;
}
conn->evt = evt;
conn->ldcp = ldcp;
if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
DDI_NOSLEEP) != DDI_SUCCESS) {
cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
vswp->instance);
kmem_free(conn, sizeof (vsw_conn_evt_t));
goto err_exit;
}
D1(vswp, "%s: exit", __func__);
return;
err_exit:
/*
* Have mostly likely failed due to memory shortage. Clear the flag so
* that future requests will at least be attempted and will hopefully
* succeed.
*/
if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
ldcp->reset_active = 0;
}
/*
* Deal with events relating to a connection. Invoked from a taskq.
*/
static void
vsw_conn_task(void *arg)
{
vsw_conn_evt_t *conn = (vsw_conn_evt_t *)arg;
vsw_ldc_t *ldcp = NULL;
vsw_t *vswp = NULL;
uint16_t evt;
ldc_status_t curr_status;
ldcp = conn->ldcp;
evt = conn->evt;
vswp = ldcp->ldc_vswp;
D1(vswp, "%s: enter", __func__);
/* can safely free now have copied out data */
kmem_free(conn, sizeof (vsw_conn_evt_t));
mutex_enter(&ldcp->status_lock);
if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
"channel %ld", vswp->instance, ldcp->ldc_id);
mutex_exit(&ldcp->status_lock);
return;
}
/*
* If we wish to restart the handshake on this channel, then if
* the channel is UP we bring it DOWN to flush the underlying
* ldc queue.
*/
if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
(void) ldc_down(ldcp->ldc_handle);
/*
* re-init all the associated data structures.
*/
vsw_ldc_reinit(ldcp);
/*
* Bring the channel back up (note it does no harm to
* do this even if the channel is already UP, Just
* becomes effectively a no-op).
*/
(void) ldc_up(ldcp->ldc_handle);
/*
* Check if channel is now UP. This will only happen if
* peer has also done a ldc_up().
*/
if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
"channel %ld", vswp->instance, ldcp->ldc_id);
mutex_exit(&ldcp->status_lock);
return;
}
ldcp->ldc_status = curr_status;
/* channel UP so restart handshake by sending version info */
if (curr_status == LDC_UP) {
if (ldcp->hcnt++ > vsw_num_handshakes) {
cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
" handshake attempts (%d) on channel %ld",
vswp->instance, ldcp->hcnt, ldcp->ldc_id);
mutex_exit(&ldcp->status_lock);
return;
}
if (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
DDI_NOSLEEP) != DDI_SUCCESS) {
cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
vswp->instance);
/*
* Don't count as valid restart attempt if couldn't
* send version msg.
*/
if (ldcp->hcnt > 0)
ldcp->hcnt--;
}
}
/*
* Mark that the process is complete by clearing the flag.
*
* Note is it possible that the taskq dispatch above may have failed,
* most likely due to memory shortage. We still clear the flag so
* future attempts will at least be attempted and will hopefully
* succeed.
*/
if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
ldcp->reset_active = 0;
mutex_exit(&ldcp->status_lock);
D1(vswp, "%s: exit", __func__);
}
/*
* returns 0 if legal for event signified by flag to have
* occured at the time it did. Otherwise returns 1.
*/
int
vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
{
vsw_t *vswp = ldcp->ldc_vswp;
uint64_t state;
uint64_t phase;
if (dir == INBOUND)
state = ldcp->lane_in.lstate;
else
state = ldcp->lane_out.lstate;
phase = ldcp->hphase;
switch (flag) {
case VSW_VER_INFO_RECV:
if (phase > VSW_MILESTONE0) {
DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
" when in state %d\n", ldcp->ldc_id, phase);
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
return (1);
}
break;
case VSW_VER_ACK_RECV:
case VSW_VER_NACK_RECV:
if (!(state & VSW_VER_INFO_SENT)) {
DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK"
" or VER_NACK when in state %d\n",
ldcp->ldc_id, phase);
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
return (1);
} else
state &= ~VSW_VER_INFO_SENT;
break;
case VSW_ATTR_INFO_RECV:
if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
" when in state %d\n", ldcp->ldc_id, phase);
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
return (1);
}
break;
case VSW_ATTR_ACK_RECV:
case VSW_ATTR_NACK_RECV:
if (!(state & VSW_ATTR_INFO_SENT)) {
DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
" or ATTR_NACK when in state %d\n",
ldcp->ldc_id, phase);
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
return (1);
} else
state &= ~VSW_ATTR_INFO_SENT;
break;
case VSW_DRING_INFO_RECV:
if (phase < VSW_MILESTONE1) {
DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
" when in state %d\n", ldcp->ldc_id, phase);
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
return (1);
}
break;
case VSW_DRING_ACK_RECV:
case VSW_DRING_NACK_RECV:
if (!(state & VSW_DRING_INFO_SENT)) {
DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK"
" or DRING_NACK when in state %d\n",
ldcp->ldc_id, phase);
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
return (1);
} else
state &= ~VSW_DRING_INFO_SENT;
break;
case VSW_RDX_INFO_RECV:
if (phase < VSW_MILESTONE3) {
DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
" when in state %d\n", ldcp->ldc_id, phase);
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
return (1);
}
break;
case VSW_RDX_ACK_RECV:
case VSW_RDX_NACK_RECV:
if (!(state & VSW_RDX_INFO_SENT)) {
DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK"
" or RDX_NACK when in state %d\n",
ldcp->ldc_id, phase);
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
return (1);
} else
state &= ~VSW_RDX_INFO_SENT;
break;
case VSW_MCST_INFO_RECV:
if (phase < VSW_MILESTONE3) {
DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
" when in state %d\n", ldcp->ldc_id, phase);
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
return (1);
}
break;
default:
DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
ldcp->ldc_id, flag);
return (1);
}
if (dir == INBOUND)
ldcp->lane_in.lstate = state;
else
ldcp->lane_out.lstate = state;
D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
return (0);
}
void
vsw_next_milestone(vsw_ldc_t *ldcp)
{
vsw_t *vswp = ldcp->ldc_vswp;
D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
ldcp->ldc_id, ldcp->hphase);
DUMP_FLAGS(ldcp->lane_in.lstate);
DUMP_FLAGS(ldcp->lane_out.lstate);
switch (ldcp->hphase) {
case VSW_MILESTONE0:
/*
* If we haven't started to handshake with our peer,
* start to do so now.
*/
if (ldcp->lane_out.lstate == 0) {
D2(vswp, "%s: (chan %lld) starting handshake "
"with peer", __func__, ldcp->ldc_id);
vsw_process_conn_evt(ldcp, VSW_CONN_UP);
}
/*
* Only way to pass this milestone is to have successfully
* negotiated version info.
*/
if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
(ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
D2(vswp, "%s: (chan %lld) leaving milestone 0",
__func__, ldcp->ldc_id);
/*
* Next milestone is passed when attribute
* information has been successfully exchanged.
*/
ldcp->hphase = VSW_MILESTONE1;
vsw_send_attr(ldcp);
}
break;
case VSW_MILESTONE1:
/*
* Only way to pass this milestone is to have successfully
* negotiated attribute information.
*/
if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
ldcp->hphase = VSW_MILESTONE2;
/*
* If the peer device has said it wishes to
* use descriptor rings then we send it our ring
* info, otherwise we just set up a private ring
* which we use an internal buffer
*/
if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE)
vsw_send_dring_info(ldcp);
}
break;
case VSW_MILESTONE2:
/*
* If peer has indicated in its attribute message that
* it wishes to use descriptor rings then the only way
* to pass this milestone is for us to have received
* valid dring info.
*
* If peer is not using descriptor rings then just fall
* through.
*/
if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) &&
(!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT)))
break;
D2(vswp, "%s: (chan %lld) leaving milestone 2",
__func__, ldcp->ldc_id);
ldcp->hphase = VSW_MILESTONE3;
vsw_send_rdx(ldcp);
break;
case VSW_MILESTONE3:
/*
* Pass this milestone when all paramaters have been
* successfully exchanged and RDX sent in both directions.
*
* Mark outbound lane as available to transmit data.
*/
if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
(ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
D2(vswp, "%s: (chan %lld) leaving milestone 3",
__func__, ldcp->ldc_id);
D2(vswp, "%s: ** handshake complete (0x%llx : "
"0x%llx) **", __func__, ldcp->lane_in.lstate,
ldcp->lane_out.lstate);
ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
ldcp->hphase = VSW_MILESTONE4;
ldcp->hcnt = 0;
DISPLAY_STATE();
} else {
D2(vswp, "%s: still in milestone 3 (0x%llx :"
" 0x%llx", __func__, ldcp->lane_in.lstate,
ldcp->lane_out.lstate);
}
break;
case VSW_MILESTONE4:
D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
ldcp->ldc_id);
break;
default:
DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
ldcp->ldc_id, ldcp->hphase);
}
D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
ldcp->hphase);
}
/*
* Check if major version is supported.
*
* Returns 0 if finds supported major number, and if necessary
* adjusts the minor field.
*
* Returns 1 if can't match major number exactly. Sets mjor/minor
* to next lowest support values, or to zero if no other values possible.
*/
static int
vsw_supported_version(vio_ver_msg_t *vp)
{
int i;
D1(NULL, "vsw_supported_version: enter");
for (i = 0; i < VSW_NUM_VER; i++) {
if (vsw_versions[i].ver_major == vp->ver_major) {
/*
* Matching or lower major version found. Update
* minor number if necessary.
*/
if (vp->ver_minor > vsw_versions[i].ver_minor) {
D2(NULL, "%s: adjusting minor value"
" from %d to %d", __func__,
vp->ver_minor,
vsw_versions[i].ver_minor);
vp->ver_minor = vsw_versions[i].ver_minor;
}
return (0);
}
if (vsw_versions[i].ver_major < vp->ver_major) {
if (vp->ver_minor > vsw_versions[i].ver_minor) {
D2(NULL, "%s: adjusting minor value"
" from %d to %d", __func__,
vp->ver_minor,
vsw_versions[i].ver_minor);
vp->ver_minor = vsw_versions[i].ver_minor;
}
return (1);
}
}
/* No match was possible, zero out fields */
vp->ver_major = 0;
vp->ver_minor = 0;
D1(NULL, "vsw_supported_version: exit");
return (1);
}
/*
* Main routine for processing messages received over LDC.
*/
static void
vsw_process_pkt(void *arg)
{
vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
vsw_t *vswp = ldcp->ldc_vswp;
size_t msglen;
vio_msg_tag_t tag;
def_msg_t dmsg;
int rv = 0;
D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
/*
* If channel is up read messages until channel is empty.
*/
do {
msglen = sizeof (dmsg);
rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen);
if (rv != 0) {
DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) "
"len(%d)\n", __func__, ldcp->ldc_id,
rv, msglen);
}
/* channel has been reset */
if (rv == ECONNRESET) {
vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
break;
}
if (msglen == 0) {
D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
ldcp->ldc_id);
break;
}
D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
ldcp->ldc_id, msglen);
/*
* Figure out what sort of packet we have gotten by
* examining the msg tag, and then switch it appropriately.
*/
bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t));
switch (tag.vio_msgtype) {
case VIO_TYPE_CTRL:
vsw_dispatch_ctrl_task(ldcp, &dmsg, tag);
break;
case VIO_TYPE_DATA:
vsw_process_data_pkt(ldcp, &dmsg, tag);
break;
case VIO_TYPE_ERR:
vsw_process_err_pkt(ldcp, &dmsg, tag);
break;
default:
DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
"id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id);
break;
}
} while (msglen);
D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
}
/*
* Dispatch a task to process a VIO control message.
*/
static void
vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag)
{
vsw_ctrl_task_t *ctaskp = NULL;
vsw_port_t *port = ldcp->ldc_port;
vsw_t *vswp = port->p_vswp;
D1(vswp, "%s: enter", __func__);
/*
* We need to handle RDX ACK messages in-band as once they
* are exchanged it is possible that we will get an
* immediate (legitimate) data packet.
*/
if ((tag.vio_subtype_env == VIO_RDX) &&
(tag.vio_subtype == VIO_SUBTYPE_ACK)) {
if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
return;
ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
D2(vswp, "%s (%ld) handling RDX_ACK in place "
"(ostate 0x%llx : hphase %d)", __func__,
ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
vsw_next_milestone(ldcp);
return;
}
ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
if (ctaskp == NULL) {
DERR(vswp, "%s: unable to alloc space for ctrl"
" msg", __func__);
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
return;
}
ctaskp->ldcp = ldcp;
bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
mutex_enter(&ldcp->hss_lock);
ctaskp->hss_id = ldcp->hss_id;
mutex_exit(&ldcp->hss_lock);
/*
* Dispatch task to processing taskq if port is not in
* the process of being detached.
*/
mutex_enter(&port->state_lock);
if (port->state == VSW_PORT_INIT) {
if ((vswp->taskq_p == NULL) ||
(ddi_taskq_dispatch(vswp->taskq_p,
vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP)
!= DDI_SUCCESS)) {
DERR(vswp, "%s: unable to dispatch task to taskq",
__func__);
kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
mutex_exit(&port->state_lock);
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
return;
}
} else {
DWARN(vswp, "%s: port %d detaching, not dispatching "
"task", __func__, port->p_instance);
}
mutex_exit(&port->state_lock);
D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
ldcp->ldc_id);
D1(vswp, "%s: exit", __func__);
}
/*
* Process a VIO ctrl message. Invoked from taskq.
*/
static void
vsw_process_ctrl_pkt(void *arg)
{
vsw_ctrl_task_t *ctaskp = (vsw_ctrl_task_t *)arg;
vsw_ldc_t *ldcp = ctaskp->ldcp;
vsw_t *vswp = ldcp->ldc_vswp;
vio_msg_tag_t tag;
uint16_t env;
D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
env = tag.vio_subtype_env;
/* stale pkt check */
mutex_enter(&ldcp->hss_lock);
if (ctaskp->hss_id < ldcp->hss_id) {
DWARN(vswp, "%s: discarding stale packet belonging to"
" earlier (%ld) handshake session", __func__,
ctaskp->hss_id);
mutex_exit(&ldcp->hss_lock);
return;
}
mutex_exit(&ldcp->hss_lock);
/* session id check */
if (ldcp->session_status & VSW_PEER_SESSION) {
if (ldcp->peer_session != tag.vio_sid) {
DERR(vswp, "%s (chan %d): invalid session id (%llx)",
__func__, ldcp->ldc_id, tag.vio_sid);
kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
return;
}
}
/*
* Switch on vio_subtype envelope, then let lower routines
* decide if its an INFO, ACK or NACK packet.
*/
switch (env) {
case VIO_VER_INFO:
vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
break;
case VIO_DRING_REG:
vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
break;
case VIO_DRING_UNREG:
vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
break;
case VIO_ATTR_INFO:
vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
break;
case VNET_MCAST_INFO:
vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
break;
case VIO_RDX:
vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
break;
default:
DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
__func__, env);
}
kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}
/*
* Version negotiation. We can end up here either because our peer
* has responded to a handshake message we have sent it, or our peer
* has initiated a handshake with us. If its the former then can only
* be ACK or NACK, if its the later can only be INFO.
*
* If its an ACK we move to the next stage of the handshake, namely
* attribute exchange. If its a NACK we see if we can specify another
* version, if we can't we stop.
*
* If it is an INFO we reset all params associated with communication
* in that direction over this channel (remember connection is
* essentially 2 independent simplex channels).
*/
void
vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
{
vio_ver_msg_t *ver_pkt;
vsw_t *vswp = ldcp->ldc_vswp;
D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
/*
* We know this is a ctrl/version packet so
* cast it into the correct structure.
*/
ver_pkt = (vio_ver_msg_t *)pkt;
switch (ver_pkt->tag.vio_subtype) {
case VIO_SUBTYPE_INFO:
D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
/*
* Record the session id, which we will use from now
* until we see another VER_INFO msg. Even then the
* session id in most cases will be unchanged, execpt
* if channel was reset.
*/
if ((ldcp->session_status & VSW_PEER_SESSION) &&
(ldcp->peer_session != ver_pkt->tag.vio_sid)) {
DERR(vswp, "%s: updating session id for chan %lld "
"from %llx to %llx", __func__, ldcp->ldc_id,
ldcp->peer_session, ver_pkt->tag.vio_sid);
}
ldcp->peer_session = ver_pkt->tag.vio_sid;
ldcp->session_status |= VSW_PEER_SESSION;
/* Legal message at this time ? */
if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
return;
/*
* First check the device class. Currently only expect
* to be talking to a network device. In the future may
* also talk to another switch.
*/
if (ver_pkt->dev_class != VDEV_NETWORK) {
DERR(vswp, "%s: illegal device class %d", __func__,
ver_pkt->dev_class);
ver_pkt->tag.vio_sid = ldcp->local_session;
ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
(void) vsw_send_msg(ldcp, (void *)ver_pkt,
sizeof (vio_ver_msg_t), B_TRUE);
ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
vsw_next_milestone(ldcp);
return;
} else {
ldcp->dev_class = ver_pkt->dev_class;
}
/*
* Now check the version.
*/
if (vsw_supported_version(ver_pkt) == 0) {
/*
* Support this major version and possibly
* adjusted minor version.
*/
D2(vswp, "%s: accepted ver %d:%d", __func__,
ver_pkt->ver_major, ver_pkt->ver_minor);
/* Store accepted values */
ldcp->lane_in.ver_major = ver_pkt->ver_major;
ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
} else {
/*
* NACK back with the next lower major/minor
* pairing we support (if don't suuport any more
* versions then they will be set to zero.
*/
D2(vswp, "%s: replying with ver %d:%d", __func__,
ver_pkt->ver_major, ver_pkt->ver_minor);
/* Store updated values */
ldcp->lane_in.ver_major = ver_pkt->ver_major;
ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
}
DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
ver_pkt->tag.vio_sid = ldcp->local_session;
(void) vsw_send_msg(ldcp, (void *)ver_pkt,
sizeof (vio_ver_msg_t), B_TRUE);
vsw_next_milestone(ldcp);
break;
case VIO_SUBTYPE_ACK:
D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
return;
/* Store updated values */
ldcp->lane_in.ver_major = ver_pkt->ver_major;
ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
vsw_next_milestone(ldcp);
break;
case VIO_SUBTYPE_NACK:
D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
return;
/*
* If our peer sent us a NACK with the ver fields set to
* zero then there is nothing more we can do. Otherwise see
* if we support either the version suggested, or a lesser
* one.
*/
if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
DERR(vswp, "%s: peer unable to negotiate any "
"further.", __func__);
ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
vsw_next_milestone(ldcp);
return;
}
/*
* Check to see if we support this major version or
* a lower one. If we don't then maj/min will be set
* to zero.
*/
(void) vsw_supported_version(ver_pkt);
if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
/* Nothing more we can do */
DERR(vswp, "%s: version negotiation failed.\n",
__func__);
ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
vsw_next_milestone(ldcp);
} else {
/* found a supported major version */
ldcp->lane_out.ver_major = ver_pkt->ver_major;
ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
D2(vswp, "%s: resending with updated values (%x, %x)",
__func__, ver_pkt->ver_major,
ver_pkt->ver_minor);
ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
ver_pkt->tag.vio_sid = ldcp->local_session;
ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
(void) vsw_send_msg(ldcp, (void *)ver_pkt,
sizeof (vio_ver_msg_t), B_TRUE);
vsw_next_milestone(ldcp);
}
break;
default:
DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
ver_pkt->tag.vio_subtype);
}
D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
}
/*
* Process an attribute packet. We can end up here either because our peer
* has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
* peer has sent us an attribute INFO message
*
* If its an ACK we then move to the next stage of the handshake which
* is to send our descriptor ring info to our peer. If its a NACK then
* there is nothing more we can (currently) do.
*
* If we get a valid/acceptable INFO packet (and we have already negotiated
* a version) we ACK back and set channel state to ATTR_RECV, otherwise we
* NACK back and reset channel state to INACTIV.
*
* FUTURE: in time we will probably negotiate over attributes, but for
* the moment unacceptable attributes are regarded as a fatal error.
*
*/
void
vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
{
vnet_attr_msg_t *attr_pkt;
vsw_t *vswp = ldcp->ldc_vswp;
vsw_port_t *port = ldcp->ldc_port;
uint64_t macaddr = 0;
int i;
D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
/*
* We know this is a ctrl/attr packet so
* cast it into the correct structure.
*/
attr_pkt = (vnet_attr_msg_t *)pkt;
switch (attr_pkt->tag.vio_subtype) {
case VIO_SUBTYPE_INFO:
D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
return;
/*
* If the attributes are unacceptable then we NACK back.
*/
if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) {
DERR(vswp, "%s (chan %d): invalid attributes",
__func__, ldcp->ldc_id);
vsw_free_lane_resources(ldcp, INBOUND);
attr_pkt->tag.vio_sid = ldcp->local_session;
attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
(void) vsw_send_msg(ldcp, (void *)attr_pkt,
sizeof (vnet_attr_msg_t), B_TRUE);
vsw_next_milestone(ldcp);
return;
}
/*
* Otherwise store attributes for this lane and update
* lane state.
*/
ldcp->lane_in.mtu = attr_pkt->mtu;
ldcp->lane_in.addr = attr_pkt->addr;
ldcp->lane_in.addr_type = attr_pkt->addr_type;
ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
macaddr = ldcp->lane_in.addr;
for (i = ETHERADDRL - 1; i >= 0; i--) {
port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
macaddr >>= 8;
}
/* create the fdb entry for this port/mac address */
(void) vsw_add_fdb(vswp, port);
/* setup device specifc xmit routines */
mutex_enter(&port->tx_lock);
if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) {
D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
port->transmit = vsw_dringsend;
} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
vsw_create_privring(ldcp);
port->transmit = vsw_descrsend;
}
mutex_exit(&port->tx_lock);
attr_pkt->tag.vio_sid = ldcp->local_session;
attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
(void) vsw_send_msg(ldcp, (void *)attr_pkt,
sizeof (vnet_attr_msg_t), B_TRUE);
vsw_next_milestone(ldcp);
break;
case VIO_SUBTYPE_ACK:
D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
return;
ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
vsw_next_milestone(ldcp);
break;
case VIO_SUBTYPE_NACK:
D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
return;
ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
vsw_next_milestone(ldcp);
break;
default:
DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
attr_pkt->tag.vio_subtype);
}
D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
}
/*
* Process a dring info packet. We can end up here either because our peer
* has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
* peer has sent us a dring INFO message.
*
* If we get a valid/acceptable INFO packet (and we have already negotiated
* a version) we ACK back and update the lane state, otherwise we NACK back.
*
* FUTURE: nothing to stop client from sending us info on multiple dring's
* but for the moment we will just use the first one we are given.
*
*/
void
vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
{
vio_dring_reg_msg_t *dring_pkt;
vsw_t *vswp = ldcp->ldc_vswp;
ldc_mem_info_t minfo;
dring_info_t *dp, *dbp;
int dring_found = 0;
/*
* We know this is a ctrl/dring packet so
* cast it into the correct structure.
*/
dring_pkt = (vio_dring_reg_msg_t *)pkt;
D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
switch (dring_pkt->tag.vio_subtype) {
case VIO_SUBTYPE_INFO:
D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
return;
/*
* If the dring params are unacceptable then we NACK back.
*/
if (vsw_check_dring_info(dring_pkt)) {
DERR(vswp, "%s (%lld): invalid dring info",
__func__, ldcp->ldc_id);
vsw_free_lane_resources(ldcp, INBOUND);
dring_pkt->tag.vio_sid = ldcp->local_session;
dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
(void) vsw_send_msg(ldcp, (void *)dring_pkt,
sizeof (vio_dring_reg_msg_t), B_TRUE);
vsw_next_milestone(ldcp);
return;
}
/*
* Otherwise, attempt to map in the dring using the
* cookie. If that succeeds we send back a unique dring
* identifier that the sending side will use in future
* to refer to this descriptor ring.
*/
dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
dp->num_descriptors = dring_pkt->num_descriptors;
dp->descriptor_size = dring_pkt->descriptor_size;
dp->options = dring_pkt->options;
dp->ncookies = dring_pkt->ncookies;
/*
* Note: should only get one cookie. Enforced in
* the ldc layer.
*/
bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
sizeof (ldc_mem_cookie_t));
D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
dp->num_descriptors, dp->descriptor_size);
D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
dp->options, dp->ncookies);
if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
dp->ncookies, dp->num_descriptors,
dp->descriptor_size, LDC_SHADOW_MAP,
&(dp->handle))) != 0) {
DERR(vswp, "%s: dring_map failed\n", __func__);
kmem_free(dp, sizeof (dring_info_t));
vsw_free_lane_resources(ldcp, INBOUND);
dring_pkt->tag.vio_sid = ldcp->local_session;
dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
(void) vsw_send_msg(ldcp, (void *)dring_pkt,
sizeof (vio_dring_reg_msg_t), B_TRUE);
vsw_next_milestone(ldcp);
return;
}
if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
DERR(vswp, "%s: dring_addr failed\n", __func__);
kmem_free(dp, sizeof (dring_info_t));
vsw_free_lane_resources(ldcp, INBOUND);
dring_pkt->tag.vio_sid = ldcp->local_session;
dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
(void) vsw_send_msg(ldcp, (void *)dring_pkt,
sizeof (vio_dring_reg_msg_t), B_TRUE);
vsw_next_milestone(ldcp);
return;
} else {
/* store the address of the pub part of ring */
dp->pub_addr = minfo.vaddr;
}
/* no private section as we are importing */
dp->priv_addr = NULL;
/*
* Using simple mono increasing int for ident at
* the moment.
*/
dp->ident = ldcp->next_ident;
ldcp->next_ident++;
dp->end_idx = 0;
dp->next = NULL;
/*
* Link it onto the end of the list of drings
* for this lane.
*/
if (ldcp->lane_in.dringp == NULL) {
D2(vswp, "%s: adding first INBOUND dring", __func__);
ldcp->lane_in.dringp = dp;
} else {
dbp = ldcp->lane_in.dringp;
while (dbp->next != NULL)
dbp = dbp->next;
dbp->next = dp;
}
/* acknowledge it */
dring_pkt->tag.vio_sid = ldcp->local_session;
dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
dring_pkt->dring_ident = dp->ident;
(void) vsw_send_msg(ldcp, (void *)dring_pkt,
sizeof (vio_dring_reg_msg_t), B_TRUE);
ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
vsw_next_milestone(ldcp);
break;
case VIO_SUBTYPE_ACK:
D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
return;
/*
* Peer is acknowledging our dring info and will have
* sent us a dring identifier which we will use to
* refer to this ring w.r.t. our peer.
*/
dp = ldcp->lane_out.dringp;
if (dp != NULL) {
/*
* Find the ring this ident should be associated
* with.
*/
if (vsw_dring_match(dp, dring_pkt)) {
dring_found = 1;
} else while (dp != NULL) {
if (vsw_dring_match(dp, dring_pkt)) {
dring_found = 1;
break;
}
dp = dp->next;
}
if (dring_found == 0) {
DERR(NULL, "%s: unrecognised ring cookie",
__func__);
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
return;
}
} else {
DERR(vswp, "%s: DRING ACK received but no drings "
"allocated", __func__);
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
return;
}
/* store ident */
dp->ident = dring_pkt->dring_ident;
ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
vsw_next_milestone(ldcp);
break;
case VIO_SUBTYPE_NACK:
D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
return;
ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
vsw_next_milestone(ldcp);
break;
default:
DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
dring_pkt->tag.vio_subtype);
}
D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
}
/*
* Process a request from peer to unregister a dring.
*
* For the moment we just restart the handshake if our
* peer endpoint attempts to unregister a dring.
*/
void
vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
{
vsw_t *vswp = ldcp->ldc_vswp;
vio_dring_unreg_msg_t *dring_pkt;
/*
* We know this is a ctrl/dring packet so
* cast it into the correct structure.
*/
dring_pkt = (vio_dring_unreg_msg_t *)pkt;
D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
switch (dring_pkt->tag.vio_subtype) {
case VIO_SUBTYPE_INFO:
D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
DWARN(vswp, "%s: restarting handshake..", __func__);
break;
case VIO_SUBTYPE_ACK:
D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
DWARN(vswp, "%s: restarting handshake..", __func__);
break;
case VIO_SUBTYPE_NACK:
D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
DWARN(vswp, "%s: restarting handshake..", __func__);
break;
default:
DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
dring_pkt->tag.vio_subtype);
}
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}
#define SND_MCST_NACK(ldcp, pkt) \
pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
pkt->tag.vio_sid = ldcp->local_session; \
(void) vsw_send_msg(ldcp, (void *)pkt, \
sizeof (vnet_mcast_msg_t), B_TRUE);
/*
* Process a multicast request from a vnet.
*
* Vnet's specify a multicast address that they are interested in. This
* address is used as a key into the hash table which forms the multicast
* forwarding database (mFDB).
*
* The table keys are the multicast addresses, while the table entries
* are pointers to lists of ports which wish to receive packets for the
* specified multicast address.
*
* When a multicast packet is being switched we use the address as a key
* into the hash table, and then walk the appropriate port list forwarding
* the pkt to each port in turn.
*
* If a vnet is no longer interested in a particular multicast grouping
* we simply find the correct location in the hash table and then delete
* the relevant port from the port list.
*
* To deal with the case whereby a port is being deleted without first
* removing itself from the lists in the hash table, we maintain a list
* of multicast addresses the port has registered an interest in, within
* the port structure itself. We then simply walk that list of addresses
* using them as keys into the hash table and remove the port from the
* appropriate lists.
*/
static void
vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
{
vnet_mcast_msg_t *mcst_pkt;
vsw_port_t *port = ldcp->ldc_port;
vsw_t *vswp = ldcp->ldc_vswp;
int i;
D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
/*
* We know this is a ctrl/mcast packet so
* cast it into the correct structure.
*/
mcst_pkt = (vnet_mcast_msg_t *)pkt;
switch (mcst_pkt->tag.vio_subtype) {
case VIO_SUBTYPE_INFO:
D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
/*
* Check if in correct state to receive a multicast
* message (i.e. handshake complete). If not reset
* the handshake.
*/
if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
return;
/*
* Before attempting to add or remove address check
* that they are valid multicast addresses.
* If not, then NACK back.
*/
for (i = 0; i < mcst_pkt->count; i++) {
if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
DERR(vswp, "%s: invalid multicast address",
__func__);
SND_MCST_NACK(ldcp, mcst_pkt);
return;
}
}
/*
* Now add/remove the addresses. If this fails we
* NACK back.
*/
if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
SND_MCST_NACK(ldcp, mcst_pkt);
return;
}
mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
mcst_pkt->tag.vio_sid = ldcp->local_session;
DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
sizeof (vnet_mcast_msg_t), B_TRUE);
break;
case VIO_SUBTYPE_ACK:
DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
/*
* We shouldn't ever get a multicast ACK message as
* at the moment we never request multicast addresses
* to be set on some other device. This may change in
* the future if we have cascading switches.
*/
if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
return;
/* Do nothing */
break;
case VIO_SUBTYPE_NACK:
DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
/*
* We shouldn't get a multicast NACK packet for the
* same reasons as we shouldn't get a ACK packet.
*/
if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
return;
/* Do nothing */
break;
default:
DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
mcst_pkt->tag.vio_subtype);
}
D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}
static void
vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
{
vio_rdx_msg_t *rdx_pkt;
vsw_t *vswp = ldcp->ldc_vswp;
/*
* We know this is a ctrl/rdx packet so
* cast it into the correct structure.
*/
rdx_pkt = (vio_rdx_msg_t *)pkt;
D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
switch (rdx_pkt->tag.vio_subtype) {
case VIO_SUBTYPE_INFO:
D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
return;
rdx_pkt->tag.vio_sid = ldcp->local_session;
rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
sizeof (vio_rdx_msg_t), B_TRUE);
vsw_next_milestone(ldcp);
break;
case VIO_SUBTYPE_ACK:
/*
* Should be handled in-band by callback handler.
*/
DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
break;
case VIO_SUBTYPE_NACK:
D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
return;
ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
vsw_next_milestone(ldcp);
break;
default:
DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
rdx_pkt->tag.vio_subtype);
}
D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}
static void
vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag)
{
uint16_t env = tag.vio_subtype_env;
vsw_t *vswp = ldcp->ldc_vswp;
D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
/* session id check */
if (ldcp->session_status & VSW_PEER_SESSION) {
if (ldcp->peer_session != tag.vio_sid) {
DERR(vswp, "%s (chan %d): invalid session id (%llx)",
__func__, ldcp->ldc_id, tag.vio_sid);
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
return;
}
}
/*
* It is an error for us to be getting data packets
* before the handshake has completed.
*/
if (ldcp->hphase != VSW_MILESTONE4) {
DERR(vswp, "%s: got data packet before handshake complete "
"hphase %d (%x: %x)", __func__, ldcp->hphase,
ldcp->lane_in.lstate, ldcp->lane_out.lstate);
DUMP_FLAGS(ldcp->lane_in.lstate);
DUMP_FLAGS(ldcp->lane_out.lstate);
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
return;
}
/*
* Switch on vio_subtype envelope, then let lower routines
* decide if its an INFO, ACK or NACK packet.
*/
if (env == VIO_DRING_DATA) {
vsw_process_data_dring_pkt(ldcp, dpkt);
} else if (env == VIO_PKT_DATA) {
vsw_process_data_raw_pkt(ldcp, dpkt);
} else if (env == VIO_DESC_DATA) {
vsw_process_data_ibnd_pkt(ldcp, dpkt);
} else {
DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
__func__, env);
}
D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}
#define SND_DRING_NACK(ldcp, pkt) \
pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
pkt->tag.vio_sid = ldcp->local_session; \
(void) vsw_send_msg(ldcp, (void *)pkt, \
sizeof (vio_dring_msg_t), B_TRUE);
static void
vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
{
vio_dring_msg_t *dring_pkt;
vnet_public_desc_t *pub_addr = NULL;
vsw_private_desc_t *priv_addr = NULL;
dring_info_t *dp = NULL;
vsw_t *vswp = ldcp->ldc_vswp;
mblk_t *mp = NULL;
mblk_t *bp = NULL;
mblk_t *bpt = NULL;
size_t nbytes = 0;
size_t off = 0;
uint64_t ncookies = 0;
uint64_t chain = 0;
uint64_t j, len;
uint32_t pos, start, datalen;
uint32_t range_start, range_end;
int32_t end, num, cnt = 0;
int i, rv, msg_rv = 0;
boolean_t ack_needed = B_FALSE;
boolean_t prev_desc_ack = B_FALSE;
int read_attempts = 0;
D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
/*
* We know this is a data/dring packet so
* cast it into the correct structure.
*/
dring_pkt = (vio_dring_msg_t *)dpkt;
/*
* Switch on the vio_subtype. If its INFO then we need to
* process the data. If its an ACK we need to make sure
* it makes sense (i.e did we send an earlier data/info),
* and if its a NACK then we maybe attempt a retry.
*/
switch (dring_pkt->tag.vio_subtype) {
case VIO_SUBTYPE_INFO:
D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
READ_ENTER(&ldcp->lane_in.dlistrw);
if ((dp = vsw_ident2dring(&ldcp->lane_in,
dring_pkt->dring_ident)) == NULL) {
RW_EXIT(&ldcp->lane_in.dlistrw);
DERR(vswp, "%s(%lld): unable to find dring from "
"ident 0x%llx", __func__, ldcp->ldc_id,
dring_pkt->dring_ident);
SND_DRING_NACK(ldcp, dring_pkt);
return;
}
start = pos = dring_pkt->start_idx;
end = dring_pkt->end_idx;
len = dp->num_descriptors;
range_start = range_end = pos;
D2(vswp, "%s(%lld): start index %ld : end %ld\n",
__func__, ldcp->ldc_id, start, end);
if (end == -1) {
num = -1;
} else if (end >= 0) {
num = end >= pos ?
end - pos + 1: (len - pos + 1) + end;
/* basic sanity check */
if (end > len) {
RW_EXIT(&ldcp->lane_in.dlistrw);
DERR(vswp, "%s(%lld): endpoint %lld outside "
"ring length %lld", __func__,
ldcp->ldc_id, end, len);
SND_DRING_NACK(ldcp, dring_pkt);
return;
}
} else {
RW_EXIT(&ldcp->lane_in.dlistrw);
DERR(vswp, "%s(%lld): invalid endpoint %lld",
__func__, ldcp->ldc_id, end);
SND_DRING_NACK(ldcp, dring_pkt);
return;
}
while (cnt != num) {
vsw_recheck_desc:
if ((rv = ldc_mem_dring_acquire(dp->handle,
pos, pos)) != 0) {
RW_EXIT(&ldcp->lane_in.dlistrw);
DERR(vswp, "%s(%lld): unable to acquire "
"descriptor at pos %d: err %d",
__func__, pos, ldcp->ldc_id, rv);
SND_DRING_NACK(ldcp, dring_pkt);
return;
}
pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
/*
* When given a bounded range of descriptors
* to process, its an error to hit a descriptor
* which is not ready. In the non-bounded case
* (end_idx == -1) this simply indicates we have
* reached the end of the current active range.
*/
if (pub_addr->hdr.dstate != VIO_DESC_READY) {
/* unbound - no error */
if (end == -1) {
if (read_attempts == vsw_read_attempts)
break;
delay(drv_usectohz(vsw_desc_delay));
read_attempts++;
goto vsw_recheck_desc;
}
/* bounded - error - so NACK back */
RW_EXIT(&ldcp->lane_in.dlistrw);
DERR(vswp, "%s(%lld): descriptor not READY "
"(%d)", __func__, ldcp->ldc_id,
pub_addr->hdr.dstate);
SND_DRING_NACK(ldcp, dring_pkt);
return;
}
DTRACE_PROBE1(read_attempts, int, read_attempts);
range_end = pos;
/*
* If we ACK'd the previous descriptor then now
* record the new range start position for later
* ACK's.
*/
if (prev_desc_ack) {
range_start = pos;
D2(vswp, "%s(%lld): updating range start "
"to be %d", __func__, ldcp->ldc_id,
range_start);
prev_desc_ack = B_FALSE;
}
/*
* Data is padded to align on 8 byte boundary,
* datalen is actual data length, i.e. minus that
* padding.
*/
datalen = pub_addr->nbytes;
/*
* Does peer wish us to ACK when we have finished
* with this descriptor ?
*/
if (pub_addr->hdr.ack)
ack_needed = B_TRUE;
D2(vswp, "%s(%lld): processing desc %lld at pos"
" 0x%llx : dstate 0x%lx : datalen 0x%lx",
__func__, ldcp->ldc_id, pos, pub_addr,
pub_addr->hdr.dstate, datalen);
/*
* Mark that we are starting to process descriptor.
*/
pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
mp = vio_allocb(ldcp->rxh);
if (mp == NULL) {
/*
* No free receive buffers available, so
* fallback onto allocb(9F). Make sure that
* we get a data buffer which is a multiple
* of 8 as this is required by ldc_mem_copy.
*/
DTRACE_PROBE(allocb);
mp = allocb(datalen + VNET_IPALIGN + 8,
BPRI_MED);
}
/*
* Ensure that we ask ldc for an aligned
* number of bytes.
*/
nbytes = datalen + VNET_IPALIGN;
if (nbytes & 0x7) {
off = 8 - (nbytes & 0x7);
nbytes += off;
}
ncookies = pub_addr->ncookies;
rv = ldc_mem_copy(ldcp->ldc_handle,
(caddr_t)mp->b_rptr, 0, &nbytes,
pub_addr->memcookie, ncookies,
LDC_COPY_IN);
if (rv != 0) {
DERR(vswp, "%s(%d): unable to copy in "
"data from %d cookies in desc %d"
" (rv %d)", __func__, ldcp->ldc_id,
ncookies, pos, rv);
freemsg(mp);
pub_addr->hdr.dstate = VIO_DESC_DONE;
(void) ldc_mem_dring_release(dp->handle,
pos, pos);
break;
} else {
D2(vswp, "%s(%d): copied in %ld bytes"
" using %d cookies", __func__,
ldcp->ldc_id, nbytes, ncookies);
}
/* adjust the read pointer to skip over the padding */
mp->b_rptr += VNET_IPALIGN;
/* point to the actual end of data */
mp->b_wptr = mp->b_rptr + datalen;
/* build a chain of received packets */
if (bp == NULL) {
/* first pkt */
bp = mp;
bp->b_next = bp->b_prev = NULL;
bpt = bp;
chain = 1;
} else {
mp->b_next = NULL;
mp->b_prev = bpt;
bpt->b_next = mp;
bpt = mp;
chain++;
}
/* mark we are finished with this descriptor */
pub_addr->hdr.dstate = VIO_DESC_DONE;
(void) ldc_mem_dring_release(dp->handle, pos, pos);
/*
* Send an ACK back to peer if requested.
*/
if (ack_needed) {
ack_needed = B_FALSE;
dring_pkt->start_idx = range_start;
dring_pkt->end_idx = range_end;
DERR(vswp, "%s(%lld): processed %d %d, ACK"
" requested", __func__, ldcp->ldc_id,
dring_pkt->start_idx,
dring_pkt->end_idx);
dring_pkt->dring_process_state = VIO_DP_ACTIVE;
dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
dring_pkt->tag.vio_sid = ldcp->local_session;
msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
sizeof (vio_dring_msg_t),
B_FALSE);
/*
* Check if ACK was successfully sent. If not
* we break and deal with that below.
*/
if (msg_rv != 0)
break;
prev_desc_ack = B_TRUE;
range_start = pos;
}
/* next descriptor */
pos = (pos + 1) % len;
cnt++;
/*
* Break out of loop here and stop processing to
* allow some other network device (or disk) to
* get access to the cpu.
*/
if (chain > vsw_chain_len) {
D3(vswp, "%s(%lld): switching chain of %d "
"msgs", __func__, ldcp->ldc_id, chain);
break;
}
}
RW_EXIT(&ldcp->lane_in.dlistrw);
/*
* If when we attempted to send the ACK we found that the
* channel had been reset then now handle this. We deal with
* it here as we cannot reset the channel while holding the
* dlistrw lock, and we don't want to acquire/release it
* continuously in the above loop, as a channel reset should
* be a rare event.
*/
if (msg_rv == ECONNRESET) {
vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
break;
}
/* send the chain of packets to be switched */
if (bp != NULL) {
D3(vswp, "%s(%lld): switching chain of %d msgs",
__func__, ldcp->ldc_id, chain);
vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
ldcp->ldc_port, NULL);
}
DTRACE_PROBE1(msg_cnt, int, cnt);
/*
* We are now finished so ACK back with the state
* set to STOPPING so our peer knows we are finished
*/
dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
dring_pkt->tag.vio_sid = ldcp->local_session;
dring_pkt->dring_process_state = VIO_DP_STOPPED;
DTRACE_PROBE(stop_process_sent);
/*
* We have not processed any more descriptors beyond
* the last one we ACK'd.
*/
if (prev_desc_ack)
range_start = range_end;
dring_pkt->start_idx = range_start;
dring_pkt->end_idx = range_end;
D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
__func__, ldcp->ldc_id, dring_pkt->start_idx,
dring_pkt->end_idx);
(void) vsw_send_msg(ldcp, (void *)dring_pkt,
sizeof (vio_dring_msg_t), B_TRUE);
break;
case VIO_SUBTYPE_ACK:
D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
/*
* Verify that the relevant descriptors are all
* marked as DONE
*/
READ_ENTER(&ldcp->lane_out.dlistrw);
if ((dp = vsw_ident2dring(&ldcp->lane_out,
dring_pkt->dring_ident)) == NULL) {
RW_EXIT(&ldcp->lane_out.dlistrw);
DERR(vswp, "%s: unknown ident in ACK", __func__);
return;
}
pub_addr = (vnet_public_desc_t *)dp->pub_addr;
priv_addr = (vsw_private_desc_t *)dp->priv_addr;
start = end = 0;
start = dring_pkt->start_idx;
end = dring_pkt->end_idx;
len = dp->num_descriptors;
j = num = 0;
/* calculate # descriptors taking into a/c wrap around */
num = end >= start ? end - start + 1: (len - start + 1) + end;
D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n",
__func__, ldcp->ldc_id, start, end, num);
mutex_enter(&dp->dlock);
dp->last_ack_recv = end;
mutex_exit(&dp->dlock);
for (i = start; j < num; i = (i + 1) % len, j++) {
pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
/*
* If the last descriptor in a range has the ACK
* bit set then we will get two messages from our
* peer relating to it. The normal ACK msg and then
* a subsequent STOP msg. The first message will have
* resulted in the descriptor being reclaimed and
* its state set to FREE so when we encounter a non
* DONE descriptor we need to check to see if its
* because we have just reclaimed it.
*/
mutex_enter(&priv_addr->dstate_lock);
if (pub_addr->hdr.dstate == VIO_DESC_DONE) {
/* clear all the fields */
bzero(priv_addr->datap, priv_addr->datalen);
priv_addr->datalen = 0;
pub_addr->hdr.dstate = VIO_DESC_FREE;
pub_addr->hdr.ack = 0;
priv_addr->dstate = VIO_DESC_FREE;
mutex_exit(&priv_addr->dstate_lock);
D3(vswp, "clearing descp %d : pub state "
"0x%llx : priv state 0x%llx", i,
pub_addr->hdr.dstate,
priv_addr->dstate);
} else {
mutex_exit(&priv_addr->dstate_lock);
if (dring_pkt->dring_process_state !=
VIO_DP_STOPPED) {
DERR(vswp, "%s: descriptor %lld at pos "
" 0x%llx not DONE (0x%lx)\n",
__func__, i, pub_addr,
pub_addr->hdr.dstate);
RW_EXIT(&ldcp->lane_out.dlistrw);
return;
}
}
}
/*
* If our peer is stopping processing descriptors then
* we check to make sure it has processed all the descriptors
* we have updated. If not then we send it a new message
* to prompt it to restart.
*/
if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
DTRACE_PROBE(stop_process_recv);
D2(vswp, "%s(%lld): got stopping msg : %d : %d",
__func__, ldcp->ldc_id, dring_pkt->start_idx,
dring_pkt->end_idx);
/*
* Check next descriptor in public section of ring.
* If its marked as READY then we need to prompt our
* peer to start processing the ring again.
*/
i = (end + 1) % len;
pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
/*
* Hold the restart lock across all of this to
* make sure that its not possible for us to
* decide that a msg needs to be sent in the future
* but the sending code having already checked is
* about to exit.
*/
mutex_enter(&dp->restart_lock);
mutex_enter(&priv_addr->dstate_lock);
if (pub_addr->hdr.dstate == VIO_DESC_READY) {
mutex_exit(&priv_addr->dstate_lock);
dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
dring_pkt->tag.vio_sid = ldcp->local_session;
mutex_enter(&ldcp->lane_out.seq_lock);
dring_pkt->seq_num = ldcp->lane_out.seq_num++;
mutex_exit(&ldcp->lane_out.seq_lock);
dring_pkt->start_idx = (end + 1) % len;
dring_pkt->end_idx = -1;
D2(vswp, "%s(%lld) : sending restart msg:"
" %d : %d", __func__, ldcp->ldc_id,
dring_pkt->start_idx,
dring_pkt->end_idx);
msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
sizeof (vio_dring_msg_t), B_FALSE);
} else {
mutex_exit(&priv_addr->dstate_lock);
dp->restart_reqd = B_TRUE;
}
mutex_exit(&dp->restart_lock);
}
RW_EXIT(&ldcp->lane_out.dlistrw);
/* only do channel reset after dropping dlistrw lock */
if (msg_rv == ECONNRESET)
vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
break;
case VIO_SUBTYPE_NACK:
DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
__func__, ldcp->ldc_id);
/*
* Something is badly wrong if we are getting NACK's
* for our data pkts. So reset the channel.
*/
vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
break;
default:
DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
ldcp->ldc_id, dring_pkt->tag.vio_subtype);
}
D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
}
/*
* VIO_PKT_DATA (a.k.a raw data mode )
*
* Note - currently not supported. Do nothing.
*/
static void
vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt)
{
_NOTE(ARGUNUSED(dpkt))
D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
DERR(NULL, "%s (%lld): currently not supported",
__func__, ldcp->ldc_id);
D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
}
/*
* Process an in-band descriptor message (most likely from
* OBP).
*/
static void
vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
{
vnet_ibnd_desc_t *ibnd_desc;
dring_info_t *dp = NULL;
vsw_private_desc_t *priv_addr = NULL;
vsw_t *vswp = ldcp->ldc_vswp;
mblk_t *mp = NULL;
mblk_t *nmp;
size_t nbytes = 0;
size_t off = 0;
uint64_t idx = 0;
uint32_t num = 1, len, datalen = 0;
uint64_t ncookies = 0;
int i, rv;
int j = 0;
D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
ibnd_desc = (vnet_ibnd_desc_t *)pkt;
switch (ibnd_desc->hdr.tag.vio_subtype) {
case VIO_SUBTYPE_INFO:
D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
return;
/*
* Data is padded to align on a 8 byte boundary,
* nbytes is actual data length, i.e. minus that
* padding.
*/
datalen = ibnd_desc->nbytes;
D2(vswp, "%s(%lld): processing inband desc : "
": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
ncookies = ibnd_desc->ncookies;
/*
* allocb(9F) returns an aligned data block. We
* need to ensure that we ask ldc for an aligned
* number of bytes also.
*/
nbytes = datalen;
if (nbytes & 0x7) {
off = 8 - (nbytes & 0x7);
nbytes += off;
}
mp = allocb(datalen, BPRI_MED);
if (mp == NULL) {
DERR(vswp, "%s(%lld): allocb failed",
__func__, ldcp->ldc_id);
return;
}
rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
LDC_COPY_IN);
if (rv != 0) {
DERR(vswp, "%s(%d): unable to copy in data from "
"%d cookie(s)", __func__,
ldcp->ldc_id, ncookies);
freemsg(mp);
return;
}
D2(vswp, "%s(%d): copied in %ld bytes using %d "
"cookies", __func__, ldcp->ldc_id, nbytes,
ncookies);
/*
* Upper layer is expecting the IP header in the packet to
* be 4-bytes aligned, but the OBP is sending packets that
* are not aligned. So, copy the data to another message
* such that the alignment requirement is met.
*/
nmp = allocb(datalen + VNET_IPALIGN, BPRI_MED);
if (nmp == NULL) {
DERR(vswp, "%s(%lld): allocb failed",
__func__, ldcp->ldc_id);
freemsg(mp);
return;
}
nmp->b_rptr += VNET_IPALIGN;
bcopy(mp->b_rptr, nmp->b_rptr, datalen);
freemsg(mp);
/* point to the actual end of data */
nmp->b_wptr = nmp->b_rptr + datalen;
/*
* We ACK back every in-band descriptor message we process
*/
ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
sizeof (vnet_ibnd_desc_t), B_TRUE);
/* send the packet to be switched */
vswp->vsw_switch_frame(vswp, nmp, VSW_VNETPORT,
ldcp->ldc_port, NULL);
break;
case VIO_SUBTYPE_ACK:
D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
/* Verify the ACK is valid */
idx = ibnd_desc->hdr.desc_handle;
if (idx >= VSW_RING_NUM_EL) {
cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
"(idx %ld)", vswp->instance, idx);
return;
}
if ((dp = ldcp->lane_out.dringp) == NULL) {
DERR(vswp, "%s: no dring found", __func__);
return;
}
len = dp->num_descriptors;
/*
* If the descriptor we are being ACK'ed for is not the
* one we expected, then pkts were lost somwhere, either
* when we tried to send a msg, or a previous ACK msg from
* our peer. In either case we now reclaim the descriptors
* in the range from the last ACK we received up to the
* current ACK.
*/
if (idx != dp->last_ack_recv) {
DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
__func__, dp->last_ack_recv, idx);
num = idx >= dp->last_ack_recv ?
idx - dp->last_ack_recv + 1:
(len - dp->last_ack_recv + 1) + idx;
}
/*
* When we sent the in-band message to our peer we
* marked the copy in our private ring as READY. We now
* check that the descriptor we are being ACK'ed for is in
* fact READY, i.e. it is one we have shared with our peer.
*
* If its not we flag an error, but still reset the descr
* back to FREE.
*/
for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
mutex_enter(&priv_addr->dstate_lock);
if (priv_addr->dstate != VIO_DESC_READY) {
DERR(vswp, "%s: (%ld) desc at index %ld not "
"READY (0x%lx)", __func__,
ldcp->ldc_id, idx, priv_addr->dstate);
DERR(vswp, "%s: bound %d: ncookies %ld : "
"datalen %ld", __func__,
priv_addr->bound, priv_addr->ncookies,
priv_addr->datalen);
}
D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
ldcp->ldc_id, idx);
/* release resources associated with sent msg */
bzero(priv_addr->datap, priv_addr->datalen);
priv_addr->datalen = 0;
priv_addr->dstate = VIO_DESC_FREE;
mutex_exit(&priv_addr->dstate_lock);
}
/* update to next expected value */
dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
break;
case VIO_SUBTYPE_NACK:
DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
/*
* We should only get a NACK if our peer doesn't like
* something about a message we have sent it. If this
* happens we just release the resources associated with
* the message. (We are relying on higher layers to decide
* whether or not to resend.
*/
/* limit check */
idx = ibnd_desc->hdr.desc_handle;
if (idx >= VSW_RING_NUM_EL) {
DERR(vswp, "%s: corrupted NACK received (idx %lld)",
__func__, idx);
return;
}
if ((dp = ldcp->lane_out.dringp) == NULL) {
DERR(vswp, "%s: no dring found", __func__);
return;
}
priv_addr = (vsw_private_desc_t *)dp->priv_addr;
/* move to correct location in ring */
priv_addr += idx;
/* release resources associated with sent msg */
mutex_enter(&priv_addr->dstate_lock);
bzero(priv_addr->datap, priv_addr->datalen);
priv_addr->datalen = 0;
priv_addr->dstate = VIO_DESC_FREE;
mutex_exit(&priv_addr->dstate_lock);
break;
default:
DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
}
D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
}
static void
vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag)
{
_NOTE(ARGUNUSED(epkt))
vsw_t *vswp = ldcp->ldc_vswp;
uint16_t env = tag.vio_subtype_env;
D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
/*
* Error vio_subtypes have yet to be defined. So for
* the moment we can't do anything.
*/
D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
}
/*
* Switch the given ethernet frame when operating in layer 2 mode.
*
* vswp: pointer to the vsw instance
* mp: pointer to chain of ethernet frame(s) to be switched
* caller: identifies the source of this frame as:
* 1. VSW_VNETPORT - a vsw port (connected to a vnet).
* 2. VSW_PHYSDEV - the physical ethernet device
* 3. VSW_LOCALDEV - vsw configured as a virtual interface
* arg: argument provided by the caller.
* 1. for VNETPORT - pointer to the corresponding vsw_port_t.
* 2. for PHYSDEV - NULL
* 3. for LOCALDEV - pointer to to this vsw_t(self)
*/
void
vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
vsw_port_t *arg, mac_resource_handle_t mrh)
{
struct ether_header *ehp;
vsw_port_t *port = NULL;
mblk_t *bp, *ret_m;
mblk_t *nmp = NULL;
vsw_port_list_t *plist = &vswp->plist;
D1(vswp, "%s: enter (caller %d)", __func__, caller);
/*
* PERF: rather than breaking up the chain here, scan it
* to find all mblks heading to same destination and then
* pass that sub-chain to the lower transmit functions.
*/
/* process the chain of packets */
bp = mp;
while (bp) {
mp = bp;
bp = bp->b_next;
mp->b_next = mp->b_prev = NULL;
ehp = (struct ether_header *)mp->b_rptr;
D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
__func__, MBLKSIZE(mp), MBLKL(mp));
READ_ENTER(&vswp->if_lockrw);
if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
/*
* If destination is VSW_LOCALDEV (vsw as an eth
* interface) and if the device is up & running,
* send the packet up the stack on this host.
* If the virtual interface is down, drop the packet.
*/
if (caller != VSW_LOCALDEV) {
if (vswp->if_state & VSW_IF_UP) {
RW_EXIT(&vswp->if_lockrw);
mac_rx(vswp->if_mh, mrh, mp);
} else {
RW_EXIT(&vswp->if_lockrw);
/* Interface down, drop pkt */
freemsg(mp);
}
} else {
RW_EXIT(&vswp->if_lockrw);
freemsg(mp);
}
continue;
}
RW_EXIT(&vswp->if_lockrw);
READ_ENTER(&plist->lockrw);
port = vsw_lookup_fdb(vswp, ehp);
if (port) {
/*
* Mark the port as in-use.
*/
mutex_enter(&port->ref_lock);
port->ref_cnt++;
mutex_exit(&port->ref_lock);
RW_EXIT(&plist->lockrw);
/*
* If plumbed and in promisc mode then copy msg
* and send up the stack.
*/
READ_ENTER(&vswp->if_lockrw);
if (VSW_U_P(vswp->if_state)) {
RW_EXIT(&vswp->if_lockrw);
nmp = copymsg(mp);
if (nmp)
mac_rx(vswp->if_mh, mrh, nmp);
} else {
RW_EXIT(&vswp->if_lockrw);
}
/*
* If the destination is in FDB, the packet
* should be forwarded to the correponding
* vsw_port (connected to a vnet device -
* VSW_VNETPORT)
*/
(void) vsw_portsend(port, mp);
/*
* Decrement use count in port and check if
* should wake delete thread.
*/
mutex_enter(&port->ref_lock);
port->ref_cnt--;
if (port->ref_cnt == 0)
cv_signal(&port->ref_cv);
mutex_exit(&port->ref_lock);
} else {
RW_EXIT(&plist->lockrw);
/*
* Destination not in FDB.
*
* If the destination is broadcast or
* multicast forward the packet to all
* (VNETPORTs, PHYSDEV, LOCALDEV),
* except the caller.
*/
if (IS_BROADCAST(ehp)) {
D3(vswp, "%s: BROADCAST pkt", __func__);
(void) vsw_forward_all(vswp, mp,
caller, arg);
} else if (IS_MULTICAST(ehp)) {
D3(vswp, "%s: MULTICAST pkt", __func__);
(void) vsw_forward_grp(vswp, mp,
caller, arg);
} else {
/*
* If the destination is unicast, and came
* from either a logical network device or
* the switch itself when it is plumbed, then
* send it out on the physical device and also
* up the stack if the logical interface is
* in promiscious mode.
*
* NOTE: The assumption here is that if we
* cannot find the destination in our fdb, its
* a unicast address, and came from either a
* vnet or down the stack (when plumbed) it
* must be destinded for an ethernet device
* outside our ldoms.
*/
if (caller == VSW_VNETPORT) {
READ_ENTER(&vswp->if_lockrw);
if (VSW_U_P(vswp->if_state)) {
RW_EXIT(&vswp->if_lockrw);
nmp = copymsg(mp);
if (nmp)
mac_rx(vswp->if_mh,
mrh, nmp);
} else {
RW_EXIT(&vswp->if_lockrw);
}
if ((ret_m = vsw_tx_msg(vswp, mp))
!= NULL) {
DERR(vswp, "%s: drop mblks to "
"phys dev", __func__);
freemsg(ret_m);
}
} else if (caller == VSW_PHYSDEV) {
/*
* Pkt seen because card in promisc
* mode. Send up stack if plumbed in
* promisc mode, else drop it.
*/
READ_ENTER(&vswp->if_lockrw);
if (VSW_U_P(vswp->if_state)) {
RW_EXIT(&vswp->if_lockrw);
mac_rx(vswp->if_mh, mrh, mp);
} else {
RW_EXIT(&vswp->if_lockrw);
freemsg(mp);
}
} else if (caller == VSW_LOCALDEV) {
/*
* Pkt came down the stack, send out
* over physical device.
*/
if ((ret_m = vsw_tx_msg(vswp, mp))
!= NULL) {
DERR(vswp, "%s: drop mblks to "
"phys dev", __func__);
freemsg(ret_m);
}
}
}
}
}
D1(vswp, "%s: exit\n", __func__);
}
/*
* Switch ethernet frame when in layer 3 mode (i.e. using IP
* layer to do the routing).
*
* There is a large amount of overlap between this function and
* vsw_switch_l2_frame. At some stage we need to revisit and refactor
* both these functions.
*/
void
vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
vsw_port_t *arg, mac_resource_handle_t mrh)
{
struct ether_header *ehp;
vsw_port_t *port = NULL;
mblk_t *bp = NULL;
vsw_port_list_t *plist = &vswp->plist;
D1(vswp, "%s: enter (caller %d)", __func__, caller);
/*
* In layer 3 mode should only ever be switching packets
* between IP layer and vnet devices. So make sure thats
* who is invoking us.
*/
if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
freemsgchain(mp);
return;
}
/* process the chain of packets */
bp = mp;
while (bp) {
mp = bp;
bp = bp->b_next;
mp->b_next = mp->b_prev = NULL;
ehp = (struct ether_header *)mp->b_rptr;
D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
__func__, MBLKSIZE(mp), MBLKL(mp));
READ_ENTER(&plist->lockrw);
port = vsw_lookup_fdb(vswp, ehp);
if (port) {
/*
* Mark port as in-use.
*/
mutex_enter(&port->ref_lock);
port->ref_cnt++;
mutex_exit(&port->ref_lock);
RW_EXIT(&plist->lockrw);
D2(vswp, "%s: sending to target port", __func__);
(void) vsw_portsend(port, mp);
/*
* Finished with port so decrement ref count and
* check if should wake delete thread.
*/
mutex_enter(&port->ref_lock);
port->ref_cnt--;
if (port->ref_cnt == 0)
cv_signal(&port->ref_cv);
mutex_exit(&port->ref_lock);
} else {
RW_EXIT(&plist->lockrw);
/*
* Destination not in FDB
*
* If the destination is broadcast or
* multicast forward the packet to all
* (VNETPORTs, PHYSDEV, LOCALDEV),
* except the caller.
*/
if (IS_BROADCAST(ehp)) {
D2(vswp, "%s: BROADCAST pkt", __func__);
(void) vsw_forward_all(vswp, mp,
caller, arg);
} else if (IS_MULTICAST(ehp)) {
D2(vswp, "%s: MULTICAST pkt", __func__);
(void) vsw_forward_grp(vswp, mp,
caller, arg);
} else {
/*
* Unicast pkt from vnet that we don't have
* an FDB entry for, so must be destinded for
* the outside world. Attempt to send up to the
* IP layer to allow it to deal with it.
*/
if (caller == VSW_VNETPORT) {
READ_ENTER(&vswp->if_lockrw);
if (vswp->if_state & VSW_IF_UP) {
RW_EXIT(&vswp->if_lockrw);
D2(vswp, "%s: sending up",
__func__);
mac_rx(vswp->if_mh, mrh, mp);
} else {
RW_EXIT(&vswp->if_lockrw);
/* Interface down, drop pkt */
D2(vswp, "%s I/F down",
__func__);
freemsg(mp);
}
}
}
}
}
D1(vswp, "%s: exit", __func__);
}
/*
* Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
* except the caller (port on which frame arrived).
*/
static int
vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
{
vsw_port_list_t *plist = &vswp->plist;
vsw_port_t *portp;
mblk_t *nmp = NULL;
mblk_t *ret_m = NULL;
int skip_port = 0;
D1(vswp, "vsw_forward_all: enter\n");
/*
* Broadcast message from inside ldoms so send to outside
* world if in either of layer 2 modes.
*/
if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
nmp = dupmsg(mp);
if (nmp) {
if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
DERR(vswp, "%s: dropping pkt(s) "
"consisting of %ld bytes of data for"
" physical device", __func__, MBLKL(ret_m));
freemsg(ret_m);
}
}
}
if (caller == VSW_VNETPORT)
skip_port = 1;
/*
* Broadcast message from other vnet (layer 2 or 3) or outside
* world (layer 2 only), send up stack if plumbed.
*/
if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
READ_ENTER(&vswp->if_lockrw);
if (vswp->if_state & VSW_IF_UP) {
RW_EXIT(&vswp->if_lockrw);
nmp = copymsg(mp);
if (nmp)
mac_rx(vswp->if_mh, NULL, nmp);
} else {
RW_EXIT(&vswp->if_lockrw);
}
}
/* send it to all VNETPORTs */
READ_ENTER(&plist->lockrw);
for (portp = plist->head; portp != NULL; portp = portp->p_next) {
D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
/*
* Caution ! - don't reorder these two checks as arg
* will be NULL if the caller is PHYSDEV. skip_port is
* only set if caller is VNETPORT.
*/
if ((skip_port) && (portp == arg))
continue;
else {
nmp = dupmsg(mp);
if (nmp) {
(void) vsw_portsend(portp, nmp);
} else {
DERR(vswp, "vsw_forward_all: nmp NULL");
}
}
}
RW_EXIT(&plist->lockrw);
freemsg(mp);
D1(vswp, "vsw_forward_all: exit\n");
return (0);
}
/*
* Forward pkts to any devices or interfaces which have registered
* an interest in them (i.e. multicast groups).
*/
static int
vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
{
struct ether_header *ehp = (struct ether_header *)mp->b_rptr;
mfdb_ent_t *entp = NULL;
mfdb_ent_t *tpp = NULL;
vsw_port_t *port;
uint64_t key = 0;
mblk_t *nmp = NULL;
mblk_t *ret_m = NULL;
boolean_t check_if = B_TRUE;
/*
* Convert address to hash table key
*/
KEY_HASH(key, ehp->ether_dhost);
D1(vswp, "%s: key 0x%llx", __func__, key);
/*
* If pkt came from either a vnet or down the stack (if we are
* plumbed) and we are in layer 2 mode, then we send the pkt out
* over the physical adapter, and then check to see if any other
* vnets are interested in it.
*/
if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
nmp = dupmsg(mp);
if (nmp) {
if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
DERR(vswp, "%s: dropping pkt(s) "
"consisting of %ld bytes of "
"data for physical device",
__func__, MBLKL(ret_m));
freemsg(ret_m);
}
}
}
READ_ENTER(&vswp->mfdbrw);
if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
(mod_hash_val_t *)&entp) != 0) {
D3(vswp, "%s: no table entry found for addr 0x%llx",
__func__, key);
} else {
/*
* Send to list of devices associated with this address...
*/
for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
/* dont send to ourselves */
if ((caller == VSW_VNETPORT) &&
(tpp->d_addr == (void *)arg)) {
port = (vsw_port_t *)tpp->d_addr;
D3(vswp, "%s: not sending to ourselves"
" : port %d", __func__,
port->p_instance);
continue;
} else if ((caller == VSW_LOCALDEV) &&
(tpp->d_type == VSW_LOCALDEV)) {
D3(vswp, "%s: not sending back up stack",
__func__);
continue;
}
if (tpp->d_type == VSW_VNETPORT) {
port = (vsw_port_t *)tpp->d_addr;
D3(vswp, "%s: sending to port %ld for "
" addr 0x%llx", __func__,
port->p_instance, key);
nmp = dupmsg(mp);
if (nmp)
(void) vsw_portsend(port, nmp);
} else {
if (vswp->if_state & VSW_IF_UP) {
nmp = copymsg(mp);
if (nmp)
mac_rx(vswp->if_mh, NULL, nmp);
check_if = B_FALSE;
D3(vswp, "%s: sending up stack"
" for addr 0x%llx", __func__,
key);
}
}
}
}
RW_EXIT(&vswp->mfdbrw);
/*
* If the pkt came from either a vnet or from physical device,
* and if we havent already sent the pkt up the stack then we
* check now if we can/should (i.e. the interface is plumbed
* and in promisc mode).
*/
if ((check_if) &&
((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
READ_ENTER(&vswp->if_lockrw);
if (VSW_U_P(vswp->if_state)) {
RW_EXIT(&vswp->if_lockrw);
D3(vswp, "%s: (caller %d) finally sending up stack"
" for addr 0x%llx", __func__, caller, key);
nmp = copymsg(mp);
if (nmp)
mac_rx(vswp->if_mh, NULL, nmp);
} else {
RW_EXIT(&vswp->if_lockrw);
}
}
freemsg(mp);
D1(vswp, "%s: exit", __func__);
return (0);
}
/* transmit the packet over the given port */
static int
vsw_portsend(vsw_port_t *port, mblk_t *mp)
{
vsw_ldc_list_t *ldcl = &port->p_ldclist;
vsw_ldc_t *ldcp;
int status = 0;
READ_ENTER(&ldcl->lockrw);
/*
* Note for now, we have a single channel.
*/
ldcp = ldcl->head;
if (ldcp == NULL) {
DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
freemsg(mp);
RW_EXIT(&ldcl->lockrw);
return (1);
}
/*
* Send the message out using the appropriate
* transmit function which will free mblock when it
* is finished with it.
*/
mutex_enter(&port->tx_lock);
if (port->transmit != NULL)
status = (*port->transmit)(ldcp, mp);
else {
freemsg(mp);
}
mutex_exit(&port->tx_lock);
RW_EXIT(&ldcl->lockrw);
return (status);
}
/*
* Send packet out via descriptor ring to a logical device.
*/
static int
vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
{
vio_dring_msg_t dring_pkt;
dring_info_t *dp = NULL;
vsw_private_desc_t *priv_desc = NULL;
vnet_public_desc_t *pub = NULL;
vsw_t *vswp = ldcp->ldc_vswp;
mblk_t *bp;
size_t n, size;
caddr_t bufp;
int idx;
int status = LDC_TX_SUCCESS;
D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
/* TODO: make test a macro */
if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
"packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
ldcp->lane_out.lstate);
freemsg(mp);
return (LDC_TX_FAILURE);
}
/*
* Note - using first ring only, this may change
* in the future.
*/
READ_ENTER(&ldcp->lane_out.dlistrw);
if ((dp = ldcp->lane_out.dringp) == NULL) {
RW_EXIT(&ldcp->lane_out.dlistrw);
DERR(vswp, "%s(%lld): no dring for outbound lane on"
" channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
freemsg(mp);
return (LDC_TX_FAILURE);
}
size = msgsize(mp);
if (size > (size_t)ETHERMAX) {
RW_EXIT(&ldcp->lane_out.dlistrw);
DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
ldcp->ldc_id, size);
freemsg(mp);
return (LDC_TX_FAILURE);
}
/*
* Find a free descriptor
*
* Note: for the moment we are assuming that we will only
* have one dring going from the switch to each of its
* peers. This may change in the future.
*/
if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
D2(vswp, "%s(%lld): no descriptor available for ring "
"at 0x%llx", __func__, ldcp->ldc_id, dp);
/* nothing more we can do */
status = LDC_TX_NORESOURCES;
goto vsw_dringsend_free_exit;
} else {
D2(vswp, "%s(%lld): free private descriptor found at pos "
"%ld addr 0x%llx\n", __func__, ldcp->ldc_id, idx,
priv_desc);
}
/* copy data into the descriptor */
bufp = priv_desc->datap;
bufp += VNET_IPALIGN;
for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
n = MBLKL(bp);
bcopy(bp->b_rptr, bufp, n);
bufp += n;
}
priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
pub = priv_desc->descp;
pub->nbytes = priv_desc->datalen;
mutex_enter(&priv_desc->dstate_lock);
pub->hdr.dstate = VIO_DESC_READY;
mutex_exit(&priv_desc->dstate_lock);
/*
* Determine whether or not we need to send a message to our
* peer prompting them to read our newly updated descriptor(s).
*/
mutex_enter(&dp->restart_lock);
if (dp->restart_reqd) {
dp->restart_reqd = B_FALSE;
mutex_exit(&dp->restart_lock);
/*
* Send a vio_dring_msg to peer to prompt them to read
* the updated descriptor ring.
*/
dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
dring_pkt.tag.vio_sid = ldcp->local_session;
/* Note - for now using first ring */
dring_pkt.dring_ident = dp->ident;
mutex_enter(&ldcp->lane_out.seq_lock);
dring_pkt.seq_num = ldcp->lane_out.seq_num++;
mutex_exit(&ldcp->lane_out.seq_lock);
/*
* If last_ack_recv is -1 then we know we've not
* received any ack's yet, so this must be the first
* msg sent, so set the start to the begining of the ring.
*/
mutex_enter(&dp->dlock);
if (dp->last_ack_recv == -1) {
dring_pkt.start_idx = 0;
} else {
dring_pkt.start_idx = (dp->last_ack_recv + 1) %
dp->num_descriptors;
}
dring_pkt.end_idx = -1;
mutex_exit(&dp->dlock);
D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
ldcp->ldc_id, dp, dring_pkt.dring_ident);
D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n",
__func__, ldcp->ldc_id, dring_pkt.start_idx,
dring_pkt.end_idx, dring_pkt.seq_num);
RW_EXIT(&ldcp->lane_out.dlistrw);
(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
sizeof (vio_dring_msg_t), B_TRUE);
/* free the message block */
freemsg(mp);
return (status);
} else {
mutex_exit(&dp->restart_lock);
D2(vswp, "%s(%lld): updating descp %d", __func__,
ldcp->ldc_id, idx);
}
vsw_dringsend_free_exit:
RW_EXIT(&ldcp->lane_out.dlistrw);
/* free the message block */
freemsg(mp);
D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
return (status);
}
/*
* Send an in-band descriptor message over ldc.
*/
static int
vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
{
vsw_t *vswp = ldcp->ldc_vswp;
vnet_ibnd_desc_t ibnd_msg;
vsw_private_desc_t *priv_desc = NULL;
dring_info_t *dp = NULL;
size_t n, size = 0;
caddr_t bufp;
mblk_t *bp;
int idx, i;
int status = LDC_TX_SUCCESS;
static int warn_msg = 1;
D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
ASSERT(mp != NULL);
if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
__func__, ldcp->ldc_id, ldcp->ldc_status,
ldcp->lane_out.lstate);
freemsg(mp);
return (LDC_TX_FAILURE);
}
/*
* only expect single dring to exist, which we use
* as an internal buffer, rather than a transfer channel.
*/
READ_ENTER(&ldcp->lane_out.dlistrw);
if ((dp = ldcp->lane_out.dringp) == NULL) {
DERR(vswp, "%s(%lld): no dring for outbound lane",
__func__, ldcp->ldc_id);
DERR(vswp, "%s(%lld) status(%d) state (0x%llx)",
__func__, ldcp->ldc_id, ldcp->ldc_status,
ldcp->lane_out.lstate);
RW_EXIT(&ldcp->lane_out.dlistrw);
freemsg(mp);
return (LDC_TX_FAILURE);
}
size = msgsize(mp);
if (size > (size_t)ETHERMAX) {
RW_EXIT(&ldcp->lane_out.dlistrw);
DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
ldcp->ldc_id, size);
freemsg(mp);
return (LDC_TX_FAILURE);
}
/*
* Find a free descriptor in our buffer ring
*/
if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
RW_EXIT(&ldcp->lane_out.dlistrw);
if (warn_msg) {
DERR(vswp, "%s(%lld): no descriptor available for ring "
"at 0x%llx", __func__, ldcp->ldc_id, dp);
warn_msg = 0;
}
/* nothing more we can do */
status = LDC_TX_NORESOURCES;
goto vsw_descrsend_free_exit;
} else {
D2(vswp, "%s(%lld): free private descriptor found at pos "
"%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx,
priv_desc);
warn_msg = 1;
}
/* copy data into the descriptor */
bufp = priv_desc->datap;
for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
n = MBLKL(bp);
bcopy(bp->b_rptr, bufp, n);
bufp += n;
}
priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
/* create and send the in-band descp msg */
ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
mutex_enter(&ldcp->lane_out.seq_lock);
ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++;
mutex_exit(&ldcp->lane_out.seq_lock);
/*
* Copy the mem cookies describing the data from the
* private region of the descriptor ring into the inband
* descriptor.
*/
for (i = 0; i < priv_desc->ncookies; i++) {
bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
sizeof (ldc_mem_cookie_t));
}
ibnd_msg.hdr.desc_handle = idx;
ibnd_msg.ncookies = priv_desc->ncookies;
ibnd_msg.nbytes = size;
RW_EXIT(&ldcp->lane_out.dlistrw);
(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
sizeof (vnet_ibnd_desc_t), B_TRUE);
vsw_descrsend_free_exit:
/* free the allocated message blocks */
freemsg(mp);
D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
return (status);
}
static void
vsw_send_ver(void *arg)
{
vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
vsw_t *vswp = ldcp->ldc_vswp;
lane_t *lp = &ldcp->lane_out;
vio_ver_msg_t ver_msg;
D1(vswp, "%s enter", __func__);
ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
ver_msg.tag.vio_sid = ldcp->local_session;
ver_msg.ver_major = vsw_versions[0].ver_major;
ver_msg.ver_minor = vsw_versions[0].ver_minor;
ver_msg.dev_class = VDEV_NETWORK_SWITCH;
lp->lstate |= VSW_VER_INFO_SENT;
lp->ver_major = ver_msg.ver_major;
lp->ver_minor = ver_msg.ver_minor;
DUMP_TAG(ver_msg.tag);
(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
}
static void
vsw_send_attr(vsw_ldc_t *ldcp)
{
vsw_t *vswp = ldcp->ldc_vswp;
lane_t *lp = &ldcp->lane_out;
vnet_attr_msg_t attr_msg;
D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
/*
* Subtype is set to INFO by default
*/
attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
attr_msg.tag.vio_sid = ldcp->local_session;
/* payload copied from default settings for lane */
attr_msg.mtu = lp->mtu;
attr_msg.addr_type = lp->addr_type;
attr_msg.xfer_mode = lp->xfer_mode;
attr_msg.ack_freq = lp->xfer_mode;
READ_ENTER(&vswp->if_lockrw);
bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL);
RW_EXIT(&vswp->if_lockrw);
ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
DUMP_TAG(attr_msg.tag);
(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
}
/*
* Create dring info msg (which also results in the creation of
* a dring).
*/
static vio_dring_reg_msg_t *
vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
{
vio_dring_reg_msg_t *mp;
dring_info_t *dp;
vsw_t *vswp = ldcp->ldc_vswp;
D1(vswp, "vsw_create_dring_info_pkt enter\n");
/*
* If we can't create a dring, obviously no point sending
* a message.
*/
if ((dp = vsw_create_dring(ldcp)) == NULL)
return (NULL);
mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
mp->tag.vio_msgtype = VIO_TYPE_CTRL;
mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
mp->tag.vio_subtype_env = VIO_DRING_REG;
mp->tag.vio_sid = ldcp->local_session;
/* payload */
mp->num_descriptors = dp->num_descriptors;
mp->descriptor_size = dp->descriptor_size;
mp->options = dp->options;
mp->ncookies = dp->ncookies;
bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
mp->dring_ident = 0;
D1(vswp, "vsw_create_dring_info_pkt exit\n");
return (mp);
}
static void
vsw_send_dring_info(vsw_ldc_t *ldcp)
{
vio_dring_reg_msg_t *dring_msg;
vsw_t *vswp = ldcp->ldc_vswp;
D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
dring_msg = vsw_create_dring_info_pkt(ldcp);
if (dring_msg == NULL) {
cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
vswp->instance, __func__);
return;
}
ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
(void) vsw_send_msg(ldcp, dring_msg,
sizeof (vio_dring_reg_msg_t), B_TRUE);
kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
}
static void
vsw_send_rdx(vsw_ldc_t *ldcp)
{
vsw_t *vswp = ldcp->ldc_vswp;
vio_rdx_msg_t rdx_msg;
D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
rdx_msg.tag.vio_subtype_env = VIO_RDX;
rdx_msg.tag.vio_sid = ldcp->local_session;
ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
DUMP_TAG(rdx_msg.tag);
(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
}
/*
* Generic routine to send message out over ldc channel.
*
* It is possible that when we attempt to write over the ldc channel
* that we get notified that it has been reset. Depending on the value
* of the handle_reset flag we either handle that event here or simply
* notify the caller that the channel was reset.
*/
static int
vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
{
int rv;
size_t msglen = size;
vio_msg_tag_t *tag = (vio_msg_tag_t *)msgp;
vsw_t *vswp = ldcp->ldc_vswp;
D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
ldcp->ldc_id, size);
D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
mutex_enter(&ldcp->ldc_txlock);
do {
msglen = size;
rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
if ((rv != 0) || (msglen != size)) {
DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) "
"rv(%d) size (%d) msglen(%d)\n", ldcp->ldc_id,
rv, size, msglen);
}
mutex_exit(&ldcp->ldc_txlock);
/*
* If channel has been reset we either handle it here or
* simply report back that it has been reset and let caller
* decide what to do.
*/
if (rv == ECONNRESET) {
DWARN(vswp, "%s (%lld) channel reset",
__func__, ldcp->ldc_id);
/*
* N.B - must never be holding the dlistrw lock when
* we do a reset of the channel.
*/
if (handle_reset) {
vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
}
}
return (rv);
}
/*
* Add an entry into FDB, for the given mac address and port_id.
* Returns 0 on success, 1 on failure.
*
* Lock protecting FDB must be held by calling process.
*/
static int
vsw_add_fdb(vsw_t *vswp, vsw_port_t *port)
{
uint64_t addr = 0;
D1(vswp, "%s: enter", __func__);
KEY_HASH(addr, port->p_macaddr);
D2(vswp, "%s: key = 0x%llx", __func__, addr);
/*
* Note: duplicate keys will be rejected by mod_hash.
*/
if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr,
(mod_hash_val_t)port) != 0) {
DERR(vswp, "%s: unable to add entry into fdb.", __func__);
return (1);
}
D1(vswp, "%s: exit", __func__);
return (0);
}
/*
* Remove an entry from FDB.
* Returns 0 on success, 1 on failure.
*/
static int
vsw_del_fdb(vsw_t *vswp, vsw_port_t *port)
{
uint64_t addr = 0;
D1(vswp, "%s: enter", __func__);
KEY_HASH(addr, port->p_macaddr);
D2(vswp, "%s: key = 0x%llx", __func__, addr);
(void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr);
D1(vswp, "%s: enter", __func__);
return (0);
}
/*
* Search fdb for a given mac address.
* Returns pointer to the entry if found, else returns NULL.
*/
static vsw_port_t *
vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp)
{
uint64_t key = 0;
vsw_port_t *port = NULL;
D1(vswp, "%s: enter", __func__);
KEY_HASH(key, ehp->ether_dhost);
D2(vswp, "%s: key = 0x%llx", __func__, key);
if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key,
(mod_hash_val_t *)&port) != 0) {
D2(vswp, "%s: no port found", __func__);
return (NULL);
}
D1(vswp, "%s: exit", __func__);
return (port);
}
/*
* Add or remove multicast address(es).
*
* Returns 0 on success, 1 on failure.
*/
static int
vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
{
mcst_addr_t *mcst_p = NULL;
vsw_t *vswp = port->p_vswp;
uint64_t addr = 0x0;
int i;
D1(vswp, "%s: enter", __func__);
D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
mutex_enter(&vswp->mac_lock);
if (vswp->mh == NULL) {
mutex_exit(&vswp->mac_lock);
return (1);
}
mutex_exit(&vswp->mac_lock);
for (i = 0; i < mcst_pkt->count; i++) {
/*
* Convert address into form that can be used
* as hash table key.
*/
KEY_HASH(addr, mcst_pkt->mca[i]);
/*
* Add or delete the specified address/port combination.
*/
if (mcst_pkt->set == 0x1) {
D3(vswp, "%s: adding multicast address 0x%llx for "
"port %ld", __func__, addr, port->p_instance);
if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
/*
* Update the list of multicast
* addresses contained within the
* port structure to include this new
* one.
*/
mcst_p = kmem_alloc(sizeof (mcst_addr_t),
KM_NOSLEEP);
if (mcst_p == NULL) {
DERR(vswp, "%s: unable to alloc mem",
__func__);
return (1);
}
mcst_p->nextp = NULL;
mcst_p->addr = addr;
mutex_enter(&port->mca_lock);
mcst_p->nextp = port->mcap;
port->mcap = mcst_p;
mutex_exit(&port->mca_lock);
/*
* Program the address into HW. If the addr
* has already been programmed then the MAC
* just increments a ref counter (which is
* used when the address is being deleted)
*/
mutex_enter(&vswp->mac_lock);
if ((vswp->mh == NULL) ||
mac_multicst_add(vswp->mh,
(uchar_t *)&mcst_pkt->mca[i])) {
mutex_exit(&vswp->mac_lock);
cmn_err(CE_WARN, "!vsw%d: unable to "
"add multicast address",
vswp->instance);
(void) vsw_del_mcst(vswp, VSW_VNETPORT,
addr, port);
vsw_del_addr(VSW_VNETPORT, port, addr);
return (1);
}
mutex_exit(&vswp->mac_lock);
} else {
DERR(vswp, "%s: error adding multicast "
"address 0x%llx for port %ld",
__func__, addr, port->p_instance);
return (1);
}
} else {
/*
* Delete an entry from the multicast hash
* table and update the address list
* appropriately.
*/
if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
D3(vswp, "%s: deleting multicast address "
"0x%llx for port %ld", __func__, addr,
port->p_instance);
vsw_del_addr(VSW_VNETPORT, port, addr);
/*
* Remove the address from HW. The address
* will actually only be removed once the ref
* count within the MAC layer has dropped to
* zero. I.e. we can safely call this fn even
* if other ports are interested in this
* address.
*/
mutex_enter(&vswp->mac_lock);
if ((vswp->mh == NULL) ||
mac_multicst_remove(vswp->mh,
(uchar_t *)&mcst_pkt->mca[i])) {
mutex_exit(&vswp->mac_lock);
cmn_err(CE_WARN, "!vsw%d: unable to "
"remove multicast address",
vswp->instance);
return (1);
}
mutex_exit(&vswp->mac_lock);
} else {
DERR(vswp, "%s: error deleting multicast "
"addr 0x%llx for port %ld",
__func__, addr, port->p_instance);
return (1);
}
}
}
D1(vswp, "%s: exit", __func__);
return (0);
}
/*
* Add a new multicast entry.
*
* Search hash table based on address. If match found then
* update associated val (which is chain of ports), otherwise
* create new key/val (addr/port) pair and insert into table.
*/
static int
vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
{
int dup = 0;
int rv = 0;
mfdb_ent_t *ment = NULL;
mfdb_ent_t *tmp_ent = NULL;
mfdb_ent_t *new_ent = NULL;
void *tgt = NULL;
if (devtype == VSW_VNETPORT) {
/*
* Being invoked from a vnet.
*/
ASSERT(arg != NULL);
tgt = arg;
D2(NULL, "%s: port %d : address 0x%llx", __func__,
((vsw_port_t *)arg)->p_instance, addr);
} else {
/*
* We are being invoked via the m_multicst mac entry
* point.
*/
D2(NULL, "%s: address 0x%llx", __func__, addr);
tgt = (void *)vswp;
}
WRITE_ENTER(&vswp->mfdbrw);
if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
(mod_hash_val_t *)&ment) != 0) {
/* address not currently in table */
ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
ment->d_addr = (void *)tgt;
ment->d_type = devtype;
ment->nextp = NULL;
if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
(mod_hash_val_t)ment) != 0) {
DERR(vswp, "%s: hash table insertion failed", __func__);
kmem_free(ment, sizeof (mfdb_ent_t));
rv = 1;
} else {
D2(vswp, "%s: added initial entry for 0x%llx to "
"table", __func__, addr);
}
} else {
/*
* Address in table. Check to see if specified port
* is already associated with the address. If not add
* it now.
*/
tmp_ent = ment;
while (tmp_ent != NULL) {
if (tmp_ent->d_addr == (void *)tgt) {
if (devtype == VSW_VNETPORT) {
DERR(vswp, "%s: duplicate port entry "
"found for portid %ld and key "
"0x%llx", __func__,
((vsw_port_t *)arg)->p_instance,
addr);
} else {
DERR(vswp, "%s: duplicate entry found"
"for key 0x%llx",
__func__, addr);
}
rv = 1;
dup = 1;
break;
}
tmp_ent = tmp_ent->nextp;
}
/*
* Port not on list so add it to end now.
*/
if (0 == dup) {
D2(vswp, "%s: added entry for 0x%llx to table",
__func__, addr);
new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
new_ent->d_addr = (void *)tgt;
new_ent->d_type = devtype;
new_ent->nextp = NULL;
tmp_ent = ment;
while (tmp_ent->nextp != NULL)
tmp_ent = tmp_ent->nextp;
tmp_ent->nextp = new_ent;
}
}
RW_EXIT(&vswp->mfdbrw);
return (rv);
}
/*
* Remove a multicast entry from the hashtable.
*
* Search hash table based on address. If match found, scan
* list of ports associated with address. If specified port
* found remove it from list.
*/
static int
vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
{
mfdb_ent_t *ment = NULL;
mfdb_ent_t *curr_p, *prev_p;
void *tgt = NULL;
D1(vswp, "%s: enter", __func__);
if (devtype == VSW_VNETPORT) {
tgt = (vsw_port_t *)arg;
D2(vswp, "%s: removing port %d from mFDB for address"
" 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance,
addr);
} else {
D2(vswp, "%s: removing entry", __func__);
tgt = (void *)vswp;
}
WRITE_ENTER(&vswp->mfdbrw);
if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
(mod_hash_val_t *)&ment) != 0) {
D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
RW_EXIT(&vswp->mfdbrw);
return (1);
}
prev_p = curr_p = ment;
while (curr_p != NULL) {
if (curr_p->d_addr == (void *)tgt) {
if (devtype == VSW_VNETPORT) {
D2(vswp, "%s: port %d found", __func__,
((vsw_port_t *)tgt)->p_instance);
} else {
D2(vswp, "%s: instance found", __func__);
}
if (prev_p == curr_p) {
/*
* head of list, if no other element is in
* list then destroy this entry, otherwise
* just replace it with updated value.
*/
ment = curr_p->nextp;
kmem_free(curr_p, sizeof (mfdb_ent_t));
if (ment == NULL) {
(void) mod_hash_destroy(vswp->mfdb,
(mod_hash_val_t)addr);
} else {
(void) mod_hash_replace(vswp->mfdb,
(mod_hash_key_t)addr,
(mod_hash_val_t)ment);
}
} else {
/*
* Not head of list, no need to do
* replacement, just adjust list pointers.
*/
prev_p->nextp = curr_p->nextp;
kmem_free(curr_p, sizeof (mfdb_ent_t));
}
break;
}
prev_p = curr_p;
curr_p = curr_p->nextp;
}
RW_EXIT(&vswp->mfdbrw);
D1(vswp, "%s: exit", __func__);
return (0);
}
/*
* Port is being deleted, but has registered an interest in one
* or more multicast groups. Using the list of addresses maintained
* within the port structure find the appropriate entry in the hash
* table and remove this port from the list of interested ports.
*/
static void
vsw_del_mcst_port(vsw_port_t *port)
{
mcst_addr_t *mcst_p = NULL;
vsw_t *vswp = port->p_vswp;
D1(vswp, "%s: enter", __func__);
mutex_enter(&port->mca_lock);
while (port->mcap != NULL) {
(void) vsw_del_mcst(vswp, VSW_VNETPORT,
port->mcap->addr, port);
mcst_p = port->mcap->nextp;
kmem_free(port->mcap, sizeof (mcst_addr_t));
port->mcap = mcst_p;
}
mutex_exit(&port->mca_lock);
D1(vswp, "%s: exit", __func__);
}
/*
* This vsw instance is detaching, but has registered an interest in one
* or more multicast groups. Using the list of addresses maintained
* within the vsw structure find the appropriate entry in the hash
* table and remove this instance from the list of interested ports.
*/
static void
vsw_del_mcst_vsw(vsw_t *vswp)
{
mcst_addr_t *next_p = NULL;
D1(vswp, "%s: enter", __func__);
mutex_enter(&vswp->mca_lock);
while (vswp->mcap != NULL) {
DERR(vswp, "%s: deleting addr 0x%llx",
__func__, vswp->mcap->addr);
(void) vsw_del_mcst(vswp, VSW_LOCALDEV,
vswp->mcap->addr, NULL);
next_p = vswp->mcap->nextp;
kmem_free(vswp->mcap, sizeof (mcst_addr_t));
vswp->mcap = next_p;
}
vswp->mcap = NULL;
mutex_exit(&vswp->mca_lock);
D1(vswp, "%s: exit", __func__);
}
/*
* Remove the specified address from the list of address maintained
* in this port node.
*/
static void
vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
{
vsw_t *vswp = NULL;
vsw_port_t *port = NULL;
mcst_addr_t *prev_p = NULL;
mcst_addr_t *curr_p = NULL;
D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
__func__, devtype, addr);
if (devtype == VSW_VNETPORT) {
port = (vsw_port_t *)arg;
mutex_enter(&port->mca_lock);
prev_p = curr_p = port->mcap;
} else {
vswp = (vsw_t *)arg;
mutex_enter(&vswp->mca_lock);
prev_p = curr_p = vswp->mcap;
}
while (curr_p != NULL) {
if (curr_p->addr == addr) {
D2(NULL, "%s: address found", __func__);
/* match found */
if (prev_p == curr_p) {
/* list head */
if (devtype == VSW_VNETPORT)
port->mcap = curr_p->nextp;
else
vswp->mcap = curr_p->nextp;
} else {
prev_p->nextp = curr_p->nextp;
}
kmem_free(curr_p, sizeof (mcst_addr_t));
break;
} else {
prev_p = curr_p;
curr_p = curr_p->nextp;
}
}
if (devtype == VSW_VNETPORT)
mutex_exit(&port->mca_lock);
else
mutex_exit(&vswp->mca_lock);
D1(NULL, "%s: exit", __func__);
}
/*
* Creates a descriptor ring (dring) and links it into the
* link of outbound drings for this channel.
*
* Returns NULL if creation failed.
*/
static dring_info_t *
vsw_create_dring(vsw_ldc_t *ldcp)
{
vsw_private_desc_t *priv_addr = NULL;
vsw_t *vswp = ldcp->ldc_vswp;
ldc_mem_info_t minfo;
dring_info_t *dp, *tp;
int i;
dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
/* create public section of ring */
if ((ldc_mem_dring_create(VSW_RING_NUM_EL,
VSW_PUB_SIZE, &dp->handle)) != 0) {
DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
"failed", ldcp->ldc_id);
goto create_fail_exit;
}
ASSERT(dp->handle != NULL);
/*
* Get the base address of the public section of the ring.
*/
if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
ldcp->ldc_id);
goto dring_fail_exit;
} else {
ASSERT(minfo.vaddr != 0);
dp->pub_addr = minfo.vaddr;
}
dp->num_descriptors = VSW_RING_NUM_EL;
dp->descriptor_size = VSW_PUB_SIZE;
dp->options = VIO_TX_DRING;
dp->ncookies = 1; /* guaranteed by ldc */
/*
* create private portion of ring
*/
dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);
if (vsw_setup_ring(ldcp, dp)) {
DERR(vswp, "%s: unable to setup ring", __func__);
goto dring_fail_exit;
}
/* haven't used any descriptors yet */
dp->end_idx = 0;
dp->last_ack_recv = -1;
/* bind dring to the channel */
if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
LDC_SHADOW_MAP, LDC_MEM_RW,
&dp->cookie[0], &dp->ncookies)) != 0) {
DERR(vswp, "vsw_create_dring: unable to bind to channel "
"%lld", ldcp->ldc_id);
goto dring_fail_exit;
}
mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
dp->restart_reqd = B_TRUE;
/*
* Only ever create rings for outgoing lane. Link it onto
* end of list.
*/
WRITE_ENTER(&ldcp->lane_out.dlistrw);
if (ldcp->lane_out.dringp == NULL) {
D2(vswp, "vsw_create_dring: adding first outbound ring");
ldcp->lane_out.dringp = dp;
} else {
tp = ldcp->lane_out.dringp;
while (tp->next != NULL)
tp = tp->next;
tp->next = dp;
}
RW_EXIT(&ldcp->lane_out.dlistrw);
return (dp);
dring_fail_exit:
(void) ldc_mem_dring_destroy(dp->handle);
create_fail_exit:
if (dp->priv_addr != NULL) {
priv_addr = dp->priv_addr;
for (i = 0; i < VSW_RING_NUM_EL; i++) {
if (priv_addr->memhandle != NULL)
(void) ldc_mem_free_handle(
priv_addr->memhandle);
priv_addr++;
}
kmem_free(dp->priv_addr,
(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
}
mutex_destroy(&dp->dlock);
kmem_free(dp, sizeof (dring_info_t));
return (NULL);
}
/*
* Create a ring consisting of just a private portion and link
* it into the list of rings for the outbound lane.
*
* These type of rings are used primarily for temporary data
* storage (i.e. as data buffers).
*/
void
vsw_create_privring(vsw_ldc_t *ldcp)
{
dring_info_t *dp, *tp;
vsw_t *vswp = ldcp->ldc_vswp;
D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
/* no public section */
dp->pub_addr = NULL;
dp->priv_addr = kmem_zalloc((sizeof (vsw_private_desc_t) *
VSW_RING_NUM_EL), KM_SLEEP);
dp->num_descriptors = VSW_RING_NUM_EL;
if (vsw_setup_ring(ldcp, dp)) {
DERR(vswp, "%s: setup of ring failed", __func__);
kmem_free(dp->priv_addr,
(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
mutex_destroy(&dp->dlock);
kmem_free(dp, sizeof (dring_info_t));
return;
}
/* haven't used any descriptors yet */
dp->end_idx = 0;
mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
dp->restart_reqd = B_TRUE;
/*
* Only ever create rings for outgoing lane. Link it onto
* end of list.
*/
WRITE_ENTER(&ldcp->lane_out.dlistrw);
if (ldcp->lane_out.dringp == NULL) {
D2(vswp, "%s: adding first outbound privring", __func__);
ldcp->lane_out.dringp = dp;
} else {
tp = ldcp->lane_out.dringp;
while (tp->next != NULL)
tp = tp->next;
tp->next = dp;
}
RW_EXIT(&ldcp->lane_out.dlistrw);
D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}
/*
* Setup the descriptors in the dring. Returns 0 on success, 1 on
* failure.
*/
int
vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
{
vnet_public_desc_t *pub_addr = NULL;
vsw_private_desc_t *priv_addr = NULL;
vsw_t *vswp = ldcp->ldc_vswp;
uint64_t *tmpp;
uint64_t offset = 0;
uint32_t ncookies = 0;
static char *name = "vsw_setup_ring";
int i, j, nc, rv;
priv_addr = dp->priv_addr;
pub_addr = dp->pub_addr;
/* public section may be null but private should never be */
ASSERT(priv_addr != NULL);
/*
* Allocate the region of memory which will be used to hold
* the data the descriptors will refer to.
*/
dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ);
dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
dp->data_sz, dp->data_addr);
tmpp = (uint64_t *)dp->data_addr;
offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp);
/*
* Initialise some of the private and public (if they exist)
* descriptor fields.
*/
for (i = 0; i < VSW_RING_NUM_EL; i++) {
mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
&priv_addr->memhandle)) != 0) {
DERR(vswp, "%s: alloc mem handle failed", name);
goto setup_ring_cleanup;
}
priv_addr->datap = (void *)tmpp;
rv = ldc_mem_bind_handle(priv_addr->memhandle,
(caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ,
LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
&(priv_addr->memcookie[0]), &ncookies);
if (rv != 0) {
DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
"(rv %d)", name, ldcp->ldc_id, rv);
goto setup_ring_cleanup;
}
priv_addr->bound = 1;
D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
name, i, priv_addr->memcookie[0].addr,
priv_addr->memcookie[0].size);
if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
"invalid num of cookies (%d) for size 0x%llx",
name, ldcp->ldc_id, ncookies,
VSW_RING_EL_DATA_SZ);
goto setup_ring_cleanup;
} else {
for (j = 1; j < ncookies; j++) {
rv = ldc_mem_nextcookie(priv_addr->memhandle,
&(priv_addr->memcookie[j]));
if (rv != 0) {
DERR(vswp, "%s: ldc_mem_nextcookie "
"failed rv (%d)", name, rv);
goto setup_ring_cleanup;
}
D3(vswp, "%s: memcookie %d : addr 0x%llx : "
"size 0x%llx", name, j,
priv_addr->memcookie[j].addr,
priv_addr->memcookie[j].size);
}
}
priv_addr->ncookies = ncookies;
priv_addr->dstate = VIO_DESC_FREE;
if (pub_addr != NULL) {
/* link pub and private sides */
priv_addr->descp = pub_addr;
pub_addr->ncookies = priv_addr->ncookies;
for (nc = 0; nc < pub_addr->ncookies; nc++) {
bcopy(&priv_addr->memcookie[nc],
&pub_addr->memcookie[nc],
sizeof (ldc_mem_cookie_t));
}
pub_addr->hdr.dstate = VIO_DESC_FREE;
pub_addr++;
}
/*
* move to next element in the dring and the next
* position in the data buffer.
*/
priv_addr++;
tmpp += offset;
}
return (0);
setup_ring_cleanup:
priv_addr = dp->priv_addr;
for (j = 0; j < i; j++) {
(void) ldc_mem_unbind_handle(priv_addr->memhandle);
(void) ldc_mem_free_handle(priv_addr->memhandle);
mutex_destroy(&priv_addr->dstate_lock);
priv_addr++;
}
kmem_free(dp->data_addr, dp->data_sz);
return (1);
}
/*
* Searches the private section of a ring for a free descriptor,
* starting at the location of the last free descriptor found
* previously.
*
* Returns 0 if free descriptor is available, and updates state
* of private descriptor to VIO_DESC_READY, otherwise returns 1.
*
* FUTURE: might need to return contiguous range of descriptors
* as dring info msg assumes all will be contiguous.
*/
static int
vsw_dring_find_free_desc(dring_info_t *dringp,
vsw_private_desc_t **priv_p, int *idx)
{
vsw_private_desc_t *addr = NULL;
int num = VSW_RING_NUM_EL;
int ret = 1;
D1(NULL, "%s enter\n", __func__);
ASSERT(dringp->priv_addr != NULL);
D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
__func__, dringp, dringp->end_idx);
addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
mutex_enter(&addr->dstate_lock);
if (addr->dstate == VIO_DESC_FREE) {
addr->dstate = VIO_DESC_READY;
*priv_p = addr;
*idx = dringp->end_idx;
dringp->end_idx = (dringp->end_idx + 1) % num;
ret = 0;
}
mutex_exit(&addr->dstate_lock);
/* ring full */
if (ret == 1) {
D2(NULL, "%s: no desp free: started at %d", __func__,
dringp->end_idx);
}
D1(NULL, "%s: exit\n", __func__);
return (ret);
}
/*
* Map from a dring identifier to the ring itself. Returns
* pointer to ring or NULL if no match found.
*
* Should be called with dlistrw rwlock held as reader.
*/
static dring_info_t *
vsw_ident2dring(lane_t *lane, uint64_t ident)
{
dring_info_t *dp = NULL;
if ((dp = lane->dringp) == NULL) {
return (NULL);
} else {
if (dp->ident == ident)
return (dp);
while (dp != NULL) {
if (dp->ident == ident)
break;
dp = dp->next;
}
}
return (dp);
}
/*
* Set the default lane attributes. These are copied into
* the attr msg we send to our peer. If they are not acceptable
* then (currently) the handshake ends.
*/
static void
vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
{
bzero(lp, sizeof (lane_t));
READ_ENTER(&vswp->if_lockrw);
ether_copy(&(vswp->if_addr), &(lp->addr));
RW_EXIT(&vswp->if_lockrw);
lp->mtu = VSW_MTU;
lp->addr_type = ADDR_TYPE_MAC;
lp->xfer_mode = VIO_DRING_MODE;
lp->ack_freq = 0; /* for shared mode */
mutex_enter(&lp->seq_lock);
lp->seq_num = VNET_ISS;
mutex_exit(&lp->seq_lock);
}
/*
* Verify that the attributes are acceptable.
*
* FUTURE: If some attributes are not acceptable, change them
* our desired values.
*/
static int
vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port)
{
int ret = 0;
D1(NULL, "vsw_check_attr enter\n");
/*
* Note we currently only support in-band descriptors
* and descriptor rings, not packet based transfer (VIO_PKT_MODE)
*/
if ((pkt->xfer_mode != VIO_DESC_MODE) &&
(pkt->xfer_mode != VIO_DRING_MODE)) {
D2(NULL, "vsw_check_attr: unknown mode %x\n",
pkt->xfer_mode);
ret = 1;
}
/* Only support MAC addresses at moment. */
if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
D2(NULL, "vsw_check_attr: invalid addr_type %x, "
"or address 0x%llx\n", pkt->addr_type,
pkt->addr);
ret = 1;
}
/*
* MAC address supplied by device should match that stored
* in the vsw-port OBP node. Need to decide what to do if they
* don't match, for the moment just warn but don't fail.
*/
if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) {
DERR(NULL, "vsw_check_attr: device supplied address "
"0x%llx doesn't match node address 0x%llx\n",
pkt->addr, port->p_macaddr);
}
/*
* Ack freq only makes sense in pkt mode, in shared
* mode the ring descriptors say whether or not to
* send back an ACK.
*/
if ((pkt->xfer_mode == VIO_DRING_MODE) &&
(pkt->ack_freq > 0)) {
D2(NULL, "vsw_check_attr: non zero ack freq "
" in SHM mode\n");
ret = 1;
}
/*
* Note: for the moment we only support ETHER
* frames. This may change in the future.
*/
if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) {
D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
pkt->mtu);
ret = 1;
}
D1(NULL, "vsw_check_attr exit\n");
return (ret);
}
/*
* Returns 1 if there is a problem, 0 otherwise.
*/
static int
vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
{
_NOTE(ARGUNUSED(pkt))
int ret = 0;
D1(NULL, "vsw_check_dring_info enter\n");
if ((pkt->num_descriptors == 0) ||
(pkt->descriptor_size == 0) ||
(pkt->ncookies != 1)) {
DERR(NULL, "vsw_check_dring_info: invalid dring msg");
ret = 1;
}
D1(NULL, "vsw_check_dring_info exit\n");
return (ret);
}
/*
* Returns 1 if two memory cookies match. Otherwise returns 0.
*/
static int
vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
{
if ((m1->addr != m2->addr) ||
(m2->size != m2->size)) {
return (0);
} else {
return (1);
}
}
/*
* Returns 1 if ring described in reg message matches that
* described by dring_info structure. Otherwise returns 0.
*/
static int
vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
{
if ((msg->descriptor_size != dp->descriptor_size) ||
(msg->num_descriptors != dp->num_descriptors) ||
(msg->ncookies != dp->ncookies) ||
!(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
return (0);
} else {
return (1);
}
}
static caddr_t
vsw_print_ethaddr(uint8_t *a, char *ebuf)
{
(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
a[0], a[1], a[2], a[3], a[4], a[5]);
return (ebuf);
}
/*
* Reset and free all the resources associated with
* the channel.
*/
static void
vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
{
dring_info_t *dp, *dpp;
lane_t *lp = NULL;
int rv = 0;
ASSERT(ldcp != NULL);
D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
if (dir == INBOUND) {
D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
" of channel %lld", __func__, ldcp->ldc_id);
lp = &ldcp->lane_in;
} else {
D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
" of channel %lld", __func__, ldcp->ldc_id);
lp = &ldcp->lane_out;
}
lp->lstate = VSW_LANE_INACTIV;
mutex_enter(&lp->seq_lock);
lp->seq_num = VNET_ISS;
mutex_exit(&lp->seq_lock);
if (lp->dringp) {
if (dir == INBOUND) {
WRITE_ENTER(&lp->dlistrw);
dp = lp->dringp;
while (dp != NULL) {
dpp = dp->next;
if (dp->handle != NULL)
(void) ldc_mem_dring_unmap(dp->handle);
kmem_free(dp, sizeof (dring_info_t));
dp = dpp;
}
RW_EXIT(&lp->dlistrw);
} else {
/*
* unbind, destroy exported dring, free dring struct
*/
WRITE_ENTER(&lp->dlistrw);
dp = lp->dringp;
rv = vsw_free_ring(dp);
RW_EXIT(&lp->dlistrw);
}
if (rv == 0) {
lp->dringp = NULL;
}
}
D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
}
/*
* Free ring and all associated resources.
*
* Should be called with dlistrw rwlock held as writer.
*/
static int
vsw_free_ring(dring_info_t *dp)
{
vsw_private_desc_t *paddr = NULL;
dring_info_t *dpp;
int i, rv = 1;
while (dp != NULL) {
mutex_enter(&dp->dlock);
dpp = dp->next;
if (dp->priv_addr != NULL) {
/*
* First unbind and free the memory handles
* stored in each descriptor within the ring.
*/
for (i = 0; i < VSW_RING_NUM_EL; i++) {
paddr = (vsw_private_desc_t *)
dp->priv_addr + i;
if (paddr->memhandle != NULL) {
if (paddr->bound == 1) {
rv = ldc_mem_unbind_handle(
paddr->memhandle);
if (rv != 0) {
DERR(NULL, "error "
"unbinding handle for "
"ring 0x%llx at pos %d",
dp, i);
mutex_exit(&dp->dlock);
return (rv);
}
paddr->bound = 0;
}
rv = ldc_mem_free_handle(
paddr->memhandle);
if (rv != 0) {
DERR(NULL, "error freeing "
"handle for ring "
"0x%llx at pos %d",
dp, i);
mutex_exit(&dp->dlock);
return (rv);
}
paddr->memhandle = NULL;
}
mutex_destroy(&paddr->dstate_lock);
}
kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t)
* VSW_RING_NUM_EL));
}
/*
* Now unbind and destroy the ring itself.
*/
if (dp->handle != NULL) {
(void) ldc_mem_dring_unbind(dp->handle);
(void) ldc_mem_dring_destroy(dp->handle);
}
if (dp->data_addr != NULL) {
kmem_free(dp->data_addr, dp->data_sz);
}
mutex_exit(&dp->dlock);
mutex_destroy(&dp->dlock);
mutex_destroy(&dp->restart_lock);
kmem_free(dp, sizeof (dring_info_t));
dp = dpp;
}
return (0);
}
/*
* Debugging routines
*/
static void
display_state(void)
{
vsw_t *vswp;
vsw_port_list_t *plist;
vsw_port_t *port;
vsw_ldc_list_t *ldcl;
vsw_ldc_t *ldcp;
cmn_err(CE_NOTE, "***** system state *****");
for (vswp = vsw_head; vswp; vswp = vswp->next) {
plist = &vswp->plist;
READ_ENTER(&plist->lockrw);
cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
vswp->instance, plist->num_ports);
for (port = plist->head; port != NULL; port = port->p_next) {
ldcl = &port->p_ldclist;
cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
port->p_instance, ldcl->num_ldcs);
READ_ENTER(&ldcl->lockrw);
ldcp = ldcl->head;
for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
cmn_err(CE_CONT, "chan %lu : dev %d : "
"status %d : phase %u\n",
ldcp->ldc_id, ldcp->dev_class,
ldcp->ldc_status, ldcp->hphase);
cmn_err(CE_CONT, "chan %lu : lsession %lu : "
"psession %lu\n",
ldcp->ldc_id,
ldcp->local_session,
ldcp->peer_session);
cmn_err(CE_CONT, "Inbound lane:\n");
display_lane(&ldcp->lane_in);
cmn_err(CE_CONT, "Outbound lane:\n");
display_lane(&ldcp->lane_out);
}
RW_EXIT(&ldcl->lockrw);
}
RW_EXIT(&plist->lockrw);
}
cmn_err(CE_NOTE, "***** system state *****");
}
static void
display_lane(lane_t *lp)
{
dring_info_t *drp;
cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
lp->addr_type, lp->addr, lp->xfer_mode);
cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
cmn_err(CE_CONT, "Dring info:\n");
for (drp = lp->dringp; drp != NULL; drp = drp->next) {
cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
drp->num_descriptors, drp->descriptor_size);
cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
(uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
drp->ident, drp->end_idx);
display_ring(drp);
}
}
static void
display_ring(dring_info_t *dringp)
{
uint64_t i;
uint64_t priv_count = 0;
uint64_t pub_count = 0;
vnet_public_desc_t *pub_addr = NULL;
vsw_private_desc_t *priv_addr = NULL;
for (i = 0; i < VSW_RING_NUM_EL; i++) {
if (dringp->pub_addr != NULL) {
pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
if (pub_addr->hdr.dstate == VIO_DESC_FREE)
pub_count++;
}
if (dringp->priv_addr != NULL) {
priv_addr =
(vsw_private_desc_t *)dringp->priv_addr + i;
if (priv_addr->dstate == VIO_DESC_FREE)
priv_count++;
}
}
cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
i, priv_count, pub_count);
}
static void
dump_flags(uint64_t state)
{
int i;
typedef struct flag_name {
int flag_val;
char *flag_name;
} flag_name_t;
flag_name_t flags[] = {
VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
DERR(NULL, "DUMP_FLAGS: %llx\n", state);
for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
if (state & flags[i].flag_val)
DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
}
}