bridge.c revision 4249d844b6c24a328b579c3d5ebc44595a61e046
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* This module implements a STREAMS driver that provides layer-two (Ethernet)
* bridging functionality. The STREAMS interface is used to provide
* observability (snoop/wireshark) and control, but not for interface plumbing.
*/
#include <sys/types.h>
#include <sys/bitmap.h>
#include <sys/cmn_err.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/errno.h>
#include <sys/kstat.h>
#include <sys/modctl.h>
#include <sys/note.h>
#include <sys/param.h>
#include <sys/policy.h>
#include <sys/sdt.h>
#include <sys/stat.h>
#include <sys/stream.h>
#include <sys/stropts.h>
#include <sys/strsun.h>
#include <sys/sunddi.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/dlpi.h>
#include <sys/dls.h>
#include <sys/mac_ether.h>
#include <sys/mac_provider.h>
#include <sys/mac_client_priv.h>
#include <sys/mac_impl.h>
#include <sys/vlan.h>
#include <net/bridge.h>
#include <net/bridge_impl.h>
#include <net/trill.h>
/*
* Locks and reference counts: object lifetime and design.
*
* bridge_mac_t
* Bridge mac (snoop) instances are in bmac_list, which is protected by
* bmac_rwlock. They're allocated by bmac_alloc and freed by bridge_timer().
* Every bridge_inst_t has a single bridge_mac_t, but when bridge_inst_t goes
* away, the bridge_mac_t remains until either all of the users go away
* (detected by a timer) or until the instance is picked up again by the same
* bridge starting back up.
*
* bridge_inst_t
* Bridge instances are in inst_list, which is protected by inst_lock.
* They're allocated by inst_alloc() and freed by inst_free(). After
* allocation, an instance is placed in inst_list, and the reference count is
* incremented to represent this. That reference is decremented when the
* BIF_SHUTDOWN flag is set, and no new increments may occur. When the last
* reference is freed, the instance is removed from the list.
*
* Bridge instances have lists of links and an AVL tree of forwarding
* entries. Each of these structures holds one reference on the bridge
* instance. These lists and tree are protected by bi_rwlock.
*
* bridge_stream_t
* Bridge streams are allocated by stream_alloc() and freed by stream_free().
* These streams are created when "bridged" opens /dev/bridgectl, and are
* used to create new bridge instances (via BRIOC_NEWBRIDGE) and control the
* links on the bridge. When a stream closes, the bridge instance created is
* destroyed. There's at most one bridge instance for a given control
* stream.
*
* bridge_link_t
* Links are allocated by bridge_add_link() and freed by link_free(). The
* bi_links list holds a reference to the link. When the BLF_DELETED flag is
* set, that reference is dropped. The link isn't removed from the list
* until the last reference drops. Each forwarding entry that uses a given
* link holds a reference, as does each thread transmitting a packet via the
* link. The MAC layer calls in via bridge_ref_cb() to hold a reference on
* a link when transmitting.
*
* It's important that once BLF_DELETED is set, there's no way for the
* reference count to increase again. If it can, then the link may be
* double-freed. The BLF_FREED flag is intended for use with assertions to
* guard against this in testing.
*
* bridge_fwd_t
* Bridge forwarding entries are allocated by bridge_recv_cb() and freed by
* fwd_free(). The bi_fwd AVL tree holds one reference to the entry. Unlike
* other data structures, the reference is dropped when the entry is removed
* from the tree by fwd_delete(), and the BFF_INTREE flag is removed. Each
* thread that's forwarding a packet to a known destination holds a reference
* to a forwarding entry.
*
* TRILL notes:
*
* The TRILL module does all of its I/O through bridging. It uses references
* on the bridge_inst_t and bridge_link_t structures, and has seven entry
* points and four callbacks. One entry point is for setting the callbacks
* (bridge_trill_register_cb). There are four entry points for taking bridge
* and link references (bridge_trill_{br,ln}{ref,unref}). The final two
* entry points are for decapsulated packets from TRILL (bridge_trill_decaps)
* that need to be bridged locally, and for TRILL-encapsulated output packets
* (bridge_trill_output).
*
* The four callbacks comprise two notification functions for bridges and
* links being deleted, one function for raw received TRILL packets, and one
* for bridge output to non-local TRILL destinations (tunnel entry).
*/
/*
* Ethernet reserved multicast addresses for TRILL; used also in TRILL module.
*/
const uint8_t all_isis_rbridges[] = ALL_ISIS_RBRIDGES;
static const uint8_t all_esadi_rbridges[] = ALL_ESADI_RBRIDGES;
const uint8_t bridge_group_address[] = BRIDGE_GROUP_ADDRESS;
static const char *inst_kstats_list[] = { KSINST_NAMES };
static const char *link_kstats_list[] = { KSLINK_NAMES };
#define KREF(p, m, vn) p->m.vn.value.ui64
#define KINCR(p, m, vn) ++KREF(p, m, vn)
#define KDECR(p, m, vn) --KREF(p, m, vn)
#define KIPINCR(p, vn) KINCR(p, bi_kstats, vn)
#define KIPDECR(p, vn) KDECR(p, bi_kstats, vn)
#define KLPINCR(p, vn) KINCR(p, bl_kstats, vn)
#define KIINCR(vn) KIPINCR(bip, vn)
#define KIDECR(vn) KIPDECR(bip, vn)
#define KLINCR(vn) KLPINCR(blp, vn)
#define Dim(x) (sizeof (x) / sizeof (*(x)))
/* Amount of overhead added when encapsulating with VLAN headers */
#define VLAN_INCR (sizeof (struct ether_vlan_header) - \
sizeof (struct ether_header))
static dev_info_t *bridge_dev_info;
static major_t bridge_major;
static ddi_taskq_t *bridge_taskq;
/*
* These are the bridge instance management data structures. The mutex lock
* protects the list of bridge instances. A reference count is then used on
* each instance to determine when to free it. We use mac_minor_hold() to
* allocate minor_t values, which are used both for self-cloning /dev/net/
* device nodes as well as client streams. Minor node 0 is reserved for the
* allocation control node.
*/
static list_t inst_list;
static kcondvar_t inst_cv; /* Allows us to wait for shutdown */
static kmutex_t inst_lock;
static krwlock_t bmac_rwlock;
static list_t bmac_list;
/* Wait for taskq entries that use STREAMS */
static kcondvar_t stream_ref_cv;
static kmutex_t stream_ref_lock;
static timeout_id_t bridge_timerid;
static clock_t bridge_scan_interval;
static clock_t bridge_fwd_age;
static bridge_inst_t *bridge_find_name(const char *);
static void bridge_timer(void *);
static void bridge_unref(bridge_inst_t *);
static const uint8_t zero_addr[ETHERADDRL] = { 0 };
/* Global TRILL linkage */
static trill_recv_pkt_t trill_recv_fn;
static trill_encap_pkt_t trill_encap_fn;
static trill_br_dstr_t trill_brdstr_fn;
static trill_ln_dstr_t trill_lndstr_fn;
/* special settings to accommodate DLD flow control; see dld_str.c */
static struct module_info bridge_dld_modinfo = {
0, /* mi_idnum */
"bridge", /* mi_idname */
0, /* mi_minpsz */
INFPSZ, /* mi_maxpsz */
1, /* mi_hiwat */
0 /* mi_lowat */
};
static struct qinit bridge_dld_rinit = {
NULL, /* qi_putp */
NULL, /* qi_srvp */
dld_open, /* qi_qopen */
dld_close, /* qi_qclose */
NULL, /* qi_qadmin */
&bridge_dld_modinfo, /* qi_minfo */
NULL /* qi_mstat */
};
static struct qinit bridge_dld_winit = {
(int (*)())dld_wput, /* qi_putp */
(int (*)())dld_wsrv, /* qi_srvp */
NULL, /* qi_qopen */
NULL, /* qi_qclose */
NULL, /* qi_qadmin */
&bridge_dld_modinfo, /* qi_minfo */
NULL /* qi_mstat */
};
static int bridge_ioc_listfwd(void *, intptr_t, int, cred_t *, int *);
/* GLDv3 control ioctls used by Bridging */
static dld_ioc_info_t bridge_ioc_list[] = {
{BRIDGE_IOC_LISTFWD, DLDCOPYINOUT, sizeof (bridge_listfwd_t),
bridge_ioc_listfwd, NULL},
};
/*
* Given a bridge mac pointer, get a ref-held pointer to the corresponding
* bridge instance, if any. We must hold the global bmac_rwlock so that
* bm_inst doesn't slide out from under us.
*/
static bridge_inst_t *
mac_to_inst(const bridge_mac_t *bmp)
{
bridge_inst_t *bip;
rw_enter(&bmac_rwlock, RW_READER);
if ((bip = bmp->bm_inst) != NULL)
atomic_inc_uint(&bip->bi_refs);
rw_exit(&bmac_rwlock);
return (bip);
}
static void
link_sdu_fail(bridge_link_t *blp, boolean_t failed, mblk_t **mlist)
{
mblk_t *mp;
bridge_ctl_t *bcp;
bridge_link_t *blcmp;
bridge_inst_t *bip;
bridge_mac_t *bmp;
if (failed) {
if (blp->bl_flags & BLF_SDUFAIL)
return;
blp->bl_flags |= BLF_SDUFAIL;
} else {
if (!(blp->bl_flags & BLF_SDUFAIL))
return;
blp->bl_flags &= ~BLF_SDUFAIL;
}
/*
* If this link is otherwise up, then check if there are any other
* non-failed non-down links. If not, then we control the state of the
* whole bridge.
*/
bip = blp->bl_inst;
bmp = bip->bi_mac;
if (blp->bl_linkstate != LINK_STATE_DOWN) {
for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
blcmp = list_next(&bip->bi_links, blcmp)) {
if (blp != blcmp &&
!(blcmp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)) &&
blcmp->bl_linkstate != LINK_STATE_DOWN)
break;
}
if (blcmp == NULL) {
bmp->bm_linkstate = failed ? LINK_STATE_DOWN :
LINK_STATE_UP;
mac_link_redo(bmp->bm_mh, bmp->bm_linkstate);
}
}
/*
* If we're becoming failed, then the link's current true state needs
* to be reflected upwards to this link's clients. If we're becoming
* unfailed, then we get the state of the bridge instead on all
* clients.
*/
if (failed) {
if (bmp->bm_linkstate != blp->bl_linkstate)
mac_link_redo(blp->bl_mh, blp->bl_linkstate);
} else {
mac_link_redo(blp->bl_mh, bmp->bm_linkstate);
}
/* get the current mblk we're going to send up */
if ((mp = blp->bl_lfailmp) == NULL &&
(mp = allocb(sizeof (bridge_ctl_t), BPRI_MED)) == NULL)
return;
/* get a new one for next time */
blp->bl_lfailmp = allocb(sizeof (bridge_ctl_t), BPRI_MED);
/* if none for next time, then report only failures */
if (blp->bl_lfailmp == NULL && !failed) {
blp->bl_lfailmp = mp;
return;
}
/* LINTED: alignment */
bcp = (bridge_ctl_t *)mp->b_rptr;
bcp->bc_linkid = blp->bl_linkid;
bcp->bc_failed = failed;
mp->b_wptr = (uchar_t *)(bcp + 1);
mp->b_next = *mlist;
*mlist = mp;
}
/*
* Send control messages (link SDU changes) using the stream to the
* bridge instance daemon.
*/
static void
send_up_messages(bridge_inst_t *bip, mblk_t *mp)
{
mblk_t *mnext;
queue_t *rq;
rq = bip->bi_control->bs_wq;
rq = OTHERQ(rq);
while (mp != NULL) {
mnext = mp->b_next;
mp->b_next = NULL;
putnext(rq, mp);
mp = mnext;
}
}
/* ARGSUSED */
static int
bridge_m_getstat(void *arg, uint_t stat, uint64_t *val)
{
return (ENOTSUP);
}
static int
bridge_m_start(void *arg)
{
bridge_mac_t *bmp = arg;
bmp->bm_flags |= BMF_STARTED;
return (0);
}
static void
bridge_m_stop(void *arg)
{
bridge_mac_t *bmp = arg;
bmp->bm_flags &= ~BMF_STARTED;
}
/* ARGSUSED */
static int
bridge_m_setpromisc(void *arg, boolean_t on)
{
return (0);
}
/* ARGSUSED */
static int
bridge_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
{
return (0);
}
/* ARGSUSED */
static int
bridge_m_unicst(void *arg, const uint8_t *macaddr)
{
return (ENOTSUP);
}
static mblk_t *
bridge_m_tx(void *arg, mblk_t *mp)
{
_NOTE(ARGUNUSED(arg));
freemsgchain(mp);
return (NULL);
}
/* ARGSUSED */
static int
bridge_ioc_listfwd(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
bridge_listfwd_t *blf = karg;
bridge_inst_t *bip;
bridge_fwd_t *bfp, match;
avl_index_t where;
bip = bridge_find_name(blf->blf_name);
if (bip == NULL)
return (ENOENT);
bcopy(blf->blf_dest, match.bf_dest, ETHERADDRL);
match.bf_flags |= BFF_VLANLOCAL;
rw_enter(&bip->bi_rwlock, RW_READER);
if ((bfp = avl_find(&bip->bi_fwd, &match, &where)) == NULL)
bfp = avl_nearest(&bip->bi_fwd, where, AVL_AFTER);
else
bfp = AVL_NEXT(&bip->bi_fwd, bfp);
if (bfp == NULL) {
bzero(blf, sizeof (*blf));
} else {
bcopy(bfp->bf_dest, blf->blf_dest, ETHERADDRL);
blf->blf_trill_nick = bfp->bf_trill_nick;
blf->blf_ms_age =
drv_hztousec(lbolt - bfp->bf_lastheard) / 1000;
blf->blf_is_local =
(bfp->bf_flags & BFF_LOCALADDR) != 0;
blf->blf_linkid = bfp->bf_links[0]->bl_linkid;
}
rw_exit(&bip->bi_rwlock);
bridge_unref(bip);
return (0);
}
static int
bridge_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
uint_t pr_valsize, const void *pr_val)
{
bridge_mac_t *bmp = arg;
bridge_inst_t *bip;
bridge_link_t *blp;
int err;
uint_t maxsdu;
mblk_t *mlist;
_NOTE(ARGUNUSED(pr_name));
switch (pr_num) {
case MAC_PROP_MTU:
if (pr_valsize < sizeof (bmp->bm_maxsdu)) {
err = EINVAL;
break;
}
(void) bcopy(pr_val, &maxsdu, sizeof (maxsdu));
if (maxsdu == bmp->bm_maxsdu) {
err = 0;
} else if ((bip = mac_to_inst(bmp)) == NULL) {
err = ENXIO;
} else {
rw_enter(&bip->bi_rwlock, RW_WRITER);
mlist = NULL;
for (blp = list_head(&bip->bi_links); blp != NULL;
blp = list_next(&bip->bi_links, blp)) {
if (blp->bl_flags & BLF_DELETED)
continue;
if (blp->bl_maxsdu == maxsdu)
link_sdu_fail(blp, B_FALSE, &mlist);
else if (blp->bl_maxsdu == bmp->bm_maxsdu)
link_sdu_fail(blp, B_TRUE, &mlist);
}
rw_exit(&bip->bi_rwlock);
bmp->bm_maxsdu = maxsdu;
(void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
send_up_messages(bip, mlist);
bridge_unref(bip);
err = 0;
}
break;
default:
err = ENOTSUP;
break;
}
return (err);
}
static int
bridge_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
uint_t pr_flags, uint_t pr_valsize, void *pr_val, uint_t *perm)
{
bridge_mac_t *bmp = arg;
int err = 0;
_NOTE(ARGUNUSED(pr_name));
switch (pr_num) {
case MAC_PROP_MTU: {
mac_propval_range_t range;
if (!(pr_flags & MAC_PROP_POSSIBLE))
return (ENOTSUP);
if (pr_valsize < sizeof (mac_propval_range_t))
return (EINVAL);
range.mpr_count = 1;
range.mpr_type = MAC_PROPVAL_UINT32;
range.range_uint32[0].mpur_min =
range.range_uint32[0].mpur_max = bmp->bm_maxsdu;
bcopy(&range, pr_val, sizeof (range));
*perm = MAC_PROP_PERM_RW;
break;
}
case MAC_PROP_STATUS:
if (pr_valsize < sizeof (bmp->bm_linkstate)) {
err = EINVAL;
} else {
bcopy(&bmp->bm_linkstate, pr_val,
sizeof (&bmp->bm_linkstate));
*perm = MAC_PROP_PERM_READ;
}
break;
default:
err = ENOTSUP;
break;
}
return (err);
}
static mac_callbacks_t bridge_m_callbacks = {
MC_SETPROP | MC_GETPROP,
bridge_m_getstat,
bridge_m_start,
bridge_m_stop,
bridge_m_setpromisc,
bridge_m_multicst,
bridge_m_unicst,
bridge_m_tx,
NULL, /* ioctl */
NULL, /* getcapab */
NULL, /* open */
NULL, /* close */
bridge_m_setprop,
bridge_m_getprop
};
/*
* Create kstats from a list.
*/
static kstat_t *
kstat_setup(kstat_named_t *knt, const char **names, int nstat,
const char *unitname)
{
kstat_t *ksp;
int i;
for (i = 0; i < nstat; i++)
kstat_named_init(&knt[i], names[i], KSTAT_DATA_UINT64);
ksp = kstat_create_zone("bridge", 0, unitname, "net",
KSTAT_TYPE_NAMED, nstat, KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
if (ksp != NULL) {
ksp->ks_data = knt;
kstat_install(ksp);
}
return (ksp);
}
/*
* Find an existing bridge_mac_t structure or allocate a new one for the given
* bridge instance. This creates the mac driver instance that snoop can use.
*/
static int
bmac_alloc(bridge_inst_t *bip, bridge_mac_t **bmacp)
{
bridge_mac_t *bmp, *bnew;
mac_register_t *mac;
int err;
*bmacp = NULL;
if ((mac = mac_alloc(MAC_VERSION)) == NULL)
return (EINVAL);
bnew = kmem_zalloc(sizeof (*bnew), KM_SLEEP);
rw_enter(&bmac_rwlock, RW_WRITER);
for (bmp = list_head(&bmac_list); bmp != NULL;
bmp = list_next(&bmac_list, bmp)) {
if (strcmp(bip->bi_name, bmp->bm_name) == 0) {
ASSERT(bmp->bm_inst == NULL);
bmp->bm_inst = bip;
rw_exit(&bmac_rwlock);
kmem_free(bnew, sizeof (*bnew));
mac_free(mac);
*bmacp = bmp;
return (0);
}
}
mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
mac->m_driver = bnew;
mac->m_dip = bridge_dev_info;
mac->m_instance = (uint_t)-1;
mac->m_src_addr = (uint8_t *)zero_addr;
mac->m_callbacks = &bridge_m_callbacks;
/*
* Note that the SDU limits are irrelevant, as nobody transmits on the
* bridge node itself. It's mainly for monitoring but we allow
* setting the bridge MTU for quick transition of all links part of the
* bridge to a new MTU.
*/
mac->m_min_sdu = 1;
mac->m_max_sdu = 1500;
err = mac_register(mac, &bnew->bm_mh);
mac_free(mac);
if (err != 0) {
rw_exit(&bmac_rwlock);
kmem_free(bnew, sizeof (*bnew));
return (err);
}
bnew->bm_inst = bip;
(void) strcpy(bnew->bm_name, bip->bi_name);
if (list_is_empty(&bmac_list)) {
bridge_timerid = timeout(bridge_timer, NULL,
bridge_scan_interval);
}
list_insert_tail(&bmac_list, bnew);
rw_exit(&bmac_rwlock);
/*
* Mark the MAC as unable to go "active" so that only passive clients
* (such as snoop) can bind to it.
*/
mac_no_active(bnew->bm_mh);
*bmacp = bnew;
return (0);
}
/*
* Disconnect the given bridge_mac_t from its bridge instance. The bridge
* instance is going away. The mac instance can't go away until the clients
* are gone (see bridge_timer).
*/
static void
bmac_disconnect(bridge_mac_t *bmp)
{
bridge_inst_t *bip;
bmp->bm_linkstate = LINK_STATE_DOWN;
mac_link_redo(bmp->bm_mh, LINK_STATE_DOWN);
rw_enter(&bmac_rwlock, RW_READER);
bip = bmp->bm_inst;
bip->bi_mac = NULL;
bmp->bm_inst = NULL;
rw_exit(&bmac_rwlock);
}
/* This is used by the avl trees to sort forwarding table entries */
static int
fwd_compare(const void *addr1, const void *addr2)
{
const bridge_fwd_t *fwd1 = addr1;
const bridge_fwd_t *fwd2 = addr2;
int diff = memcmp(fwd1->bf_dest, fwd2->bf_dest, ETHERADDRL);
if (diff != 0)
return (diff > 0 ? 1 : -1);
if ((fwd1->bf_flags ^ fwd2->bf_flags) & BFF_VLANLOCAL) {
if (fwd1->bf_vlanid > fwd2->bf_vlanid)
return (1);
else if (fwd1->bf_vlanid < fwd2->bf_vlanid)
return (-1);
}
return (0);
}
static void
inst_free(bridge_inst_t *bip)
{
ASSERT(bip->bi_mac == NULL);
rw_destroy(&bip->bi_rwlock);
list_destroy(&bip->bi_links);
cv_destroy(&bip->bi_linkwait);
avl_destroy(&bip->bi_fwd);
if (bip->bi_ksp != NULL)
kstat_delete(bip->bi_ksp);
kmem_free(bip, sizeof (*bip));
}
static bridge_inst_t *
inst_alloc(const char *bridge)
{
bridge_inst_t *bip;
bip = kmem_zalloc(sizeof (*bip), KM_SLEEP);
bip->bi_refs = 1;
(void) strcpy(bip->bi_name, bridge);
rw_init(&bip->bi_rwlock, NULL, RW_DRIVER, NULL);
list_create(&bip->bi_links, sizeof (bridge_link_t),
offsetof(bridge_link_t, bl_node));
cv_init(&bip->bi_linkwait, NULL, CV_DRIVER, NULL);
avl_create(&bip->bi_fwd, fwd_compare, sizeof (bridge_fwd_t),
offsetof(bridge_fwd_t, bf_node));
return (bip);
}
static bridge_inst_t *
bridge_find_name(const char *bridge)
{
bridge_inst_t *bip;
mutex_enter(&inst_lock);
for (bip = list_head(&inst_list); bip != NULL;
bip = list_next(&inst_list, bip)) {
if (!(bip->bi_flags & BIF_SHUTDOWN) &&
strcmp(bridge, bip->bi_name) == 0) {
atomic_inc_uint(&bip->bi_refs);
break;
}
}
mutex_exit(&inst_lock);
return (bip);
}
static int
bridge_create(datalink_id_t linkid, const char *bridge, bridge_inst_t **bipc)
{
bridge_inst_t *bip, *bipnew;
bridge_mac_t *bmp = NULL;
int err;
*bipc = NULL;
bipnew = inst_alloc(bridge);
mutex_enter(&inst_lock);
lookup_retry:
for (bip = list_head(&inst_list); bip != NULL;
bip = list_next(&inst_list, bip)) {
if (strcmp(bridge, bip->bi_name) == 0)
break;
}
/* This should not take long; if it does, we've got a design problem */
if (bip != NULL && (bip->bi_flags & BIF_SHUTDOWN)) {
cv_wait(&inst_cv, &inst_lock);
goto lookup_retry;
}
if (bip != NULL) {
/* We weren't expecting to find anything */
bip = NULL;
err = EEXIST;
} else {
bip = bipnew;
bipnew = NULL;
list_insert_tail(&inst_list, bip);
}
mutex_exit(&inst_lock);
if (bip == NULL)
goto fail;
bip->bi_ksp = kstat_setup((kstat_named_t *)&bip->bi_kstats,
inst_kstats_list, Dim(inst_kstats_list), bip->bi_name);
err = bmac_alloc(bip, &bmp);
if ((bip->bi_mac = bmp) == NULL)
goto fail_create;
/*
* bm_inst is set, so the timer cannot yank the DLS rug from under us.
* No extra locking is needed here.
*/
if (!(bmp->bm_flags & BMF_DLS)) {
if ((err = dls_devnet_create(bmp->bm_mh, linkid)) != 0)
goto fail_create;
bmp->bm_flags |= BMF_DLS;
}
bip->bi_dev = makedevice(bridge_major, mac_minor(bmp->bm_mh));
*bipc = bip;
return (0);
fail_create:
if (bmp != NULL)
bmac_disconnect(bip->bi_mac);
bipnew = bip;
fail:
ASSERT(bipnew->bi_trilldata == NULL);
bipnew->bi_flags |= BIF_SHUTDOWN;
inst_free(bipnew);
return (err);
}
static void
bridge_unref(bridge_inst_t *bip)
{
if (atomic_dec_uint_nv(&bip->bi_refs) == 0) {
ASSERT(bip->bi_flags & BIF_SHUTDOWN);
/* free up mac for reuse before leaving global list */
if (bip->bi_mac != NULL)
bmac_disconnect(bip->bi_mac);
mutex_enter(&inst_lock);
list_remove(&inst_list, bip);
cv_broadcast(&inst_cv);
mutex_exit(&inst_lock);
inst_free(bip);
}
}
/*
* Stream instances are used only for allocating bridges and serving as a
* control node. They serve no data-handling function.
*/
static bridge_stream_t *
stream_alloc(void)
{
bridge_stream_t *bsp;
minor_t mn;
if ((mn = mac_minor_hold(B_FALSE)) == 0)
return (NULL);
bsp = kmem_zalloc(sizeof (*bsp), KM_SLEEP);
bsp->bs_minor = mn;
return (bsp);
}
static void
stream_free(bridge_stream_t *bsp)
{
mac_minor_rele(bsp->bs_minor);
kmem_free(bsp, sizeof (*bsp));
}
/* Reference hold/release functions for STREAMS-related taskq */
static void
stream_ref(bridge_stream_t *bsp)
{
mutex_enter(&stream_ref_lock);
bsp->bs_taskq_cnt++;
mutex_exit(&stream_ref_lock);
}
static void
stream_unref(bridge_stream_t *bsp)
{
mutex_enter(&stream_ref_lock);
if (--bsp->bs_taskq_cnt == 0)
cv_broadcast(&stream_ref_cv);
mutex_exit(&stream_ref_lock);
}
static void
link_free(bridge_link_t *blp)
{
bridge_inst_t *bip = blp->bl_inst;
ASSERT(!(blp->bl_flags & BLF_FREED));
blp->bl_flags |= BLF_FREED;
if (blp->bl_ksp != NULL)
kstat_delete(blp->bl_ksp);
if (blp->bl_lfailmp != NULL)
freeb(blp->bl_lfailmp);
cv_destroy(&blp->bl_trillwait);
mutex_destroy(&blp->bl_trilllock);
kmem_free(blp, sizeof (*blp));
/* Don't unreference the bridge until the MAC is closed */
bridge_unref(bip);
}
static void
link_unref(bridge_link_t *blp)
{
if (atomic_dec_uint_nv(&blp->bl_refs) == 0) {
bridge_inst_t *bip = blp->bl_inst;
ASSERT(blp->bl_flags & BLF_DELETED);
rw_enter(&bip->bi_rwlock, RW_WRITER);
list_remove(&bip->bi_links, blp);
rw_exit(&bip->bi_rwlock);
if (bip->bi_trilldata != NULL && list_is_empty(&bip->bi_links))
cv_broadcast(&bip->bi_linkwait);
link_free(blp);
}
}
static bridge_fwd_t *
fwd_alloc(const uint8_t *addr, uint_t nlinks, uint16_t nick)
{
bridge_fwd_t *bfp;
bfp = kmem_zalloc(sizeof (*bfp) + (nlinks * sizeof (bridge_link_t *)),
KM_NOSLEEP);
if (bfp != NULL) {
bcopy(addr, bfp->bf_dest, ETHERADDRL);
bfp->bf_lastheard = lbolt;
bfp->bf_maxlinks = nlinks;
bfp->bf_links = (bridge_link_t **)(bfp + 1);
bfp->bf_trill_nick = nick;
}
return (bfp);
}
static bridge_fwd_t *
fwd_find(bridge_inst_t *bip, const uint8_t *addr, uint16_t vlanid)
{
bridge_fwd_t *bfp, *vbfp;
bridge_fwd_t match;
bcopy(addr, match.bf_dest, ETHERADDRL);
match.bf_flags = 0;
rw_enter(&bip->bi_rwlock, RW_READER);
if ((bfp = avl_find(&bip->bi_fwd, &match, NULL)) != NULL) {
if (bfp->bf_vlanid != vlanid && bfp->bf_vcnt > 0) {
match.bf_vlanid = vlanid;
match.bf_flags = BFF_VLANLOCAL;
vbfp = avl_find(&bip->bi_fwd, &match, NULL);
if (vbfp != NULL)
bfp = vbfp;
}
atomic_inc_uint(&bfp->bf_refs);
}
rw_exit(&bip->bi_rwlock);
return (bfp);
}
static void
fwd_free(bridge_fwd_t *bfp)
{
uint_t i;
bridge_inst_t *bip = bfp->bf_links[0]->bl_inst;
KIDECR(bki_count);
for (i = 0; i < bfp->bf_nlinks; i++)
link_unref(bfp->bf_links[i]);
kmem_free(bfp,
sizeof (*bfp) + bfp->bf_maxlinks * sizeof (bridge_link_t *));
}
static void
fwd_unref(bridge_fwd_t *bfp)
{
if (atomic_dec_uint_nv(&bfp->bf_refs) == 0) {
ASSERT(!(bfp->bf_flags & BFF_INTREE));
fwd_free(bfp);
}
}
static void
fwd_delete(bridge_fwd_t *bfp)
{
bridge_inst_t *bip;
bridge_fwd_t *bfpzero;
if (bfp->bf_flags & BFF_INTREE) {
ASSERT(bfp->bf_nlinks > 0);
bip = bfp->bf_links[0]->bl_inst;
rw_enter(&bip->bi_rwlock, RW_WRITER);
/* Another thread could beat us to this */
if (bfp->bf_flags & BFF_INTREE) {
avl_remove(&bip->bi_fwd, bfp);
bfp->bf_flags &= ~BFF_INTREE;
if (bfp->bf_flags & BFF_VLANLOCAL) {
bfp->bf_flags &= ~BFF_VLANLOCAL;
bfpzero = avl_find(&bip->bi_fwd, bfp, NULL);
if (bfpzero != NULL && bfpzero->bf_vcnt > 0)
bfpzero->bf_vcnt--;
}
rw_exit(&bip->bi_rwlock);
fwd_unref(bfp); /* no longer in avl tree */
} else {
rw_exit(&bip->bi_rwlock);
}
}
}
static boolean_t
fwd_insert(bridge_inst_t *bip, bridge_fwd_t *bfp)
{
avl_index_t idx;
boolean_t retv;
rw_enter(&bip->bi_rwlock, RW_WRITER);
if (!(bip->bi_flags & BIF_SHUTDOWN) &&
avl_numnodes(&bip->bi_fwd) < bip->bi_tablemax &&
avl_find(&bip->bi_fwd, bfp, &idx) == NULL) {
avl_insert(&bip->bi_fwd, bfp, idx);
bfp->bf_flags |= BFF_INTREE;
atomic_inc_uint(&bfp->bf_refs); /* avl entry */
retv = B_TRUE;
} else {
retv = B_FALSE;
}
rw_exit(&bip->bi_rwlock);
return (retv);
}
static void
fwd_update_local(bridge_link_t *blp, const uint8_t *oldaddr,
const uint8_t *newaddr)
{
bridge_inst_t *bip = blp->bl_inst;
bridge_fwd_t *bfp, *bfnew;
bridge_fwd_t match;
avl_index_t idx;
boolean_t drop_ref = B_FALSE;
if (bcmp(oldaddr, newaddr, ETHERADDRL) == 0)
return;
if (bcmp(oldaddr, zero_addr, ETHERADDRL) == 0)
goto no_old_addr;
/*
* Find the previous entry, and remove our link from it.
*/
bcopy(oldaddr, match.bf_dest, ETHERADDRL);
rw_enter(&bip->bi_rwlock, RW_WRITER);
if ((bfp = avl_find(&bip->bi_fwd, &match, NULL)) != NULL) {
int i;
/*
* See if we're in the list, and remove if so.
*/
for (i = 0; i < bfp->bf_nlinks; i++) {
if (bfp->bf_links[i] == blp) {
/*
* We assume writes are atomic, so no special
* MT handling is needed. The list length is
* decremented first, and then we remove
* entries.
*/
bfp->bf_nlinks--;
for (; i < bfp->bf_nlinks; i++)
bfp->bf_links[i] = bfp->bf_links[i + 1];
drop_ref = B_TRUE;
break;
}
}
/* If no more links, then remove and free up */
if (bfp->bf_nlinks == 0) {
avl_remove(&bip->bi_fwd, bfp);
bfp->bf_flags &= ~BFF_INTREE;
} else {
bfp = NULL;
}
}
rw_exit(&bip->bi_rwlock);
if (bfp != NULL)
fwd_unref(bfp); /* no longer in avl tree */
/*
* Now get the new link address and add this link to the list. The
* list should be of length 1 unless the user has configured multiple
* NICs with the same address. (That's an incorrect configuration, but
* we support it anyway.)
*/
no_old_addr:
bfp = NULL;
if ((bip->bi_flags & BIF_SHUTDOWN) ||
bcmp(newaddr, zero_addr, ETHERADDRL) == 0)
goto no_new_addr;
bcopy(newaddr, match.bf_dest, ETHERADDRL);
rw_enter(&bip->bi_rwlock, RW_WRITER);
if ((bfp = avl_find(&bip->bi_fwd, &match, &idx)) == NULL) {
bfnew = fwd_alloc(newaddr, 1, RBRIDGE_NICKNAME_NONE);
if (bfnew != NULL)
KIINCR(bki_count);
} else if (bfp->bf_nlinks < bfp->bf_maxlinks) {
/* special case: link fits in existing entry */
bfnew = bfp;
} else {
bfnew = fwd_alloc(newaddr, bfp->bf_nlinks + 1,
RBRIDGE_NICKNAME_NONE);
if (bfnew != NULL) {
KIINCR(bki_count);
avl_remove(&bip->bi_fwd, bfp);
bfp->bf_flags &= ~BFF_INTREE;
bfnew->bf_nlinks = bfp->bf_nlinks;
bcopy(bfp->bf_links, bfnew->bf_links,
bfp->bf_nlinks * sizeof (bfp));
/* reset the idx value due to removal above */
(void) avl_find(&bip->bi_fwd, &match, &idx);
}
}
if (bfnew != NULL) {
bfnew->bf_links[bfnew->bf_nlinks++] = blp;
if (drop_ref)
drop_ref = B_FALSE;
else
atomic_inc_uint(&blp->bl_refs); /* bf_links entry */
if (bfnew != bfp) {
/* local addresses are not subject to table limits */
avl_insert(&bip->bi_fwd, bfnew, idx);
bfnew->bf_flags |= (BFF_INTREE | BFF_LOCALADDR);
atomic_inc_uint(&bfnew->bf_refs); /* avl entry */
}
}
rw_exit(&bip->bi_rwlock);
no_new_addr:
/*
* If we found an existing entry and we replaced it with a new one,
* then drop the table reference from the old one. We removed it from
* the AVL tree above.
*/
if (bfnew != NULL && bfp != NULL && bfnew != bfp)
fwd_unref(bfp);
/* Account for removed entry. */
if (drop_ref)
link_unref(blp);
}
static void
bridge_new_unicst(bridge_link_t *blp)
{
uint8_t new_mac[ETHERADDRL];
mac_unicast_primary_get(blp->bl_mh, new_mac);
fwd_update_local(blp, blp->bl_local_mac, new_mac);
bcopy(new_mac, blp->bl_local_mac, ETHERADDRL);
}
/*
* We must shut down a link prior to freeing it, and doing that requires
* blocking to wait for running MAC threads while holding a reference. This is
* run from a taskq to accomplish proper link shutdown followed by reference
* drop.
*/
static void
link_shutdown(void *arg)
{
bridge_link_t *blp = arg;
mac_handle_t mh = blp->bl_mh;
bridge_inst_t *bip;
bridge_fwd_t *bfp, *bfnext;
avl_tree_t fwd_scavenge;
int i;
/*
* This link is being destroyed. Notify TRILL now that it's no longer
* possible to send packets. Data packets may still arrive until TRILL
* calls bridge_trill_lnunref.
*/
if (blp->bl_trilldata != NULL)
trill_lndstr_fn(blp->bl_trilldata, blp);
if (blp->bl_flags & BLF_PROM_ADDED)
(void) mac_promisc_remove(blp->bl_mphp);
if (blp->bl_flags & BLF_SET_BRIDGE)
mac_bridge_clear(mh, (mac_handle_t)blp);
if (blp->bl_flags & BLF_MARGIN_ADDED) {
(void) mac_notify_remove(blp->bl_mnh, B_TRUE);
(void) mac_margin_remove(mh, blp->bl_margin);
}
/* Tell the clients the real link state when we leave */
mac_link_redo(blp->bl_mh,
mac_stat_get(blp->bl_mh, MAC_STAT_LOWLINK_STATE));
/* Destroy all of the forwarding entries related to this link */
avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
offsetof(bridge_fwd_t, bf_node));
bip = blp->bl_inst;
rw_enter(&bip->bi_rwlock, RW_WRITER);
bfnext = avl_first(&bip->bi_fwd);
while ((bfp = bfnext) != NULL) {
bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
for (i = 0; i < bfp->bf_nlinks; i++) {
if (bfp->bf_links[i] == blp)
break;
}
if (i >= bfp->bf_nlinks)
continue;
if (bfp->bf_nlinks > 1) {
/* note that this can't be the last reference */
link_unref(blp);
bfp->bf_nlinks--;
for (; i < bfp->bf_nlinks; i++)
bfp->bf_links[i] = bfp->bf_links[i + 1];
} else {
ASSERT(bfp->bf_flags & BFF_INTREE);
avl_remove(&bip->bi_fwd, bfp);
bfp->bf_flags &= ~BFF_INTREE;
avl_add(&fwd_scavenge, bfp);
}
}
rw_exit(&bip->bi_rwlock);
bfnext = avl_first(&fwd_scavenge);
while ((bfp = bfnext) != NULL) {
bfnext = AVL_NEXT(&fwd_scavenge, bfp);
avl_remove(&fwd_scavenge, bfp);
fwd_unref(bfp);
}
avl_destroy(&fwd_scavenge);
if (blp->bl_flags & BLF_CLIENT_OPEN)
mac_client_close(blp->bl_mch, 0);
mac_close(mh);
/*
* We are now completely removed from the active list, so drop the
* reference (see bridge_add_link).
*/
link_unref(blp);
}
static void
shutdown_inst(bridge_inst_t *bip)
{
bridge_link_t *blp, *blnext;
bridge_fwd_t *bfp;
mutex_enter(&inst_lock);
if (bip->bi_flags & BIF_SHUTDOWN) {
mutex_exit(&inst_lock);
return;
}
/*
* Once on the inst_list, the bridge instance must not leave that list
* without having the shutdown flag set first. When the shutdown flag
* is set, we own the list reference, so we must drop it before
* returning.
*/
bip->bi_flags |= BIF_SHUTDOWN;
mutex_exit(&inst_lock);
bip->bi_control = NULL;
rw_enter(&bip->bi_rwlock, RW_READER);
blnext = list_head(&bip->bi_links);
while ((blp = blnext) != NULL) {
blnext = list_next(&bip->bi_links, blp);
if (!(blp->bl_flags & BLF_DELETED)) {
blp->bl_flags |= BLF_DELETED;
(void) ddi_taskq_dispatch(bridge_taskq, link_shutdown,
blp, DDI_SLEEP);
}
}
while ((bfp = avl_first(&bip->bi_fwd)) != NULL) {
atomic_inc_uint(&bfp->bf_refs);
rw_exit(&bip->bi_rwlock);
fwd_delete(bfp);
fwd_unref(bfp);
rw_enter(&bip->bi_rwlock, RW_READER);
}
rw_exit(&bip->bi_rwlock);
/*
* This bridge is being destroyed. Notify TRILL once all of the
* links are all gone.
*/
mutex_enter(&inst_lock);
while (bip->bi_trilldata != NULL && !list_is_empty(&bip->bi_links))
cv_wait(&bip->bi_linkwait, &inst_lock);
mutex_exit(&inst_lock);
if (bip->bi_trilldata != NULL)
trill_brdstr_fn(bip->bi_trilldata, bip);
bridge_unref(bip);
}
/*
* This is called once by the TRILL module when it starts up. It just sets the
* global TRILL callback function pointers -- data transmit/receive and bridge
* and link destroy notification. There's only one TRILL module, so only one
* registration is needed.
*
* TRILL should call this function with NULL pointers before unloading. It
* must not do so before dropping all references to bridges and links. We
* assert that this is true on debug builds.
*/
void
bridge_trill_register_cb(trill_recv_pkt_t recv_fn, trill_encap_pkt_t encap_fn,
trill_br_dstr_t brdstr_fn, trill_ln_dstr_t lndstr_fn)
{
#ifdef DEBUG
if (recv_fn == NULL && trill_recv_fn != NULL) {
bridge_inst_t *bip;
bridge_link_t *blp;
mutex_enter(&inst_lock);
for (bip = list_head(&inst_list); bip != NULL;
bip = list_next(&inst_list, bip)) {
ASSERT(bip->bi_trilldata == NULL);
rw_enter(&bip->bi_rwlock, RW_READER);
for (blp = list_head(&bip->bi_links); blp != NULL;
blp = list_next(&bip->bi_links, blp)) {
ASSERT(blp->bl_trilldata == NULL);
}
rw_exit(&bip->bi_rwlock);
}
mutex_exit(&inst_lock);
}
#endif
trill_recv_fn = recv_fn;
trill_encap_fn = encap_fn;
trill_brdstr_fn = brdstr_fn;
trill_lndstr_fn = lndstr_fn;
}
/*
* This registers the TRILL instance pointer with a bridge. Before this
* pointer is set, the forwarding, TRILL receive, and bridge destructor
* functions won't be called.
*
* TRILL holds a reference on a bridge with this call. It must free the
* reference by calling the unregister function below.
*/
bridge_inst_t *
bridge_trill_brref(const char *bname, void *ptr)
{
char bridge[MAXLINKNAMELEN];
bridge_inst_t *bip;
(void) snprintf(bridge, MAXLINKNAMELEN, "%s0", bname);
bip = bridge_find_name(bridge);
if (bip != NULL) {
ASSERT(bip->bi_trilldata == NULL && ptr != NULL);
bip->bi_trilldata = ptr;
}
return (bip);
}
void
bridge_trill_brunref(bridge_inst_t *bip)
{
ASSERT(bip->bi_trilldata != NULL);
bip->bi_trilldata = NULL;
bridge_unref(bip);
}
/*
* TRILL calls this function when referencing a particular link on a bridge.
*
* It holds a reference on the link, so TRILL must clear out the reference when
* it's done with the link (on unbinding).
*/
bridge_link_t *
bridge_trill_lnref(bridge_inst_t *bip, datalink_id_t linkid, void *ptr)
{
bridge_link_t *blp;
ASSERT(ptr != NULL);
rw_enter(&bip->bi_rwlock, RW_READER);
for (blp = list_head(&bip->bi_links); blp != NULL;
blp = list_next(&bip->bi_links, blp)) {
if (!(blp->bl_flags & BLF_DELETED) &&
blp->bl_linkid == linkid && blp->bl_trilldata == NULL) {
blp->bl_trilldata = ptr;
blp->bl_flags &= ~BLF_TRILLACTIVE;
(void) memset(blp->bl_afs, 0, sizeof (blp->bl_afs));
atomic_inc_uint(&blp->bl_refs);
break;
}
}
rw_exit(&bip->bi_rwlock);
return (blp);
}
void
bridge_trill_lnunref(bridge_link_t *blp)
{
mutex_enter(&blp->bl_trilllock);
ASSERT(blp->bl_trilldata != NULL);
blp->bl_trilldata = NULL;
blp->bl_flags &= ~BLF_TRILLACTIVE;
while (blp->bl_trillthreads > 0)
cv_wait(&blp->bl_trillwait, &blp->bl_trilllock);
mutex_exit(&blp->bl_trilllock);
(void) memset(blp->bl_afs, 0xff, sizeof (blp->bl_afs));
link_unref(blp);
}
/*
* This periodic timer performs three functions:
* 1. It scans the list of learned forwarding entries, and removes ones that
* haven't been heard from in a while. The time limit is backed down if
* we're above the configured table limit.
* 2. It walks the links and decays away the bl_learns counter.
* 3. It scans the observability node entries looking for ones that can be
* freed up.
*/
/* ARGSUSED */
static void
bridge_timer(void *arg)
{
bridge_inst_t *bip;
bridge_fwd_t *bfp, *bfnext;
bridge_mac_t *bmp, *bmnext;
bridge_link_t *blp;
int err;
datalink_id_t tmpid;
avl_tree_t fwd_scavenge;
clock_t age_limit;
uint32_t ldecay;
avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
offsetof(bridge_fwd_t, bf_node));
mutex_enter(&inst_lock);
for (bip = list_head(&inst_list); bip != NULL;
bip = list_next(&inst_list, bip)) {
if (bip->bi_flags & BIF_SHUTDOWN)
continue;
rw_enter(&bip->bi_rwlock, RW_WRITER);
/* compute scaled maximum age based on table limit */
if (avl_numnodes(&bip->bi_fwd) > bip->bi_tablemax)
bip->bi_tshift++;
else
bip->bi_tshift = 0;
if ((age_limit = bridge_fwd_age >> bip->bi_tshift) == 0) {
if (bip->bi_tshift != 0)
bip->bi_tshift--;
age_limit = 1;
}
bfnext = avl_first(&bip->bi_fwd);
while ((bfp = bfnext) != NULL) {
bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
if (!(bfp->bf_flags & BFF_LOCALADDR) &&
(lbolt - bfp->bf_lastheard) > age_limit) {
ASSERT(bfp->bf_flags & BFF_INTREE);
avl_remove(&bip->bi_fwd, bfp);
bfp->bf_flags &= ~BFF_INTREE;
avl_add(&fwd_scavenge, bfp);
}
}
for (blp = list_head(&bip->bi_links); blp != NULL;
blp = list_next(&bip->bi_links, blp)) {
ldecay = mac_get_ldecay(blp->bl_mh);
if (ldecay >= blp->bl_learns)
blp->bl_learns = 0;
else
atomic_add_int(&blp->bl_learns, -(int)ldecay);
}
rw_exit(&bip->bi_rwlock);
bfnext = avl_first(&fwd_scavenge);
while ((bfp = bfnext) != NULL) {
bfnext = AVL_NEXT(&fwd_scavenge, bfp);
avl_remove(&fwd_scavenge, bfp);
KIINCR(bki_expire);
fwd_unref(bfp); /* drop tree reference */
}
}
mutex_exit(&inst_lock);
avl_destroy(&fwd_scavenge);
/*
* Scan the bridge_mac_t entries and try to free up the ones that are
* no longer active. This must be done by polling, as neither DLS nor
* MAC provides a driver any sort of positive control over clients.
*/
rw_enter(&bmac_rwlock, RW_WRITER);
bmnext = list_head(&bmac_list);
while ((bmp = bmnext) != NULL) {
bmnext = list_next(&bmac_list, bmp);
/* ignore active bridges */
if (bmp->bm_inst != NULL)
continue;
if (bmp->bm_flags & BMF_DLS) {
err = dls_devnet_destroy(bmp->bm_mh, &tmpid, B_FALSE);
ASSERT(err == 0 || err == EBUSY);
if (err == 0)
bmp->bm_flags &= ~BMF_DLS;
}
if (!(bmp->bm_flags & BMF_DLS)) {
err = mac_unregister(bmp->bm_mh);
ASSERT(err == 0 || err == EBUSY);
if (err == 0) {
list_remove(&bmac_list, bmp);
kmem_free(bmp, sizeof (*bmp));
}
}
}
if (list_is_empty(&bmac_list)) {
bridge_timerid = 0;
} else {
bridge_timerid = timeout(bridge_timer, NULL,
bridge_scan_interval);
}
rw_exit(&bmac_rwlock);
}
static int
bridge_open(queue_t *rq, dev_t *devp, int oflag, int sflag, cred_t *credp)
{
bridge_stream_t *bsp;
if (rq->q_ptr != NULL)
return (0);
if (sflag & MODOPEN)
return (EINVAL);
/*
* Check the minor node number being opened. This tells us which
* bridge instance the user wants.
*/
if (getminor(*devp) != 0) {
/*
* This is a regular DLPI stream for snoop or the like.
* Redirect it through DLD.
*/
rq->q_qinfo = &bridge_dld_rinit;
OTHERQ(rq)->q_qinfo = &bridge_dld_winit;
return (dld_open(rq, devp, oflag, sflag, credp));
} else {
/*
* Allocate the bridge control stream structure.
*/
if ((bsp = stream_alloc()) == NULL)
return (ENOSR);
rq->q_ptr = WR(rq)->q_ptr = (caddr_t)bsp;
bsp->bs_wq = WR(rq);
*devp = makedevice(getmajor(*devp), bsp->bs_minor);
qprocson(rq);
return (0);
}
}
/*
* This is used only for bridge control streams. DLPI goes through dld
* instead.
*/
static int
bridge_close(queue_t *rq)
{
bridge_stream_t *bsp = rq->q_ptr;
bridge_inst_t *bip;
/*
* Wait for any stray taskq (add/delete link) entries related to this
* stream to leave the system.
*/
mutex_enter(&stream_ref_lock);
while (bsp->bs_taskq_cnt != 0)
cv_wait(&stream_ref_cv, &stream_ref_lock);
mutex_exit(&stream_ref_lock);
qprocsoff(rq);
if ((bip = bsp->bs_inst) != NULL)
shutdown_inst(bip);
rq->q_ptr = WR(rq)->q_ptr = NULL;
stream_free(bsp);
if (bip != NULL)
bridge_unref(bip);
return (0);
}
static void
bridge_learn(bridge_link_t *blp, const uint8_t *saddr, uint16_t ingress_nick,
uint16_t vlanid)
{
bridge_inst_t *bip = blp->bl_inst;
bridge_fwd_t *bfp, *bfpnew;
int i;
boolean_t replaced = B_FALSE;
/* Ignore multi-destination address used as source; it's nonsense. */
if (*saddr & 1)
return;
/*
* If the source is known, then check whether it belongs on this link.
* If not, and this isn't a fixed local address, then we've detected a
* move. If it's not known, learn it.
*/
if ((bfp = fwd_find(bip, saddr, vlanid)) != NULL) {
/*
* If the packet has a fixed local source address, then there's
* nothing we can learn. We must quit. If this was a received
* packet, then the sender has stolen our address, but there's
* nothing we can do. If it's a transmitted packet, then
* that's the normal case.
*/
if (bfp->bf_flags & BFF_LOCALADDR) {
fwd_unref(bfp);
return;
}
/*
* Check if the link (and TRILL sender, if any) being used is
* among the ones registered for this address. If so, then
* this is information that we already know.
*/
if (bfp->bf_trill_nick == ingress_nick) {
for (i = 0; i < bfp->bf_nlinks; i++) {
if (bfp->bf_links[i] == blp) {
bfp->bf_lastheard = lbolt;
fwd_unref(bfp);
return;
}
}
}
}
/*
* Note that we intentionally "unlearn" things that appear to be under
* attack on this link. The forwarding cache is a negative thing for
* security -- it disables reachability as a performance optimization
* -- so leaving out entries optimizes for success and defends against
* the attack. Thus, the bare increment without a check in the delete
* code above is right. (And it's ok if we skid over the limit a
* little, so there's no syncronization needed on the test.)
*/
if (blp->bl_learns >= mac_get_llimit(blp->bl_mh)) {
if (bfp != NULL) {
if (bfp->bf_vcnt == 0)
fwd_delete(bfp);
fwd_unref(bfp);
}
return;
}
atomic_inc_uint(&blp->bl_learns);
if ((bfpnew = fwd_alloc(saddr, 1, ingress_nick)) == NULL) {
if (bfp != NULL)
fwd_unref(bfp);
return;
}
KIINCR(bki_count);
if (bfp != NULL) {
/*
* If this is a new destination for the same VLAN, then delete
* so that we can update. If it's a different VLAN, then we're
* not going to delete the original. Split off instead into an
* IVL entry.
*/
if (bfp->bf_vlanid == vlanid) {
/* save the count of IVL duplicates */
bfpnew->bf_vcnt = bfp->bf_vcnt;
/* entry deletes count as learning events */
atomic_inc_uint(&blp->bl_learns);
/* destroy and create anew; node moved */
fwd_delete(bfp);
replaced = B_TRUE;
KIINCR(bki_moved);
} else {
bfp->bf_vcnt++;
bfpnew->bf_flags |= BFF_VLANLOCAL;
}
fwd_unref(bfp);
}
bfpnew->bf_links[0] = blp;
bfpnew->bf_nlinks = 1;
atomic_inc_uint(&blp->bl_refs); /* bf_links entry */
if (!fwd_insert(bip, bfpnew))
fwd_free(bfpnew);
else if (!replaced)
KIINCR(bki_source);
}
/*
* Process the VLAN headers for output on a given link. There are several
* cases (noting that we don't map VLANs):
* 1. The input packet is good as it is; either
* a. It has no tag, and output has same PVID
* b. It has a non-zero priority-only tag for PVID, and b_band is same
* c. It has a tag with VLAN different from PVID, and b_band is same
* 2. The tag must change: non-zero b_band is different from tag priority
* 3. The packet has a tag and should not (VLAN same as PVID, b_band zero)
* 4. The packet has no tag and needs one:
* a. VLAN ID same as PVID, but b_band is non-zero
* b. VLAN ID different from PVID
* We exclude case 1 first, then modify the packet. Note that output packets
* get a priority set by the mblk, not by the header, because QoS in bridging
* requires priority recalculation at each node.
*
* The passed-in tci is the "impossible" value 0xFFFF when no tag is present.
*/
static mblk_t *
reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid)
{
boolean_t source_has_tag = (tci != 0xFFFF);
mblk_t *mpcopy;
size_t mlen, minlen;
struct ether_vlan_header *evh;
int pri;
/* This helps centralize error handling in the caller. */
if (mp == NULL)
return (mp);
/* No forwarded packet can have hardware checksum enabled */
DB_CKSUMFLAGS(mp) = 0;
/* Get the no-modification cases out of the way first */
if (!source_has_tag && vlanid == pvid) /* 1a */
return (mp);
pri = VLAN_PRI(tci);
if (source_has_tag && mp->b_band == pri) {
if (vlanid != pvid) /* 1c */
return (mp);
if (pri != 0 && VLAN_ID(tci) == 0) /* 1b */
return (mp);
}
/*
* We now know that we must modify the packet. Prepare for that. Note
* that if a tag is present, the caller has already done a pullup for
* the VLAN header, so we're good to go.
*/
if (MBLKL(mp) < sizeof (struct ether_header)) {
mpcopy = msgpullup(mp, sizeof (struct ether_header));
if (mpcopy == NULL) {
freemsg(mp);
return (NULL);
}
mp = mpcopy;
}
if (DB_REF(mp) > 1 || !IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)) ||
(!source_has_tag && MBLKTAIL(mp) < VLAN_INCR)) {
minlen = mlen = MBLKL(mp);
if (!source_has_tag)
minlen += VLAN_INCR;
ASSERT(minlen >= sizeof (struct ether_vlan_header));
/*
* We're willing to copy some data to avoid fragmentation, but
* not a lot.
*/
if (minlen > 256)
minlen = sizeof (struct ether_vlan_header);
mpcopy = allocb(minlen, BPRI_MED);
if (mpcopy == NULL) {
freemsg(mp);
return (NULL);
}
if (mlen <= minlen) {
/* We toss the first mblk when we can. */
bcopy(mp->b_rptr, mpcopy->b_rptr, mlen);
mpcopy->b_wptr += mlen;
mpcopy->b_cont = mp->b_cont;
freeb(mp);
} else {
/* If not, then just copy what we need */
if (!source_has_tag)
minlen = sizeof (struct ether_header);
bcopy(mp->b_rptr, mpcopy->b_rptr, minlen);
mpcopy->b_wptr += minlen;
mpcopy->b_cont = mp;
mp->b_rptr += minlen;
}
mp = mpcopy;
}
/* LINTED: pointer alignment */
evh = (struct ether_vlan_header *)mp->b_rptr;
if (source_has_tag) {
if (mp->b_band == 0 && vlanid == pvid) { /* 3 */
evh->ether_tpid = evh->ether_type;
mlen = MBLKL(mp);
if (mlen > sizeof (struct ether_vlan_header))
ovbcopy(mp->b_rptr +
sizeof (struct ether_vlan_header),
mp->b_rptr + sizeof (struct ether_header),
mlen - sizeof (struct ether_vlan_header));
mp->b_wptr -= VLAN_INCR;
} else { /* 2 */
if (vlanid == pvid)
vlanid = VLAN_ID_NONE;
tci = VLAN_TCI(mp->b_band, ETHER_CFI, vlanid);
evh->ether_tci = htons(tci);
}
} else {
/* case 4: no header present, but one is needed */
mlen = MBLKL(mp);
if (mlen > sizeof (struct ether_header))
ovbcopy(mp->b_rptr + sizeof (struct ether_header),
mp->b_rptr + sizeof (struct ether_vlan_header),
mlen - sizeof (struct ether_header));
mp->b_wptr += VLAN_INCR;
ASSERT(mp->b_wptr <= DB_LIM(mp));
if (vlanid == pvid)
vlanid = VLAN_ID_NONE;
tci = VLAN_TCI(mp->b_band, ETHER_CFI, vlanid);
evh->ether_type = evh->ether_tpid;
evh->ether_tpid = htons(ETHERTYPE_VLAN);
evh->ether_tci = htons(tci);
}
return (mp);
}
/* Record VLAN information and strip header if requested . */
static void
update_header(mblk_t *mp, mac_header_info_t *hdr_info, boolean_t striphdr)
{
if (hdr_info->mhi_bindsap == ETHERTYPE_VLAN) {
struct ether_vlan_header *evhp;
uint16_t ether_type;
/* LINTED: alignment */
evhp = (struct ether_vlan_header *)mp->b_rptr;
hdr_info->mhi_istagged = B_TRUE;
hdr_info->mhi_tci = ntohs(evhp->ether_tci);
if (striphdr) {
/*
* For VLAN tagged frames update the ether_type
* in hdr_info before stripping the header.
*/
ether_type = ntohs(evhp->ether_type);
hdr_info->mhi_origsap = ether_type;
hdr_info->mhi_bindsap = (ether_type > ETHERMTU) ?
ether_type : DLS_SAP_LLC;
mp->b_rptr = (uchar_t *)(evhp + 1);
}
} else {
hdr_info->mhi_istagged = B_FALSE;
hdr_info->mhi_tci = VLAN_ID_NONE;
if (striphdr)
mp->b_rptr += sizeof (struct ether_header);
}
}
/*
* Return B_TRUE if we're allowed to send on this link with the given VLAN ID.
*/
static boolean_t
bridge_can_send(bridge_link_t *blp, uint16_t vlanid)
{
ASSERT(vlanid != VLAN_ID_NONE);
if (blp->bl_flags & BLF_DELETED)
return (B_FALSE);
if (blp->bl_trilldata == NULL && blp->bl_state != BLS_FORWARDING)
return (B_FALSE);
return (BRIDGE_VLAN_ISSET(blp, vlanid) && BRIDGE_AF_ISSET(blp, vlanid));
}
/*
* This function scans the bridge forwarding tables in order to forward a given
* packet. If the packet either doesn't need forwarding (the current link is
* correct) or the current link needs a copy as well, then the packet is
* returned to the caller.
*
* If a packet has been decapsulated from TRILL, then it must *NOT* reenter a
* TRILL tunnel. If the destination points there, then drop instead.
*/
static mblk_t *
bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
uint16_t vlanid, uint16_t tci, boolean_t from_trill, boolean_t is_xmit)
{
mblk_t *mpsend, *mpcopy;
bridge_inst_t *bip = blp->bl_inst;
bridge_link_t *blpsend, *blpnext;
bridge_fwd_t *bfp;
uint_t i;
boolean_t selfseen = B_FALSE;
void *tdp;
const uint8_t *daddr = hdr_info->mhi_daddr;
/*
* Check for the IEEE "reserved" multicast addresses. Messages sent to
* these addresses are used for link-local control (STP and pause), and
* are never forwarded or redirected.
*/
if (daddr[0] == 1 && daddr[1] == 0x80 && daddr[2] == 0xc2 &&
daddr[3] == 0 && daddr[4] == 0 && (daddr[5] & 0xf0) == 0) {
if (from_trill) {
freemsg(mp);
mp = NULL;
}
return (mp);
}
if ((bfp = fwd_find(bip, daddr, vlanid)) != NULL) {
/*
* If trill indicates a destination for this node, then it's
* clearly not intended for local delivery. We must tell TRILL
* to encapsulate, as long as we didn't just decapsulate it.
*/
if (bfp->bf_trill_nick != RBRIDGE_NICKNAME_NONE) {
/*
* Error case: can't reencapsulate if the protocols are
* working correctly.
*/
if (from_trill) {
freemsg(mp);
return (NULL);
}
mutex_enter(&blp->bl_trilllock);
if ((tdp = blp->bl_trilldata) != NULL) {
blp->bl_trillthreads++;
mutex_exit(&blp->bl_trilllock);
update_header(mp, hdr_info, B_FALSE);
if (is_xmit)
mp = mac_fix_cksum(mp);
/* all trill data frames have Inner.VLAN */
mp = reform_vlan_header(mp, vlanid, tci, 0);
if (mp == NULL) {
KIINCR(bki_drops);
fwd_unref(bfp);
return (NULL);
}
trill_encap_fn(tdp, blp, hdr_info, mp,
bfp->bf_trill_nick);
mutex_enter(&blp->bl_trilllock);
if (--blp->bl_trillthreads == 0 &&
blp->bl_trilldata == NULL)
cv_broadcast(&blp->bl_trillwait);
}
mutex_exit(&blp->bl_trilllock);
/* if TRILL has been disabled, then kill this stray */
if (tdp == NULL) {
freemsg(mp);
fwd_delete(bfp);
}
fwd_unref(bfp);
return (NULL);
}
/* find first link we can send on */
for (i = 0; i < bfp->bf_nlinks; i++) {
blpsend = bfp->bf_links[i];
if (blpsend == blp)
selfseen = B_TRUE;
else if (bridge_can_send(blpsend, vlanid))
break;
}
while (i < bfp->bf_nlinks) {
blpsend = bfp->bf_links[i];
for (i++; i < bfp->bf_nlinks; i++) {
blpnext = bfp->bf_links[i];
if (blpnext == blp)
selfseen = B_TRUE;
else if (bridge_can_send(blpnext, vlanid))
break;
}
if (i == bfp->bf_nlinks && !selfseen) {
mpsend = mp;
mp = NULL;
} else {
mpsend = copymsg(mp);
}
if (!from_trill && is_xmit)
mpsend = mac_fix_cksum(mpsend);
mpsend = reform_vlan_header(mpsend, vlanid, tci,
blpsend->bl_pvid);
if (mpsend == NULL) {
KIINCR(bki_drops);
continue;
}
KIINCR(bki_forwards);
/*
* No need to bump up the link reference count, as
* the forwarding entry itself holds a reference to
* the link.
*/
if (bfp->bf_flags & BFF_LOCALADDR) {
mac_rx_common(blpsend->bl_mh, NULL, mpsend);
} else {
KLPINCR(blpsend, bkl_xmit);
MAC_RING_TX(blpsend->bl_mh, NULL, mpsend,
mpsend);
freemsg(mpsend);
}
}
/*
* Handle a special case: if we're transmitting to the original
* link, then check whether the localaddr flag is set. If it
* is, then receive instead. This doesn't happen with ordinary
* bridging, but does happen often with TRILL decapsulation.
*/
if (mp != NULL && is_xmit && (bfp->bf_flags & BFF_LOCALADDR)) {
mac_rx_common(blp->bl_mh, NULL, mp);
mp = NULL;
}
fwd_unref(bfp);
} else {
/*
* TRILL has two cases to handle. If the packet is off the
* wire (not from TRILL), then we need to send up into the
* TRILL module to have the distribution tree computed. If the
* packet is from TRILL (decapsulated), then we're part of the
* distribution tree, and we need to copy the packet on member
* interfaces.
*
* Thus, the from TRILL case is identical to the STP case.
*/
if (!from_trill && blp->bl_trilldata != NULL) {
mutex_enter(&blp->bl_trilllock);
if ((tdp = blp->bl_trilldata) != NULL) {
blp->bl_trillthreads++;
mutex_exit(&blp->bl_trilllock);
if ((mpsend = copymsg(mp)) != NULL) {
update_header(mpsend,
hdr_info, B_FALSE);
/*
* all trill data frames have
* Inner.VLAN
*/
mpsend = reform_vlan_header(mpsend,
vlanid, tci, 0);
if (mpsend == NULL) {
KIINCR(bki_drops);
} else {
trill_encap_fn(tdp, blp,
hdr_info, mpsend,
RBRIDGE_NICKNAME_NONE);
}
}
mutex_enter(&blp->bl_trilllock);
if (--blp->bl_trillthreads == 0 &&
blp->bl_trilldata == NULL)
cv_broadcast(&blp->bl_trillwait);
}
mutex_exit(&blp->bl_trilllock);
}
/*
* This is an unknown destination, so flood.
*/
rw_enter(&bip->bi_rwlock, RW_READER);
for (blpnext = list_head(&bip->bi_links); blpnext != NULL;
blpnext = list_next(&bip->bi_links, blpnext)) {
if (blpnext == blp)
selfseen = B_TRUE;
else if (bridge_can_send(blpnext, vlanid))
break;
}
if (blpnext != NULL)
atomic_inc_uint(&blpnext->bl_refs);
rw_exit(&bip->bi_rwlock);
while ((blpsend = blpnext) != NULL) {
rw_enter(&bip->bi_rwlock, RW_READER);
for (blpnext = list_next(&bip->bi_links, blpsend);
blpnext != NULL;
blpnext = list_next(&bip->bi_links, blpnext)) {
if (blpnext == blp)
selfseen = B_TRUE;
else if (bridge_can_send(blpnext, vlanid))
break;
}
if (blpnext != NULL)
atomic_inc_uint(&blpnext->bl_refs);
rw_exit(&bip->bi_rwlock);
if (blpnext == NULL && !selfseen) {
mpsend = mp;
mp = NULL;
} else {
mpsend = copymsg(mp);
}
if (!from_trill && is_xmit)
mpsend = mac_fix_cksum(mpsend);
mpsend = reform_vlan_header(mpsend, vlanid, tci,
blpsend->bl_pvid);
if (mpsend == NULL) {
KIINCR(bki_drops);
continue;
}
if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST)
KIINCR(bki_unknown);
else
KIINCR(bki_mbcast);
KLPINCR(blpsend, bkl_xmit);
if ((mpcopy = copymsg(mpsend)) != NULL)
mac_rx_common(blpsend->bl_mh, NULL, mpcopy);
MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, mpsend);
freemsg(mpsend);
link_unref(blpsend);
}
}
/*
* At this point, if np is non-NULL, it means that the caller needs to
* continue on the selected link.
*/
return (mp);
}
/*
* Extract and validate the VLAN information for a given packet. This checks
* conformance with the rules for use of the PVID on the link, and for the
* allowed (configured) VLAN set.
*
* Returns B_TRUE if the packet passes, B_FALSE if it fails.
*/
static boolean_t
bridge_get_vlan(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
uint16_t *vlanidp, uint16_t *tcip)
{
uint16_t tci, vlanid;
if (hdr_info->mhi_bindsap == ETHERTYPE_VLAN) {
ptrdiff_t tpos = offsetof(struct ether_vlan_header, ether_tci);
ptrdiff_t mlen;
/*
* Extract the VLAN ID information, regardless of alignment,
* and without a pullup. This isn't attractive, but we do this
* to avoid having to deal with the pointers stashed in
* hdr_info moving around or having the caller deal with a new
* mblk_t pointer.
*/
while (mp != NULL) {
mlen = MBLKL(mp);
if (mlen > tpos && mlen > 0)
break;
tpos -= mlen;
mp = mp->b_cont;
}
if (mp == NULL)
return (B_FALSE);
tci = mp->b_rptr[tpos] << 8;
if (++tpos >= mlen) {
do {
mp = mp->b_cont;
} while (mp != NULL && MBLKL(mp) == 0);
if (mp == NULL)
return (B_FALSE);
tpos = 0;
}
tci |= mp->b_rptr[tpos];
vlanid = VLAN_ID(tci);
if (VLAN_CFI(tci) != ETHER_CFI || vlanid > VLAN_ID_MAX)
return (B_FALSE);
if (vlanid == VLAN_ID_NONE || vlanid == blp->bl_pvid)
goto input_no_vlan;
if (!BRIDGE_VLAN_ISSET(blp, vlanid))
return (B_FALSE);
} else {
tci = 0xFFFF;
input_no_vlan:
/*
* If PVID is set to zero, then untagged traffic is not
* supported here. Do not learn or forward.
*/
if ((vlanid = blp->bl_pvid) == VLAN_ID_NONE)
return (B_FALSE);
}
*tcip = tci;
*vlanidp = vlanid;
return (B_TRUE);
}
/*
* Handle MAC notifications.
*/
static void
bridge_notify_cb(void *arg, mac_notify_type_t note_type)
{
bridge_link_t *blp = arg;
switch (note_type) {
case MAC_NOTE_UNICST:
bridge_new_unicst(blp);
break;
case MAC_NOTE_SDU_SIZE: {
uint_t maxsdu;
bridge_inst_t *bip = blp->bl_inst;
bridge_mac_t *bmp = bip->bi_mac;
boolean_t notify = B_FALSE;
mblk_t *mlist = NULL;
mac_sdu_get(blp->bl_mh, NULL, &maxsdu);
rw_enter(&bip->bi_rwlock, RW_READER);
if (list_prev(&bip->bi_links, blp) == NULL &&
list_next(&bip->bi_links, blp) == NULL) {
notify = (maxsdu != bmp->bm_maxsdu);
bmp->bm_maxsdu = maxsdu;
}
blp->bl_maxsdu = maxsdu;
if (maxsdu != bmp->bm_maxsdu)
link_sdu_fail(blp, B_TRUE, &mlist);
else if (notify)
(void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
rw_exit(&bip->bi_rwlock);
send_up_messages(bip, mlist);
break;
}
}
}
/*
* This is called by the MAC layer. As with the transmit side, we're right in
* the data path for all I/O on this port, so if we don't need to forward this
* packet anywhere, we have to send it upwards via mac_rx_common.
*/
static void
bridge_recv_cb(mac_handle_t mh, mac_resource_handle_t rsrc, mblk_t *mpnext)
{
mblk_t *mp, *mpcopy;
bridge_link_t *blp = (bridge_link_t *)mh;
bridge_inst_t *bip = blp->bl_inst;
bridge_mac_t *bmp = bip->bi_mac;
mac_header_info_t hdr_info;
uint16_t vlanid, tci;
boolean_t trillmode = B_FALSE;
KIINCR(bki_recv);
KLINCR(bkl_recv);
/*
* Regardless of state, check for inbound TRILL packets when TRILL is
* active. These are pulled out of band and sent for TRILL handling.
*/
if (blp->bl_trilldata != NULL) {
void *tdp;
mblk_t *newhead;
mblk_t *tail = NULL;
mutex_enter(&blp->bl_trilllock);
if ((tdp = blp->bl_trilldata) != NULL) {
blp->bl_trillthreads++;
mutex_exit(&blp->bl_trilllock);
trillmode = B_TRUE;
newhead = mpnext;
while ((mp = mpnext) != NULL) {
boolean_t raw_isis, bridge_group;
mpnext = mp->b_next;
/*
* If the header isn't readable, then leave on
* the list and continue.
*/
if (mac_header_info(blp->bl_mh, mp,
&hdr_info) != 0) {
tail = mp;
continue;
}
/*
* The TRILL document specifies that, on
* Ethernet alone, IS-IS packets arrive with
* LLC rather than Ethertype, and using a
* specific destination address. We must check
* for that here. Also, we need to give BPDUs
* to TRILL for processing.
*/
raw_isis = bridge_group = B_FALSE;
if (hdr_info.mhi_dsttype ==
MAC_ADDRTYPE_MULTICAST) {
if (memcmp(hdr_info.mhi_daddr,
all_isis_rbridges, ETHERADDRL) == 0)
raw_isis = B_TRUE;
else if (memcmp(hdr_info.mhi_daddr,
bridge_group_address, ETHERADDRL) ==
0)
bridge_group = B_TRUE;
}
if (!raw_isis && !bridge_group &&
hdr_info.mhi_bindsap != ETHERTYPE_TRILL &&
(hdr_info.mhi_bindsap != ETHERTYPE_VLAN ||
/* LINTED: alignment */
((struct ether_vlan_header *)mp->b_rptr)->
ether_type != htons(ETHERTYPE_TRILL))) {
tail = mp;
continue;
}
/*
* We've got TRILL input. Remove from the list
* and send up through the TRILL module. (Send
* a copy through promiscuous receive just to
* support snooping on TRILL. Order isn't
* preserved strictly, but that doesn't matter
* here.)
*/
if (tail != NULL)
tail->b_next = mpnext;
mp->b_next = NULL;
if (mp == newhead)
newhead = mpnext;
mac_trill_snoop(blp->bl_mh, mp);
update_header(mp, &hdr_info, B_TRUE);
/*
* On raw IS-IS and BPDU frames, we have to
* make sure that the length is trimmed
* properly. We use origsap in order to cope
* with jumbograms for IS-IS. (Regular mac
* can't.)
*/
if (raw_isis || bridge_group) {
size_t msglen = msgdsize(mp);
if (msglen > hdr_info.mhi_origsap) {
(void) adjmsg(mp,
hdr_info.mhi_origsap -
msglen);
} else if (msglen <
hdr_info.mhi_origsap) {
freemsg(mp);
continue;
}
}
trill_recv_fn(tdp, blp, rsrc, mp, &hdr_info);
}
mpnext = newhead;
mutex_enter(&blp->bl_trilllock);
if (--blp->bl_trillthreads == 0 &&
blp->bl_trilldata == NULL)
cv_broadcast(&blp->bl_trillwait);
}
mutex_exit(&blp->bl_trilllock);
if (mpnext == NULL)
return;
}
/*
* If this is a TRILL RBridge, then just check whether this link is
* used at all for forwarding. If not, then we're done.
*/
if (trillmode) {
if (!(blp->bl_flags & BLF_TRILLACTIVE) ||
(blp->bl_flags & BLF_SDUFAIL)) {
mac_rx_common(blp->bl_mh, rsrc, mpnext);
return;
}
} else {
/*
* For regular (STP) bridges, if we're in blocking or listening
* state, then do nothing. We don't learn or forward until
* told to do so.
*/
if (blp->bl_state == BLS_BLOCKLISTEN) {
mac_rx_common(blp->bl_mh, rsrc, mpnext);
return;
}
}
/*
* Send a copy of the message chain up to the observability node users.
* For TRILL, we must obey the VLAN AF rules, so we go packet-by-
* packet.
*/
if (!trillmode && blp->bl_state == BLS_FORWARDING &&
(bmp->bm_flags & BMF_STARTED) &&
(mp = copymsgchain(mpnext)) != NULL) {
mac_rx(bmp->bm_mh, NULL, mp);
}
/*
* We must be in learning or forwarding state, or using TRILL on a link
* with one or more VLANs active. For each packet in the list, process
* the source address, and then attempt to forward.
*/
while ((mp = mpnext) != NULL) {
mpnext = mp->b_next;
mp->b_next = NULL;
/*
* If we can't decode the header or if the header specifies a
* multicast source address (impossible!), then don't bother
* learning or forwarding, but go ahead and forward up the
* stack for subsequent processing.
*/
if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0 ||
(hdr_info.mhi_saddr[0] & 1) != 0) {
KIINCR(bki_drops);
KLINCR(bkl_drops);
mac_rx_common(blp->bl_mh, rsrc, mp);
continue;
}
/*
* Extract and validate the VLAN ID for this packet.
*/
if (!bridge_get_vlan(blp, &hdr_info, mp, &vlanid, &tci) ||
!BRIDGE_AF_ISSET(blp, vlanid)) {
mac_rx_common(blp->bl_mh, rsrc, mp);
continue;
}
if (trillmode) {
/*
* Special test required by TRILL document: must
* discard frames with outer address set to ESADI.
*/
if (memcmp(hdr_info.mhi_daddr, all_esadi_rbridges,
ETHERADDRL) == 0) {
mac_rx_common(blp->bl_mh, rsrc, mp);
continue;
}
/*
* If we're in TRILL mode, then the call above to get
* the VLAN ID has also checked that we're the
* appointed forwarder, so report that we're handling
* this packet to any observability node users.
*/
if ((bmp->bm_flags & BMF_STARTED) &&
(mpcopy = copymsg(mp)) != NULL)
mac_rx(bmp->bm_mh, NULL, mpcopy);
}
/*
* First process the source address and learn from it. For
* TRILL, we learn only if we're the appointed forwarder.
*/
bridge_learn(blp, hdr_info.mhi_saddr, RBRIDGE_NICKNAME_NONE,
vlanid);
/*
* Now check whether we're forwarding and look up the
* destination. If we can forward, do so.
*/
if (trillmode || blp->bl_state == BLS_FORWARDING) {
mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci,
B_FALSE, B_FALSE);
}
if (mp != NULL)
mac_rx_common(blp->bl_mh, rsrc, mp);
}
}
/* ARGSUSED */
static mblk_t *
bridge_xmit_cb(mac_handle_t mh, mac_ring_handle_t rh, mblk_t *mpnext)
{
bridge_link_t *blp = (bridge_link_t *)mh;
bridge_inst_t *bip = blp->bl_inst;
bridge_mac_t *bmp = bip->bi_mac;
mac_header_info_t hdr_info;
uint16_t vlanid, tci;
mblk_t *mp, *mpcopy;
boolean_t trillmode;
trillmode = blp->bl_trilldata != NULL;
/*
* If we're using STP and we're in blocking or listening state, or if
* we're using TRILL and no VLANs are active, then behave as though the
* bridge isn't here at all, and send on the local link alone.
*/
if ((!trillmode && blp->bl_state == BLS_BLOCKLISTEN) ||
(trillmode &&
(!(blp->bl_flags & BLF_TRILLACTIVE) ||
(blp->bl_flags & BLF_SDUFAIL)))) {
KIINCR(bki_sent);
KLINCR(bkl_xmit);
MAC_RING_TX(blp->bl_mh, rh, mpnext, mp);
return (mp);
}
/*
* Send a copy of the message up to the observability node users.
* TRILL needs to check on a packet-by-packet basis.
*/
if (!trillmode && blp->bl_state == BLS_FORWARDING &&
(bmp->bm_flags & BMF_STARTED) &&
(mp = copymsgchain(mpnext)) != NULL) {
mac_rx(bmp->bm_mh, NULL, mp);
}
while ((mp = mpnext) != NULL) {
mpnext = mp->b_next;
mp->b_next = NULL;
if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0) {
freemsg(mp);
continue;
}
/*
* Extract and validate the VLAN ID for this packet.
*/
if (!bridge_get_vlan(blp, &hdr_info, mp, &vlanid, &tci) ||
!BRIDGE_AF_ISSET(blp, vlanid)) {
freemsg(mp);
continue;
}
/*
* If we're using TRILL, then we've now validated that we're
* the forwarder for this VLAN, so go ahead and let
* observability node users know about the packet.
*/
if (trillmode && (bmp->bm_flags & BMF_STARTED) &&
(mpcopy = copymsg(mp)) != NULL) {
mac_rx(bmp->bm_mh, NULL, mpcopy);
}
/*
* We have to learn from our own transmitted packets, because
* there may be a Solaris DLPI raw sender (who can specify his
* own source address) using promiscuous mode for receive. The
* mac layer information won't (and can't) tell us everything
* we need to know.
*/
bridge_learn(blp, hdr_info.mhi_saddr, RBRIDGE_NICKNAME_NONE,
vlanid);
/* attempt forwarding */
if (trillmode || blp->bl_state == BLS_FORWARDING) {
mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci,
B_FALSE, B_TRUE);
}
if (mp != NULL) {
MAC_RING_TX(blp->bl_mh, rh, mp, mp);
if (mp == NULL) {
KIINCR(bki_sent);
KLINCR(bkl_xmit);
}
}
/*
* If we get stuck, then stop. Don't let the user's output
* packets get out of order. (More importantly: don't try to
* bridge the same packet multiple times if flow control is
* asserted.)
*/
if (mp != NULL) {
mp->b_next = mpnext;
break;
}
}
return (mp);
}
/*
* This is called by TRILL when it decapsulates an packet, and we must forward
* locally. On failure, we just drop.
*
* Note that the ingress_nick reported by TRILL must not represent this local
* node.
*/
void
bridge_trill_decaps(bridge_link_t *blp, mblk_t *mp, uint16_t ingress_nick)
{
mac_header_info_t hdr_info;
uint16_t vlanid, tci;
bridge_inst_t *bip = blp->bl_inst; /* used by macros */
mblk_t *mpcopy;
if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0) {
freemsg(mp);
return;
}
/* Extract VLAN ID for this packet. */
if (hdr_info.mhi_bindsap == ETHERTYPE_VLAN) {
struct ether_vlan_header *evhp;
/* LINTED: alignment */
evhp = (struct ether_vlan_header *)mp->b_rptr;
tci = ntohs(evhp->ether_tci);
vlanid = VLAN_ID(tci);
} else {
/* Inner VLAN headers are required in TRILL data packets */
DTRACE_PROBE3(bridge__trill__decaps__novlan, bridge_link_t *,
blp, mblk_t *, mp, uint16_t, ingress_nick);
freemsg(mp);
return;
}
/* Learn the location of this sender in the RBridge network */
bridge_learn(blp, hdr_info.mhi_saddr, ingress_nick, vlanid);
/* attempt forwarding */
mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci, B_TRUE, B_TRUE);
if (mp != NULL) {
if (bridge_can_send(blp, vlanid)) {
/* Deliver a copy locally as well */
if ((mpcopy = copymsg(mp)) != NULL)
mac_rx_common(blp->bl_mh, NULL, mpcopy);
MAC_RING_TX(blp->bl_mh, NULL, mp, mp);
}
if (mp == NULL) {
KIINCR(bki_sent);
KLINCR(bkl_xmit);
} else {
freemsg(mp);
}
}
}
/*
* This function is used by TRILL _only_ to transmit TRILL-encapsulated
* packets. It sends on a single underlying link and does not bridge.
*/
mblk_t *
bridge_trill_output(bridge_link_t *blp, mblk_t *mp)
{
bridge_inst_t *bip = blp->bl_inst; /* used by macros */
mac_trill_snoop(blp->bl_mh, mp);
MAC_RING_TX(blp->bl_mh, NULL, mp, mp);
if (mp == NULL) {
KIINCR(bki_sent);
KLINCR(bkl_xmit);
}
return (mp);
}
/*
* Set the "appointed forwarder" flag array for this link. TRILL controls
* forwarding on a VLAN basis. The "trillactive" flag is an optimization for
* the forwarder.
*/
void
bridge_trill_setvlans(bridge_link_t *blp, const uint8_t *arr)
{
int i;
uint_t newflags = 0;
for (i = 0; i < BRIDGE_VLAN_ARR_SIZE; i++) {
if ((blp->bl_afs[i] = arr[i]) != 0)
newflags = BLF_TRILLACTIVE;
}
blp->bl_flags = (blp->bl_flags & ~BLF_TRILLACTIVE) | newflags;
}
void
bridge_trill_flush(bridge_link_t *blp, uint16_t vlan, boolean_t dotrill)
{
bridge_inst_t *bip = blp->bl_inst;
bridge_fwd_t *bfp, *bfnext;
avl_tree_t fwd_scavenge;
int i;
_NOTE(ARGUNUSED(vlan));
avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
offsetof(bridge_fwd_t, bf_node));
rw_enter(&bip->bi_rwlock, RW_WRITER);
bfnext = avl_first(&bip->bi_fwd);
while ((bfp = bfnext) != NULL) {
bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
if (bfp->bf_flags & BFF_LOCALADDR)
continue;
if (dotrill) {
/* port doesn't matter if we're flushing TRILL */
if (bfp->bf_trill_nick == RBRIDGE_NICKNAME_NONE)
continue;
} else {
if (bfp->bf_trill_nick != RBRIDGE_NICKNAME_NONE)
continue;
for (i = 0; i < bfp->bf_nlinks; i++) {
if (bfp->bf_links[i] == blp)
break;
}
if (i >= bfp->bf_nlinks)
continue;
}
ASSERT(bfp->bf_flags & BFF_INTREE);
avl_remove(&bip->bi_fwd, bfp);
bfp->bf_flags &= ~BFF_INTREE;
avl_add(&fwd_scavenge, bfp);
}
rw_exit(&bip->bi_rwlock);
bfnext = avl_first(&fwd_scavenge);
while ((bfp = bfnext) != NULL) {
bfnext = AVL_NEXT(&fwd_scavenge, bfp);
avl_remove(&fwd_scavenge, bfp);
fwd_unref(bfp);
}
avl_destroy(&fwd_scavenge);
}
/*
* Let the mac module take or drop a reference to a bridge link. When this is
* called, the mac module is holding the mi_bridge_lock, so the link cannot be
* in the process of entering or leaving a bridge.
*/
static void
bridge_ref_cb(mac_handle_t mh, boolean_t hold)
{
bridge_link_t *blp = (bridge_link_t *)mh;
if (hold)
atomic_inc_uint(&blp->bl_refs);
else
link_unref(blp);
}
/*
* Handle link state changes reported by the mac layer. This acts as a filter
* for link state changes: if a link is reporting down, but there are other
* links still up on the bridge, then the state is changed to "up." When the
* last link goes down, all are marked down, and when the first link goes up,
* all are marked up. (Recursion is avoided by the use of the "redo" function.)
*
* We treat unknown as equivalent to "up."
*/
static link_state_t
bridge_ls_cb(mac_handle_t mh, link_state_t newls)
{
bridge_link_t *blp = (bridge_link_t *)mh;
bridge_link_t *blcmp;
bridge_inst_t *bip;
bridge_mac_t *bmp;
if (newls != LINK_STATE_DOWN && blp->bl_linkstate != LINK_STATE_DOWN ||
(blp->bl_flags & (BLF_DELETED|BLF_SDUFAIL))) {
blp->bl_linkstate = newls;
return (newls);
}
/*
* Scan first to see if there are any other non-down links. If there
* are, then we're done. Otherwise, if all others are down, then the
* state of this link is the state of the bridge.
*/
bip = blp->bl_inst;
rw_enter(&bip->bi_rwlock, RW_WRITER);
for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
blcmp = list_next(&bip->bi_links, blcmp)) {
if (blcmp != blp &&
!(blcmp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)) &&
blcmp->bl_linkstate != LINK_STATE_DOWN)
break;
}
if (blcmp != NULL) {
/*
* If there are other links that are considered up, then tell
* the caller that the link is actually still up, regardless of
* this link's underlying state.
*/
blp->bl_linkstate = newls;
newls = LINK_STATE_UP;
} else if (blp->bl_linkstate != newls) {
/*
* If we've found no other 'up' links, and this link has
* changed state, then report the new state of the bridge to
* all other clients.
*/
blp->bl_linkstate = newls;
for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
blcmp = list_next(&bip->bi_links, blcmp)) {
if (blcmp != blp && !(blcmp->bl_flags & BLF_DELETED))
mac_link_redo(blcmp->bl_mh, newls);
}
bmp = bip->bi_mac;
if ((bmp->bm_linkstate = newls) != LINK_STATE_DOWN)
bmp->bm_linkstate = LINK_STATE_UP;
mac_link_redo(bmp->bm_mh, bmp->bm_linkstate);
}
rw_exit(&bip->bi_rwlock);
return (newls);
}
static void
bridge_add_link(void *arg)
{
mblk_t *mp = arg;
bridge_stream_t *bsp;
bridge_inst_t *bip, *bipt;
bridge_mac_t *bmp;
datalink_id_t linkid;
int err;
mac_handle_t mh;
uint_t maxsdu;
bridge_link_t *blp = NULL, *blpt;
const mac_info_t *mip;
boolean_t macopen = B_FALSE;
char linkname[MAXLINKNAMELEN];
char kstatname[KSTAT_STRLEN];
int i;
link_state_t linkstate;
mblk_t *mlist;
bsp = (bridge_stream_t *)mp->b_next;
mp->b_next = NULL;
bip = bsp->bs_inst;
/* LINTED: alignment */
linkid = *(datalink_id_t *)mp->b_cont->b_rptr;
/*
* First make sure that there is no other bridge that has this link.
* We don't want to overlap operations from two bridges; the MAC layer
* supports only one bridge on a given MAC at a time.
*
* We rely on the fact that there's just one taskq thread for the
* bridging module: once we've checked for a duplicate, we can drop the
* lock, because no other thread could possibly be adding another link
* until we're done.
*/
mutex_enter(&inst_lock);
for (bipt = list_head(&inst_list); bipt != NULL;
bipt = list_next(&inst_list, bipt)) {
rw_enter(&bipt->bi_rwlock, RW_READER);
for (blpt = list_head(&bipt->bi_links); blpt != NULL;
blpt = list_next(&bipt->bi_links, blpt)) {
if (linkid == blpt->bl_linkid)
break;
}
rw_exit(&bipt->bi_rwlock);
if (blpt != NULL)
break;
}
mutex_exit(&inst_lock);
if (bipt != NULL) {
err = EBUSY;
goto fail;
}
if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
goto fail;
macopen = B_TRUE;
/* we bridge only Ethernet */
mip = mac_info(mh);
if (mip->mi_media != DL_ETHER) {
err = ENOTSUP;
goto fail;
}
/*
* Get the current maximum SDU on this interface. If there are other
* links on the bridge, then this one must match, or it errors out.
* Otherwise, the first link becomes the standard for the new bridge.
*/
mac_sdu_get(mh, NULL, &maxsdu);
bmp = bip->bi_mac;
if (list_is_empty(&bip->bi_links)) {
bmp->bm_maxsdu = maxsdu;
(void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
}
/* figure the kstat name; also used as the mac client name */
i = MBLKL(mp->b_cont) - sizeof (datalink_id_t);
if (i < 0 || i >= MAXLINKNAMELEN)
i = MAXLINKNAMELEN - 1;
bcopy(mp->b_cont->b_rptr + sizeof (datalink_id_t), linkname, i);
linkname[i] = '\0';
(void) snprintf(kstatname, sizeof (kstatname), "%s-%s", bip->bi_name,
linkname);
if ((blp = kmem_zalloc(sizeof (*blp), KM_NOSLEEP)) == NULL) {
err = ENOMEM;
goto fail;
}
blp->bl_lfailmp = allocb(sizeof (bridge_ctl_t), BPRI_MED);
if (blp->bl_lfailmp == NULL) {
kmem_free(blp, sizeof (*blp));
err = ENOMEM;
goto fail;
}
atomic_inc_uint(&bip->bi_refs);
blp->bl_inst = bip;
blp->bl_mh = mh;
blp->bl_linkid = linkid;
blp->bl_maxsdu = maxsdu;
cv_init(&blp->bl_trillwait, NULL, CV_DRIVER, NULL);
mutex_init(&blp->bl_trilllock, NULL, MUTEX_DRIVER, NULL);
(void) memset(blp->bl_afs, 0xff, sizeof (blp->bl_afs));
err = mac_client_open(mh, &blp->bl_mch, kstatname, 0);
if (err != 0)
goto fail;
blp->bl_flags |= BLF_CLIENT_OPEN;
err = mac_margin_add(mh, &blp->bl_margin, B_TRUE);
if (err != 0)
goto fail;
blp->bl_flags |= BLF_MARGIN_ADDED;
blp->bl_mnh = mac_notify_add(mh, bridge_notify_cb, blp);
err = mac_bridge_set(mh, (mac_handle_t)blp);
if (err != 0)
goto fail;
blp->bl_flags |= BLF_SET_BRIDGE;
err = mac_promisc_add(blp->bl_mch, MAC_CLIENT_PROMISC_ALL, NULL,
blp, &blp->bl_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP);
if (err != 0)
goto fail;
blp->bl_flags |= BLF_PROM_ADDED;
bridge_new_unicst(blp);
blp->bl_ksp = kstat_setup((kstat_named_t *)&blp->bl_kstats,
link_kstats_list, Dim(link_kstats_list), kstatname);
/*
* The link holds a reference to the bridge instance, so that the
* instance can't go away before the link is freed. The insertion into
* bi_links holds a reference on the link. When marking as removed
* from bi_links (BLF_DELETED), drop the reference on the link. When
* freeing the link, drop the reference on the instance.
*/
rw_enter(&bip->bi_rwlock, RW_WRITER);
list_insert_tail(&bip->bi_links, blp);
atomic_inc_uint(&blp->bl_refs);
/*
* If the new link is no good on this bridge, then let the daemon know
* about the problem.
*/
mlist = NULL;
if (maxsdu != bmp->bm_maxsdu)
link_sdu_fail(blp, B_TRUE, &mlist);
rw_exit(&bip->bi_rwlock);
send_up_messages(bip, mlist);
/*
* Trigger a link state update so that if this link is the first one
* "up" in the bridge, then we notify everyone. This triggers a trip
* through bridge_ls_cb.
*/
linkstate = mac_stat_get(mh, MAC_STAT_LOWLINK_STATE);
blp->bl_linkstate = LINK_STATE_DOWN;
mac_link_update(mh, linkstate);
/*
* We now need to report back to the stream that invoked us, and then
* drop the reference on the stream that we're holding.
*/
miocack(bsp->bs_wq, mp, 0, 0);
stream_unref(bsp);
return;
fail:
if (blp == NULL) {
if (macopen)
mac_close(mh);
} else {
link_shutdown(blp);
link_free(blp);
}
miocnak(bsp->bs_wq, mp, 0, err);
stream_unref(bsp);
}
static void
bridge_rem_link(void *arg)
{
mblk_t *mp = arg;
bridge_stream_t *bsp;
bridge_inst_t *bip;
bridge_mac_t *bmp;
datalink_id_t linkid;
bridge_link_t *blp, *blsave;
boolean_t found;
mblk_t *mlist;
bsp = (bridge_stream_t *)mp->b_next;
mp->b_next = NULL;
bip = bsp->bs_inst;
/* LINTED: alignment */
linkid = *(datalink_id_t *)mp->b_cont->b_rptr;
/*
* We become reader here so that we can loop over the other links and
* deliver link up/down notification.
*/
rw_enter(&bip->bi_rwlock, RW_READER);
found = B_FALSE;
for (blp = list_head(&bip->bi_links); blp != NULL;
blp = list_next(&bip->bi_links, blp)) {
if (blp->bl_linkid == linkid &&
!(blp->bl_flags & BLF_DELETED)) {
blp->bl_flags |= BLF_DELETED;
(void) ddi_taskq_dispatch(bridge_taskq, link_shutdown,
blp, DDI_SLEEP);
found = B_TRUE;
break;
}
}
/*
* Check if this link is up and the remainder of the links are all
* down.
*/
if (blp != NULL && blp->bl_linkstate != LINK_STATE_DOWN) {
for (blp = list_head(&bip->bi_links); blp != NULL;
blp = list_next(&bip->bi_links, blp)) {
if (blp->bl_linkstate != LINK_STATE_DOWN &&
!(blp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)))
break;
}
if (blp == NULL) {
for (blp = list_head(&bip->bi_links); blp != NULL;
blp = list_next(&bip->bi_links, blp)) {
if (!(blp->bl_flags & BLF_DELETED))
mac_link_redo(blp->bl_mh,
LINK_STATE_DOWN);
}
bmp = bip->bi_mac;
bmp->bm_linkstate = LINK_STATE_DOWN;
mac_link_redo(bmp->bm_mh, LINK_STATE_DOWN);
}
}
/*
* Check if there's just one working link left on the bridge. If so,
* then that link is now authoritative for bridge MTU.
*/
blsave = NULL;
for (blp = list_head(&bip->bi_links); blp != NULL;
blp = list_next(&bip->bi_links, blp)) {
if (!(blp->bl_flags & BLF_DELETED)) {
if (blsave == NULL)
blsave = blp;
else
break;
}
}
mlist = NULL;
bmp = bip->bi_mac;
if (blsave != NULL && blp == NULL &&
blsave->bl_maxsdu != bmp->bm_maxsdu) {
bmp->bm_maxsdu = blsave->bl_maxsdu;
(void) mac_maxsdu_update(bmp->bm_mh, blsave->bl_maxsdu);
link_sdu_fail(blsave, B_FALSE, &mlist);
}
rw_exit(&bip->bi_rwlock);
send_up_messages(bip, mlist);
if (found)
miocack(bsp->bs_wq, mp, 0, 0);
else
miocnak(bsp->bs_wq, mp, 0, ENOENT);
stream_unref(bsp);
}
/*
* This function intentionally returns with bi_rwlock held; it is intended for
* quick checks and updates.
*/
static bridge_link_t *
enter_link(bridge_inst_t *bip, datalink_id_t linkid)
{
bridge_link_t *blp;
rw_enter(&bip->bi_rwlock, RW_READER);
for (blp = list_head(&bip->bi_links); blp != NULL;
blp = list_next(&bip->bi_links, blp)) {
if (blp->bl_linkid == linkid && !(blp->bl_flags & BLF_DELETED))
break;
}
return (blp);
}
static void
bridge_ioctl(queue_t *wq, mblk_t *mp)
{
bridge_stream_t *bsp = wq->q_ptr;
bridge_inst_t *bip;
struct iocblk *iop;
int rc = EINVAL;
int len = 0;
bridge_link_t *blp;
cred_t *cr;
/* LINTED: alignment */
iop = (struct iocblk *)mp->b_rptr;
/*
* For now, all of the bridge ioctls are privileged.
*/
if ((cr = msg_getcred(mp, NULL)) == NULL)
cr = iop->ioc_cr;
if (cr != NULL && secpolicy_net_config(cr, B_FALSE) != 0) {
miocnak(wq, mp, 0, EPERM);
return;
}
switch (iop->ioc_cmd) {
case BRIOC_NEWBRIDGE: {
bridge_newbridge_t *bnb;
if (bsp->bs_inst != NULL ||
(rc = miocpullup(mp, sizeof (bridge_newbridge_t))) != 0)
break;
/* LINTED: alignment */
bnb = (bridge_newbridge_t *)mp->b_cont->b_rptr;
bnb->bnb_name[MAXNAMELEN-1] = '\0';
if ((rc = bridge_create(bnb->bnb_linkid,
bnb->bnb_name, &bip)) != 0)
break;
rw_enter(&bip->bi_rwlock, RW_WRITER);
if (bip->bi_control != NULL) {
rw_exit(&bip->bi_rwlock);
bridge_unref(bip);
rc = EBUSY;
} else {
atomic_inc_uint(&bip->bi_refs);
bsp->bs_inst = bip; /* stream holds reference */
bip->bi_control = bsp;
rw_exit(&bip->bi_rwlock);
rc = 0;
}
break;
}
case BRIOC_ADDLINK:
if ((bip = bsp->bs_inst) == NULL ||
(rc = miocpullup(mp, sizeof (datalink_id_t))) != 0)
break;
/*
* We cannot perform the action in this thread, because we're
* not in process context, and we may already be holding
* MAC-related locks. Place the request on taskq.
*/
mp->b_next = (mblk_t *)bsp;
stream_ref(bsp);
(void) ddi_taskq_dispatch(bridge_taskq, bridge_add_link, mp,
DDI_SLEEP);
return;
case BRIOC_REMLINK:
if ((bip = bsp->bs_inst) == NULL ||
(rc = miocpullup(mp, sizeof (datalink_id_t))) != 0)
break;
/*
* We cannot perform the action in this thread, because we're
* not in process context, and we may already be holding
* MAC-related locks. Place the request on taskq.
*/
mp->b_next = (mblk_t *)bsp;
stream_ref(bsp);
(void) ddi_taskq_dispatch(bridge_taskq, bridge_rem_link, mp,
DDI_SLEEP);
return;
case BRIOC_SETSTATE: {
bridge_setstate_t *bss;
if ((bip = bsp->bs_inst) == NULL ||
(rc = miocpullup(mp, sizeof (*bss))) != 0)
break;
/* LINTED: alignment */
bss = (bridge_setstate_t *)mp->b_cont->b_rptr;
if ((blp = enter_link(bip, bss->bss_linkid)) == NULL) {
rc = ENOENT;
} else {
rc = 0;
blp->bl_state = bss->bss_state;
}
rw_exit(&bip->bi_rwlock);
break;
}
case BRIOC_SETPVID: {
bridge_setpvid_t *bsv;
if ((bip = bsp->bs_inst) == NULL ||
(rc = miocpullup(mp, sizeof (*bsv))) != 0)
break;
/* LINTED: alignment */
bsv = (bridge_setpvid_t *)mp->b_cont->b_rptr;
if (bsv->bsv_vlan > VLAN_ID_MAX)
break;
if ((blp = enter_link(bip, bsv->bsv_linkid)) == NULL) {
rc = ENOENT;
} else if (blp->bl_pvid == bsv->bsv_vlan) {
rc = 0;
} else {
rc = 0;
BRIDGE_VLAN_CLR(blp, blp->bl_pvid);
blp->bl_pvid = bsv->bsv_vlan;
if (blp->bl_pvid != 0)
BRIDGE_VLAN_SET(blp, blp->bl_pvid);
}
rw_exit(&bip->bi_rwlock);
break;
}
case BRIOC_VLANENAB: {
bridge_vlanenab_t *bve;
if ((bip = bsp->bs_inst) == NULL ||
(rc = miocpullup(mp, sizeof (*bve))) != 0)
break;
/* LINTED: alignment */
bve = (bridge_vlanenab_t *)mp->b_cont->b_rptr;
if (bve->bve_vlan > VLAN_ID_MAX)
break;
if ((blp = enter_link(bip, bve->bve_linkid)) == NULL) {
rc = ENOENT;
} else {
rc = 0;
/* special case: vlan 0 means "all" */
if (bve->bve_vlan == 0) {
(void) memset(blp->bl_vlans,
bve->bve_onoff ? ~0 : 0,
sizeof (blp->bl_vlans));
BRIDGE_VLAN_CLR(blp, 0);
if (blp->bl_pvid != 0)
BRIDGE_VLAN_SET(blp, blp->bl_pvid);
} else if (bve->bve_vlan == blp->bl_pvid) {
rc = EINVAL;
} else if (bve->bve_onoff) {
BRIDGE_VLAN_SET(blp, bve->bve_vlan);
} else {
BRIDGE_VLAN_CLR(blp, bve->bve_vlan);
}
}
rw_exit(&bip->bi_rwlock);
break;
}
case BRIOC_FLUSHFWD: {
bridge_flushfwd_t *bff;
bridge_fwd_t *bfp, *bfnext;
avl_tree_t fwd_scavenge;
int i;
if ((bip = bsp->bs_inst) == NULL ||
(rc = miocpullup(mp, sizeof (*bff))) != 0)
break;
/* LINTED: alignment */
bff = (bridge_flushfwd_t *)mp->b_cont->b_rptr;
rw_enter(&bip->bi_rwlock, RW_WRITER);
/* This case means "all" */
if (bff->bff_linkid == DATALINK_INVALID_LINKID) {
blp = NULL;
} else {
for (blp = list_head(&bip->bi_links); blp != NULL;
blp = list_next(&bip->bi_links, blp)) {
if (blp->bl_linkid == bff->bff_linkid &&
!(blp->bl_flags & BLF_DELETED))
break;
}
if (blp == NULL) {
rc = ENOENT;
rw_exit(&bip->bi_rwlock);
break;
}
}
avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
offsetof(bridge_fwd_t, bf_node));
bfnext = avl_first(&bip->bi_fwd);
while ((bfp = bfnext) != NULL) {
bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
if (bfp->bf_flags & BFF_LOCALADDR)
continue;
if (blp != NULL) {
for (i = 0; i < bfp->bf_maxlinks; i++) {
if (bfp->bf_links[i] == blp)
break;
}
/*
* If the link is there and we're excluding,
* then skip. If the link is not there and
* we're doing only that link, then skip.
*/
if ((i < bfp->bf_maxlinks) == bff->bff_exclude)
continue;
}
ASSERT(bfp->bf_flags & BFF_INTREE);
avl_remove(&bip->bi_fwd, bfp);
bfp->bf_flags &= ~BFF_INTREE;
avl_add(&fwd_scavenge, bfp);
}
rw_exit(&bip->bi_rwlock);
bfnext = avl_first(&fwd_scavenge);
while ((bfp = bfnext) != NULL) {
bfnext = AVL_NEXT(&fwd_scavenge, bfp);
avl_remove(&fwd_scavenge, bfp);
fwd_unref(bfp); /* drop tree reference */
}
avl_destroy(&fwd_scavenge);
break;
}
case BRIOC_TABLEMAX:
if ((bip = bsp->bs_inst) == NULL ||
(rc = miocpullup(mp, sizeof (uint32_t))) != 0)
break;
/* LINTED: alignment */
bip->bi_tablemax = *(uint32_t *)mp->b_cont->b_rptr;
break;
}
if (rc == 0)
miocack(wq, mp, len, 0);
else
miocnak(wq, mp, 0, rc);
}
static void
bridge_wput(queue_t *wq, mblk_t *mp)
{
switch (DB_TYPE(mp)) {
case M_IOCTL:
bridge_ioctl(wq, mp);
break;
case M_FLUSH:
if (*mp->b_rptr & FLUSHW)
*mp->b_rptr &= ~FLUSHW;
if (*mp->b_rptr & FLUSHR)
qreply(wq, mp);
else
freemsg(mp);
break;
default:
freemsg(mp);
break;
}
}
/*
* This function allocates the main data structures for the bridge driver and
* connects us into devfs.
*/
static void
bridge_inst_init(void)
{
bridge_scan_interval = 5 * drv_usectohz(1000000);
bridge_fwd_age = 25 * drv_usectohz(1000000);
rw_init(&bmac_rwlock, NULL, RW_DRIVER, NULL);
list_create(&bmac_list, sizeof (bridge_mac_t),
offsetof(bridge_mac_t, bm_node));
list_create(&inst_list, sizeof (bridge_inst_t),
offsetof(bridge_inst_t, bi_node));
cv_init(&inst_cv, NULL, CV_DRIVER, NULL);
mutex_init(&inst_lock, NULL, MUTEX_DRIVER, NULL);
cv_init(&stream_ref_cv, NULL, CV_DRIVER, NULL);
mutex_init(&stream_ref_lock, NULL, MUTEX_DRIVER, NULL);
mac_bridge_vectors(bridge_xmit_cb, bridge_recv_cb, bridge_ref_cb,
bridge_ls_cb);
}
/*
* This function disconnects from devfs and destroys all data structures in
* preparation for unload. It's assumed that there are no active bridge
* references left at this point.
*/
static void
bridge_inst_fini(void)
{
mac_bridge_vectors(NULL, NULL, NULL, NULL);
if (bridge_timerid != 0)
(void) untimeout(bridge_timerid);
rw_destroy(&bmac_rwlock);
list_destroy(&bmac_list);
list_destroy(&inst_list);
cv_destroy(&inst_cv);
mutex_destroy(&inst_lock);
cv_destroy(&stream_ref_cv);
mutex_destroy(&stream_ref_lock);
}
/*
* bridge_attach()
*
* Description:
* Attach bridge driver to the system.
*/
static int
bridge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
if (cmd != DDI_ATTACH)
return (DDI_FAILURE);
if (ddi_create_minor_node(dip, BRIDGE_CTL, S_IFCHR, 0, DDI_PSEUDO,
CLONE_DEV) == DDI_FAILURE) {
return (DDI_FAILURE);
}
if (dld_ioc_register(BRIDGE_IOC, bridge_ioc_list,
DLDIOCCNT(bridge_ioc_list)) != 0) {
ddi_remove_minor_node(dip, BRIDGE_CTL);
return (DDI_FAILURE);
}
bridge_dev_info = dip;
bridge_major = ddi_driver_major(dip);
bridge_taskq = ddi_taskq_create(dip, "bridge", 1, TASKQ_DEFAULTPRI, 0);
return (DDI_SUCCESS);
}
/*
* bridge_detach()
*
* Description:
* Detach an interface to the system.
*/
static int
bridge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
if (cmd != DDI_DETACH)
return (DDI_FAILURE);
ddi_remove_minor_node(dip, NULL);
ddi_taskq_destroy(bridge_taskq);
bridge_dev_info = NULL;
return (DDI_SUCCESS);
}
/*
* bridge_info()
*
* Description:
* Translate "dev_t" to a pointer to the associated "dev_info_t".
*/
/* ARGSUSED */
static int
bridge_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg,
void **result)
{
int rc;
switch (infocmd) {
case DDI_INFO_DEVT2DEVINFO:
if (bridge_dev_info == NULL) {
rc = DDI_FAILURE;
} else {
*result = (void *)bridge_dev_info;
rc = DDI_SUCCESS;
}
break;
case DDI_INFO_DEVT2INSTANCE:
*result = NULL;
rc = DDI_SUCCESS;
break;
default:
rc = DDI_FAILURE;
break;
}
return (rc);
}
static struct module_info bridge_modinfo = {
2105, /* mi_idnum */
"bridge", /* mi_idname */
0, /* mi_minpsz */
16384, /* mi_maxpsz */
65536, /* mi_hiwat */
128 /* mi_lowat */
};
static struct qinit bridge_rinit = {
NULL, /* qi_putp */
NULL, /* qi_srvp */
bridge_open, /* qi_qopen */
bridge_close, /* qi_qclose */
NULL, /* qi_qadmin */
&bridge_modinfo, /* qi_minfo */
NULL /* qi_mstat */
};
static struct qinit bridge_winit = {
(int (*)())bridge_wput, /* qi_putp */
NULL, /* qi_srvp */
NULL, /* qi_qopen */
NULL, /* qi_qclose */
NULL, /* qi_qadmin */
&bridge_modinfo, /* qi_minfo */
NULL /* qi_mstat */
};
static struct streamtab bridge_tab = {
&bridge_rinit, /* st_rdinit */
&bridge_winit /* st_wrinit */
};
/* No STREAMS perimeters; we do all our own locking */
DDI_DEFINE_STREAM_OPS(bridge_ops, nulldev, nulldev, bridge_attach,
bridge_detach, nodev, bridge_info, D_NEW | D_MP, &bridge_tab,
ddi_quiesce_not_supported);
static struct modldrv modldrv = {
&mod_driverops,
"bridging driver",
&bridge_ops
};
static struct modlinkage modlinkage = {
MODREV_1,
(void *)&modldrv,
NULL
};
int
_init(void)
{
int retv;
bridge_inst_init();
if ((retv = mod_install(&modlinkage)) != 0)
bridge_inst_fini();
return (retv);
}
int
_fini(void)
{
int retv;
rw_enter(&bmac_rwlock, RW_READER);
retv = list_is_empty(&bmac_list) ? 0 : EBUSY;
rw_exit(&bmac_rwlock);
if (retv == 0 &&
(retv = mod_remove(&modlinkage)) == 0)
bridge_inst_fini();
return (retv);
}
int
_info(struct modinfo *modinfop)
{
return (mod_info(&modlinkage, modinfop));
}