ipnet.c revision 7b57f05abb8796d3c91c8d4d4c75dcafb5af6b69
0N/A/*
3845N/A * CDDL HEADER START
0N/A *
0N/A * The contents of this file are subject to the terms of the
0N/A * Common Development and Distribution License (the "License").
0N/A * You may not use this file except in compliance with the License.
0N/A *
0N/A * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
0N/A * or http://www.opensolaris.org/os/licensing.
0N/A * See the License for the specific language governing permissions
0N/A * and limitations under the License.
0N/A *
0N/A * When distributing Covered Code, include this CDDL HEADER in each
0N/A * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
0N/A * If applicable, add the following below this CDDL HEADER, with the
0N/A * fields enclosed by brackets "[]" replaced with your own identifying
0N/A * information: Portions Copyright [yyyy] [name of copyright owner]
0N/A *
1472N/A * CDDL HEADER END
1472N/A */
1472N/A
0N/A/*
0N/A * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
0N/A * Use is subject to license terms.
1879N/A */
1879N/A
1879N/A/*
1879N/A * The ipnet device defined here provides access to packets at the IP layer. To
1879N/A * provide access to packets at this layer it registers a callback function in
1879N/A * the ip module and when there are open instances of the device ip will pass
1879N/A * packets into the device. Packets from ip are passed on the input, output and
1879N/A * loopback paths. Internally the module returns to ip as soon as possible by
1879N/A * deferring processing using a taskq.
0N/A *
0N/A * Management of the devices in /dev/ipnet/ is handled by the devname
0N/A * filesystem and use of the neti interfaces. This module registers for NIC
0N/A * events using the neti framework so that when IP interfaces are bought up,
0N/A * taken down etc. the ipnet module is notified and its view of the interfaces
0N/A * configured on the system adjusted. On attach, the module gets an initial
0N/A * view of the system again using the neti framework but as it has already
0N/A * registered for IP interface events, it is still up-to-date with any changes.
0N/A */
0N/A
0N/A#include <sys/types.h>
0N/A#include <sys/conf.h>
0N/A#include <sys/cred.h>
0N/A#include <sys/stat.h>
0N/A#include <sys/ddi.h>
0N/A#include <sys/sunddi.h>
0N/A#include <sys/modctl.h>
0N/A#include <sys/dlpi.h>
0N/A#include <sys/strsun.h>
0N/A#include <sys/id_space.h>
0N/A#include <sys/kmem.h>
0N/A#include <sys/mkdev.h>
0N/A#include <sys/neti.h>
0N/A#include <net/if.h>
0N/A#include <sys/errno.h>
0N/A#include <sys/list.h>
0N/A#include <sys/ksynch.h>
0N/A#include <sys/hook_event.h>
0N/A#include <sys/sdt.h>
0N/A#include <sys/stropts.h>
0N/A#include <sys/sysmacros.h>
0N/A#include <inet/ip.h>
0N/A#include <inet/ip_if.h>
0N/A#include <inet/ip_multi.h>
0N/A#include <inet/ip6.h>
0N/A#include <inet/ipnet.h>
0N/A#include <net/bpf.h>
0N/A#include <net/bpfdesc.h>
0N/A#include <net/dlt.h>
0N/A
0N/Astatic struct module_info ipnet_minfo = {
0N/A 1, /* mi_idnum */
0N/A "ipnet", /* mi_idname */
0N/A 0, /* mi_minpsz */
0N/A INFPSZ, /* mi_maxpsz */
0N/A 2048, /* mi_hiwat */
0N/A 0 /* mi_lowat */
0N/A};
0N/A
0N/A/*
0N/A * List to hold static view of ipnetif_t's on the system. This is needed to
0N/A * avoid holding the lock protecting the avl tree of ipnetif's over the
0N/A * callback into the dev filesystem.
0N/A */
0N/Atypedef struct ipnetif_cbdata {
0N/A char ic_ifname[LIFNAMSIZ];
0N/A dev_t ic_dev;
0N/A list_node_t ic_next;
0N/A} ipnetif_cbdata_t;
0N/A
0N/A/*
0N/A * Convenience enumerated type for ipnet_accept(). It describes the
0N/A * properties of a given ipnet_addrp_t relative to a single ipnet_t
0N/A * client stream. The values represent whether the address is ...
0N/A */
0N/Atypedef enum {
0N/A IPNETADDR_MYADDR, /* an address on my ipnetif_t. */
0N/A IPNETADDR_MBCAST, /* a multicast or broadcast address. */
0N/A IPNETADDR_UNKNOWN /* none of the above. */
0N/A} ipnet_addrtype_t;
0N/A
0N/A/* Argument used for the ipnet_nicevent_taskq callback. */
0N/Atypedef struct ipnet_nicevent_s {
0N/A nic_event_t ipne_event;
0N/A net_handle_t ipne_protocol;
0N/A netstackid_t ipne_stackid;
0N/A uint64_t ipne_ifindex;
0N/A uint64_t ipne_lifindex;
0N/A char ipne_ifname[LIFNAMSIZ];
0N/A} ipnet_nicevent_t;
0N/A
0N/Astatic dev_info_t *ipnet_dip;
0N/Astatic major_t ipnet_major;
0N/Astatic ddi_taskq_t *ipnet_taskq; /* taskq for packets */
0N/Astatic ddi_taskq_t *ipnet_nicevent_taskq; /* taskq for NIC events */
0N/Astatic id_space_t *ipnet_minor_space;
0N/Astatic const int IPNET_MINOR_LO = 1; /* minor number for /dev/lo0 */
0N/Astatic const int IPNET_MINOR_MIN = 2; /* start of dynamic minors */
0N/Astatic dl_info_ack_t ipnet_infoack = IPNET_INFO_ACK_INIT;
0N/Astatic ipnet_acceptfn_t ipnet_accept, ipnet_loaccept;
0N/Astatic bpf_itap_fn_t ipnet_itap;
0N/A
0N/Astatic void ipnet_input(mblk_t *);
0N/Astatic int ipnet_wput(queue_t *, mblk_t *);
0N/Astatic int ipnet_rsrv(queue_t *);
0N/Astatic int ipnet_open(queue_t *, dev_t *, int, int, cred_t *);
0N/Astatic int ipnet_close(queue_t *);
0N/Astatic void ipnet_ioctl(queue_t *, mblk_t *);
0N/Astatic void ipnet_iocdata(queue_t *, mblk_t *);
0N/Astatic void ipnet_wputnondata(queue_t *, mblk_t *);
0N/Astatic int ipnet_attach(dev_info_t *, ddi_attach_cmd_t);
0N/Astatic int ipnet_detach(dev_info_t *, ddi_detach_cmd_t);
0N/Astatic int ipnet_devinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
0N/Astatic void ipnet_inforeq(queue_t *q, mblk_t *mp);
0N/Astatic void ipnet_bindreq(queue_t *q, mblk_t *mp);
0N/Astatic void ipnet_unbindreq(queue_t *q, mblk_t *mp);
0N/Astatic void ipnet_dlpromisconreq(queue_t *q, mblk_t *mp);
0N/Astatic void ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp);
0N/Astatic int ipnet_join_allmulti(ipnetif_t *, ipnet_stack_t *);
0N/Astatic void ipnet_leave_allmulti(ipnetif_t *, ipnet_stack_t *);
0N/Astatic int ipnet_nicevent_cb(hook_event_token_t, hook_data_t, void *);
0N/Astatic void ipnet_nicevent_task(void *);
0N/Astatic ipnetif_t *ipnetif_create(const char *, uint64_t, ipnet_stack_t *,
0N/A uint64_t);
0N/Astatic void ipnetif_remove(ipnetif_t *, ipnet_stack_t *);
0N/Astatic ipnetif_addr_t *ipnet_match_lif(ipnetif_t *, lif_if_t, boolean_t);
0N/Astatic ipnetif_t *ipnetif_getby_index(uint64_t, ipnet_stack_t *);
0N/Astatic ipnetif_t *ipnetif_getby_dev(dev_t, ipnet_stack_t *);
0N/Astatic boolean_t ipnetif_in_zone(ipnetif_t *, zoneid_t, ipnet_stack_t *);
0N/Astatic void ipnetif_zonecheck(ipnetif_t *, ipnet_stack_t *);
0N/Astatic int ipnet_populate_if(net_handle_t, ipnet_stack_t *, boolean_t);
0N/Astatic int ipnetif_compare_name(const void *, const void *);
0N/Astatic int ipnetif_compare_name_zone(const void *, const void *);
0N/Astatic int ipnetif_compare_index(const void *, const void *);
0N/Astatic void ipnet_add_ifaddr(uint64_t, ipnetif_t *, net_handle_t);
0N/Astatic void ipnet_delete_ifaddr(ipnetif_addr_t *, ipnetif_t *, boolean_t);
0N/Astatic void ipnetif_refhold(ipnetif_t *);
0N/Astatic void ipnetif_refrele(ipnetif_t *);
0N/Astatic void ipnet_walkers_inc(ipnet_stack_t *);
0N/Astatic void ipnet_walkers_dec(ipnet_stack_t *);
0N/Astatic void ipnet_register_netihook(ipnet_stack_t *);
0N/Astatic void *ipnet_stack_init(netstackid_t, netstack_t *);
0N/Astatic void ipnet_stack_fini(netstackid_t, void *);
0N/Astatic void ipnet_dispatch(void *);
0N/Astatic int ipobs_bounce_func(hook_event_token_t, hook_data_t, void *);
0N/Astatic void ipnet_bpfattach(ipnetif_t *);
0N/Astatic void ipnet_bpfdetach(ipnetif_t *);
0N/Astatic int ipnet_bpf_bounce(hook_event_token_t, hook_data_t, void *);
0N/Astatic void ipnet_bpf_probe_shared(ipnet_stack_t *);
0N/Astatic void ipnet_bpf_release_shared(ipnet_stack_t *);
0N/Astatic ipnetif_t *ipnetif_clone_create(ipnetif_t *, zoneid_t);
0N/Astatic void ipnetif_clone_release(ipnetif_t *);
0N/A
0N/Astatic struct qinit ipnet_rinit = {
0N/A NULL, /* qi_putp */
0N/A ipnet_rsrv, /* qi_srvp */
0N/A ipnet_open, /* qi_qopen */
0N/A ipnet_close, /* qi_qclose */
0N/A NULL, /* qi_qadmin */
0N/A &ipnet_minfo, /* qi_minfo */
0N/A};
0N/A
0N/Astatic struct qinit ipnet_winit = {
0N/A ipnet_wput, /* qi_putp */
0N/A NULL, /* qi_srvp */
0N/A NULL, /* qi_qopen */
2676N/A NULL, /* qi_qclose */
2676N/A NULL, /* qi_qadmin */
2676N/A &ipnet_minfo, /* qi_minfo */
2963N/A};
2963N/A
2963N/Astatic struct streamtab ipnet_info = {
0N/A &ipnet_rinit, &ipnet_winit
0N/A};
0N/A
0N/ADDI_DEFINE_STREAM_OPS(ipnet_ops, nulldev, nulldev, ipnet_attach,
0N/A ipnet_detach, nodev, ipnet_devinfo, D_MP | D_MTPERMOD, &ipnet_info,
0N/A ddi_quiesce_not_supported);
0N/A
0N/Astatic struct modldrv modldrv = {
0N/A &mod_driverops,
0N/A "STREAMS ipnet driver",
0N/A &ipnet_ops
0N/A};
0N/A
0N/Astatic struct modlinkage modlinkage = {
0N/A MODREV_1, &modldrv, NULL
0N/A};
0N/A
0N/A/*
0N/A * This structure contains the template data (names and type) that is
0N/A * copied, in bulk, into the new kstats structure created by net_kstat_create.
0N/A * No actual statistical information is stored in this instance of the
0N/A * ipnet_kstats_t structure.
0N/A */
0N/Astatic ipnet_kstats_t stats_template = {
0N/A { "duplicationFail", KSTAT_DATA_UINT64 },
0N/A { "dispatchOk", KSTAT_DATA_UINT64 },
0N/A { "dispatchFail", KSTAT_DATA_UINT64 },
0N/A { "dispatchHeaderDrop", KSTAT_DATA_UINT64 },
0N/A { "dispatchDupDrop", KSTAT_DATA_UINT64 },
0N/A { "dispatchDeliver", KSTAT_DATA_UINT64 },
0N/A { "acceptOk", KSTAT_DATA_UINT64 },
0N/A { "acceptFail", KSTAT_DATA_UINT64 }
0N/A};
0N/A
0N/A/*
0N/A * Walk the list of physical interfaces on the machine, for each
0N/A * interface create a new ipnetif_t and add any addresses to it. We
0N/A * need to do the walk twice, once for IPv4 and once for IPv6.
0N/A *
0N/A * The interfaces are destroyed as part of ipnet_stack_fini() for each
0N/A * stack. Note that we cannot do this initialization in
0N/A * ipnet_stack_init(), since ipnet_stack_init() cannot fail.
0N/A */
0N/Astatic int
0N/Aipnetif_init(void)
0N/A{
0N/A netstack_handle_t nh;
0N/A netstack_t *ns;
0N/A ipnet_stack_t *ips;
0N/A int ret = 0;
1203N/A
0N/A netstack_next_init(&nh);
0N/A while ((ns = netstack_next(&nh)) != NULL) {
0N/A ips = ns->netstack_ipnet;
0N/A if ((ret = ipnet_populate_if(ips->ips_ndv4, ips, B_FALSE)) == 0)
0N/A ret = ipnet_populate_if(ips->ips_ndv6, ips, B_TRUE);
0N/A netstack_rele(ns);
0N/A if (ret != 0)
0N/A break;
0N/A }
0N/A netstack_next_fini(&nh);
0N/A return (ret);
0N/A}
0N/A
0N/A/*
0N/A * Standard module entry points.
0N/A */
0N/Aint
0N/A_init(void)
0N/A{
0N/A int ret;
0N/A boolean_t netstack_registered = B_FALSE;
0N/A
0N/A if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1)
0N/A return (ENODEV);
0N/A ipnet_minor_space = id_space_create("ipnet_minor_space",
0N/A IPNET_MINOR_MIN, MAXMIN32);
0N/A
0N/A /*
0N/A * We call ddi_taskq_create() with nthread == 1 to ensure in-order
0N/A * delivery of packets to clients. Note that we need to create the
0N/A * taskqs before calling netstack_register() since ipnet_stack_init()
0N/A * registers callbacks that use 'em.
0N/A */
0N/A ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0);
0N/A ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue",
0N/A 1, TASKQ_DEFAULTPRI, 0);
0N/A if (ipnet_taskq == NULL || ipnet_nicevent_taskq == NULL) {
0N/A ret = ENOMEM;
0N/A goto done;
0N/A }
0N/A
0N/A netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
0N/A netstack_registered = B_TRUE;
0N/A
0N/A if ((ret = ipnetif_init()) == 0)
0N/A ret = mod_install(&modlinkage);
0N/Adone:
0N/A if (ret != 0) {
0N/A if (ipnet_taskq != NULL)
0N/A ddi_taskq_destroy(ipnet_taskq);
0N/A if (ipnet_nicevent_taskq != NULL)
0N/A ddi_taskq_destroy(ipnet_nicevent_taskq);
0N/A if (netstack_registered)
0N/A netstack_unregister(NS_IPNET);
0N/A id_space_destroy(ipnet_minor_space);
0N/A }
0N/A return (ret);
0N/A}
0N/A
0N/Aint
0N/A_fini(void)
0N/A{
0N/A int err;
0N/A
0N/A if ((err = mod_remove(&modlinkage)) != 0)
0N/A return (err);
0N/A
0N/A netstack_unregister(NS_IPNET);
0N/A ddi_taskq_destroy(ipnet_nicevent_taskq);
0N/A ddi_taskq_destroy(ipnet_taskq);
0N/A id_space_destroy(ipnet_minor_space);
0N/A return (0);
0N/A}
0N/A
0N/Aint
3845N/A_info(struct modinfo *modinfop)
0N/A{
0N/A return (mod_info(&modlinkage, modinfop));
0N/A}
0N/A
0N/Astatic void
0N/Aipnet_register_netihook(ipnet_stack_t *ips)
0N/A{
0N/A int ret;
0N/A zoneid_t zoneid;
0N/A netid_t netid;
0N/A
0N/A HOOK_INIT(ips->ips_nicevents, ipnet_nicevent_cb, "ipnet_nicevents",
0N/A ips);
0N/A
0N/A /*
0N/A * It is possible for an exclusive stack to be in the process of
0N/A * shutting down here, and the netid and protocol lookups could fail
0N/A * in that case.
0N/A */
0N/A zoneid = netstackid_to_zoneid(ips->ips_netstack->netstack_stackid);
0N/A if ((netid = net_zoneidtonetid(zoneid)) == -1)
0N/A return;
1915N/A
1915N/A if ((ips->ips_ndv4 = net_protocol_lookup(netid, NHF_INET)) != NULL) {
1915N/A if ((ret = net_hook_register(ips->ips_ndv4, NH_NIC_EVENTS,
1915N/A ips->ips_nicevents)) != 0) {
1915N/A VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
1915N/A ips->ips_ndv4 = NULL;
1915N/A cmn_err(CE_WARN, "unable to register IPv4 netinfo hooks"
1915N/A " in zone %d: %d", zoneid, ret);
1915N/A }
1915N/A }
1915N/A if ((ips->ips_ndv6 = net_protocol_lookup(netid, NHF_INET6)) != NULL) {
1915N/A if ((ret = net_hook_register(ips->ips_ndv6, NH_NIC_EVENTS,
1915N/A ips->ips_nicevents)) != 0) {
1915N/A VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
1915N/A ips->ips_ndv6 = NULL;
1915N/A cmn_err(CE_WARN, "unable to register IPv6 netinfo hooks"
1915N/A " in zone %d: %d", zoneid, ret);
1915N/A }
1915N/A }
1915N/A
1915N/A /*
1915N/A * Create a local set of kstats for each zone.
1915N/A */
1915N/A ips->ips_kstatp = net_kstat_create(netid, "ipnet", 0, "ipnet_stats",
1915N/A "misc", KSTAT_TYPE_NAMED,
1915N/A sizeof (ipnet_kstats_t) / sizeof (kstat_named_t), 0);
1915N/A if (ips->ips_kstatp != NULL) {
1915N/A bcopy(&stats_template, &ips->ips_stats,
1915N/A sizeof (ips->ips_stats));
3845N/A ips->ips_kstatp->ks_data = &ips->ips_stats;
1915N/A ips->ips_kstatp->ks_private =
1915N/A (void *)(uintptr_t)ips->ips_netstack->netstack_stackid;
1915N/A kstat_install(ips->ips_kstatp);
1915N/A } else {
3845N/A cmn_err(CE_WARN, "net_kstat_create(%s,%s,%s) failed",
1915N/A "ipnet", "ipnet_stats", "misc");
1915N/A }
1915N/A}
1915N/A
1915N/A/*
1915N/A * This function is called on attach to build an initial view of the
1915N/A * interfaces on the system. It will be called once for IPv4 and once
1915N/A * for IPv6, although there is only one ipnet interface for both IPv4
1915N/A * and IPv6 there are separate address lists.
1915N/A */
1915N/Astatic int
1915N/Aipnet_populate_if(net_handle_t nd, ipnet_stack_t *ips, boolean_t isv6)
1915N/A{
1915N/A phy_if_t phyif;
1915N/A lif_if_t lif;
1915N/A ipnetif_t *ipnetif;
1915N/A char name[LIFNAMSIZ];
1915N/A boolean_t new_if = B_FALSE;
1915N/A uint64_t ifflags;
1915N/A int ret = 0;
1915N/A
1915N/A /*
1915N/A * If ipnet_register_netihook() was unable to initialize this
1915N/A * stack's net_handle_t, then we cannot populate any interface
0N/A * information. This usually happens when we attempted to
0N/A * grab a net_handle_t as a stack was shutting down. We don't
0N/A * want to fail the entire _init() operation because of a
0N/A * stack shutdown (other stacks will continue to work just
0N/A * fine), so we silently return success here.
0N/A */
0N/A if (nd == NULL)
0N/A return (0);
0N/A
0N/A /*
0N/A * Make sure we're not processing NIC events during the
0N/A * population of our interfaces and address lists.
0N/A */
0N/A mutex_enter(&ips->ips_event_lock);
0N/A
0N/A for (phyif = net_phygetnext(nd, 0); phyif != 0;
0N/A phyif = net_phygetnext(nd, phyif)) {
0N/A if (net_getifname(nd, phyif, name, LIFNAMSIZ) != 0)
0N/A continue;
0N/A ifflags = 0;
0N/A (void) net_getlifflags(nd, phyif, 0, &ifflags);
0N/A if ((ipnetif = ipnetif_getby_index(phyif, ips)) == NULL) {
0N/A ipnetif = ipnetif_create(name, phyif, ips, ifflags);
0N/A if (ipnetif == NULL) {
0N/A ret = ENOMEM;
0N/A goto done;
0N/A }
0N/A new_if = B_TRUE;
0N/A }
0N/A ipnetif->if_flags |=
0N/A isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
0N/A
0N/A for (lif = net_lifgetnext(nd, phyif, 0); lif != 0;
0N/A lif = net_lifgetnext(nd, phyif, lif)) {
0N/A /*
0N/A * Skip addresses that aren't up. We'll add
0N/A * them when we receive an NE_LIF_UP event.
0N/A */
0N/A if (net_getlifflags(nd, phyif, lif, &ifflags) != 0 ||
0N/A !(ifflags & IFF_UP))
0N/A continue;
0N/A /* Don't add it if we already have it. */
0N/A if (ipnet_match_lif(ipnetif, lif, isv6) != NULL)
0N/A continue;
0N/A ipnet_add_ifaddr(lif, ipnetif, nd);
0N/A }
0N/A if (!new_if)
0N/A ipnetif_refrele(ipnetif);
0N/A }
0N/A
0N/Adone:
0N/A mutex_exit(&ips->ips_event_lock);
0N/A return (ret);
0N/A}
0N/A
0N/Astatic int
0N/Aipnet_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
0N/A{
0N/A if (cmd != DDI_ATTACH)
0N/A return (DDI_FAILURE);
0N/A
0N/A if (ddi_create_minor_node(dip, "lo0", S_IFCHR, IPNET_MINOR_LO,
0N/A DDI_PSEUDO, 0) == DDI_FAILURE)
0N/A return (DDI_FAILURE);
0N/A
0N/A ipnet_dip = dip;
0N/A return (DDI_SUCCESS);
0N/A}
0N/A
0N/Astatic int
0N/Aipnet_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
0N/A{
0N/A if (cmd != DDI_DETACH)
0N/A return (DDI_FAILURE);
0N/A
0N/A ASSERT(dip == ipnet_dip);
0N/A ddi_remove_minor_node(ipnet_dip, NULL);
0N/A ipnet_dip = NULL;
0N/A return (DDI_SUCCESS);
0N/A}
0N/A
0N/A/* ARGSUSED */
0N/Astatic int
0N/Aipnet_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
0N/A{
0N/A int error = DDI_FAILURE;
0N/A
0N/A switch (infocmd) {
0N/A case DDI_INFO_DEVT2INSTANCE:
0N/A *result = (void *)0;
0N/A error = DDI_SUCCESS;
0N/A break;
0N/A case DDI_INFO_DEVT2DEVINFO:
0N/A if (ipnet_dip != NULL) {
0N/A *result = ipnet_dip;
0N/A error = DDI_SUCCESS;
0N/A }
0N/A break;
0N/A }
0N/A return (error);
0N/A}
0N/A
0N/A/* ARGSUSED */
0N/Astatic int
0N/Aipnet_open(queue_t *rq, dev_t *dev, int oflag, int sflag, cred_t *crp)
0N/A{
0N/A ipnet_t *ipnet;
0N/A netstack_t *ns = NULL;
0N/A ipnet_stack_t *ips;
0N/A int err = 0;
2678N/A zoneid_t zoneid = crgetzoneid(crp);
2678N/A
2678N/A /*
2678N/A * If the system is labeled, only the global zone is allowed to open
2678N/A * IP observability nodes.
2678N/A */
2678N/A if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
2678N/A return (EACCES);
2678N/A
2678N/A /* We don't support open as a module */
2678N/A if (sflag & MODOPEN)
2678N/A return (ENOTSUP);
2678N/A
2678N/A /* This driver is self-cloning, we don't support re-open. */
2678N/A if (rq->q_ptr != NULL)
2678N/A return (EBUSY);
0N/A
0N/A if ((ipnet = kmem_zalloc(sizeof (*ipnet), KM_NOSLEEP)) == NULL)
0N/A return (ENOMEM);
0N/A
0N/A VERIFY((ns = netstack_find_by_cred(crp)) != NULL);
0N/A ips = ns->netstack_ipnet;
2678N/A
0N/A rq->q_ptr = WR(rq)->q_ptr = ipnet;
0N/A ipnet->ipnet_rq = rq;
2678N/A ipnet->ipnet_minor = (minor_t)id_alloc(ipnet_minor_space);
0N/A ipnet->ipnet_zoneid = zoneid;
0N/A ipnet->ipnet_dlstate = DL_UNBOUND;
0N/A ipnet->ipnet_ns = ns;
0N/A
2678N/A /*
0N/A * We need to hold ips_event_lock here as any NE_LIF_DOWN events need
0N/A * to be processed after ipnet_if is set and the ipnet_t has been
2664N/A * inserted in the ips_str_list.
2678N/A */
0N/A mutex_enter(&ips->ips_event_lock);
0N/A if (getminor(*dev) == IPNET_MINOR_LO) {
0N/A ipnet->ipnet_flags |= IPNET_LOMODE;
0N/A ipnet->ipnet_acceptfn = ipnet_loaccept;
0N/A } else {
0N/A ipnet->ipnet_acceptfn = ipnet_accept;
0N/A ipnet->ipnet_if = ipnetif_getby_dev(*dev, ips);
0N/A if (ipnet->ipnet_if == NULL ||
0N/A !ipnetif_in_zone(ipnet->ipnet_if, zoneid, ips)) {
0N/A err = ENODEV;
0N/A goto done;
0N/A }
0N/A }
0N/A
0N/A mutex_enter(&ips->ips_walkers_lock);
0N/A while (ips->ips_walkers_cnt != 0)
0N/A cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
0N/A list_insert_head(&ips->ips_str_list, ipnet);
0N/A *dev = makedevice(getmajor(*dev), ipnet->ipnet_minor);
0N/A qprocson(rq);
0N/A
2667N/A /*
2667N/A * Only register our callback if we're the first open client; we call
2667N/A * unregister in close() for the last open client.
0N/A */
0N/A if (list_head(&ips->ips_str_list) == list_tail(&ips->ips_str_list))
0N/A ips->ips_hook = ipobs_register_hook(ns, ipnet_input);
0N/A mutex_exit(&ips->ips_walkers_lock);
0N/A
0N/Adone:
0N/A mutex_exit(&ips->ips_event_lock);
0N/A if (err != 0) {
0N/A netstack_rele(ns);
0N/A id_free(ipnet_minor_space, ipnet->ipnet_minor);
0N/A if (ipnet->ipnet_if != NULL)
0N/A ipnetif_refrele(ipnet->ipnet_if);
0N/A kmem_free(ipnet, sizeof (*ipnet));
0N/A }
0N/A return (err);
0N/A}
0N/A
0N/Astatic int
0N/Aipnet_close(queue_t *rq)
0N/A{
0N/A ipnet_t *ipnet = rq->q_ptr;
2678N/A ipnet_stack_t *ips = ipnet->ipnet_ns->netstack_ipnet;
0N/A
0N/A if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
0N/A ipnet_leave_allmulti(ipnet->ipnet_if, ips);
0N/A if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
2678N/A ipnet_leave_allmulti(ipnet->ipnet_if, ips);
0N/A
0N/A mutex_enter(&ips->ips_walkers_lock);
2678N/A while (ips->ips_walkers_cnt != 0)
2678N/A cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
0N/A
0N/A qprocsoff(rq);
0N/A
0N/A list_remove(&ips->ips_str_list, ipnet);
0N/A if (ipnet->ipnet_if != NULL)
2667N/A ipnetif_refrele(ipnet->ipnet_if);
2667N/A id_free(ipnet_minor_space, ipnet->ipnet_minor);
2678N/A
2667N/A if (list_is_empty(&ips->ips_str_list)) {
2678N/A ipobs_unregister_hook(ips->ips_netstack, ips->ips_hook);
2667N/A ips->ips_hook = NULL;
2667N/A }
2667N/A
2667N/A kmem_free(ipnet, sizeof (*ipnet));
0N/A
0N/A mutex_exit(&ips->ips_walkers_lock);
0N/A netstack_rele(ips->ips_netstack);
0N/A return (0);
0N/A}
0N/A
0N/Astatic int
0N/Aipnet_wput(queue_t *q, mblk_t *mp)
0N/A{
0N/A switch (mp->b_datap->db_type) {
0N/A case M_FLUSH:
0N/A if (*mp->b_rptr & FLUSHW) {
0N/A flushq(q, FLUSHDATA);
0N/A *mp->b_rptr &= ~FLUSHW;
0N/A }
0N/A if (*mp->b_rptr & FLUSHR)
0N/A qreply(q, mp);
0N/A else
0N/A freemsg(mp);
0N/A break;
0N/A case M_PROTO:
0N/A case M_PCPROTO:
0N/A ipnet_wputnondata(q, mp);
0N/A break;
0N/A case M_IOCTL:
0N/A ipnet_ioctl(q, mp);
0N/A break;
0N/A case M_IOCDATA:
0N/A ipnet_iocdata(q, mp);
0N/A break;
0N/A default:
0N/A freemsg(mp);
0N/A break;
0N/A }
0N/A return (0);
0N/A}
0N/A
0N/Astatic int
0N/Aipnet_rsrv(queue_t *q)
0N/A{
0N/A mblk_t *mp;
0N/A
0N/A while ((mp = getq(q)) != NULL) {
0N/A ASSERT(DB_TYPE(mp) == M_DATA);
0N/A if (canputnext(q)) {
0N/A putnext(q, mp);
0N/A } else {
0N/A (void) putbq(q, mp);
0N/A break;
0N/A }
0N/A }
0N/A return (0);
0N/A}
0N/A
0N/Astatic void
0N/Aipnet_ioctl(queue_t *q, mblk_t *mp)
0N/A{
0N/A struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
0N/A
0N/A switch (iocp->ioc_cmd) {
0N/A case DLIOCRAW:
0N/A miocack(q, mp, 0, 0);
0N/A break;
0N/A case DLIOCIPNETINFO:
0N/A if (iocp->ioc_count == TRANSPARENT) {
0N/A mcopyin(mp, NULL, sizeof (uint_t), NULL);
0N/A qreply(q, mp);
0N/A break;
0N/A }
0N/A /* Fallthrough, we don't support I_STR with DLIOCIPNETINFO. */
0N/A default:
0N/A miocnak(q, mp, 0, EINVAL);
0N/A break;
0N/A }
0N/A}
0N/A
0N/Astatic void
0N/Aipnet_iocdata(queue_t *q, mblk_t *mp)
0N/A{
0N/A struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
0N/A ipnet_t *ipnet = q->q_ptr;
0N/A
0N/A switch (iocp->ioc_cmd) {
0N/A case DLIOCIPNETINFO:
0N/A if (*(int *)mp->b_cont->b_rptr == 1)
0N/A ipnet->ipnet_flags |= IPNET_INFO;
0N/A else if (*(int *)mp->b_cont->b_rptr == 0)
0N/A ipnet->ipnet_flags &= ~IPNET_INFO;
0N/A else
0N/A goto iocnak;
0N/A miocack(q, mp, 0, DL_IPNETINFO_VERSION);
0N/A break;
0N/A default:
0N/Aiocnak:
0N/A miocnak(q, mp, 0, EINVAL);
0N/A break;
0N/A }
0N/A}
0N/A
0N/Astatic void
0N/Aipnet_wputnondata(queue_t *q, mblk_t *mp)
0N/A{
0N/A union DL_primitives *dlp = (union DL_primitives *)mp->b_rptr;
0N/A t_uscalar_t prim = dlp->dl_primitive;
0N/A
0N/A switch (prim) {
0N/A case DL_INFO_REQ:
0N/A ipnet_inforeq(q, mp);
0N/A break;
0N/A case DL_UNBIND_REQ:
0N/A ipnet_unbindreq(q, mp);
0N/A break;
0N/A case DL_BIND_REQ:
0N/A ipnet_bindreq(q, mp);
0N/A break;
0N/A case DL_PROMISCON_REQ:
0N/A ipnet_dlpromisconreq(q, mp);
0N/A break;
0N/A case DL_PROMISCOFF_REQ:
0N/A ipnet_dlpromiscoffreq(q, mp);
0N/A break;
0N/A case DL_UNITDATA_REQ:
0N/A case DL_DETACH_REQ:
0N/A case DL_PHYS_ADDR_REQ:
0N/A case DL_SET_PHYS_ADDR_REQ:
0N/A case DL_ENABMULTI_REQ:
0N/A case DL_DISABMULTI_REQ:
0N/A case DL_ATTACH_REQ:
0N/A dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
0N/A break;
0N/A default:
0N/A dlerrorack(q, mp, prim, DL_BADPRIM, 0);
0N/A break;
0N/A }
0N/A}
0N/A
1137N/Astatic void
0N/Aipnet_inforeq(queue_t *q, mblk_t *mp)
0N/A{
0N/A dl_info_ack_t *dlip;
1137N/A size_t size = sizeof (dl_info_ack_t) + sizeof (ushort_t);
1137N/A
1137N/A if (MBLKL(mp) < DL_INFO_REQ_SIZE) {
0N/A dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0);
0N/A return;
0N/A }
0N/A
0N/A if ((mp = mexchange(q, mp, size, M_PCPROTO, DL_INFO_ACK)) == NULL)
0N/A return;
0N/A
0N/A dlip = (dl_info_ack_t *)mp->b_rptr;
0N/A *dlip = ipnet_infoack;
0N/A qreply(q, mp);
0N/A}
0N/A
0N/Astatic void
0N/Aipnet_bindreq(queue_t *q, mblk_t *mp)
0N/A{
0N/A union DL_primitives *dlp = (union DL_primitives *)mp->b_rptr;
0N/A ipnet_t *ipnet = q->q_ptr;
0N/A
0N/A if (MBLKL(mp) < DL_BIND_REQ_SIZE) {
0N/A dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0);
0N/A return;
0N/A }
0N/A
0N/A switch (dlp->bind_req.dl_sap) {
0N/A case 0 :
0N/A ipnet->ipnet_family = AF_UNSPEC;
0N/A break;
0N/A case IPV4_VERSION :
0N/A ipnet->ipnet_family = AF_INET;
0N/A break;
0N/A case IPV6_VERSION :
0N/A ipnet->ipnet_family = AF_INET6;
0N/A break;
0N/A default :
0N/A dlerrorack(q, mp, DL_BIND_REQ, DL_BADSAP, 0);
0N/A return;
0N/A /*NOTREACHED*/
0N/A }
0N/A
0N/A ipnet->ipnet_dlstate = DL_IDLE;
0N/A dlbindack(q, mp, dlp->bind_req.dl_sap, 0, 0, 0, 0);
0N/A}
0N/A
0N/Astatic void
0N/Aipnet_unbindreq(queue_t *q, mblk_t *mp)
0N/A{
0N/A ipnet_t *ipnet = q->q_ptr;
0N/A
0N/A if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) {
0N/A dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0);
0N/A return;
0N/A }
0N/A
0N/A if (ipnet->ipnet_dlstate != DL_IDLE) {
0N/A dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE, 0);
0N/A } else {
0N/A ipnet->ipnet_dlstate = DL_UNBOUND;
0N/A ipnet->ipnet_family = AF_UNSPEC;
0N/A dlokack(q, mp, DL_UNBIND_REQ);
0N/A }
0N/A}
0N/A
0N/Astatic void
0N/Aipnet_dlpromisconreq(queue_t *q, mblk_t *mp)
0N/A{
0N/A ipnet_t *ipnet = q->q_ptr;
0N/A t_uscalar_t level;
0N/A int err;
0N/A
0N/A if (MBLKL(mp) < DL_PROMISCON_REQ_SIZE) {
0N/A dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
0N/A return;
0N/A }
0N/A
0N/A if (ipnet->ipnet_flags & IPNET_LOMODE) {
0N/A dlokack(q, mp, DL_PROMISCON_REQ);
0N/A return;
0N/A }
0N/A
0N/A level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
0N/A if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
0N/A if ((err = ipnet_join_allmulti(ipnet->ipnet_if,
0N/A ipnet->ipnet_ns->netstack_ipnet)) != 0) {
0N/A dlerrorack(q, mp, DL_PROMISCON_REQ, DL_SYSERR, err);
0N/A return;
0N/A }
0N/A }
0N/A
0N/A switch (level) {
0N/A case DL_PROMISC_PHYS:
0N/A ipnet->ipnet_flags |= IPNET_PROMISC_PHYS;
0N/A break;
0N/A case DL_PROMISC_SAP:
0N/A ipnet->ipnet_flags |= IPNET_PROMISC_SAP;
0N/A break;
0N/A case DL_PROMISC_MULTI:
0N/A ipnet->ipnet_flags |= IPNET_PROMISC_MULTI;
0N/A break;
0N/A default:
0N/A dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
0N/A return;
0N/A }
0N/A
0N/A dlokack(q, mp, DL_PROMISCON_REQ);
0N/A}
0N/A
0N/Astatic void
0N/Aipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp)
0N/A{
0N/A ipnet_t *ipnet = q->q_ptr;
0N/A t_uscalar_t level;
0N/A uint16_t orig_ipnet_flags = ipnet->ipnet_flags;
0N/A
0N/A if (MBLKL(mp) < DL_PROMISCOFF_REQ_SIZE) {
0N/A dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
0N/A return;
0N/A }
0N/A
0N/A if (ipnet->ipnet_flags & IPNET_LOMODE) {
2664N/A dlokack(q, mp, DL_PROMISCOFF_REQ);
0N/A return;
0N/A }
0N/A
0N/A level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
0N/A switch (level) {
0N/A case DL_PROMISC_PHYS:
0N/A if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
0N/A ipnet->ipnet_flags &= ~IPNET_PROMISC_PHYS;
0N/A break;
0N/A case DL_PROMISC_SAP:
0N/A if (ipnet->ipnet_flags & IPNET_PROMISC_SAP)
0N/A ipnet->ipnet_flags &= ~IPNET_PROMISC_SAP;
0N/A break;
0N/A case DL_PROMISC_MULTI:
0N/A if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
0N/A ipnet->ipnet_flags &= ~IPNET_PROMISC_MULTI;
0N/A break;
0N/A default:
0N/A dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
0N/A return;
0N/A }
0N/A
0N/A if (orig_ipnet_flags == ipnet->ipnet_flags) {
0N/A dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_NOTENAB, 0);
0N/A return;
0N/A }
0N/A
0N/A if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
0N/A ipnet_leave_allmulti(ipnet->ipnet_if,
0N/A ipnet->ipnet_ns->netstack_ipnet);
0N/A }
0N/A
0N/A dlokack(q, mp, DL_PROMISCOFF_REQ);
0N/A}
0N/A
0N/Astatic int
0N/Aipnet_join_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
0N/A{
0N/A int err = 0;
1879N/A ip_stack_t *ipst = ips->ips_netstack->netstack_ip;
1879N/A uint64_t index = ipnetif->if_index;
mutex_enter(&ips->ips_event_lock);
if (ipnetif->if_multicnt == 0) {
ASSERT((ipnetif->if_flags &
(IPNETIF_IPV4ALLMULTI | IPNETIF_IPV6ALLMULTI)) == 0);
if (ipnetif->if_flags & IPNETIF_IPV4PLUMBED) {
err = ip_join_allmulti(index, B_FALSE, ipst);
if (err != 0)
goto done;
ipnetif->if_flags |= IPNETIF_IPV4ALLMULTI;
}
if (ipnetif->if_flags & IPNETIF_IPV6PLUMBED) {
err = ip_join_allmulti(index, B_TRUE, ipst);
if (err != 0 &&
(ipnetif->if_flags & IPNETIF_IPV4ALLMULTI)) {
(void) ip_leave_allmulti(index, B_FALSE, ipst);
ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
goto done;
}
ipnetif->if_flags |= IPNETIF_IPV6ALLMULTI;
}
}
ipnetif->if_multicnt++;
done:
mutex_exit(&ips->ips_event_lock);
return (err);
}
static void
ipnet_leave_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
{
int err;
ip_stack_t *ipst = ips->ips_netstack->netstack_ip;
uint64_t index = ipnetif->if_index;
mutex_enter(&ips->ips_event_lock);
ASSERT(ipnetif->if_multicnt != 0);
if (--ipnetif->if_multicnt == 0) {
if (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI) {
err = ip_leave_allmulti(index, B_FALSE, ipst);
ASSERT(err == 0 || err == ENODEV);
ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
}
if (ipnetif->if_flags & IPNETIF_IPV6ALLMULTI) {
err = ip_leave_allmulti(index, B_TRUE, ipst);
ASSERT(err == 0 || err == ENODEV);
ipnetif->if_flags &= ~IPNETIF_IPV6ALLMULTI;
}
}
mutex_exit(&ips->ips_event_lock);
}
/*
* Allocate a new mblk_t and put a dl_ipnetinfo_t in it.
* The structure it copies the header information from,
* hook_pkt_observe_t, is constructed using network byte
* order in ipobs_hook(), so there is no conversion here.
*/
static mblk_t *
ipnet_addheader(hook_pkt_observe_t *hdr, mblk_t *mp)
{
mblk_t *dlhdr;
dl_ipnetinfo_t *dl;
if ((dlhdr = allocb(sizeof (dl_ipnetinfo_t), BPRI_HI)) == NULL) {
freemsg(mp);
return (NULL);
}
dl = (dl_ipnetinfo_t *)dlhdr->b_rptr;
dl->dli_version = DL_IPNETINFO_VERSION;
dl->dli_family = hdr->hpo_family;
dl->dli_htype = hdr->hpo_htype;
dl->dli_pktlen = hdr->hpo_pktlen;
dl->dli_ifindex = hdr->hpo_ifindex;
dl->dli_grifindex = hdr->hpo_grifindex;
dl->dli_zsrc = hdr->hpo_zsrc;
dl->dli_zdst = hdr->hpo_zdst;
dlhdr->b_wptr += sizeof (*dl);
dlhdr->b_cont = mp;
return (dlhdr);
}
static ipnet_addrtype_t
ipnet_get_addrtype(ipnet_t *ipnet, ipnet_addrp_t *addr)
{
list_t *list;
ipnetif_t *ipnetif = ipnet->ipnet_if;
ipnetif_addr_t *ifaddr;
ipnet_addrtype_t addrtype = IPNETADDR_UNKNOWN;
/* First check if the address is multicast or limited broadcast. */
switch (addr->iap_family) {
case AF_INET:
if (CLASSD(*(addr->iap_addr4)) ||
*(addr->iap_addr4) == INADDR_BROADCAST)
return (IPNETADDR_MBCAST);
break;
case AF_INET6:
if (IN6_IS_ADDR_MULTICAST(addr->iap_addr6))
return (IPNETADDR_MBCAST);
break;
}
/*
* Walk the address list to see if the address belongs to our
* interface or is one of our subnet broadcast addresses.
*/
mutex_enter(&ipnetif->if_addr_lock);
list = (addr->iap_family == AF_INET) ?
&ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list;
for (ifaddr = list_head(list);
ifaddr != NULL && addrtype == IPNETADDR_UNKNOWN;
ifaddr = list_next(list, ifaddr)) {
/*
* If we're not in the global zone, then only look at
* addresses in our zone.
*/
if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
ipnet->ipnet_zoneid != ifaddr->ifa_zone)
continue;
switch (addr->iap_family) {
case AF_INET:
if (ifaddr->ifa_ip4addr != INADDR_ANY &&
*(addr->iap_addr4) == ifaddr->ifa_ip4addr)
addrtype = IPNETADDR_MYADDR;
else if (ifaddr->ifa_brdaddr != INADDR_ANY &&
*(addr->iap_addr4) == ifaddr->ifa_brdaddr)
addrtype = IPNETADDR_MBCAST;
break;
case AF_INET6:
if (IN6_ARE_ADDR_EQUAL(addr->iap_addr6,
&ifaddr->ifa_ip6addr))
addrtype = IPNETADDR_MYADDR;
break;
}
}
mutex_exit(&ipnetif->if_addr_lock);
return (addrtype);
}
/*
* Verify if the packet contained in hdr should be passed up to the
* ipnet client stream.
*/
static boolean_t
ipnet_accept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
ipnet_addrp_t *dst)
{
boolean_t obsif;
uint64_t ifindex = ipnet->ipnet_if->if_index;
ipnet_addrtype_t srctype;
ipnet_addrtype_t dsttype;
srctype = ipnet_get_addrtype(ipnet, src);
dsttype = ipnet_get_addrtype(ipnet, dst);
/*
* If the packet's ifindex matches ours, or the packet's group ifindex
* matches ours, it's on the interface we're observing. (Thus,
* observing on the group ifindex matches all ifindexes in the group.)
*/
obsif = (ntohl(hdr->hpo_ifindex) == ifindex ||
ntohl(hdr->hpo_grifindex) == ifindex);
DTRACE_PROBE5(ipnet_accept__addr,
ipnet_addrtype_t, srctype, ipnet_addrp_t *, src,
ipnet_addrtype_t, dsttype, ipnet_addrp_t *, dst,
boolean_t, obsif);
/*
* Do not allow an ipnet stream to see packets that are not from or to
* its zone. The exception is when zones are using the shared stack
* model. In this case, streams in the global zone have visibility
* into other shared-stack zones, and broadcast and multicast traffic
* is visible by all zones in the stack.
*/
if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
dsttype != IPNETADDR_MBCAST) {
if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
return (B_FALSE);
}
/*
* If DL_PROMISC_SAP isn't enabled, then the bound SAP must match the
* packet's IP version.
*/
if (!(ipnet->ipnet_flags & IPNET_PROMISC_SAP) &&
ipnet->ipnet_family != hdr->hpo_family)
return (B_FALSE);
/* If the destination address is ours, then accept the packet. */
if (dsttype == IPNETADDR_MYADDR)
return (B_TRUE);
/*
* If DL_PROMISC_PHYS is enabled, then we can see all packets that are
* sent or received on the interface we're observing, or packets that
* have our source address (this allows us to see packets we send).
*/
if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) {
if (srctype == IPNETADDR_MYADDR || obsif)
return (B_TRUE);
}
/*
* We accept multicast and broadcast packets transmitted or received
* on the interface we're observing.
*/
if (dsttype == IPNETADDR_MBCAST && obsif)
return (B_TRUE);
return (B_FALSE);
}
/*
* Verify if the packet contained in hdr should be passed up to the ipnet
* client stream that's in IPNET_LOMODE.
*/
/* ARGSUSED */
static boolean_t
ipnet_loaccept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
ipnet_addrp_t *dst)
{
if (hdr->hpo_htype != IPOBS_HOOK_LOCAL) {
/*
* ipnet_if is only NULL for IPNET_MINOR_LO devices.
*/
if (ipnet->ipnet_if == NULL)
return (B_FALSE);
}
/*
* An ipnet stream must not see packets that are not from/to its zone.
*/
if (ipnet->ipnet_zoneid != GLOBAL_ZONEID) {
if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
return (B_FALSE);
}
return (ipnet->ipnet_family == AF_UNSPEC ||
ipnet->ipnet_family == hdr->hpo_family);
}
static void
ipnet_dispatch(void *arg)
{
mblk_t *mp = arg;
hook_pkt_observe_t *hdr = (hook_pkt_observe_t *)mp->b_rptr;
ipnet_t *ipnet;
mblk_t *netmp;
list_t *list;
ipnet_stack_t *ips;
ipnet_addrp_t src;
ipnet_addrp_t dst;
ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
netmp = hdr->hpo_pkt->b_cont;
src.iap_family = hdr->hpo_family;
dst.iap_family = hdr->hpo_family;
if (hdr->hpo_family == AF_INET) {
src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
} else {
src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
}
ipnet_walkers_inc(ips);
list = &ips->ips_str_list;
for (ipnet = list_head(list); ipnet != NULL;
ipnet = list_next(list, ipnet)) {
if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
IPSK_BUMP(ips, ik_acceptFail);
continue;
}
IPSK_BUMP(ips, ik_acceptOk);
if (list_next(list, ipnet) == NULL) {
netmp = hdr->hpo_pkt->b_cont;
hdr->hpo_pkt->b_cont = NULL;
} else {
if ((netmp = dupmsg(hdr->hpo_pkt->b_cont)) == NULL &&
(netmp = copymsg(hdr->hpo_pkt->b_cont)) == NULL) {
IPSK_BUMP(ips, ik_duplicationFail);
continue;
}
}
if (ipnet->ipnet_flags & IPNET_INFO) {
if ((netmp = ipnet_addheader(hdr, netmp)) == NULL) {
IPSK_BUMP(ips, ik_dispatchHeaderDrop);
continue;
}
}
if (ipnet->ipnet_rq->q_first == NULL &&
canputnext(ipnet->ipnet_rq)) {
putnext(ipnet->ipnet_rq, netmp);
IPSK_BUMP(ips, ik_dispatchDeliver);
} else if (canput(ipnet->ipnet_rq)) {
(void) putq(ipnet->ipnet_rq, netmp);
IPSK_BUMP(ips, ik_dispatchDeliver);
} else {
freemsg(netmp);
IPSK_BUMP(ips, ik_dispatchPutDrop);
}
}
ipnet_walkers_dec(ips);
freemsg(mp);
}
static void
ipnet_input(mblk_t *mp)
{
hook_pkt_observe_t *hdr = (hook_pkt_observe_t *)mp->b_rptr;
ipnet_stack_t *ips;
ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
if (ddi_taskq_dispatch(ipnet_taskq, ipnet_dispatch, mp, DDI_NOSLEEP) !=
DDI_SUCCESS) {
IPSK_BUMP(ips, ik_dispatchFail);
freemsg(mp);
} else {
IPSK_BUMP(ips, ik_dispatchOk);
}
}
static ipnetif_t *
ipnet_alloc_if(ipnet_stack_t *ips)
{
ipnetif_t *ipnetif;
if ((ipnetif = kmem_zalloc(sizeof (*ipnetif), KM_NOSLEEP)) == NULL)
return (NULL);
mutex_init(&ipnetif->if_addr_lock, NULL, MUTEX_DEFAULT, 0);
list_create(&ipnetif->if_ip4addr_list, sizeof (ipnetif_addr_t),
offsetof(ipnetif_addr_t, ifa_link));
list_create(&ipnetif->if_ip6addr_list, sizeof (ipnetif_addr_t),
offsetof(ipnetif_addr_t, ifa_link));
mutex_init(&ipnetif->if_reflock, NULL, MUTEX_DEFAULT, 0);
ipnetif->if_stackp = ips;
return (ipnetif);
}
/*
* Create a new ipnetif_t and new minor node for it. If creation is
* successful the new ipnetif_t is inserted into an avl_tree
* containing ipnetif's for this stack instance.
*/
static ipnetif_t *
ipnetif_create(const char *name, uint64_t index, ipnet_stack_t *ips,
uint64_t ifflags)
{
ipnetif_t *ipnetif;
avl_index_t where = 0;
minor_t ifminor;
/*
* Because ipnetif_create() can be called from a NIC event
* callback, it should not block.
*/
ifminor = (minor_t)id_alloc_nosleep(ipnet_minor_space);
if (ifminor == (minor_t)-1)
return (NULL);
if ((ipnetif = ipnet_alloc_if(ips)) == NULL) {
id_free(ipnet_minor_space, ifminor);
return (NULL);
}
(void) strlcpy(ipnetif->if_name, name, LIFNAMSIZ);
ipnetif->if_index = (uint_t)index;
ipnetif->if_zoneid = netstack_get_zoneid(ips->ips_netstack);
ipnetif->if_dev = makedevice(ipnet_major, ifminor);
ipnetif->if_refcnt = 1;
if ((ifflags & IFF_LOOPBACK) != 0)
ipnetif->if_flags = IPNETIF_LOOPBACK;
mutex_enter(&ips->ips_avl_lock);
VERIFY(avl_find(&ips->ips_avl_by_index, &index, &where) == NULL);
avl_insert(&ips->ips_avl_by_index, ipnetif, where);
VERIFY(avl_find(&ips->ips_avl_by_name, (void *)name, &where) == NULL);
avl_insert(&ips->ips_avl_by_name, ipnetif, where);
mutex_exit(&ips->ips_avl_lock);
/*
* Now that the interface can be found by lookups back into ipnet,
* allowing for sanity checking, call the BPF attach.
*/
ipnet_bpfattach(ipnetif);
return (ipnetif);
}
static void
ipnetif_remove(ipnetif_t *ipnetif, ipnet_stack_t *ips)
{
ipnet_t *ipnet;
ipnet_walkers_inc(ips);
/* Send a SIGHUP to all open streams associated with this ipnetif. */
for (ipnet = list_head(&ips->ips_str_list); ipnet != NULL;
ipnet = list_next(&ips->ips_str_list, ipnet)) {
if (ipnet->ipnet_if == ipnetif)
(void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
}
ipnet_walkers_dec(ips);
mutex_enter(&ips->ips_avl_lock);
avl_remove(&ips->ips_avl_by_index, ipnetif);
avl_remove(&ips->ips_avl_by_name, ipnetif);
mutex_exit(&ips->ips_avl_lock);
/*
* Now that the interface can't be found, do a BPF detach
*/
ipnet_bpfdetach(ipnetif);
/*
* Release the reference we implicitly held in ipnetif_create().
*/
ipnetif_refrele(ipnetif);
}
static void
ipnet_purge_addrlist(list_t *addrlist)
{
ipnetif_addr_t *ifa;
while ((ifa = list_head(addrlist)) != NULL) {
list_remove(addrlist, ifa);
if (ifa->ifa_shared != NULL)
ipnetif_clone_release(ifa->ifa_shared);
kmem_free(ifa, sizeof (*ifa));
}
}
static void
ipnetif_free(ipnetif_t *ipnetif)
{
ASSERT(ipnetif->if_refcnt == 0);
ASSERT(ipnetif->if_sharecnt == 0);
/* Remove IPv4/v6 address lists from the ipnetif */
ipnet_purge_addrlist(&ipnetif->if_ip4addr_list);
list_destroy(&ipnetif->if_ip4addr_list);
ipnet_purge_addrlist(&ipnetif->if_ip6addr_list);
list_destroy(&ipnetif->if_ip6addr_list);
mutex_destroy(&ipnetif->if_addr_lock);
mutex_destroy(&ipnetif->if_reflock);
if (ipnetif->if_dev != 0)
id_free(ipnet_minor_space, getminor(ipnetif->if_dev));
kmem_free(ipnetif, sizeof (*ipnetif));
}
/*
* Create an ipnetif_addr_t with the given logical interface id (lif)
* and add it to the supplied ipnetif. The lif is the netinfo
* representation of logical interface id, and we use this id to match
* incoming netinfo events against our lists of addresses.
*/
static void
ipnet_add_ifaddr(uint64_t lif, ipnetif_t *ipnetif, net_handle_t nd)
{
ipnetif_addr_t *ifaddr;
zoneid_t zoneid;
struct sockaddr_in bcast;
struct sockaddr_storage addr;
net_ifaddr_t type = NA_ADDRESS;
uint64_t phyif = ipnetif->if_index;
if (net_getlifaddr(nd, phyif, lif, 1, &type, &addr) != 0 ||
net_getlifzone(nd, phyif, lif, &zoneid) != 0)
return;
if ((ifaddr = kmem_alloc(sizeof (*ifaddr), KM_NOSLEEP)) == NULL)
return;
ifaddr->ifa_zone = zoneid;
ifaddr->ifa_id = lif;
ifaddr->ifa_shared = NULL;
switch (addr.ss_family) {
case AF_INET:
ifaddr->ifa_ip4addr =
((struct sockaddr_in *)&addr)->sin_addr.s_addr;
/*
* Try and get the broadcast address. Note that it's okay for
* an interface to not have a broadcast address, so we don't
* fail the entire operation if net_getlifaddr() fails here.
*/
type = NA_BROADCAST;
if (net_getlifaddr(nd, phyif, lif, 1, &type, &bcast) == 0)
ifaddr->ifa_brdaddr = bcast.sin_addr.s_addr;
break;
case AF_INET6:
ifaddr->ifa_ip6addr = ((struct sockaddr_in6 *)&addr)->sin6_addr;
break;
}
/*
* The zoneid stored in ipnetif_t needs to correspond to the actual
* zone the address is being used in. This facilitates finding the
* correct netstack_t pointer, amongst other things, later.
*/
if (zoneid == ALL_ZONES)
zoneid = GLOBAL_ZONEID;
mutex_enter(&ipnetif->if_addr_lock);
if (zoneid != ipnetif->if_zoneid) {
ipnetif_t *ifp2;
ifp2 = ipnetif_clone_create(ipnetif, zoneid);
ifaddr->ifa_shared = ifp2;
}
list_insert_tail(addr.ss_family == AF_INET ?
&ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list, ifaddr);
mutex_exit(&ipnetif->if_addr_lock);
}
static void
ipnet_delete_ifaddr(ipnetif_addr_t *ifaddr, ipnetif_t *ipnetif, boolean_t isv6)
{
mutex_enter(&ipnetif->if_addr_lock);
if (ifaddr->ifa_shared != NULL)
ipnetif_clone_release(ifaddr->ifa_shared);
list_remove(isv6 ?
&ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list, ifaddr);
mutex_exit(&ipnetif->if_addr_lock);
kmem_free(ifaddr, sizeof (*ifaddr));
}
static void
ipnet_plumb_ev(ipnet_nicevent_t *ipne, ipnet_stack_t *ips, boolean_t isv6)
{
ipnetif_t *ipnetif;
boolean_t refrele_needed = B_TRUE;
uint64_t ifflags;
uint64_t ifindex;
char *ifname;
ifflags = 0;
ifname = ipne->ipne_ifname;
ifindex = ipne->ipne_ifindex;
(void) net_getlifflags(ipne->ipne_protocol, ifindex, 0, &ifflags);
if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL) {
ipnetif = ipnetif_create(ifname, ifindex, ips, ifflags);
refrele_needed = B_FALSE;
}
if (ipnetif != NULL) {
ipnetif->if_flags |=
isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
}
if (ipnetif->if_multicnt != 0) {
if (ip_join_allmulti(ifindex, isv6,
ips->ips_netstack->netstack_ip) == 0) {
ipnetif->if_flags |=
isv6 ? IPNETIF_IPV6ALLMULTI : IPNETIF_IPV4ALLMULTI;
}
}
if (refrele_needed)
ipnetif_refrele(ipnetif);
}
static void
ipnet_unplumb_ev(uint64_t ifindex, ipnet_stack_t *ips, boolean_t isv6)
{
ipnetif_t *ipnetif;
if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
return;
mutex_enter(&ipnetif->if_addr_lock);
ipnet_purge_addrlist(isv6 ?
&ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list);
mutex_exit(&ipnetif->if_addr_lock);
/*
* Note that we have one ipnetif for both IPv4 and IPv6, but we receive
* separate NE_UNPLUMB events for IPv4 and IPv6. We remove the ipnetif
* if both IPv4 and IPv6 interfaces have been unplumbed.
*/
ipnetif->if_flags &= isv6 ? ~IPNETIF_IPV6PLUMBED : ~IPNETIF_IPV4PLUMBED;
if (!(ipnetif->if_flags & (IPNETIF_IPV4PLUMBED | IPNETIF_IPV6PLUMBED)))
ipnetif_remove(ipnetif, ips);
ipnetif_refrele(ipnetif);
}
static void
ipnet_lifup_ev(uint64_t ifindex, uint64_t lifindex, net_handle_t nd,
ipnet_stack_t *ips, boolean_t isv6)
{
ipnetif_t *ipnetif;
ipnetif_addr_t *ifaddr;
if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
return;
if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL) {
/*
* We must have missed a NE_LIF_DOWN event. Delete this
* ifaddr and re-create it.
*/
ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
}
ipnet_add_ifaddr(lifindex, ipnetif, nd);
ipnetif_refrele(ipnetif);
}
static void
ipnet_lifdown_ev(uint64_t ifindex, uint64_t lifindex, ipnet_stack_t *ips,
boolean_t isv6)
{
ipnetif_t *ipnetif;
ipnetif_addr_t *ifaddr;
if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
return;
if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL)
ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
ipnetif_refrele(ipnetif);
/*
* Make sure that open streams on this ipnetif are still allowed to
* have it open.
*/
ipnetif_zonecheck(ipnetif, ips);
}
/*
* This callback from the NIC event framework dispatches a taskq as the event
* handlers may block.
*/
/* ARGSUSED */
static int
ipnet_nicevent_cb(hook_event_token_t token, hook_data_t info, void *arg)
{
ipnet_stack_t *ips = arg;
hook_nic_event_t *hn = (hook_nic_event_t *)info;
ipnet_nicevent_t *ipne;
if ((ipne = kmem_alloc(sizeof (ipnet_nicevent_t), KM_NOSLEEP)) == NULL)
return (0);
ipne->ipne_event = hn->hne_event;
ipne->ipne_protocol = hn->hne_protocol;
ipne->ipne_stackid = ips->ips_netstack->netstack_stackid;
ipne->ipne_ifindex = hn->hne_nic;
ipne->ipne_lifindex = hn->hne_lif;
if (hn->hne_datalen != 0) {
(void) strlcpy(ipne->ipne_ifname, hn->hne_data,
sizeof (ipne->ipne_ifname));
}
(void) ddi_taskq_dispatch(ipnet_nicevent_taskq, ipnet_nicevent_task,
ipne, DDI_NOSLEEP);
return (0);
}
static void
ipnet_nicevent_task(void *arg)
{
ipnet_nicevent_t *ipne = arg;
netstack_t *ns;
ipnet_stack_t *ips;
boolean_t isv6;
if ((ns = netstack_find_by_stackid(ipne->ipne_stackid)) == NULL)
goto done;
ips = ns->netstack_ipnet;
isv6 = (ipne->ipne_protocol == ips->ips_ndv6);
mutex_enter(&ips->ips_event_lock);
switch (ipne->ipne_event) {
case NE_PLUMB:
ipnet_plumb_ev(ipne, ips, isv6);
break;
case NE_UNPLUMB:
ipnet_unplumb_ev(ipne->ipne_ifindex, ips, isv6);
break;
case NE_LIF_UP:
ipnet_lifup_ev(ipne->ipne_ifindex, ipne->ipne_lifindex,
ipne->ipne_protocol, ips, isv6);
break;
case NE_LIF_DOWN:
ipnet_lifdown_ev(ipne->ipne_ifindex, ipne->ipne_lifindex, ips,
isv6);
break;
default:
break;
}
mutex_exit(&ips->ips_event_lock);
done:
if (ns != NULL)
netstack_rele(ns);
kmem_free(ipne, sizeof (ipnet_nicevent_t));
}
dev_t
ipnet_if_getdev(char *name, zoneid_t zoneid)
{
netstack_t *ns;
ipnet_stack_t *ips;
ipnetif_t *ipnetif;
dev_t dev = (dev_t)-1;
if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
return (dev);
if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
return (dev);
ips = ns->netstack_ipnet;
mutex_enter(&ips->ips_avl_lock);
if ((ipnetif = avl_find(&ips->ips_avl_by_name, name, NULL)) != NULL) {
if (ipnetif_in_zone(ipnetif, zoneid, ips))
dev = ipnetif->if_dev;
}
mutex_exit(&ips->ips_avl_lock);
netstack_rele(ns);
return (dev);
}
static ipnetif_t *
ipnetif_getby_index(uint64_t id, ipnet_stack_t *ips)
{
ipnetif_t *ipnetif;
mutex_enter(&ips->ips_avl_lock);
if ((ipnetif = avl_find(&ips->ips_avl_by_index, &id, NULL)) != NULL)
ipnetif_refhold(ipnetif);
mutex_exit(&ips->ips_avl_lock);
return (ipnetif);
}
static ipnetif_t *
ipnetif_getby_dev(dev_t dev, ipnet_stack_t *ips)
{
ipnetif_t *ipnetif;
avl_tree_t *tree;
mutex_enter(&ips->ips_avl_lock);
tree = &ips->ips_avl_by_index;
for (ipnetif = avl_first(tree); ipnetif != NULL;
ipnetif = avl_walk(tree, ipnetif, AVL_AFTER)) {
if (ipnetif->if_dev == dev) {
ipnetif_refhold(ipnetif);
break;
}
}
mutex_exit(&ips->ips_avl_lock);
return (ipnetif);
}
static ipnetif_addr_t *
ipnet_match_lif(ipnetif_t *ipnetif, lif_if_t lid, boolean_t isv6)
{
ipnetif_addr_t *ifaddr;
list_t *list;
mutex_enter(&ipnetif->if_addr_lock);
list = isv6 ? &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list;
for (ifaddr = list_head(list); ifaddr != NULL;
ifaddr = list_next(list, ifaddr)) {
if (lid == ifaddr->ifa_id)
break;
}
mutex_exit(&ipnetif->if_addr_lock);
return (ifaddr);
}
/* ARGSUSED */
static void *
ipnet_stack_init(netstackid_t stackid, netstack_t *ns)
{
ipnet_stack_t *ips;
ips = kmem_zalloc(sizeof (*ips), KM_SLEEP);
ips->ips_netstack = ns;
mutex_init(&ips->ips_avl_lock, NULL, MUTEX_DEFAULT, 0);
avl_create(&ips->ips_avl_by_index, ipnetif_compare_index,
sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_index));
avl_create(&ips->ips_avl_by_name, ipnetif_compare_name,
sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_name));
avl_create(&ips->ips_avl_by_shared, ipnetif_compare_name_zone,
sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_shared));
mutex_init(&ips->ips_walkers_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ips->ips_walkers_cv, NULL, CV_DRIVER, NULL);
list_create(&ips->ips_str_list, sizeof (ipnet_t),
offsetof(ipnet_t, ipnet_next));
ipnet_register_netihook(ips);
return (ips);
}
/* ARGSUSED */
static void
ipnet_stack_fini(netstackid_t stackid, void *arg)
{
ipnet_stack_t *ips = arg;
ipnetif_t *ipnetif, *nipnetif;
if (ips->ips_kstatp != NULL) {
zoneid_t zoneid;
zoneid = netstackid_to_zoneid(stackid);
net_kstat_delete(net_zoneidtonetid(zoneid), ips->ips_kstatp);
}
if (ips->ips_ndv4 != NULL) {
VERIFY(net_hook_unregister(ips->ips_ndv4, NH_NIC_EVENTS,
ips->ips_nicevents) == 0);
VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
}
if (ips->ips_ndv6 != NULL) {
VERIFY(net_hook_unregister(ips->ips_ndv6, NH_NIC_EVENTS,
ips->ips_nicevents) == 0);
VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
}
hook_free(ips->ips_nicevents);
for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
ipnetif = nipnetif) {
nipnetif = AVL_NEXT(&ips->ips_avl_by_index, ipnetif);
ipnetif_remove(ipnetif, ips);
}
avl_destroy(&ips->ips_avl_by_shared);
avl_destroy(&ips->ips_avl_by_index);
avl_destroy(&ips->ips_avl_by_name);
mutex_destroy(&ips->ips_avl_lock);
mutex_destroy(&ips->ips_walkers_lock);
cv_destroy(&ips->ips_walkers_cv);
list_destroy(&ips->ips_str_list);
kmem_free(ips, sizeof (*ips));
}
/* Do any of the addresses in addrlist belong the supplied zoneid? */
static boolean_t
ipnet_addrs_in_zone(list_t *addrlist, zoneid_t zoneid)
{
ipnetif_addr_t *ifa;
for (ifa = list_head(addrlist); ifa != NULL;
ifa = list_next(addrlist, ifa)) {
if (ifa->ifa_zone == zoneid)
return (B_TRUE);
}
return (B_FALSE);
}
/* Should the supplied ipnetif be visible from the supplied zoneid? */
static boolean_t
ipnetif_in_zone(ipnetif_t *ipnetif, zoneid_t zoneid, ipnet_stack_t *ips)
{
int ret;
/*
* The global zone has visibility into all interfaces in the global
* stack, and exclusive stack zones have visibility into all
* interfaces in their stack.
*/
if (zoneid == GLOBAL_ZONEID ||
ips->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
return (B_TRUE);
/*
* Shared-stack zones only have visibility for interfaces that have
* addresses in their zone.
*/
mutex_enter(&ipnetif->if_addr_lock);
ret = ipnet_addrs_in_zone(&ipnetif->if_ip4addr_list, zoneid) ||
ipnet_addrs_in_zone(&ipnetif->if_ip6addr_list, zoneid);
mutex_exit(&ipnetif->if_addr_lock);
return (ret);
}
/*
* Verify that any ipnet_t that has a reference to the supplied ipnetif should
* still be allowed to have it open. A given ipnet_t may no longer be allowed
* to have an ipnetif open if there are no longer any addresses that belong to
* the ipnetif in the ipnet_t's non-global shared-stack zoneid. If that's the
* case, send the ipnet_t an M_HANGUP.
*/
static void
ipnetif_zonecheck(ipnetif_t *ipnetif, ipnet_stack_t *ips)
{
list_t *strlist = &ips->ips_str_list;
ipnet_t *ipnet;
ipnet_walkers_inc(ips);
for (ipnet = list_head(strlist); ipnet != NULL;
ipnet = list_next(strlist, ipnet)) {
if (ipnet->ipnet_if != ipnetif)
continue;
if (!ipnetif_in_zone(ipnetif, ipnet->ipnet_zoneid, ips))
(void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
}
ipnet_walkers_dec(ips);
}
void
ipnet_walk_if(ipnet_walkfunc_t *cb, void *arg, zoneid_t zoneid)
{
ipnetif_t *ipnetif;
list_t cbdata;
ipnetif_cbdata_t *cbnode;
netstack_t *ns;
ipnet_stack_t *ips;
/*
* On labeled systems, non-global zones shouldn't see anything
* in /dev/ipnet.
*/
if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
return;
if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
return;
ips = ns->netstack_ipnet;
list_create(&cbdata, sizeof (ipnetif_cbdata_t),
offsetof(ipnetif_cbdata_t, ic_next));
mutex_enter(&ips->ips_avl_lock);
for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
ipnetif = avl_walk(&ips->ips_avl_by_index, ipnetif, AVL_AFTER)) {
if (!ipnetif_in_zone(ipnetif, zoneid, ips))
continue;
cbnode = kmem_zalloc(sizeof (ipnetif_cbdata_t), KM_SLEEP);
(void) strlcpy(cbnode->ic_ifname, ipnetif->if_name, LIFNAMSIZ);
cbnode->ic_dev = ipnetif->if_dev;
list_insert_head(&cbdata, cbnode);
}
mutex_exit(&ips->ips_avl_lock);
while ((cbnode = list_head(&cbdata)) != NULL) {
cb(cbnode->ic_ifname, arg, cbnode->ic_dev);
list_remove(&cbdata, cbnode);
kmem_free(cbnode, sizeof (ipnetif_cbdata_t));
}
list_destroy(&cbdata);
netstack_rele(ns);
}
static int
ipnetif_compare_index(const void *index_ptr, const void *ipnetifp)
{
int64_t index1 = *((int64_t *)index_ptr);
int64_t index2 = (int64_t)((ipnetif_t *)ipnetifp)->if_index;
return (SIGNOF(index2 - index1));
}
static int
ipnetif_compare_name(const void *name_ptr, const void *ipnetifp)
{
int res;
res = strcmp(((ipnetif_t *)ipnetifp)->if_name, name_ptr);
return (SIGNOF(res));
}
static int
ipnetif_compare_name_zone(const void *key_ptr, const void *ipnetifp)
{
const uintptr_t *ptr = key_ptr;
const ipnetif_t *ifp;
int res;
ifp = ipnetifp;
res = ifp->if_zoneid - ptr[0];
if (res != 0)
return (SIGNOF(res));
res = strcmp(ifp->if_name, (char *)ptr[1]);
return (SIGNOF(res));
}
static void
ipnetif_refhold(ipnetif_t *ipnetif)
{
mutex_enter(&ipnetif->if_reflock);
ipnetif->if_refcnt++;
mutex_exit(&ipnetif->if_reflock);
}
static void
ipnetif_refrele(ipnetif_t *ipnetif)
{
mutex_enter(&ipnetif->if_reflock);
ASSERT(ipnetif->if_refcnt > 0);
if (--ipnetif->if_refcnt == 0)
ipnetif_free(ipnetif);
else
mutex_exit(&ipnetif->if_reflock);
}
static void
ipnet_walkers_inc(ipnet_stack_t *ips)
{
mutex_enter(&ips->ips_walkers_lock);
ips->ips_walkers_cnt++;
mutex_exit(&ips->ips_walkers_lock);
}
static void
ipnet_walkers_dec(ipnet_stack_t *ips)
{
mutex_enter(&ips->ips_walkers_lock);
ASSERT(ips->ips_walkers_cnt != 0);
if (--ips->ips_walkers_cnt == 0)
cv_broadcast(&ips->ips_walkers_cv);
mutex_exit(&ips->ips_walkers_lock);
}
/*ARGSUSED*/
static int
ipobs_bounce_func(hook_event_token_t token, hook_data_t info, void *arg)
{
hook_pkt_observe_t *hdr;
pfv_t func = (pfv_t)arg;
mblk_t *mp;
hdr = (hook_pkt_observe_t *)info;
mp = dupmsg(hdr->hpo_pkt);
if (mp == NULL) {
mp = copymsg(hdr->hpo_pkt);
if (mp == NULL) {
netstack_t *ns = hdr->hpo_ctx;
ipnet_stack_t *ips = ns->netstack_ipnet;
IPSK_BUMP(ips, ik_dispatchDupDrop);
return (0);
}
}
hdr = (hook_pkt_observe_t *)mp->b_rptr;
hdr->hpo_pkt = mp;
func(mp);
return (0);
}
hook_t *
ipobs_register_hook(netstack_t *ns, pfv_t func)
{
ip_stack_t *ipst = ns->netstack_ip;
char name[32];
hook_t *hook;
HOOK_INIT(hook, ipobs_bounce_func, "", (void *)func);
VERIFY(hook != NULL);
/*
* To register multiple hooks with he same callback function,
* a unique name is needed.
*/
(void) snprintf(name, sizeof (name), "ipobserve_%p", hook);
hook->h_name = strdup(name);
(void) net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
(void) net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
return (hook);
}
void
ipobs_unregister_hook(netstack_t *ns, hook_t *hook)
{
ip_stack_t *ipst = ns->netstack_ip;
(void) net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
(void) net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
strfree(hook->h_name);
hook_free(hook);
}
/* ******************************************************************** */
/* BPF Functions below */
/* ******************************************************************** */
/*
* Convenience function to make mapping a zoneid to an ipnet_stack_t easy.
*/
static ipnet_stack_t *
ipnet_find_by_zoneid(zoneid_t zoneid)
{
netstack_t *ns;
VERIFY((ns = netstack_find_by_zoneid(zoneid)) != NULL);
return (ns->netstack_ipnet);
}
/*
* Rather than weave the complexity of what needs to be done for a BPF
* device attach or detach into the code paths of where they're used,
* it is presented here in a couple of simple functions, along with
* other similar code.
*
* The refrele/refhold here provide the means by which it is known
* when the clone structures can be free'd.
*/
static void
ipnet_bpfdetach(ipnetif_t *ifp)
{
if (ifp->if_stackp->ips_bpfdetach_fn != NULL) {
ifp->if_stackp->ips_bpfdetach_fn((uintptr_t)ifp);
ipnetif_refrele(ifp);
}
}
static void
ipnet_bpfattach(ipnetif_t *ifp)
{
if (ifp->if_stackp->ips_bpfattach_fn != NULL) {
ipnetif_refhold(ifp);
ifp->if_stackp->ips_bpfattach_fn((uintptr_t)ifp, DL_IPNET,
ifp->if_zoneid, BPR_IPNET);
}
}
/*
* Set the functions to call back to when adding or removing an interface so
* that BPF can keep its internal list of these up to date.
*/
void
ipnet_set_bpfattach(bpf_attach_fn_t attach, bpf_detach_fn_t detach,
zoneid_t zoneid, bpf_itap_fn_t tapfunc, bpf_provider_reg_fn_t provider)
{
ipnet_stack_t *ips;
ipnetif_t *ipnetif;
avl_tree_t *tree;
ipnetif_t *next;
if (zoneid == GLOBAL_ZONEID) {
ipnet_itap = tapfunc;
}
VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
/*
* If we're setting a new attach function, call it for every
* mac that has already been attached.
*/
if (attach != NULL && ips->ips_bpfattach_fn == NULL) {
ASSERT(detach != NULL);
if (provider != NULL) {
(void) provider(&bpf_ipnet);
}
/*
* The call to ipnet_bpfattach() calls into bpf`bpfattach
* which then wants to resolve the link name into a link id.
* For ipnet, this results in a call back to
* ipnet_get_linkid_byname which also needs to lock and walk
* the AVL tree. Thus the call to ipnet_bpfattach needs to
* be made without the avl_lock held.
*/
mutex_enter(&ips->ips_event_lock);
ips->ips_bpfattach_fn = attach;
ips->ips_bpfdetach_fn = detach;
mutex_enter(&ips->ips_avl_lock);
tree = &ips->ips_avl_by_index;
for (ipnetif = avl_first(tree); ipnetif != NULL;
ipnetif = next) {
ipnetif_refhold(ipnetif);
mutex_exit(&ips->ips_avl_lock);
ipnet_bpfattach(ipnetif);
mutex_enter(&ips->ips_avl_lock);
next = avl_walk(tree, ipnetif, AVL_AFTER);
ipnetif_refrele(ipnetif);
}
mutex_exit(&ips->ips_avl_lock);
ipnet_bpf_probe_shared(ips);
mutex_exit(&ips->ips_event_lock);
} else if (attach == NULL && ips->ips_bpfattach_fn != NULL) {
ASSERT(ips->ips_bpfdetach_fn != NULL);
mutex_enter(&ips->ips_event_lock);
ips->ips_bpfattach_fn = NULL;
mutex_enter(&ips->ips_avl_lock);
tree = &ips->ips_avl_by_index;
for (ipnetif = avl_first(tree); ipnetif != NULL;
ipnetif = next) {
ipnetif_refhold(ipnetif);
mutex_exit(&ips->ips_avl_lock);
ipnet_bpfdetach((ipnetif_t *)ipnetif);
mutex_enter(&ips->ips_avl_lock);
next = avl_walk(tree, ipnetif, AVL_AFTER);
ipnetif_refrele(ipnetif);
}
mutex_exit(&ips->ips_avl_lock);
ipnet_bpf_release_shared(ips);
ips->ips_bpfdetach_fn = NULL;
mutex_exit(&ips->ips_event_lock);
if (provider != NULL) {
(void) provider(&bpf_ipnet);
}
}
}
/*
* The list of interfaces available via ipnet is private for each zone,
* so the AVL tree of each zone must be searched for a given name, even
* if all names are unique.
*/
int
ipnet_open_byname(const char *name, ipnetif_t **ptr, zoneid_t zoneid)
{
ipnet_stack_t *ips;
ipnetif_t *ipnetif;
ASSERT(ptr != NULL);
VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
mutex_enter(&ips->ips_avl_lock);
ipnetif = avl_find(&ips->ips_avl_by_name, (char *)name, NULL);
if (ipnetif != NULL) {
ipnetif_refhold(ipnetif);
}
mutex_exit(&ips->ips_avl_lock);
*ptr = ipnetif;
if (ipnetif == NULL)
return (ESRCH);
return (0);
}
void
ipnet_close_byhandle(ipnetif_t *ifp)
{
ASSERT(ifp != NULL);
ipnetif_refrele(ifp);
}
const char *
ipnet_name(ipnetif_t *ifp)
{
ASSERT(ifp != NULL);
return (ifp->if_name);
}
/*
* To find the linkid for a given name, it is necessary to know which zone
* the interface name belongs to and to search the avl tree for that zone
* as there is no master list of all interfaces and which zone they belong
* to. It is assumed that the caller of this function is somehow already
* working with the ipnet interfaces and hence the ips_event_lock is held.
* When BPF calls into this function, it is doing so because of an event
* in ipnet, and thus ipnet holds the ips_event_lock. Thus the datalink id
* value returned has meaning without the need for grabbing a hold on the
* owning structure.
*/
int
ipnet_get_linkid_byname(const char *name, uint_t *idp, zoneid_t zoneid)
{
ipnet_stack_t *ips;
ipnetif_t *ifp;
VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
ASSERT(mutex_owned(&ips->ips_event_lock));
mutex_enter(&ips->ips_avl_lock);
ifp = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
if (ifp != NULL)
*idp = (uint_t)ifp->if_index;
/*
* Shared instance zone?
*/
if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
uintptr_t key[2] = { zoneid, (uintptr_t)name };
ifp = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
if (ifp != NULL)
*idp = (uint_t)ifp->if_index;
}
mutex_exit(&ips->ips_avl_lock);
if (ifp == NULL)
return (ESRCH);
return (0);
}
/*
* Strictly speaking, there is no such thing as a "client" in ipnet, like
* there is in mac. BPF only needs to have this because it is required as
* part of interfacing correctly with mac. The reuse of the original
* ipnetif_t as a client poses no danger, so long as it is done with its
* own ref-count'd hold that is given up on close.
*/
int
ipnet_client_open(ipnetif_t *ptr, ipnetif_t **result)
{
ASSERT(ptr != NULL);
ASSERT(result != NULL);
ipnetif_refhold(ptr);
*result = ptr;
return (0);
}
void
ipnet_client_close(ipnetif_t *ptr)
{
ASSERT(ptr != NULL);
ipnetif_refrele(ptr);
}
/*
* This is called from BPF when it needs to start receiving packets
* from ipnet.
*
* The use of the ipnet_t structure here is somewhat lightweight when
* compared to how it is used elsewhere but it already has all of the
* right fields in it, so reuse here doesn't seem out of order. Its
* primary purpose here is to provide the means to store pointers for
* use when ipnet_promisc_remove() needs to be called.
*
* This should never be called for the IPNET_MINOR_LO device as it is
* never created via ipnetif_create.
*/
/*ARGSUSED*/
int
ipnet_promisc_add(void *handle, uint_t how, void *data, uintptr_t *mhandle,
int flags)
{
ip_stack_t *ipst;
netstack_t *ns;
ipnetif_t *ifp;
ipnet_t *ipnet;
char name[32];
int error;
ifp = (ipnetif_t *)handle;
ns = netstack_find_by_zoneid(ifp->if_zoneid);
if ((how == DL_PROMISC_PHYS) || (how == DL_PROMISC_MULTI)) {
error = ipnet_join_allmulti(ifp, ns->netstack_ipnet);
if (error != 0)
return (error);
} else {
return (EINVAL);
}
ipnet = kmem_zalloc(sizeof (*ipnet), KM_SLEEP);
ipnet->ipnet_if = ifp;
ipnet->ipnet_ns = ns;
ipnet->ipnet_flags = flags;
if ((ifp->if_flags & IPNETIF_LOOPBACK) != 0) {
ipnet->ipnet_acceptfn = ipnet_loaccept;
} else {
ipnet->ipnet_acceptfn = ipnet_accept;
}
/*
* To register multiple hooks with the same callback function,
* a unique name is needed.
*/
HOOK_INIT(ipnet->ipnet_hook, ipnet_bpf_bounce, "", ipnet);
(void) snprintf(name, sizeof (name), "ipnet_promisc_%p",
ipnet->ipnet_hook);
ipnet->ipnet_hook->h_name = strdup(name);
ipnet->ipnet_data = data;
ipnet->ipnet_zoneid = ifp->if_zoneid;
ipst = ns->netstack_ip;
error = net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE,
ipnet->ipnet_hook);
if (error != 0)
goto regfail;
error = net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE,
ipnet->ipnet_hook);
if (error != 0) {
(void) net_hook_unregister(ipst->ips_ip4_observe_pr,
NH_OBSERVE, ipnet->ipnet_hook);
goto regfail;
}
*mhandle = (uintptr_t)ipnet;
return (0);
regfail:
cmn_err(CE_WARN, "net_hook_register failed: %d", error);
strfree(ipnet->ipnet_hook->h_name);
hook_free(ipnet->ipnet_hook);
return (error);
}
void
ipnet_promisc_remove(void *data)
{
ip_stack_t *ipst;
ipnet_t *ipnet;
hook_t *hook;
ipnet = data;
ipst = ipnet->ipnet_ns->netstack_ip;
hook = ipnet->ipnet_hook;
VERIFY(net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE,
hook) == 0);
VERIFY(net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE,
hook) == 0);
strfree(hook->h_name);
hook_free(hook);
kmem_free(ipnet, sizeof (*ipnet));
}
/*
* arg here comes from the ipnet_t allocated in ipnet_promisc_add.
* An important field from that structure is "ipnet_data" that
* contains the "data" pointer passed into ipnet_promisc_add: it needs
* to be passed back to bpf when we call into ipnet_itap.
*
* ipnet_itap is set by ipnet_set_bpfattach, which in turn is called
* from BPF.
*/
/*ARGSUSED*/
static int
ipnet_bpf_bounce(hook_event_token_t token, hook_data_t info, void *arg)
{
hook_pkt_observe_t *hdr;
ipnet_addrp_t src;
ipnet_addrp_t dst;
ipnet_stack_t *ips;
ipnet_t *ipnet;
mblk_t *netmp;
mblk_t *mp;
hdr = (hook_pkt_observe_t *)info;
mp = hdr->hpo_pkt;
ipnet = (ipnet_t *)arg;
ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
netmp = hdr->hpo_pkt->b_cont;
src.iap_family = hdr->hpo_family;
dst.iap_family = hdr->hpo_family;
if (hdr->hpo_family == AF_INET) {
src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
} else {
src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
}
if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
IPSK_BUMP(ips, ik_acceptFail);
return (0);
}
IPSK_BUMP(ips, ik_acceptOk);
ipnet_itap(ipnet->ipnet_data, mp,
hdr->hpo_htype == IPOBS_HOOK_OUTBOUND,
ntohs(hdr->hpo_pktlen) + (mp->b_wptr - mp->b_rptr));
return (0);
}
/*
* clone'd ipnetif_t's are created when a shared IP instance zone comes
* to life and configures an IP address. The model that BPF uses is that
* each interface must have a unique pointer and each interface must be
* representative of what it can capture. They are limited to one DLT
* per interface and one zone per interface. Thus every interface that
* can be seen in a zone must be announced via an attach to bpf. For
* shared instance zones, this means the ipnet driver needs to detect
* when an address is added to an interface in a zone for the first
* time (and also when the last address is removed.)
*/
static ipnetif_t *
ipnetif_clone_create(ipnetif_t *ifp, zoneid_t zoneid)
{
uintptr_t key[2] = { zoneid, (uintptr_t)ifp->if_name };
ipnet_stack_t *ips = ifp->if_stackp;
avl_index_t where = 0;
ipnetif_t *newif;
mutex_enter(&ips->ips_avl_lock);
newif = avl_find(&ips->ips_avl_by_shared, (void *)key, &where);
if (newif != NULL) {
ipnetif_refhold(newif);
newif->if_sharecnt++;
mutex_exit(&ips->ips_avl_lock);
return (newif);
}
newif = ipnet_alloc_if(ips);
if (newif == NULL) {
mutex_exit(&ips->ips_avl_lock);
return (NULL);
}
newif->if_refcnt = 1;
newif->if_sharecnt = 1;
newif->if_zoneid = zoneid;
(void) strlcpy(newif->if_name, ifp->if_name, LIFNAMSIZ);
newif->if_flags = ifp->if_flags & IPNETIF_LOOPBACK;
newif->if_index = ifp->if_index;
avl_insert(&ips->ips_avl_by_shared, newif, where);
mutex_exit(&ips->ips_avl_lock);
ipnet_bpfattach(newif);
return (newif);
}
static void
ipnetif_clone_release(ipnetif_t *ipnetif)
{
boolean_t dofree = B_FALSE;
boolean_t doremove = B_FALSE;
ipnet_stack_t *ips = ipnetif->if_stackp;
mutex_enter(&ipnetif->if_reflock);
ASSERT(ipnetif->if_refcnt > 0);
if (--ipnetif->if_refcnt == 0)
dofree = B_TRUE;
ASSERT(ipnetif->if_sharecnt > 0);
if (--ipnetif->if_sharecnt == 0)
doremove = B_TRUE;
mutex_exit(&ipnetif->if_reflock);
if (doremove) {
mutex_enter(&ips->ips_avl_lock);
avl_remove(&ips->ips_avl_by_shared, ipnetif);
mutex_exit(&ips->ips_avl_lock);
ipnet_bpfdetach(ipnetif);
}
if (dofree) {
ASSERT(ipnetif->if_sharecnt == 0);
ipnetif_free(ipnetif);
}
}
/*
* Called when BPF loads, the goal is to tell BPF about all of the interfaces
* in use by zones that have a shared IP stack. These interfaces are stored
* in the ips_avl_by_shared tree. Note that if there are 1000 bge0's in use
* as bge0:1 through to bge0:1000, then this would be represented by a single
* bge0 on that AVL tree.
*/
static void
ipnet_bpf_probe_shared(ipnet_stack_t *ips)
{
ipnetif_t *next;
ipnetif_t *ifp;
mutex_enter(&ips->ips_avl_lock);
for (ifp = avl_first(&ips->ips_avl_by_shared); ifp != NULL;
ifp = next) {
ipnetif_refhold(ifp);
mutex_exit(&ips->ips_avl_lock);
ipnet_bpfattach(ifp);
mutex_enter(&ips->ips_avl_lock);
next = avl_walk(&ips->ips_avl_by_shared, ifp, AVL_AFTER);
ipnetif_refrele(ifp);
}
mutex_exit(&ips->ips_avl_lock);
}
static void
ipnet_bpf_release_shared(ipnet_stack_t *ips)
{
ipnetif_t *next;
ipnetif_t *ifp;
mutex_enter(&ips->ips_avl_lock);
for (ifp = avl_first(&ips->ips_avl_by_shared); ifp != NULL;
ifp = next) {
ipnetif_refhold(ifp);
mutex_exit(&ips->ips_avl_lock);
ipnet_bpfdetach(ifp);
mutex_enter(&ips->ips_avl_lock);
next = avl_walk(&ips->ips_avl_by_shared, ifp, AVL_AFTER);
ipnetif_refrele(ifp);
}
mutex_exit(&ips->ips_avl_lock);
}