/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 2016, Joyent, Inc. All rights reserved.
*/
/*
* The ipnet device defined here provides access to packets at the IP layer. To
* provide access to packets at this layer it registers a callback function in
* the ip module and when there are open instances of the device ip will pass
* packets into the device. Packets from ip are passed on the input, output and
* loopback paths. Internally the module returns to ip as soon as possible by
* deferring processing using a taskq.
*
* filesystem and use of the neti interfaces. This module registers for NIC
* events using the neti framework so that when IP interfaces are bought up,
* taken down etc. the ipnet module is notified and its view of the interfaces
* configured on the system adjusted. On attach, the module gets an initial
* view of the system again using the neti framework but as it has already
* registered for IP interface events, it is still up-to-date with any changes.
*/
#include <sys/id_space.h>
#include <sys/hook_event.h>
#include <sys/sysmacros.h>
#include <inet/ip_multi.h>
1, /* mi_idnum */
"ipnet", /* mi_idname */
0, /* mi_minpsz */
INFPSZ, /* mi_maxpsz */
2048, /* mi_hiwat */
0 /* mi_lowat */
};
/*
* List to hold static view of ipnetif_t's on the system. This is needed to
* avoid holding the lock protecting the avl tree of ipnetif's over the
* callback into the dev filesystem.
*/
typedef struct ipnetif_cbdata {
/*
* Convenience enumerated type for ipnet_accept(). It describes the
* properties of a given ipnet_addrp_t relative to a single ipnet_t
* client stream. The values represent whether the address is ...
*/
typedef enum {
/* Argument used for the ipnet_nicevent_taskq callback. */
typedef struct ipnet_nicevent_s {
static void ipnet_input(mblk_t *);
static int ipnet_rsrv(queue_t *);
static int ipnet_close(queue_t *);
static void ipnet_nicevent_task(void *);
uint64_t);
static int ipnetif_compare_name(const void *, const void *);
static int ipnetif_compare_name_zone(const void *, const void *);
static int ipnetif_compare_index(const void *, const void *);
static void ipnetif_refhold(ipnetif_t *);
static void ipnetif_refrele(ipnetif_t *);
static void ipnet_walkers_inc(ipnet_stack_t *);
static void ipnet_walkers_dec(ipnet_stack_t *);
static void ipnet_register_netihook(ipnet_stack_t *);
static void ipnet_stack_fini(netstackid_t, void *);
static void ipnet_dispatch(void *);
static void ipnetif_clone_release(ipnetif_t *);
NULL, /* qi_putp */
ipnet_rsrv, /* qi_srvp */
ipnet_open, /* qi_qopen */
ipnet_close, /* qi_qclose */
NULL, /* qi_qadmin */
&ipnet_minfo, /* qi_minfo */
};
ipnet_wput, /* qi_putp */
NULL, /* qi_srvp */
NULL, /* qi_qopen */
NULL, /* qi_qclose */
NULL, /* qi_qadmin */
&ipnet_minfo, /* qi_minfo */
};
};
"STREAMS ipnet driver",
};
};
/*
* This structure contains the template data (names and type) that is
* copied, in bulk, into the new kstats structure created by net_kstat_create.
* No actual statistical information is stored in this instance of the
* ipnet_kstats_t structure.
*/
{ "duplicationFail", KSTAT_DATA_UINT64 },
{ "dispatchOk", KSTAT_DATA_UINT64 },
{ "dispatchFail", KSTAT_DATA_UINT64 },
{ "dispatchHeaderDrop", KSTAT_DATA_UINT64 },
{ "dispatchDupDrop", KSTAT_DATA_UINT64 },
{ "dispatchDeliver", KSTAT_DATA_UINT64 },
{ "acceptOk", KSTAT_DATA_UINT64 },
{ "acceptFail", KSTAT_DATA_UINT64 }
};
/*
* Walk the list of physical interfaces on the machine, for each
* interface create a new ipnetif_t and add any addresses to it. We
* need to do the walk twice, once for IPv4 and once for IPv6.
*
* The interfaces are destroyed as part of ipnet_stack_fini() for each
* stack. Note that we cannot do this initialization in
* ipnet_stack_init(), since ipnet_stack_init() cannot fail.
*/
static int
ipnetif_init(void)
{
int ret = 0;
if (ret != 0)
break;
}
return (ret);
}
/*
* Standard module entry points.
*/
int
_init(void)
{
int ret;
return (ENODEV);
/*
* We call ddi_taskq_create() with nthread == 1 to ensure in-order
* delivery of packets to clients. Note that we need to create the
* taskqs before calling netstack_register() since ipnet_stack_init()
* registers callbacks that use 'em.
*/
1, TASKQ_DEFAULTPRI, 0);
goto done;
}
if ((ret = ipnetif_init()) == 0)
done:
if (ret != 0) {
if (ipnet_taskq != NULL)
if (ipnet_nicevent_taskq != NULL)
if (netstack_registered)
}
return (ret);
}
int
_fini(void)
{
int err;
return (err);
return (0);
}
int
{
}
static void
{
int ret;
ips);
/*
* It is possible for an exclusive stack to be in the process of
* shutting down here, and the netid and protocol lookups could fail
* in that case.
*/
return;
ips->ips_nicevents)) != 0) {
}
}
ips->ips_nicevents)) != 0) {
}
}
/*
* Create a local set of kstats for each zone.
*/
"misc", KSTAT_TYPE_NAMED,
sizeof (ipnet_kstats_t) / sizeof (kstat_named_t), 0);
} else {
"ipnet", "ipnet_stats", "misc");
}
}
/*
* This function is called on attach to build an initial view of the
* interfaces on the system. It will be called once for IPv4 and once
* for IPv6, although there is only one ipnet interface for both IPv4
* and IPv6 there are separate address lists.
*/
static int
{
int ret = 0;
/*
* If ipnet_register_netihook() was unable to initialize this
* stack's net_handle_t, then we cannot populate any interface
* information. This usually happens when we attempted to
* grab a net_handle_t as a stack was shutting down. We don't
* want to fail the entire _init() operation because of a
* stack shutdown (other stacks will continue to work just
* fine), so we silently return success here.
*/
return (0);
/*
* Make sure we're not processing NIC events during the
* population of our interfaces and address lists.
*/
continue;
ifflags = 0;
goto done;
}
}
/*
* Skip addresses that aren't up. We'll add
* them when we receive an NE_LIF_UP event.
*/
continue;
/* Don't add it if we already have it. */
continue;
}
if (!new_if)
}
done:
return (ret);
}
static int
{
if (cmd != DDI_ATTACH)
return (DDI_FAILURE);
DDI_PSEUDO, 0) == DDI_FAILURE)
return (DDI_FAILURE);
return (DDI_SUCCESS);
}
static int
{
if (cmd != DDI_DETACH)
return (DDI_FAILURE);
return (DDI_SUCCESS);
}
/* ARGSUSED */
static int
{
switch (infocmd) {
case DDI_INFO_DEVT2INSTANCE:
*result = (void *)0;
error = DDI_SUCCESS;
break;
case DDI_INFO_DEVT2DEVINFO:
error = DDI_SUCCESS;
}
break;
}
return (error);
}
/* ARGSUSED */
static int
{
int err = 0;
/*
* If the system is labeled, only the global zone is allowed to open
* IP observability nodes.
*/
return (EACCES);
/* We don't support open as a module */
return (ENOTSUP);
/* This driver is self-cloning, we don't support re-open. */
return (EBUSY);
return (ENOMEM);
/*
* We need to hold ips_event_lock here as any NE_LIF_DOWN events need
* to be processed after ipnet_if is set and the ipnet_t has been
* inserted in the ips_str_list.
*/
} else {
goto done;
}
}
while (ips->ips_walkers_cnt != 0)
/*
* Only register our callback if we're the first open client; we call
* unregister in close() for the last open client.
*/
done:
if (err != 0) {
}
return (err);
}
static int
{
while (ips->ips_walkers_cnt != 0)
}
return (0);
}
static int
{
case M_FLUSH:
}
else
break;
case M_PROTO:
case M_PCPROTO:
ipnet_wputnondata(q, mp);
break;
case M_IOCTL:
ipnet_ioctl(q, mp);
break;
case M_IOCDATA:
ipnet_iocdata(q, mp);
break;
default:
break;
}
return (0);
}
static int
{
if (canputnext(q)) {
} else {
break;
}
}
return (0);
}
static void
{
case DLIOCRAW:
break;
case DLIOCIPNETINFO:
break;
}
/* Fallthrough, we don't support I_STR with DLIOCIPNETINFO. */
default:
break;
}
}
static void
{
case DLIOCIPNETINFO:
else
goto iocnak;
break;
default:
break;
}
}
static void
{
switch (prim) {
case DL_INFO_REQ:
ipnet_inforeq(q, mp);
break;
case DL_UNBIND_REQ:
ipnet_unbindreq(q, mp);
break;
case DL_BIND_REQ:
ipnet_bindreq(q, mp);
break;
case DL_PROMISCON_REQ:
ipnet_dlpromisconreq(q, mp);
break;
case DL_PROMISCOFF_REQ:
ipnet_dlpromiscoffreq(q, mp);
break;
case DL_UNITDATA_REQ:
case DL_DETACH_REQ:
case DL_PHYS_ADDR_REQ:
case DL_SET_PHYS_ADDR_REQ:
case DL_ENABMULTI_REQ:
case DL_DISABMULTI_REQ:
case DL_ATTACH_REQ:
break;
default:
break;
}
}
static void
{
return;
}
return;
*dlip = ipnet_infoack;
}
static void
{
return;
}
case 0 :
break;
case IPV4_VERSION :
break;
case IPV6_VERSION :
break;
default :
return;
/*NOTREACHED*/
}
}
static void
{
return;
}
} else {
}
}
static void
{
int err;
return;
}
return;
}
return;
}
}
switch (level) {
case DL_PROMISC_PHYS:
break;
case DL_PROMISC_SAP:
break;
case DL_PROMISC_MULTI:
break;
default:
return;
}
}
static void
{
return;
}
return;
}
switch (level) {
case DL_PROMISC_PHYS:
break;
case DL_PROMISC_SAP:
break;
case DL_PROMISC_MULTI:
break;
default:
return;
}
return;
}
}
}
static int
{
int err = 0;
if (ipnetif->if_multicnt == 0) {
(IPNETIF_IPV4ALLMULTI | IPNETIF_IPV6ALLMULTI)) == 0);
if (err != 0)
goto done;
}
if (err != 0 &&
goto done;
}
}
}
ipnetif->if_multicnt++;
done:
return (err);
}
static void
{
int err;
if (--ipnetif->if_multicnt == 0) {
}
}
}
}
/*
* Allocate a new mblk_t and put a dl_ipnetinfo_t in it.
* The structure it copies the header information from,
* hook_pkt_observe_t, is constructed using network byte
* order in ipobs_hook(), so there is no conversion here.
*/
static mblk_t *
{
return (NULL);
}
return (dlhdr);
}
static ipnet_addrtype_t
{
/* First check if the address is multicast or limited broadcast. */
switch (addr->iap_family) {
case AF_INET:
return (IPNETADDR_MBCAST);
break;
case AF_INET6:
return (IPNETADDR_MBCAST);
break;
}
/*
* Walk the address list to see if the address belongs to our
* interface or is one of our subnet broadcast addresses.
*/
/*
* If we're not in the global zone, then only look at
* addresses in our zone.
*/
continue;
switch (addr->iap_family) {
case AF_INET:
break;
case AF_INET6:
&ifaddr->ifa_ip6addr))
break;
}
}
return (addrtype);
}
/*
* Verify if the packet contained in hdr should be passed up to the
* ipnet client stream.
*/
static boolean_t
{
/*
* If the packet's ifindex matches ours, or the packet's group ifindex
* matches ours, it's on the interface we're observing. (Thus,
* observing on the group ifindex matches all ifindexes in the group.)
*/
/*
* Do not allow an ipnet stream to see packets that are not from or to
* its zone. The exception is when zones are using the shared stack
* model. In this case, streams in the global zone have visibility
* into other shared-stack zones, and broadcast and multicast traffic
* is visible by all zones in the stack.
*/
dsttype != IPNETADDR_MBCAST) {
return (B_FALSE);
}
/*
* If DL_PROMISC_SAP isn't enabled, then the bound SAP must match the
* packet's IP version.
*/
return (B_FALSE);
/* If the destination address is ours, then accept the packet. */
if (dsttype == IPNETADDR_MYADDR)
return (B_TRUE);
/*
* If DL_PROMISC_PHYS is enabled, then we can see all packets that are
* sent or received on the interface we're observing, or packets that
* have our source address (this allows us to see packets we send).
*/
return (B_TRUE);
}
/*
* We accept multicast and broadcast packets transmitted or received
* on the interface we're observing.
*/
return (B_TRUE);
return (B_FALSE);
}
/*
* Verify if the packet contained in hdr should be passed up to the ipnet
* client stream that's in IPNET_LOMODE.
*/
/* ARGSUSED */
static boolean_t
{
/*
* ipnet_if is only NULL for IPNET_MINOR_LO devices.
*/
return (B_FALSE);
}
/*
*/
return (B_FALSE);
}
}
static void
{
} else {
}
continue;
}
} else {
continue;
}
}
continue;
}
}
} else {
}
}
}
static void
{
DDI_SUCCESS) {
} else {
}
}
static ipnetif_t *
{
return (NULL);
return (ipnetif);
}
/*
* Create a new ipnetif_t and new minor node for it. If creation is
* successful the new ipnetif_t is inserted into an avl_tree
* containing ipnetif's for this stack instance.
*/
static ipnetif_t *
{
/*
* Because ipnetif_create() can be called from a NIC event
* callback, it should not block.
*/
return (NULL);
return (NULL);
}
if ((ifflags & IFF_LOOPBACK) != 0)
return (ipnetif);
}
static void
{
/* Send a SIGHUP to all open streams associated with this ipnetif. */
}
/*
* Release the reference we implicitly held in ipnetif_create().
*/
}
static void
{
}
}
static void
{
}
/*
* Create an ipnetif_addr_t with the given logical interface id (lif)
* and add it to the supplied ipnetif. The lif is the netinfo
* representation of logical interface id, and we use this id to match
* incoming netinfo events against our lists of addresses.
*/
static void
{
return;
return;
case AF_INET:
/*
* Try and get the broadcast address. Note that it's okay for
* an interface to not have a broadcast address, so we don't
* fail the entire operation if net_getlifaddr() fails here.
*/
type = NA_BROADCAST;
break;
case AF_INET6:
break;
}
/*
* The zoneid stored in ipnetif_t needs to correspond to the actual
* zone the address is being used in. This facilitates finding the
* correct netstack_t pointer, amongst other things, later.
*/
}
}
static void
{
}
static void
{
char *ifname;
ifflags = 0;
}
}
if (ipnetif->if_multicnt != 0) {
}
}
if (refrele_needed)
}
static void
{
return;
/*
* Note that we have one ipnetif for both IPv4 and IPv6, but we receive
* separate NE_UNPLUMB events for IPv4 and IPv6. We remove the ipnetif
* if both IPv4 and IPv6 interfaces have been unplumbed.
*/
}
static void
{
return;
/*
* We must have missed a NE_LIF_DOWN event. Delete this
* ifaddr and re-create it.
*/
}
}
static void
{
return;
/*
* Make sure that open streams on this ipnetif are still allowed to
* have it open.
*/
}
/*
* This callback from the NIC event framework dispatches a taskq as the event
* handlers may block.
*/
/* ARGSUSED */
static int
{
return (0);
if (hn->hne_datalen != 0) {
sizeof (ipne->ipne_ifname));
}
ipne, DDI_NOSLEEP);
return (0);
}
static void
{
goto done;
switch (ipne->ipne_event) {
case NE_PLUMB:
break;
case NE_UNPLUMB:
break;
case NE_LIF_UP:
break;
case NE_LIF_DOWN:
isv6);
break;
default:
break;
}
done:
}
{
return (dev);
return (dev);
}
return (dev);
}
static ipnetif_t *
{
return (ipnetif);
}
static ipnetif_t *
{
break;
}
}
return (ipnetif);
}
static ipnetif_addr_t *
{
break;
}
return (ifaddr);
}
/* ARGSUSED */
static void *
{
return (ips);
}
/* ARGSUSED */
static void
{
}
ips->ips_nicevents) == 0);
}
ips->ips_nicevents) == 0);
}
}
}
/* Do any of the addresses in addrlist belong the supplied zoneid? */
static boolean_t
{
return (B_TRUE);
}
return (B_FALSE);
}
/* Should the supplied ipnetif be visible from the supplied zoneid? */
static boolean_t
{
int ret;
/*
* The global zone has visibility into all interfaces in the global
* stack, and exclusive stack zones have visibility into all
* interfaces in their stack.
*/
if (zoneid == GLOBAL_ZONEID ||
return (B_TRUE);
/*
* Shared-stack zones only have visibility for interfaces that have
* addresses in their zone.
*/
return (ret);
}
/*
* Verify that any ipnet_t that has a reference to the supplied ipnetif should
* still be allowed to have it open. A given ipnet_t may no longer be allowed
* to have an ipnetif open if there are no longer any addresses that belong to
* the ipnetif in the ipnet_t's non-global shared-stack zoneid. If that's the
* case, send the ipnet_t an M_HANGUP.
*/
static void
{
continue;
}
}
void
{
/*
* On labeled systems, non-global zones shouldn't see anything
*/
return;
return;
continue;
}
}
}
static int
{
}
static int
{
int res;
}
static int
{
int res;
if (res != 0)
}
static void
{
}
static void
{
else
}
static void
{
ips->ips_walkers_cnt++;
}
static void
{
if (--ips->ips_walkers_cnt == 0)
}
/*ARGSUSED*/
static int
{
/*
* Code in ip_input() expects that it is the only one accessing the
* packet.
*/
return (0);
}
return (0);
}
hook_t *
{
/*
* To register multiple hooks with he same callback function,
* a unique name is needed.
*/
return (hook);
}
void
{
}
/* ******************************************************************** */
/* BPF Functions below */
/* ******************************************************************** */
/*
* Convenience function to make mapping a zoneid to an ipnet_stack_t easy.
*/
{
return (ns->netstack_ipnet);
}
/*
* Functions, such as the above ipnet_find_by_zoneid(), will return a
* pointer to ipnet_stack_t by calling a netstack lookup function.
* The netstack_find_*() functions return a pointer after doing a "hold"
* on the data structure and thereby require a "release" when the caller
* is finished with it. We need to mirror that API here and thus a caller
* of ipnet_find_by_zoneid() is required to call ipnet_rele().
*/
void
{
}
/*
*/
void
{
}
/*
* The list of interfaces available via ipnet is private for each zone,
* so the AVL tree of each zone must be searched for a given name, even
* if all names are unique.
*/
int
{
/*
* Shared instance zone?
*/
} else {
}
return (ESRCH);
return (0);
}
void
{
}
const char *
{
}
/*
* To find the linkid for a given name, it is necessary to know which zone
* the interface name belongs to and to search the avl tree for that zone
* as there is no master list of all interfaces and which zone they belong
* to. It is assumed that the caller of this function is somehow already
* working with the ipnet interfaces and hence the ips_event_lock is held.
* When BPF calls into this function, it is doing so because of an event
* in ipnet, and thus ipnet holds the ips_event_lock. Thus the datalink id
* value returned has meaning without the need for grabbing a hold on the
* owning structure.
*/
int
{
/*
* Shared instance zone?
*/
}
return (ESRCH);
return (0);
}
/*
* Strictly speaking, there is no such thing as a "client" in ipnet, like
* there is in mac. BPF only needs to have this because it is required as
* part of interfacing correctly with mac. The reuse of the original
* ipnetif_t as a client poses no danger, so long as it is done with its
* own ref-count'd hold that is given up on close.
*/
int
{
return (0);
}
void
{
}
/*
* This is called from BPF when it needs to start receiving packets
* from ipnet.
*
* The use of the ipnet_t structure here is somewhat lightweight when
* compared to how it is used elsewhere but it already has all of the
* right fields in it, so reuse here doesn't seem out of order. Its
* primary purpose here is to provide the means to store pointers for
* use when ipnet_promisc_remove() needs to be called.
*
* This should never be called for the IPNET_MINOR_LO device as it is
* never created via ipnetif_create.
*/
/*ARGSUSED*/
int
int flags)
{
int error;
return (EINVAL);
return (error);
}
} else {
}
/*
* To register multiple hooks with the same callback function,
* a unique name is needed.
*/
(void *)ipnet->ipnet_hook);
ipnet->ipnet_hook);
if (error != 0)
goto regfail;
ipnet->ipnet_hook);
if (error != 0) {
goto regfail;
}
return (0);
return (error);
}
void
{
hook) == 0);
hook) == 0);
}
/*
* arg here comes from the ipnet_t allocated in ipnet_promisc_add.
* An important field from that structure is "ipnet_data" that
* contains the "data" pointer passed into ipnet_promisc_add: it needs
* to be passed back to bpf when we call into ipnet_itap.
*
* ipnet_itap is set by ipnet_set_bpfattach, which in turn is called
* from BPF.
*/
/*ARGSUSED*/
static int
{
} else {
}
return (0);
}
return (0);
}
/*
* clone'd ipnetif_t's are created when a shared IP instance zone comes
* to life and configures an IP address. The model that BPF uses is that
* each interface must have a unique pointer and each interface must be
* representative of what it can capture. They are limited to one DLT
* per interface and one zone per interface. Thus every interface that
* can be seen in a zone must be announced via an attach to bpf. For
* shared instance zones, this means the ipnet driver needs to detect
* when an address is added to an interface in a zone for the first
* time (and also when the last address is removed.)
*/
static ipnetif_t *
{
newif->if_sharecnt++;
return (newif);
}
return (NULL);
}
return (newif);
}
static void
{
if (--ipnetif->if_sharecnt == 0)
if (doremove) {
}
if (dofree) {
}
}