/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
/* Copyright (c) 1990 Mentat Inc. */
/*
* Procedures for the kernel part of DVMRP,
* a Distance-Vector Multicast Routing Protocol.
* (See RFC-1075)
* Written by David Waitzman, BBN Labs, August 1988.
* Modified by Steve Deering, Stanford, February 1989.
* Modified by Mark J. Steiglitz, Stanford, May, 1991
* Modified by Van Jacobson, LBL, January 1993
* Modified by Ajit Thyagarajan, PARC, August 1993
* Modified by Bill Fenner, PARC, April 1995
*
* MROUTING 3.5
*/
/*
* TODO
* - function pointer field in vif, void *vif_sendit()
*/
#include <inet/ipsec_impl.h>
#include <inet/tunables.h>
#include <netinet/igmp_var.h>
#include <netinet/ip_mroute.h>
#include <inet/ip_multi.h>
#include <inet/ipclassifier.h>
/*
* MT Design:
*
* There are three main data structures viftable, mfctable and tbftable that
* need to be protected against MT races.
*
* vitable is a fixed length array of vif structs. There is no lock to protect
* the whole array, instead each struct is protected by its own indiviual lock.
* The value of v_marks in conjuction with the value of v_refcnt determines the
* current state of a vif structure. One special state that needs mention
* is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
* that vif is being initalized.
* Each structure is freed when the refcnt goes down to zero. If a delete comes
* in when the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
* which prevents the struct from further use. When the refcnt goes to zero
* the struct is freed and is marked VIF_MARK_NOTINUSE.
* from going away a refhold is put on the ipif before using it. see
* lock_good_vif() and unlock_good_vif().
*
* VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
* of the vif struct.
*
* tbftable is also a fixed length array of tbf structs and is only accessed
* via v_tbf. It is protected by its own lock tbf_lock.
*
* Lock Ordering is
* v_lock --> tbf_lock
* v_lock --> ill_locK
*
* mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
* Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
* it also maintains a state. These fields are protected by a lock (mfcb_lock).
* mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
* protect the struct elements.
*
* mfc structs are dynamically allocated and are singly linked
* at the head of the chain. When an mfc structure is to be deleted
* it is marked condemned and so is the state in the bucket struct.
* When the last walker of the hash bucket exits all the mfc structs
* marked condemed are freed.
*
* Locking Hierarchy:
* The bucket lock should be acquired before the mfc struct lock.
* MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
* operations on the bucket struct.
*
* last_encap_lock and numvifs_mutex should be acquired after
* acquring vif or mfc locks. These locks protect some global variables.
*
* The statistics are not currently protected by a lock
* causing the stats be be approximate, not exact.
*/
/*
* Timeouts:
* Upcall timeouts - BSD uses boolean_t mfc->expire and
* nexpire[MFCTBLSIZE], the number of times expire has been called.
* SunOS 5.x uses mfc->timeout for each mfc.
* Some Unixes are limited in the number of simultaneous timeouts
* that can be run, SunOS 5.x does not have this restriction.
*/
/*
* In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
* UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
* expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
*/
/*
* Hash function for a source, group entry
*/
((g) >> 20) ^ ((g) >> 10) ^ (g))
/* Identify PIM packet that came on a Register interface */
/* Function declarations */
static void expire_upcalls(void *);
static void free_queue(struct mfc *);
static int get_version(uchar_t *);
static int set_assert(int *, ip_stack_t *);
/*
* Token Bucket Filter functions
*/
static void tbf_process_q(struct vif *);
static void tbf_reprocess_q(void *);
static void tbf_update_tokens(struct vif *);
static void release_mfc(struct mfcb *);
/*
* Encapsulation packets
*/
/* prototype IP hdr for encapsulated packets */
0, /* tos */
sizeof (ipha_t), /* total length */
0, /* id */
0, /* frag offset */
0, /* checksum */
};
/*
* Rate limit for assert notification messages, in nsec.
*/
}
} else { \
} \
}
} else { \
} \
}
(mfcb)->mfcb_refcnt++; \
}
if (--(mfcb)->mfcb_refcnt == 0 && \
release_mfc(mfcb); \
} \
}
/*
* MFCFIND:
* Find a route for a given origin IP address and multicast group address.
* Skip entries with pending upcalls.
* Type of service parameter to be added in the future!
*/
while (_mb_rt) { \
break; \
} \
} \
}
/*
* BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
* are inefficient. We use gethrestime() which returns a timespec_t with
* sec and nsec, the resolution is machine dependent.
* The following 2 macros have been changed to use nsec instead of usec.
*/
/*
* Macros to compute elapsed time efficiently.
* Borrowed from Van Jacobson's scheduling code.
* Delta should be a hrtime_t.
*/
int xxs; \
\
switch (xxs) { \
case 2: \
delta += 1000000000; \
/*FALLTHROUGH*/ \
case 1: \
delta += 1000000000; \
break; \
default: \
} \
} \
}
/*
* Handle MRT setsockopt commands to modify the multicast routing tables.
*/
int
int datalen)
{
return (EACCES);
}
if (checkonly) {
/*
* do not do operation, just pretend to - new T_CHECK
* Note: Even routines further on can probably fail but
* this T_CHECK stuff is only to please XTI so it not
* necessary to be perfect.
*/
switch (cmd) {
case MRT_INIT:
case MRT_DONE:
case MRT_ADD_VIF:
case MRT_DEL_VIF:
case MRT_ADD_MFC:
case MRT_DEL_MFC:
case MRT_ASSERT:
return (0);
default:
return (EOPNOTSUPP);
}
}
/*
* make sure no command is issued after multicast routing has been
* turned off.
*/
if (is_mrouter_off(ipst))
return (EINVAL);
}
switch (cmd) {
default: return (EOPNOTSUPP);
}
}
/*
* Handle MRT getsockopt commands
*/
int
{
return (EACCES);
switch (cmd) {
default: return (EOPNOTSUPP);
}
}
/*
* Handle ioctl commands to obtain information from the cache.
* Called with shared access to IP. These are read_only ioctls.
*/
/* ARGSUSED */
int
{
/* Existence verified in ip_wput_nondata */
case (SIOCGETVIFCNT):
case (SIOCGETSGCNT):
case (SIOCGETLSGCNT):
default:
return (EINVAL);
}
}
/*
* Returns the packet, byte, rpf-failure count for the source, group provided.
*/
static int
{
} else
return (0);
}
/*
* Returns the packet, byte, rpf-failure count for the source, group provided.
* Uses larger counters and IPv6 addresses.
*/
/* ARGSUSED XXX until implemented */
static int
{
/* XXX TODO SIOCGETLSGCNT */
return (ENXIO);
}
/*
* Returns the input and output packet and byte counts on the vif provided.
*/
static int
{
return (EINVAL);
/*
* No locks here, an approximation is fine.
*/
return (0);
}
static int
{
int *v = (int *)data;
*v = 0x0305; /* XXX !!!! */
return (0);
}
/*
* Set PIM assert processing global.
*/
static int
{
if ((*i != 1) && (*i != 0))
return (EINVAL);
ipst->ips_pim_assert = *i;
return (0);
}
/*
* Get PIM assert processing global.
*/
static int
{
int *i = (int *)data;
*i = ipst->ips_pim_assert;
return (0);
}
/*
* Enable multicast routing.
*/
static int
{
int *v;
return (ENOPROTOOPT);
v = (int *)data;
if (*v != 1)
return (ENOPROTOOPT);
return (EADDRINUSE);
}
/*
* MRT_INIT should only be allowed for RAW sockets, but we double
* check.
*/
if (!IPCL_IS_RAWIP(connp)) {
return (EINVAL);
}
/* In order for tunnels to work we have to turn ip_g_forward on */
if (!WE_ARE_FORWARDING(ipst)) {
"ip_mrouter_init: turning on forwarding");
}
}
return (0);
}
void
{
KM_SLEEP);
/*
* mfctable:
* Includes all mfcs, including waiting upcalls.
* Multiple mfcs per bucket.
*/
KM_SLEEP);
/*
* Define the token bucket filter structures.
* tbftable -> each vif has one of these for storing info.
*/
}
/*
* Disable multicast routing.
* Didn't use global timeout_val (BSD version), instead check the mfctable.
*/
int
{
int i;
return (EINVAL);
}
"ip_mrouter_done: turning off forwarding");
}
}
/*
* Always clear cache when vifs change.
* No need to get ipst->ips_last_encap_lock since we are running as
* a writer.
*/
ipst->ips_last_encap_src = 0;
mrouter->conn_multi_router = 0;
/*
* For each phyint in use,
* disable promiscuous reception of all IP multicasts.
*/
/*
* if the vif is active mark it condemned.
*/
/* Phyint only */
(void) ip_delmulti(ilm);
}
}
/*
* decreases the refcnt added in add_vif.
* and release v_lock.
*/
} else {
continue;
}
}
ipst->ips_numvifs = 0;
ipst->ips_pim_assert = 0;
/*
* Free upcall msgs.
* Go through mfctable and stop any outstanding upcall
* timeouts remaining on mfcs.
*/
for (i = 0; i < MFCTBLSIZ; i++) {
while (mfc_rt) {
/* Free upcalls */
if (mfc_rt->mfc_timeout_id != 0) {
/*
* OK to drop the lock as we have
* a refcnt on the bucket. timeout
* can fire but it will see that
* mfc_timeout_id == 0 and not do
* anything. see expire_upcalls().
*/
mfc_rt->mfc_timeout_id = 0;
(void) untimeout(
mfc_rt->mfc_timeout_id = 0;
/*
* all queued upcall packets
* and mblk will be freed in
* release_mfc().
*/
}
}
}
}
return (0);
}
void
{
int i;
for (i = 0; i < MFCTBLSIZ; i++) {
(void) printf("ip_mrouter_stack_destroy: free for %d\n",
i);
free_queue(rt);
}
}
}
static boolean_t
{
return (B_TRUE);
}
if (mrouter->conn_multi_router == 0) {
return (B_TRUE);
}
return (B_FALSE);
}
static void
{
}
static boolean_t
{
return (B_FALSE);
}
return (B_FALSE);
}
return (B_TRUE);
}
/*
* Add a vif to the vif table.
*/
static int
{
int error = 0;
return (EINVAL);
if (is_mrouter_off(ipst))
return (EINVAL);
/*
* Viftable entry should be 0.
* if v_marks == 0 but v_refcnt != 0 means struct is being
* initialized.
*
* Also note that it is very unlikely that we will get a MRT_ADD_VIF
* request while the delete is in progress, mrouted only sends add
* requests when a new interface is added and the new interface cannot
* have the same vifi as an existing interface. We make sure that
* ill_delete will block till the vif is deleted by adding a refcnt
* to ipif in del_vif().
*/
return (EADDRINUSE);
}
/* Incoming vif should not be 0 */
return (EINVAL);
}
/* Find the interface with the local address */
return (EADDRNOTAVAIL);
}
"add_vif: src 0x%x enter",
}
/*
* Always clear cache when vifs change.
* Needed to ensure that src isn't left over from before vif was added.
* No need to get last_encap_lock, since we are running as a writer.
*/
ipst->ips_last_encap_src = 0;
"add_vif: source route tunnels not supported\n");
return (EOPNOTSUPP);
}
} else {
/* Phyint or Register vif */
/*
* Note: Since all IPPROTO_IP level options (including
* MRT_ADD_VIF) are done exclusively via
* ip_optmgmt_writer(), a lock is not necessary to
* protect reg_vif_num.
*/
} else {
return (EADDRINUSE);
}
}
/* Make sure the interface supports multicast */
}
return (EOPNOTSUPP);
}
/* Enable promiscuous reception of all IP mcasts from the if */
if (IS_UNDER_IPMP(ill))
} else {
}
}
/*
* since we released the lock lets make sure that
* ip_mrouter_done() has not been called.
*/
(void) ip_delmulti(ilm);
}
}
}
}
/* Define parameters for the tbf structure */
/* Scaling up here, allows division by 1024 in critical code. */
vifp->v_timeout_id = 0;
/* initialize per vif pkt counters */
vifp->v_bytes_in = 0;
vifp->v_bytes_out = 0;
/* Adjust numvifs up, if the vifi is higher than numvifs */
"add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
}
return (0);
}
/* Delete a vif from the vif table. */
static void
{
}
if (vifp->v_timeout_id != 0) {
vifp->v_timeout_id = 0;
}
/*
* Free packets queued at the interface.
* Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
*/
mutex_enter(&t->tbf_lock);
}
mutex_exit(&t->tbf_lock);
/*
* Always clear cache when vifs change.
* No need to get last_encap_lock since we are running as a writer.
*/
ipst->ips_last_encap_src = 0;
}
mutex_destroy(&t->tbf_lock);
/* Adjust numvifs down */
break;
}
static int
{
return (EINVAL);
/*
* Not initialized
* Here we are not looking at the vif that is being initialized
* i.e vifp->v_marks == 0 and refcnt > 0.
*/
return (EADDRNOTAVAIL);
}
/* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
/* Phyint only */
/*
* should be OK to drop the lock as we
* have marked this as CONDEMNED.
*/
(void) ip_delmulti(ilm);
}
}
}
/*
* decreases the refcnt added in add_vif.
*/
return (0);
}
/*
* Add an mfc entry.
*/
static int
{
int i;
/*
* The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
* did not have a real route for pkt.
* We want this pkt without rt installed in the mfctable to prevent
* multiiple tries, so go ahead and put it in mfctable, it will
* be discarded later in ip_mdq() because the child is NULL.
*/
/* Error checking, out of bounds? */
ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
(int)mfccp->mfcc_parent));
return (EINVAL);
}
ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
(int)mfccp->mfcc_parent));
return (EINVAL);
}
if (is_mrouter_off(ipst)) {
return (EINVAL);
}
/* If an entry already exists, just update the fields */
if (rt) {
"add_mfc: update o %x grp %x parent %x",
mfccp->mfcc_parent);
}
for (i = 0; i < (int)ipst->ips_numvifs; i++)
return (0);
}
/*
* Find the entry for which the upcall was made and update.
*/
if (nstl++ != 0)
"add_mfc: %s o %x g %x p %x",
"multiple kernel entries",
mfccp->mfcc_parent);
"add_mfc: o %x g %x p %x",
mfccp->mfcc_parent);
}
/*
* Prevent cleanup of cache entry.
* Timer starts in ip_mforward.
*/
if (rt->mfc_timeout_id != 0) {
/*
* setting id to zero will avoid this
* entry from being cleaned up in
* expire_up_calls().
*/
rt->mfc_timeout_id = 0;
/*
* dropping the lock is fine as we
* have a refhold on the bucket.
* so mfc cannot be freed.
* The timeout can fire but it will see
* that mfc_timeout_id == 0 and not cleanup.
*/
}
/*
* Send all pkts that are queued waiting for the upcall.
* ip_mdq param tun set to 0 -
* the return value of ip_mdq() isn't used here,
* so value we send doesn't matter.
*/
}
}
}
/*
* It is possible that an entry is being inserted without an upcall
*/
if (nstl == 0) {
"add_mfc: no upcall o %x g %x p %x",
mfccp->mfcc_parent);
}
if (is_mrouter_off(ipst)) {
return (EINVAL);
}
break;
}
}
/* No upcall, so make a new entry into mfctable */
ip1dbg(("add_mfc: out of memory\n"));
return (ENOBUFS);
}
/* Insert new entry at head of hash chain */
/* Link into table */
}
}
return (0);
}
/*
* Fills in mfc structure from mrouted mfcctl.
*/
static void
{
int i;
for (i = 0; i < (int)ipst->ips_numvifs; i++) {
}
/* Initialize pkt counters per src-grp */
rt->mfc_pkt_cnt = 0;
rt->mfc_byte_cnt = 0;
rt->mfc_wrong_if = 0;
}
static void
{
/*
* Drop all queued upcall packets.
* Free the mbuf with the pkt.
*/
}
}
/*
* go thorugh the hash bucket and free all the entries marked condemned.
*/
void
{
while (current_mfcp != NULL) {
continue;
}
current_mfcp = NULL;
} else {
}
}
}
/*
* Delete an mfc entry.
*/
static int
{
"del_mfc: o %x g %x",
}
/* Find mfc in mfctable, finds only entries without upcalls */
break;
}
/*
* Return if there was an upcall (mfc_rte != NULL,
* or rt not in mfctable.
*/
return (EADDRNOTAVAIL);
}
/*
* no need to hold lock as we have a reference.
*/
/* error checking */
if (rt->mfc_timeout_id != 0) {
ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
/*
* Its ok to drop the lock, the struct cannot be freed
* since we have a ref on the hash bucket.
*/
rt->mfc_timeout_id = 0;
}
/*
* Delete the entry from the cache
*/
return (0);
}
/*
* IP multicast forwarding function. This function assumes that the packet
* pointed to by ipha has arrived on (or is about to be sent to) the interface
* pointed to by "ill", and the packet is to be relayed to other networks
* that have members of the packet's destination IP multicast group.
*
* The packet is returned unscathed to the caller, unless it is
* erroneous, in which case a -1 value tells the caller (IP)
* to discard it.
*
* Unlike BSD, SunOS 5.x needs to return to IP info about
* whether pkt came in thru a tunnel, so it can be discarded, unless
* it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
* to be delivered.
* Return values are 0 - pkt is okay and phyint
* -1 - pkt is malformed and to be tossed
* 1 - pkt came in on tunnel
*/
int
{
static int srctun = 0;
"ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
}
/*
* Don't forward a packet with time-to-live of zero or one,
* or a packet destined to a local-only group.
*/
"ip_mforward: not forwarded ttl %d,"
" dst 0x%x ill %s",
}
if (tunnel_src != 0)
return (1);
else
return (0);
}
if ((tunnel_src != 0) || pim_reg_packet) {
/*
* Packet arrived over an encapsulated tunnel or via a PIM
* register message.
*/
if (tunnel_src != 0) {
"ip_mforward: ill %s arrived via ENCAP TUN",
} else if (pim_reg_packet) {
"ip_mforward: ill %s arrived via"
" REGISTER VIF",
}
}
/* Packet arrived via a physical interface. */
"ip_mforward: ill %s arrived via PHYINT",
}
} else {
/*
* Packet arrived through a SRCRT tunnel.
* Source-route tunnels are no longer supported.
* Error message printed every 1000 times.
*/
if ((srctun++ % 1000) == 0) {
"ip_mforward: received source-routed pkt from %x",
}
return (-1);
}
/* Find route in cache, return NULL if not there or upcalls q'ed. */
/*
* Lock the mfctable against changes made by ip_mforward.
* Note that only add_mfc and del_mfc can remove entries and
* they run with exclusive access to IP. So we do not need to
* guard against the rt being deleted, so release lock after reading.
*/
if (is_mrouter_off(ipst))
return (-1);
/* Entry exists, so forward if necessary */
int ret = 0;
if (pim_reg_packet) {
0, rt);
} else {
}
return (ret);
/*
* Don't forward if we don't have a cache entry. Mrouted will
* always provide a cache entry in response to an upcall.
*/
} else {
/*
* If we don't have a route for packet's origin, make a copy
* of the packet and send message to routing daemon.
*/
int npkts;
/* BSD uses mrts_no_route++ */
"ip_mforward: no rte ill %s src %x g %x misses %d",
}
/*
* The order of the following code differs from the BSD code.
* Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
* code works, so SunOS 5.x wasn't changed to conform to the
* BSD version.
*/
/* Lock mfctable. */
/*
* If we are turning off mrouted return an error
*/
if (is_mrouter_off(ipst)) {
return (-1);
}
/* Is there an upcall waiting for this packet? */
"ip_mforward: MFCTAB hash %d o 0x%x"
" g 0x%x\n",
}
/* There is an upcall */
break;
}
}
/* No upcall, so make a new entry into mfctable */
ip1dbg(("ip_mforward: out of memory "
"for mfc, mfc_rt\n"));
goto error_return;
} else
/* Get resources */
/* TODO could copy header and dup rest */
ip1dbg(("ip_mforward: out of memory for "
"mblk, mp_copy\n"));
goto error_return;
}
}
/* Get resources for rte, whether first rte or not first. */
/* Add this packet into rtdetq */
ip1dbg(("ip_mforward: out of memory for"
" rtdetq, rte\n"));
goto error_return;
}
ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
goto error_return;
}
if (pim_reg_packet) {
} else {
}
/*
* Determine if upcall q (rtdetq) has overflowed.
* mfc_rt->mfc_rte is null by mi_zalloc
* if it is the first message.
*/
npkts++;
"ip_mforward: upcalls %d\n", npkts);
}
goto error_return;
}
if (npkts == 0) { /* first upcall */
int i = 0;
/*
* Now finish installing the new mfc! Now that we have
* resources! Insert new entry at head of hash chain.
* Use src and dst which are ipaddr_t's.
*/
for (i = 0; i < (int)ipst->ips_numvifs; i++)
/* Link into table */
"ip_mforward: NEW MFCTAB hash %d o 0x%x "
"g 0x%x\n", hash,
}
}
/* Link in the upcall */
/* First upcall */
else {
/* not the first upcall */
;
}
/*
* No upcalls waiting, this is first one, so send a message to
* routing daemon to install a route into kernel table.
*/
if (npkts == 0) {
/* ipha_protocol is 0, for upcall */
if (pim_reg_packet) {
} else {
/*
* XXX do we need to hold locks here ?
*/
for (vifi = 0;
vifi++) {
continue;
break;
}
}
}
/* Timer to discard upcalls if mrouted is too slow */
/* Pass to RAWIP */
} else {
ip_drop_input("ip_mforward - upcall already waiting",
}
if (tunnel_src != 0)
return (1);
else
return (0);
}
return (-1);
}
}
/*
* Clean up the mfctable cache entry if upcall is not serviced.
* SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
*/
static void
{
return;
}
"expire_upcalls: hash %d s %x g %x",
}
/*
* if timeout has been set to zero, than the
* entry has been filled, no need to delete it.
*/
if (mfc_rt->mfc_timeout_id == 0)
goto done;
mfc_rt->mfc_timeout_id = 0;
/* Determine entry to be cleaned up in cache table. */
break;
/* del_mfc takes care of gone mfcs */
/*
* Delete the entry from the cache
*/
/*
* release_mfc will drop all queued upcall packets.
* and will free the mbuf with the pkt, if, timing info.
*/
done:
}
/*
* Packet forwarding routine once entry in the cache is made.
*/
static int
{
"ip_mdq: SEND src %x, ipha_dst %x, ill %s",
}
/* Macro to send packet on vif */
else \
}
/*
* The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
* Mrouted had no route.
* We wanted the route installed in the mfctable to prevent multiple
* tries, so it passed add_mfc(), but is discarded here. The v_ipif is
* NULL so we don't want to check the ill. Still needed as of Mrouted
* 3.6.
*/
ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
}
return (-1); /* drop pkt */
}
return (-1);
/*
* The MFC entries are not cleaned up when an ipif goes
* away thus this code has to guard against an MFC referencing
* an ipif that has been closed. Note: reset_mrt_vif_ipif
* sets the v_ipif to NULL when the ipif disappears.
*/
"%d ill %s viftable ill %s\n",
return (-1);
}
/*
* Don't forward if it didn't arrive from the parent vif for its
* origin.
*/
/* Came in the wrong interface */
ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
"numvifs %d ill %s viftable ill %s\n",
"ip_mdq: arrived wrong if, vifi %d ill "
"%s viftable ill %s\n",
}
rt->mfc_wrong_if++;
/*
* If we are doing PIM assert processing and we are forwarding
* packets on this interface, and it is a broadcast medium
* interface (and not a tunnel), send a message to the routing.
*
* We use the first ipif on the list, since it's all we have.
* Chances are the ipif_flags are the same for ipifs on the ill.
*/
/* TODO could copy header and dup rest */
ip1dbg(("ip_mdq: out of memory "
"for mblk, mp_copy\n"));
return (-1);
}
/* Pass to RAWIP */
}
if (tunnel_src != 0)
return (1);
else
return (0);
}
/*
* If I sourced this packet, it counts as output, else it was input.
*/
} else {
}
rt->mfc_pkt_cnt++;
/*
* For each vif, decide if a copy of the packet should be forwarded.
* Forward if:
* - the vif threshold ttl is non-zero AND
* - the pkt ttl exceeds the vif's threshold
* A non-zero mfc_ttl indicates that the vif is part of
* the output set for the mfc entry.
*/
vifi < num_of_vifs;
if (!lock_good_vif(vifp))
continue;
/*
* lock_good_vif should not have succedded if
* v_ipif is null.
*/
}
}
if (tunnel_src != 0)
return (1);
else
return (0);
}
/*
* Send the packet on physical interface.
* Caller assumes can continue to use mp on return.
*/
/* ARGSUSED */
static void
{
/* Make a new reference to the packet */
ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
return;
}
if (vifp->v_rate_limit <= 0)
else {
"phyint_send: tbf_contr rate %d "
"vifp 0x%p mp 0x%p dst 0x%x",
}
}
}
/*
* Send the whole packet for REGISTER encapsulation to PIM daemon
* Caller assumes it can continue to use mp on return.
*/
/* ARGSUSED */
static void
{
"register_send: src %x, dst %x\n",
}
/*
* Copy the old packet & pullup its IP header into the new mblk_t so we
* can modify it. Try to fill the new mblk_t since if we don't the
* ethernet driver will.
*/
"register_send: allocb failure.");
}
return;
}
/*
* Bump write pointer to account for igmpmsg being added.
*/
/*
* Chain packet to new mblk_t.
*/
"register_send: copymsg failure.");
}
return;
}
/*
* icmp_input() asserts that IP version field is set to an
* appropriate version. Hence, the struct igmpmsg that this really
* becomes, needs to have the correct IP version field.
*/
/*
* The kernel uses the struct igmpmsg header to encode the messages to
* the multicast routing daemon. Fill in the fields in the header
* starting with the message type which is IGMPMSG_WHOLEPKT
*/
/*
* Must Be Zero. This is because the struct igmpmsg is really an IP
* header with renamed fields and the multicast routing daemon uses
* an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
*/
"register_send: register upcall failure.");
}
} else {
/* Pass to RAWIP */
}
}
/*
* pim_validate_cksum handles verification of the checksum in the
* pim header. For PIM Register packets, the checksum is calculated
* across the PIM header only. For all other packets, the checksum
* is for the PIM header and remainder of the packet.
*
* returns: B_TRUE, if checksum is okay.
* B_FALSE, if checksum is not valid.
*/
static boolean_t
{
return (B_FALSE);
return (B_FALSE);
}
return (B_TRUE);
}
/*
* Process PIM protocol packets i.e. IP Protocol 103.
* Register messages are decapsulated and sent onto multicast forwarding.
*
* Return NULL for a bad packet that is discarded here.
* Return mp if the message is OK and should be handed to "raw" receivers.
* Callers of pim_input() may need to reinitialize variables that were copied
* from the mblk as this calls pullupmsg().
*/
mblk_t *
{
/*
* Pullup the msg for PIM protocol processing.
*/
return (NULL);
}
/*
* Validate lengths
*/
if (pimlen < PIM_MINLEN) {
"pim_input: length not at least minlen");
}
return (NULL);
}
/*
* Point to the PIM header.
*/
/*
* Check the version number.
*/
"pim_input: unknown version of PIM");
}
return (NULL);
}
/*
* Validate the checksum
*/
"pim_input: invalid checksum");
}
return (NULL);
}
return (mp);
/*
* check if the inner packet is destined to mcast group
*/
"pim_input: Inner pkt not mcast .. !");
}
return (NULL);
}
"register from %x, to %x, len %d",
}
/*
* If the null register bit is not set, decapsulate
* the packet before forwarding it.
* Avoid this in no register vif
*/
/* Copy the message */
return (NULL);
}
/*
* Decapsulate the packet and give it to
* register_mforward.
*/
/* register_mforward already called ip_drop_input */
return (NULL);
}
}
/*
* Pass all valid PIM packets up to any process(es) listening on a raw
* PIM socket. For Solaris it is done right after pim_input() is
* called.
*/
return (mp);
}
/*
* PIM sparse mode hook. Called by pim_input after decapsulating
* the packet. Loop back the packet, as if we have received it.
* In pim_input() we have to check if the destination is a multicast address.
*/
static int
{
"register_mforward: src %x, dst %x\n",
}
/*
* Need to pass in to ip_mforward() the information that the
* packet has arrived on the register_vif. We mark it with
* the IRAF_PIM_REGISTER attribute.
* pim_input verified that the (inner) destination is multicast,
* hence we skip the generic code in ip_input.
*/
} else {
}
/* Normally this will return the IRE_MULTICAST */
return (-1);
}
return (0);
}
/*
* Send an encapsulated packet.
* Caller assumes can continue to use mp when routine returns.
*/
/* ARGSUSED */
static void
{
"encap_send: vif %ld enter",
}
/*
* Copy the old packet & pullup it's IP header into the
* new mbuf so we can modify it. Try to fill the new
* mbuf since if we don't the ethernet driver will.
*/
return;
return;
}
/*
* Fill in the encapsulating IP header.
* Remote tunnel dst in rmt_addr, from add_vif().
*/
/* Turn the encapsulated IP header back into a valid one. */
ipha->ipha_hdr_checksum = 0;
}
if (vifp->v_rate_limit <= 0)
else
/* ipha is from the original header */
}
/*
* De-encapsulate a packet and feed it back through IP input if it
* matches one of our multicast tunnels.
*
* This routine is called whenever IP gets a packet with prototype
* IPPROTO_ENCAP and a local destination address and the packet didn't
* match one of our configured IP-in-IP tunnels.
*/
void
{
int hlen_encap;
/* Make sure we have all of the inner header */
return;
}
}
return;
}
}
/*
* Dump the packet if it's not to a multicast destination or if
* we don't have an encapsulating tunnel with the source.
* Note: This code assumes that the remote site IP address
* uniquely identifies the tunnel (i.e., that this site has
* at most one tunnel with the remote site).
*/
ip1dbg(("ip_mroute_decap: bad tunnel\n"));
return;
}
ipst->ips_last_encap_vif = 0;
if (!lock_good_vif(vifp))
continue;
1, SL_TRACE,
"ip_mroute_decap: good tun "
"vif %ld with %x",
}
break;
}
}
}
ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
return;
}
/*
* Need to pass in the tunnel source to ip_mforward (so that it can
* verify that the packet arrived over the correct vif.)
*/
/*
* We don't redo any of the filtering in ill_input_full_v4 and we
* have checked that all of ipha_encap and any IP options are
* pulled up. Hence we call ire_recv_multicast_v4 directly.
* However, we have to check for RSVP as in ip_input_full_v4
* and if so we pass it to ire_recv_broadcast_v4 for local delivery
* to the rsvpd.
*/
} else {
}
/* Normally this will return the IRE_MULTICAST or IRE_BROADCAST */
return;
}
ire->ire_ib_pkt_count++;
}
/*
* Remove all records with v_ipif == ipif. Called when an interface goes away
* (stream closed). Called as writer.
*/
void
{
/* Can't check vifi >= 0 since vifi_t is unsigned! */
}
}
}
/* Remove pending upcall msgs when ill goes away. Called by ill_delete. */
void
{
int i;
for (i = 0; i < MFCTBLSIZ; i++) {
"reset_mrt_ill: mfctable [%d]", i);
}
/*
* Its ok to drop the lock, the
* struct cannot be freed since
* we have a ref on the hash
* bucket.
*/
}
(void) mi_strlog(
1, SL_TRACE,
"reset_mrt_ill: "
"ill 0x%p", (void *)ill);
}
}
}
}
}
}
}
/*
* Token bucket filter module.
* The ipha is for mcastgrp destination for phyint and encap.
*/
static void
{
/* Drop if packet is too large */
if (p_len > MAX_BKT_SIZE) {
return;
}
"tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
}
mutex_enter(&t->tbf_lock);
/*
* If there are enough tokens,
* and the queue is empty, send this packet out.
*/
"tbf_control: vif %ld, TOKENS %d, pkt len %lu, qlen %d",
t->tbf_q_len);
}
/* No packets are queued */
if (t->tbf_q_len == 0) {
/* queue empty, send packet if enough tokens */
mutex_exit(&t->tbf_lock);
return;
} else {
/* Queue packet and timeout till later */
}
} else if (t->tbf_q_len < t->tbf_max_q_len) {
/* Finite queue length, so queue pkts and process queue */
} else {
/* Check that we have UDP header with IP header */
sizeof (struct udphdr);
ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
"vif %ld src 0x%x dst 0x%x\n",
return;
} else
/* Have to reassign ipha after pullupmsg */
}
/*
* Queue length too much,
* try to selectively dq, or queue and process
*/
} else {
}
}
if (t->tbf_q_len == 0) {
vifp->v_timeout_id = 0;
}
if (id != 0)
}
/*
* Adds a packet to the tbf queue at the interface.
* The ipha is for mcastgrp destination for phyint and encap.
*/
static void
{
}
/* Queue was empty */
} else {
/* Insert at tail */
}
/* set new tail pointer */
t->tbf_q_len++;
}
/*
* Process the queue at the vif interface.
* Drops the tbf_lock when sending packets.
*
* NOTE : The caller should quntimeout if the queue length is 0.
*/
static void
{
"tbf_process_q 1: vif %ld qlen = %d",
}
/*
* Loop through the queue at the interface and send
* as many packets as possible.
*/
while (t->tbf_q_len > 0) {
/* Determine if the packet can be sent */
/*
* If so, reduce no. of tokens, dequeue the packet,
* send the packet.
*/
if (--t->tbf_q_len == 0) {
}
/* Exit mutex before sending packet, then re-enter */
mutex_exit(&t->tbf_lock);
mutex_enter(&t->tbf_lock);
} else
break;
}
}
/* Called at tbf timeout to update tokens, process q and reset timer. */
static void
{
vifp->v_timeout_id = 0;
}
"tbf_reprcess_q: vif %ld timeout id = %p",
}
}
/*
* Function that will selectively discard a member of the tbf queue,
* based on the precedence value and the priority.
*
* NOTE : The caller should quntimeout if the queue length is 0.
*/
static int
{
uint_t p;
"dq_sel: vif %ld dst 0x%x",
}
/* If removing the last packet, fix the tail pointer */
/*
* It's impossible for the queue to be empty, but
* we check anyway.
*/
if (--t->tbf_q_len == 0) {
}
return (1);
}
}
return (0);
}
/* Sends packet, 2 cases - encap tunnel, phyint. */
static void
{
/* If encap tunnel options */
"tbf_send_packet: ENCAP tunnel vif %ld",
}
ixas.ixa_ifindex = 0;
/*
* Feed into ip_output_simple which will set the ident field
* and checksum the encapsulating header.
* BSD gets the cached route vifp->v_route from ip_output()
* to speed up route table lookups. Not necessary in SunOS 5.x.
* One could make multicast forwarding faster by putting an
*/
ixa_cleanup(&ixas);
return;
/* phyint */
} else {
/* Need to loop back to members on the outgoing interface. */
}
"tbf_send_pkt: phyint forward vif %ld dst = 0x%x",
}
/*
* Find an NCE which matches the nexthop.
* For a pt-pt interface we use the other end of the pt-pt
* link.
*/
} else {
}
return;
}
/*
* We don't remeber the incoming ill. Thus we
* pretend the packet arrived on the outbound ill. This means
* statistics for input errors will be increased on the wrong
* ill but that isn't a big deal.
*/
0);
}
}
/*
* Determine the current time and then the elapsed time (between the last time
* and time now). Update the no. of tokens in the bucket.
*/
static void
{
gethrestime(&tp);
/*LINTED*/
/*
* This formula is actually
* (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
*
* The (1000/1024) was introduced in add_vif to optimize
* this divide into a shift.
*/
t->tbf_last_pkt_t = tp;
if (t->tbf_n_tok > MAX_BKT_SIZE)
t->tbf_n_tok = MAX_BKT_SIZE;
"tbf_update_tok: tm %lld tok %d vif %ld",
}
}
/*
* Priority currently is based on port nos.
* Different forwarding mechanisms have different ways
* of obtaining the port no. Hence, the vif must be
* given along with the packet itself.
*
*/
static int
{
int prio;
/* Temporary hack; may add general packet classifier some day */
/*
* The UDP port space is divided up into four priority ranges:
* [0, 16384) : unclassified - lowest priority
* [16384, 32768) : audio - highest priority
* [32768, 49152) : whiteboard - medium priority
* [49152, 65536) : video - low priority
*/
case 0x4000:
prio = 70;
break;
case 0x8000:
prio = 60;
break;
case 0xc000:
prio = 55;
break;
default:
prio = 50;
break;
}
"priority: port %x prio %d\n",
}
} else
return (prio);
}
/*
* End of token bucket filter modifications
*/
/*
* Produces data for netstat -M.
*/
int
{
sizeof (struct mrtstat))) {
ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
return (0);
}
return (1);
}
/*
* Sends info for SNMP's MIB.
*/
int
{
continue;
/*
* No locks here, an approximation is fine.
*/
ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
return (0);
}
}
return (1);
}
/*
* Called by ip_snmp_get to send up multicast routing table.
*/
int
{
int i, j;
/*
* Make sure multicast has not been turned off.
*/
if (is_mrouter_off(ipst))
return (1);
/* Loop over all hash buckets and their chains */
for (i = 0; i < MFCTBLSIZ; i++) {
continue;
}
for (j = 0; j < (int)ipst->ips_numvifs; j++)
sizeof (mfcc))) {
ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
return (0);
}
}
}
return (1);
}