inet/ip/ip_mroute.c

	ip_mroute.c revision 7ba7860f5af89005c23337fb7cdc48145cc6b8ac
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2008 Sun Microsystems, Inc.
 * All rights reserved.  Use is subject to license terms.
 */
/* Copyright (c) 1990 Mentat Inc. */

/*
 * Procedures for the kernel part of DVMRP,
 * a Distance-Vector Multicast Routing Protocol.
 * (See RFC-1075)
 * Written by David Waitzman, BBN Labs, August 1988.
 * Modified by Steve Deering, Stanford, February 1989.
 * Modified by Mark J. Steiglitz, Stanford, May, 1991
 * Modified by Van Jacobson, LBL, January 1993
 * Modified by Ajit Thyagarajan, PARC, August 1993
 * Modified by Bill Fenner, PARC, April 1995
 *
 * MROUTING 3.5
 */

/*
 * TODO
 * - function pointer field in vif, void *vif_sendit()
 */

#include <sys/types.h>
#include <sys/stream.h>
#include <sys/stropts.h>
#include <sys/strlog.h>
#include <sys/systm.h>
#include <sys/ddi.h>
#include <sys/cmn_err.h>
#include <sys/zone.h>

#include <sys/param.h>
#include <sys/socket.h>
#include <sys/vtrace.h>
#include <sys/debug.h>
#include <net/if.h>
#include <sys/sockio.h>
#include <netinet/in.h>
#include <net/if_dl.h>

#include <inet/common.h>
#include <inet/mi.h>
#include <inet/nd.h>
#include <inet/mib2.h>
#include <netinet/ip6.h>
#include <inet/ip.h>
#include <inet/snmpcom.h>

#include <netinet/igmp.h>
#include <netinet/igmp_var.h>
#include <netinet/udp.h>
#include <netinet/ip_mroute.h>
#include <inet/ip_multi.h>
#include <inet/ip_ire.h>
#include <inet/ip_if.h>
#include <inet/ipclassifier.h>

#include <netinet/pim.h>


/*
 * MT Design:
 *
 * There are three main data structures viftable, mfctable and tbftable that
 * need to be protected against MT races.
 *
 * vitable is a fixed length array of vif structs. There is no lock to protect
 * the whole array, instead each struct is protected by its own indiviual lock.
 * The value of v_marks in conjuction with the value of v_refcnt determines the
 * current state of a vif structure. One special state that needs mention
 * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
 * that vif is being initalized.
 * Each structure is freed when the refcnt goes down to zero. If a delete comes
 * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
 * which prevents the struct from further use.  When the refcnt goes to zero
 * the struct is freed and is marked VIF_MARK_NOTINUSE.
 * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
 * from  going away a refhold is put on the ipif before using it. see
 * lock_good_vif() and unlock_good_vif().
 *
 * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
 * of the vif struct.
 *
 * tbftable is also a fixed length array of tbf structs and is only accessed
 * via v_tbf.  It is protected by its own lock tbf_lock.
 *
 * Lock Ordering is
 * v_lock --> tbf_lock
 * v_lock --> ill_locK
 *
 * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
 * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
 * it also maintains a state. These fields are protected by a lock (mfcb_lock).
 * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
 * protect the struct elements.
 *
 * mfc structs are dynamically allocated and are singly linked
 * at the head of the chain. When an mfc structure is to be deleted
 * it is marked condemned and so is the state in the bucket struct.
 * When the last walker of the hash bucket exits all the mfc structs
 * marked condemed are freed.
 *
 * Locking Hierarchy:
 * The bucket lock should be acquired before the mfc struct lock.
 * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
 * operations on the bucket struct.
 *
 * last_encap_lock and numvifs_mutex should be acquired after
 * acquring vif or mfc locks. These locks protect some global variables.
 *
 * The statistics are not currently protected by a lock
 * causing the stats be be approximate, not exact.
 */

#define NO_VIF  MAXVIFS     /* from mrouted, no route for src */

/*
 * Timeouts:
 *  Upcall timeouts - BSD uses boolean_t mfc->expire and
 *  nexpire[MFCTBLSIZE], the number of times expire has been called.
 *  SunOS 5.x uses mfc->timeout for each mfc.
 *  Some Unixes are limited in the number of simultaneous timeouts
 *  that can be run, SunOS 5.x does not have this restriction.
 */

/*
 * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
 * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
 * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
 */
#define     EXPIRE_TIMEOUT  (hz/4)  /* 4x / second  */
#define     UPCALL_EXPIRE   6   /* number of timeouts   */

/*
 * Hash function for a source, group entry
 */
#define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
    ((g) >> 20) ^ ((g) >> 10) ^ (g))

#define         TBF_REPROCESS   (hz / 100)  /* 100x /second */

/* Identify PIM packet that came on a Register interface */
#define PIM_REGISTER_MARKER 0xffffffff

/* Function declarations */
static int  add_mfc(struct mfcctl *, ip_stack_t *);
static int  add_vif(struct vifctl *, conn_t *, mblk_t *, ip_stack_t *);
static int  del_mfc(struct mfcctl *, ip_stack_t *);
static int  del_vif(vifi_t *, conn_t *, mblk_t *, ip_stack_t *);
static void del_vifp(struct vif *);
static void encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
static void expire_upcalls(void *);
static void fill_route(struct mfc *, struct mfcctl *, ip_stack_t *);
static void free_queue(struct mfc *);
static int  get_assert(uchar_t *, ip_stack_t *);
static int  get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *);
static int  get_sg_cnt(struct sioc_sg_req *, ip_stack_t *);
static int  get_version(uchar_t *);
static int  get_vif_cnt(struct sioc_vif_req *, ip_stack_t *);
static int  ip_mdq(mblk_t *, ipha_t *, ill_t *,
            ipaddr_t, struct mfc *);
static int  ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *);
static void phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
static int  register_mforward(queue_t *, mblk_t *, ill_t *);
static void register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
static int  set_assert(int *, ip_stack_t *);

/*
 * Token Bucket Filter functions
 */
static int  priority(struct vif *, ipha_t *);
static void tbf_control(struct vif *, mblk_t *, ipha_t *);
static int  tbf_dq_sel(struct vif *, ipha_t *);
static void tbf_process_q(struct vif *);
static void tbf_queue(struct vif *, mblk_t *);
static void tbf_reprocess_q(void *);
static void tbf_send_packet(struct vif *, mblk_t *);
static void tbf_update_tokens(struct vif *);
static void release_mfc(struct mfcb *);

static boolean_t is_mrouter_off(ip_stack_t *);
/*
 * Encapsulation packets
 */

#define ENCAP_TTL   64

/* prototype IP hdr for encapsulated packets */
static ipha_t multicast_encap_iphdr = {
    IP_SIMPLE_HDR_VERSION,
    0,              /* tos */
    sizeof (ipha_t),        /* total length */
    0,              /* id */
    0,              /* frag offset */
    ENCAP_TTL, IPPROTO_ENCAP,
    0,              /* checksum */
};

/*
 * Rate limit for assert notification messages, in nsec.
 */
#define ASSERT_MSG_TIME     3000000000


#define VIF_REFHOLD(vifp) {         \
    mutex_enter(&(vifp)->v_lock);       \
    (vifp)->v_refcnt++;         \
    mutex_exit(&(vifp)->v_lock);        \
}

#define VIF_REFRELE_LOCKED(vifp) {              \
    (vifp)->v_refcnt--;                 \
    if ((vifp)->v_refcnt == 0 &&                \
        ((vifp)->v_marks & VIF_MARK_CONDEMNED)) {   \
            del_vifp(vifp);             \
    } else {                        \
        mutex_exit(&(vifp)->v_lock);            \
    }                           \
}

#define VIF_REFRELE(vifp) {                 \
    mutex_enter(&(vifp)->v_lock);               \
    (vifp)->v_refcnt--;                 \
    if ((vifp)->v_refcnt == 0 &&                \
        ((vifp)->v_marks & VIF_MARK_CONDEMNED)) {   \
            del_vifp(vifp);             \
    } else {                        \
        mutex_exit(&(vifp)->v_lock);            \
    }                           \
}

#define MFCB_REFHOLD(mfcb) {                \
    mutex_enter(&(mfcb)->mfcb_lock);        \
    (mfcb)->mfcb_refcnt++;              \
    ASSERT((mfcb)->mfcb_refcnt != 0);       \
    mutex_exit(&(mfcb)->mfcb_lock);         \
}

#define MFCB_REFRELE(mfcb) {                    \
    mutex_enter(&(mfcb)->mfcb_lock);            \
    ASSERT((mfcb)->mfcb_refcnt != 0);           \
    if (--(mfcb)->mfcb_refcnt == 0 &&           \
        ((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) {   \
            release_mfc(mfcb);          \
    }                           \
    mutex_exit(&(mfcb)->mfcb_lock);             \
}

/*
 * MFCFIND:
 * Find a route for a given origin IP address and multicast group address.
 * Skip entries with pending upcalls.
 * Type of service parameter to be added in the future!
 */
#define MFCFIND(mfcbp, o, g, rt) { \
    struct mfc *_mb_rt = NULL; \
    rt = NULL; \
    _mb_rt = mfcbp->mfcb_mfc; \
    while (_mb_rt) { \
        if ((_mb_rt->mfc_origin.s_addr == o) && \
            (_mb_rt->mfc_mcastgrp.s_addr == g) && \
            (_mb_rt->mfc_rte == NULL) && \
            (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) {        \
            rt = _mb_rt; \
            break; \
        } \
    _mb_rt = _mb_rt->mfc_next; \
    } \
}

/*
 * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
 * are inefficient. We use gethrestime() which returns a timespec_t with
 * sec and nsec, the resolution is machine dependent.
 * The following 2 macros have been changed to use nsec instead of usec.
 */
/*
 * Macros to compute elapsed time efficiently.
 * Borrowed from Van Jacobson's scheduling code.
 * Delta should be a hrtime_t.
 */
#define TV_DELTA(a, b, delta) { \
    int xxs; \
 \
    delta = (a).tv_nsec - (b).tv_nsec; \
    if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
        switch (xxs) { \
        case 2: \
            delta += 1000000000; \
            /*FALLTHROUGH*/ \
        case 1: \
            delta += 1000000000; \
            break; \
        default: \
            delta += (1000000000 * xxs); \
        } \
    } \
}

#define TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
    (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)

/*
 * Handle MRT setsockopt commands to modify the multicast routing tables.
 */
int
ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data,
    int datalen, mblk_t *first_mp)
{
    conn_t      *connp = Q_TO_CONN(q);
    ip_stack_t  *ipst = connp->conn_netstack->netstack_ip;

    mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) {
        mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
        return (EACCES);
    }
    mutex_exit(&ipst->ips_ip_g_mrouter_mutex);

    if (checkonly) {
        /*
         * do not do operation, just pretend to - new T_CHECK
         * Note: Even routines further on can probably fail but
         * this T_CHECK stuff is only to please XTI so it not
         * necessary to be perfect.
         */
        switch (cmd) {
        case MRT_INIT:
        case MRT_DONE:
        case MRT_ADD_VIF:
        case MRT_DEL_VIF:
        case MRT_ADD_MFC:
        case MRT_DEL_MFC:
        case MRT_ASSERT:
            return (0);
        default:
            return (EOPNOTSUPP);
        }
    }

    /*
     * make sure no command is issued after multicast routing has been
     * turned off.
     */
    if (cmd != MRT_INIT && cmd != MRT_DONE) {
        if (is_mrouter_off(ipst))
            return (EINVAL);
    }

    switch (cmd) {
    case MRT_INIT:  return (ip_mrouter_init(connp, data, datalen, ipst));
    case MRT_DONE:  return (ip_mrouter_done(first_mp, ipst));
    case MRT_ADD_VIF:  return (add_vif((struct vifctl *)data, connp,
                first_mp, ipst));
    case MRT_DEL_VIF:  return (del_vif((vifi_t *)data, connp, first_mp,
                ipst));
    case MRT_ADD_MFC:  return (add_mfc((struct mfcctl *)data, ipst));
    case MRT_DEL_MFC:  return (del_mfc((struct mfcctl *)data, ipst));
    case MRT_ASSERT:   return (set_assert((int *)data, ipst));
    default:       return (EOPNOTSUPP);
    }
}

/*
 * Handle MRT getsockopt commands
 */
int
ip_mrouter_get(int cmd, queue_t *q, uchar_t *data)
{
    conn_t      *connp = Q_TO_CONN(q);
    ip_stack_t  *ipst = connp->conn_netstack->netstack_ip;

    if (connp != ipst->ips_ip_g_mrouter)
        return (EACCES);

    switch (cmd) {
    case MRT_VERSION:   return (get_version((uchar_t *)data));
    case MRT_ASSERT:    return (get_assert((uchar_t *)data, ipst));
    default:        return (EOPNOTSUPP);
    }
}

/*
 * Handle ioctl commands to obtain information from the cache.
 * Called with shared access to IP. These are read_only ioctls.
 */
/* ARGSUSED */
int
mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
    ip_ioctl_cmd_t *ipip, void *if_req)
{
    mblk_t  *mp1;
    struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
    conn_t      *connp = Q_TO_CONN(q);
    ip_stack_t  *ipst = connp->conn_netstack->netstack_ip;

    /* Existence verified in ip_wput_nondata */
    mp1 = mp->b_cont->b_cont;

    switch (iocp->ioc_cmd) {
    case (SIOCGETVIFCNT):
        return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst));
    case (SIOCGETSGCNT):
        return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst));
    case (SIOCGETLSGCNT):
        return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst));
    default:
        return (EINVAL);
    }
}

/*
 * Returns the packet, byte, rpf-failure count for the source, group provided.
 */
static int
get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst)
{
    struct mfc *rt;
    struct mfcb *mfcbp;

    mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)];
    MFCB_REFHOLD(mfcbp);
    MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);

    if (rt != NULL) {
        mutex_enter(&rt->mfc_mutex);
        req->pktcnt   = rt->mfc_pkt_cnt;
        req->bytecnt  = rt->mfc_byte_cnt;
        req->wrong_if = rt->mfc_wrong_if;
        mutex_exit(&rt->mfc_mutex);
    } else
        req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;

    MFCB_REFRELE(mfcbp);
    return (0);
}

/*
 * Returns the packet, byte, rpf-failure count for the source, group provided.
 * Uses larger counters and IPv6 addresses.
 */
/* ARGSUSED XXX until implemented */
static int
get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst)
{
    /* XXX TODO SIOCGETLSGCNT */
    return (ENXIO);
}

/*
 * Returns the input and output packet and byte counts on the vif provided.
 */
static int
get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst)
{
    vifi_t vifi = req->vifi;

    if (vifi >= ipst->ips_numvifs)
        return (EINVAL);

    /*
     * No locks here, an approximation is fine.
     */
    req->icount = ipst->ips_vifs[vifi].v_pkt_in;
    req->ocount = ipst->ips_vifs[vifi].v_pkt_out;
    req->ibytes = ipst->ips_vifs[vifi].v_bytes_in;
    req->obytes = ipst->ips_vifs[vifi].v_bytes_out;

    return (0);
}

static int
get_version(uchar_t *data)
{
    int *v = (int *)data;

    *v = 0x0305;    /* XXX !!!! */

    return (0);
}

/*
 * Set PIM assert processing global.
 */
static int
set_assert(int *i, ip_stack_t *ipst)
{
    if ((*i != 1) && (*i != 0))
        return (EINVAL);

    ipst->ips_pim_assert = *i;

    return (0);
}

/*
 * Get PIM assert processing global.
 */
static int
get_assert(uchar_t *data, ip_stack_t *ipst)
{
    int *i = (int *)data;

    *i = ipst->ips_pim_assert;

    return (0);
}

/*
 * Enable multicast routing.
 */
static int
ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst)
{
    int *v;

    if (data == NULL || (datalen != sizeof (int)))
        return (ENOPROTOOPT);

    v = (int *)data;
    if (*v != 1)
        return (ENOPROTOOPT);

    mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    if (ipst->ips_ip_g_mrouter != NULL) {
        mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
        return (EADDRINUSE);
    }

    /*
     * MRT_INIT should only be allowed for RAW sockets, but we double
     * check.
     */
    if (!IPCL_IS_RAWIP(connp)) {
        mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
        return (EINVAL);
    }

    ipst->ips_ip_g_mrouter = connp;
    connp->conn_multi_router = 1;
    /* In order for tunnels to work we have to turn ip_g_forward on */
    if (!WE_ARE_FORWARDING(ipst)) {
        if (ipst->ips_ip_mrtdebug > 1) {
            (void) mi_strlog(connp->conn_rq, 1, SL_TRACE,
                "ip_mrouter_init: turning on forwarding");
        }
        ipst->ips_saved_ip_g_forward = ipst->ips_ip_g_forward;
        ipst->ips_ip_g_forward = IP_FORWARD_ALWAYS;
    }

    mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    return (0);
}

void
ip_mrouter_stack_init(ip_stack_t *ipst)
{
    mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL);

    ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1),
        KM_SLEEP);
    ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP);
    /*
     * mfctable:
     * Includes all mfcs, including waiting upcalls.
     * Multiple mfcs per bucket.
     */
    ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ,
        KM_SLEEP);
    /*
     * Define the token bucket filter structures.
     * tbftable -> each vif has one of these for storing info.
     */
    ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP);

    mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL);

    ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
    ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
}

/*
 * Disable multicast routing.
 * Didn't use global timeout_val (BSD version), instead check the mfctable.
 */
int
ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst)
{
    conn_t      *mrouter;
    vifi_t      vifi;
    struct mfc  *mfc_rt;
    int     i;

    mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    if (ipst->ips_ip_g_mrouter == NULL) {
        mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
        return (EINVAL);
    }

    mrouter = ipst->ips_ip_g_mrouter;

    if (ipst->ips_saved_ip_g_forward != -1) {
        if (ipst->ips_ip_mrtdebug > 1) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "ip_mrouter_done: turning off forwarding");
        }
        ipst->ips_ip_g_forward = ipst->ips_saved_ip_g_forward;
        ipst->ips_saved_ip_g_forward = -1;
    }

    /*
     * Always clear cache when vifs change.
     * No need to get ipst->ips_last_encap_lock since we are running as
     * a writer.
     */
    mutex_enter(&ipst->ips_last_encap_lock);
    ipst->ips_last_encap_src = 0;
    ipst->ips_last_encap_vif = NULL;
    mutex_exit(&ipst->ips_last_encap_lock);
    mrouter->conn_multi_router = 0;

    mutex_exit(&ipst->ips_ip_g_mrouter_mutex);

    /*
     * For each phyint in use,
     * disable promiscuous reception of all IP multicasts.
     */
    for (vifi = 0; vifi < MAXVIFS; vifi++) {
        struct vif *vifp = ipst->ips_vifs + vifi;

        mutex_enter(&vifp->v_lock);
        /*
         * if the vif is active mark it condemned.
         */
        if (vifp->v_marks & VIF_MARK_GOOD) {
            ASSERT(vifp->v_ipif != NULL);
            ipif_refhold(vifp->v_ipif);
            /* Phyint only */
            if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
                ipif_t *ipif = vifp->v_ipif;
                ipsq_t  *ipsq;
                boolean_t suc;
                ill_t *ill;

                ill = ipif->ipif_ill;
                suc = B_FALSE;
                if (mp == NULL) {
                    /*
                     * being called from ip_close,
                     * lets do it synchronously.
                     * Clear VIF_MARK_GOOD and
                     * set VIF_MARK_CONDEMNED.
                     */
                    vifp->v_marks &= ~VIF_MARK_GOOD;
                    vifp->v_marks |= VIF_MARK_CONDEMNED;
                    mutex_exit(&(vifp)->v_lock);
                    suc = ipsq_enter(ill, B_FALSE, NEW_OP);
                    ipsq = ill->ill_phyint->phyint_ipsq;
                } else {
                    ipsq = ipsq_try_enter(ipif, NULL,
                        mrouter->conn_wq, mp,
                        ip_restart_optmgmt, NEW_OP, B_TRUE);
                    if (ipsq == NULL) {
                        mutex_exit(&(vifp)->v_lock);
                        ipif_refrele(ipif);
                        return (EINPROGRESS);
                    }
                    /*
                     * Clear VIF_MARK_GOOD and
                     * set VIF_MARK_CONDEMNED.
                     */
                    vifp->v_marks &= ~VIF_MARK_GOOD;
                    vifp->v_marks |= VIF_MARK_CONDEMNED;
                    mutex_exit(&(vifp)->v_lock);
                    suc = B_TRUE;
                }

                if (suc) {
                    (void) ip_delmulti(INADDR_ANY, ipif,
                        B_TRUE, B_TRUE);
                    ipsq_exit(ipsq);
                }
                mutex_enter(&vifp->v_lock);
            }
            ipif_refrele(vifp->v_ipif);
            /*
             * decreases the refcnt added in add_vif.
             * and release v_lock.
             */
            VIF_REFRELE_LOCKED(vifp);
        } else {
            mutex_exit(&vifp->v_lock);
            continue;
        }
    }

    mutex_enter(&ipst->ips_numvifs_mutex);
    ipst->ips_numvifs = 0;
    ipst->ips_pim_assert = 0;
    ipst->ips_reg_vif_num = ALL_VIFS;
    mutex_exit(&ipst->ips_numvifs_mutex);

    /*
     * Free upcall msgs.
     * Go through mfctable and stop any outstanding upcall
     * timeouts remaining on mfcs.
     */
    for (i = 0; i < MFCTBLSIZ; i++) {
        mutex_enter(&ipst->ips_mfcs[i].mfcb_lock);
        ipst->ips_mfcs[i].mfcb_refcnt++;
        ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
        mutex_exit(&ipst->ips_mfcs[i].mfcb_lock);
        mfc_rt = ipst->ips_mfcs[i].mfcb_mfc;
        while (mfc_rt) {
            /* Free upcalls */
            mutex_enter(&mfc_rt->mfc_mutex);
            if (mfc_rt->mfc_rte != NULL) {
                if (mfc_rt->mfc_timeout_id != 0) {
                    /*
                     * OK to drop the lock as we have
                     * a refcnt on the bucket. timeout
                     * can fire but it will see that
                     * mfc_timeout_id == 0 and not do
                     * anything. see expire_upcalls().
                     */
                    mfc_rt->mfc_timeout_id = 0;
                    mutex_exit(&mfc_rt->mfc_mutex);
                    (void) untimeout(
                        mfc_rt->mfc_timeout_id);
                        mfc_rt->mfc_timeout_id = 0;
                    mutex_enter(&mfc_rt->mfc_mutex);

                    /*
                     * all queued upcall packets
                     * and mblk will be freed in
                     * release_mfc().
                     */
                }
            }

            mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;

            mutex_exit(&mfc_rt->mfc_mutex);
            mfc_rt = mfc_rt->mfc_next;
        }
        MFCB_REFRELE(&ipst->ips_mfcs[i]);
    }

    mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    ipst->ips_ip_g_mrouter = NULL;
    mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    return (0);
}

void
ip_mrouter_stack_destroy(ip_stack_t *ipst)
{
    struct mfcb *mfcbp;
    struct mfc  *rt;
    int i;

    for (i = 0; i < MFCTBLSIZ; i++) {
        mfcbp = &ipst->ips_mfcs[i];

        while ((rt = mfcbp->mfcb_mfc) != NULL) {
            (void) printf("ip_mrouter_stack_destroy: free for %d\n",
                i);

            mfcbp->mfcb_mfc = rt->mfc_next;
            free_queue(rt);
            mi_free(rt);
        }
    }
    kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1));
    ipst->ips_vifs = NULL;
    kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat));
    ipst->ips_mrtstat = NULL;
    kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ);
    ipst->ips_mfcs = NULL;
    kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS);
    ipst->ips_tbfs = NULL;

    mutex_destroy(&ipst->ips_last_encap_lock);
    mutex_destroy(&ipst->ips_ip_g_mrouter_mutex);
}

static boolean_t
is_mrouter_off(ip_stack_t *ipst)
{
    conn_t  *mrouter;

    mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    if (ipst->ips_ip_g_mrouter == NULL) {
        mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
        return (B_TRUE);
    }

    mrouter = ipst->ips_ip_g_mrouter;
    if (mrouter->conn_multi_router == 0) {
        mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
        return (B_TRUE);
    }
    mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    return (B_FALSE);
}

static void
unlock_good_vif(struct vif *vifp)
{
    ASSERT(vifp->v_ipif != NULL);
    ipif_refrele(vifp->v_ipif);
    VIF_REFRELE(vifp);
}

static boolean_t
lock_good_vif(struct vif *vifp)
{
    mutex_enter(&vifp->v_lock);
    if (!(vifp->v_marks & VIF_MARK_GOOD)) {
        mutex_exit(&vifp->v_lock);
        return (B_FALSE);
    }

    ASSERT(vifp->v_ipif != NULL);
    mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
    if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
        mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
        mutex_exit(&vifp->v_lock);
        return (B_FALSE);
    }
    ipif_refhold_locked(vifp->v_ipif);
    mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
    vifp->v_refcnt++;
    mutex_exit(&vifp->v_lock);
    return (B_TRUE);
}

/*
 * Add a vif to the vif table.
 */
static int
add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
{
    struct vif  *vifp = ipst->ips_vifs + vifcp->vifc_vifi;
    ipif_t      *ipif;
    int     error;
    struct tbf  *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
    ipsq_t      *ipsq;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    ASSERT(connp != NULL);

    if (vifcp->vifc_vifi >= MAXVIFS)
        return (EINVAL);

    if (is_mrouter_off(ipst))
        return (EINVAL);

    mutex_enter(&vifp->v_lock);
    /*
     * Viftable entry should be 0.
     * if v_marks == 0 but v_refcnt != 0 means struct is being
     * initialized.
     *
     * Also note that it is very unlikely that we will get a MRT_ADD_VIF
     * request while the delete is in progress, mrouted only sends add
     * requests when a new interface is added and the new interface cannot
     * have the same vifi as an existing interface. We make sure that
     * ill_delete will block till the vif is deleted by adding a refcnt
     * to ipif in del_vif().
     */
    if (vifp->v_lcl_addr.s_addr != 0 ||
        vifp->v_marks != 0 ||
        vifp->v_refcnt != 0) {
        mutex_exit(&vifp->v_lock);
        return (EADDRINUSE);
    }

    /* Incoming vif should not be 0 */
    if (vifcp->vifc_lcl_addr.s_addr == 0) {
        mutex_exit(&vifp->v_lock);
        return (EINVAL);
    }

    vifp->v_refcnt++;
    mutex_exit(&vifp->v_lock);
    /* Find the interface with the local address */
    ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
        connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp,
        ip_restart_optmgmt, &error, ipst);
    if (ipif == NULL) {
        VIF_REFRELE(vifp);
        if (error == EINPROGRESS)
            return (error);
        return (EADDRNOTAVAIL);
    }

    /*
     * We have to be exclusive as we have to call ip_addmulti()
     * This is the best position to try to be exclusive in case
     * we have to wait.
     */
    ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp,
        ip_restart_optmgmt, NEW_OP, B_TRUE);
    if ((ipsq) == NULL) {
        VIF_REFRELE(vifp);
        ipif_refrele(ipif);
        return (EINPROGRESS);
    }

    if (ipst->ips_ip_mrtdebug > 1) {
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "add_vif: src 0x%x enter",
            vifcp->vifc_lcl_addr.s_addr);
    }

    mutex_enter(&vifp->v_lock);
    /*
     * Always clear cache when vifs change.
     * Needed to ensure that src isn't left over from before vif was added.
     * No need to get last_encap_lock, since we are running as a writer.
     */

    mutex_enter(&ipst->ips_last_encap_lock);
    ipst->ips_last_encap_src = 0;
    ipst->ips_last_encap_vif = NULL;
    mutex_exit(&ipst->ips_last_encap_lock);

    if (vifcp->vifc_flags & VIFF_TUNNEL) {
        if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
            cmn_err(CE_WARN,
                "add_vif: source route tunnels not supported\n");
            VIF_REFRELE_LOCKED(vifp);
            ipif_refrele(ipif);
            ipsq_exit(ipsq);
            return (EOPNOTSUPP);
        }
        vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;

    } else {
        /* Phyint or Register vif */
        if (vifcp->vifc_flags & VIFF_REGISTER) {
            /*
             * Note: Since all IPPROTO_IP level options (including
             * MRT_ADD_VIF) are done exclusively via
             * ip_optmgmt_writer(), a lock is not necessary to
             * protect reg_vif_num.
             */
            mutex_enter(&ipst->ips_numvifs_mutex);
            if (ipst->ips_reg_vif_num == ALL_VIFS) {
                ipst->ips_reg_vif_num = vifcp->vifc_vifi;
                mutex_exit(&ipst->ips_numvifs_mutex);
            } else {
                mutex_exit(&ipst->ips_numvifs_mutex);
                VIF_REFRELE_LOCKED(vifp);
                ipif_refrele(ipif);
                ipsq_exit(ipsq);
                return (EADDRINUSE);
            }
        }

        /* Make sure the interface supports multicast */
        if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
            VIF_REFRELE_LOCKED(vifp);
            ipif_refrele(ipif);
            if (vifcp->vifc_flags & VIFF_REGISTER) {
                mutex_enter(&ipst->ips_numvifs_mutex);
                ipst->ips_reg_vif_num = ALL_VIFS;
                mutex_exit(&ipst->ips_numvifs_mutex);
            }
            ipsq_exit(ipsq);
            return (EOPNOTSUPP);
        }
        /* Enable promiscuous reception of all IP mcasts from the if */
        mutex_exit(&vifp->v_lock);
        error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE,
            MODE_IS_EXCLUDE, NULL);
        mutex_enter(&vifp->v_lock);
        /*
         * since we released the lock lets make sure that
         * ip_mrouter_done() has not been called.
         */
        if (error != 0 || is_mrouter_off(ipst)) {
            if (error == 0)
                (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE,
                    B_TRUE);
            if (vifcp->vifc_flags & VIFF_REGISTER) {
                mutex_enter(&ipst->ips_numvifs_mutex);
                ipst->ips_reg_vif_num = ALL_VIFS;
                mutex_exit(&ipst->ips_numvifs_mutex);
            }
            VIF_REFRELE_LOCKED(vifp);
            ipif_refrele(ipif);
            ipsq_exit(ipsq);
            return (error?error:EINVAL);
        }
    }
    /* Define parameters for the tbf structure */
    vifp->v_tbf = v_tbf;
    gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
    vifp->v_tbf->tbf_n_tok = 0;
    vifp->v_tbf->tbf_q_len = 0;
    vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
    vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;

    vifp->v_flags = vifcp->vifc_flags;
    vifp->v_threshold = vifcp->vifc_threshold;
    vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
    vifp->v_ipif = ipif;
    ipif_refrele(ipif);
    /* Scaling up here, allows division by 1024 in critical code.   */
    vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
    vifp->v_timeout_id = 0;
    /* initialize per vif pkt counters */
    vifp->v_pkt_in = 0;
    vifp->v_pkt_out = 0;
    vifp->v_bytes_in = 0;
    vifp->v_bytes_out = 0;
    mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);

    /* Adjust numvifs up, if the vifi is higher than numvifs */
    mutex_enter(&ipst->ips_numvifs_mutex);
    if (ipst->ips_numvifs <= vifcp->vifc_vifi)
        ipst->ips_numvifs = vifcp->vifc_vifi + 1;
    mutex_exit(&ipst->ips_numvifs_mutex);

    if (ipst->ips_ip_mrtdebug > 1) {
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
            vifcp->vifc_vifi,
            ntohl(vifcp->vifc_lcl_addr.s_addr),
            (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
            ntohl(vifcp->vifc_rmt_addr.s_addr),
            vifcp->vifc_threshold, vifcp->vifc_rate_limit);
    }

    vifp->v_marks = VIF_MARK_GOOD;
    mutex_exit(&vifp->v_lock);
    ipsq_exit(ipsq);
    return (0);
}


/* Delete a vif from the vif table. */
static void
del_vifp(struct vif *vifp)
{
    struct tbf  *t = vifp->v_tbf;
    mblk_t  *mp0;
    vifi_t  vifi;
    ip_stack_t  *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED);
    ASSERT(t != NULL);

    if (ipst->ips_ip_mrtdebug > 1) {
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr);
    }

    if (vifp->v_timeout_id != 0) {
        (void) untimeout(vifp->v_timeout_id);
        vifp->v_timeout_id = 0;
    }

    /*
     * Free packets queued at the interface.
     * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
     */
    mutex_enter(&t->tbf_lock);
    while (t->tbf_q != NULL) {
        mp0 = t->tbf_q;
        t->tbf_q = t->tbf_q->b_next;
        mp0->b_prev = mp0->b_next = NULL;
        freemsg(mp0);
    }
    mutex_exit(&t->tbf_lock);

    /*
     * Always clear cache when vifs change.
     * No need to get last_encap_lock since we are running as a writer.
     */
    mutex_enter(&ipst->ips_last_encap_lock);
    if (vifp == ipst->ips_last_encap_vif) {
        ipst->ips_last_encap_vif = NULL;
        ipst->ips_last_encap_src = 0;
    }
    mutex_exit(&ipst->ips_last_encap_lock);

    mutex_destroy(&t->tbf_lock);

    bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf)));

    /* Adjust numvifs down */
    mutex_enter(&ipst->ips_numvifs_mutex);
    for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */
        if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0)
            break;
    ipst->ips_numvifs = vifi;
    mutex_exit(&ipst->ips_numvifs_mutex);

    bzero(vifp, sizeof (*vifp));
}

static int
del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
{
    struct vif  *vifp = ipst->ips_vifs + *vifip;
    ipsq_t      *ipsq;

    if (*vifip >= ipst->ips_numvifs)
        return (EINVAL);

    mutex_enter(&vifp->v_lock);
    /*
     * Not initialized
     * Here we are not looking at the vif that is being initialized
     * i.e vifp->v_marks == 0 and refcnt > 0.
     */
    if (vifp->v_lcl_addr.s_addr == 0 ||
        !(vifp->v_marks & VIF_MARK_GOOD)) {
        mutex_exit(&vifp->v_lock);
        return (EADDRNOTAVAIL);
    }

    /*
     * This is an optimization, if first_mp == NULL
     * than we are being called from reset_mrt_vif_ipif()
     * so we already have exclusive access to the ipsq.
     * the ASSERT below is a check for this condition.
     */
    if (first_mp != NULL &&
        !(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
        ASSERT(connp != NULL);
        /*
         * We have to be exclusive as we have to call ip_delmulti()
         * This is the best position to try to be exclusive in case
         * we have to wait.
         */
        ipsq = ipsq_try_enter(vifp->v_ipif, NULL, CONNP_TO_WQ(connp),
            first_mp, ip_restart_optmgmt, NEW_OP, B_TRUE);
        if ((ipsq) == NULL) {
            mutex_exit(&vifp->v_lock);
            return (EINPROGRESS);
        }
        /* recheck after being exclusive */
        if (vifp->v_lcl_addr.s_addr == 0 ||
            !vifp->v_marks & VIF_MARK_GOOD) {
            /*
             * someone beat us.
             */
            mutex_exit(&vifp->v_lock);
            ipsq_exit(ipsq);
            return (EADDRNOTAVAIL);
        }
    }


    ASSERT(IAM_WRITER_IPIF(vifp->v_ipif));

    /* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
    vifp->v_marks &= ~VIF_MARK_GOOD;
    vifp->v_marks |= VIF_MARK_CONDEMNED;

    /* Phyint only */
    if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
        ipif_t *ipif = vifp->v_ipif;
        ASSERT(ipif != NULL);
        /*
         * should be OK to drop the lock as we
         * have marked this as CONDEMNED.
         */
        mutex_exit(&(vifp)->v_lock);
        (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, B_TRUE);
        if (first_mp != NULL)
            ipsq_exit(ipsq);
        mutex_enter(&(vifp)->v_lock);
    }

    /*
     * decreases the refcnt added in add_vif.
     */
    VIF_REFRELE_LOCKED(vifp);
    return (0);
}

/*
 * Add an mfc entry.
 */
static int
add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
{
    struct mfc *rt;
    struct rtdetq *rte;
    ushort_t nstl;
    int i;
    struct mfcb *mfcbp;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    /*
     * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
     * did not have a real route for pkt.
     * We want this pkt without rt installed in the mfctable to prevent
     * multiiple tries, so go ahead and put it in mfctable, it will
     * be discarded later in ip_mdq() because the child is NULL.
     */

    /* Error checking, out of bounds? */
    if (mfccp->mfcc_parent > MAXVIFS) {
        ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
            (int)mfccp->mfcc_parent));
        return (EINVAL);
    }

    if ((mfccp->mfcc_parent != NO_VIF) &&
        (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) {
        ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
            (int)mfccp->mfcc_parent));
        return (EINVAL);
    }

    if (is_mrouter_off(ipst)) {
        return (EINVAL);
    }

    mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr,
        mfccp->mfcc_mcastgrp.s_addr)];
    MFCB_REFHOLD(mfcbp);
    MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr,
        mfccp->mfcc_mcastgrp.s_addr, rt);

    /* If an entry already exists, just update the fields */
    if (rt) {
        if (ipst->ips_ip_mrtdebug > 1) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "add_mfc: update o %x grp %x parent %x",
                ntohl(mfccp->mfcc_origin.s_addr),
                ntohl(mfccp->mfcc_mcastgrp.s_addr),
                mfccp->mfcc_parent);
        }
        mutex_enter(&rt->mfc_mutex);
        rt->mfc_parent = mfccp->mfcc_parent;

        mutex_enter(&ipst->ips_numvifs_mutex);
        for (i = 0; i < (int)ipst->ips_numvifs; i++)
            rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
        mutex_exit(&ipst->ips_numvifs_mutex);
        mutex_exit(&rt->mfc_mutex);

        MFCB_REFRELE(mfcbp);
        return (0);
    }

    /*
     * Find the entry for which the upcall was made and update.
     */
    for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) {
        mutex_enter(&rt->mfc_mutex);
        if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
            (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
            (rt->mfc_rte != NULL) &&
            !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
            if (nstl++ != 0)
                cmn_err(CE_WARN,
                    "add_mfc: %s o %x g %x p %x",
                    "multiple kernel entries",
                    ntohl(mfccp->mfcc_origin.s_addr),
                    ntohl(mfccp->mfcc_mcastgrp.s_addr),
                    mfccp->mfcc_parent);

            if (ipst->ips_ip_mrtdebug > 1) {
                (void) mi_strlog(mrouter->conn_rq, 1,
                    SL_TRACE,
                    "add_mfc: o %x g %x p %x",
                    ntohl(mfccp->mfcc_origin.s_addr),
                    ntohl(mfccp->mfcc_mcastgrp.s_addr),
                    mfccp->mfcc_parent);
            }
            fill_route(rt, mfccp, ipst);

            /*
             * Prevent cleanup of cache entry.
             * Timer starts in ip_mforward.
             */
            if (rt->mfc_timeout_id != 0) {
                timeout_id_t id;
                id = rt->mfc_timeout_id;
                /*
                 * setting id to zero will avoid this
                 * entry from being cleaned up in
                 * expire_up_calls().
                 */
                rt->mfc_timeout_id = 0;
                /*
                 * dropping the lock is fine as we
                 * have a refhold on the bucket.
                 * so mfc cannot be freed.
                 * The timeout can fire but it will see
                 * that mfc_timeout_id == 0 and not cleanup.
                 */
                mutex_exit(&rt->mfc_mutex);
                (void) untimeout(id);
                mutex_enter(&rt->mfc_mutex);
            }

            /*
             * Send all pkts that are queued waiting for the upcall.
             * ip_mdq param tun set to 0 -
             * the return value of ip_mdq() isn't used here,
             * so value we send doesn't matter.
             */
            while (rt->mfc_rte != NULL) {
                rte = rt->mfc_rte;
                rt->mfc_rte = rte->rte_next;
                mutex_exit(&rt->mfc_mutex);
                (void) ip_mdq(rte->mp, (ipha_t *)
                    rte->mp->b_rptr, rte->ill, 0, rt);
                freemsg(rte->mp);
                mi_free((char *)rte);
                mutex_enter(&rt->mfc_mutex);
            }
        }
        mutex_exit(&rt->mfc_mutex);
    }


    /*
     * It is possible that an entry is being inserted without an upcall
     */
    if (nstl == 0) {
        mutex_enter(&(mfcbp->mfcb_lock));
        if (ipst->ips_ip_mrtdebug > 1) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "add_mfc: no upcall o %x g %x p %x",
                ntohl(mfccp->mfcc_origin.s_addr),
                ntohl(mfccp->mfcc_mcastgrp.s_addr),
                mfccp->mfcc_parent);
        }
        if (is_mrouter_off(ipst)) {
            mutex_exit(&mfcbp->mfcb_lock);
            MFCB_REFRELE(mfcbp);
            return (EINVAL);
        }

        for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) {

            mutex_enter(&rt->mfc_mutex);
            if ((rt->mfc_origin.s_addr ==
                mfccp->mfcc_origin.s_addr) &&
                (rt->mfc_mcastgrp.s_addr ==
                mfccp->mfcc_mcastgrp.s_addr) &&
                (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) {
                fill_route(rt, mfccp, ipst);
                mutex_exit(&rt->mfc_mutex);
                break;
            }
            mutex_exit(&rt->mfc_mutex);
        }

        /* No upcall, so make a new entry into mfctable */
        if (rt == NULL) {
            rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
            if (rt == NULL) {
                ip1dbg(("add_mfc: out of memory\n"));
                mutex_exit(&mfcbp->mfcb_lock);
                MFCB_REFRELE(mfcbp);
                return (ENOBUFS);
            }

            /* Insert new entry at head of hash chain */
            mutex_enter(&rt->mfc_mutex);
            fill_route(rt, mfccp, ipst);

            /* Link into table */
            rt->mfc_next   = mfcbp->mfcb_mfc;
            mfcbp->mfcb_mfc = rt;
            mutex_exit(&rt->mfc_mutex);
        }
        mutex_exit(&mfcbp->mfcb_lock);
    }

    MFCB_REFRELE(mfcbp);
    return (0);
}

/*
 * Fills in mfc structure from mrouted mfcctl.
 */
static void
fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst)
{
    int i;

    rt->mfc_origin      = mfccp->mfcc_origin;
    rt->mfc_mcastgrp    = mfccp->mfcc_mcastgrp;
    rt->mfc_parent      = mfccp->mfcc_parent;
    mutex_enter(&ipst->ips_numvifs_mutex);
    for (i = 0; i < (int)ipst->ips_numvifs; i++) {
        rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
    }
    mutex_exit(&ipst->ips_numvifs_mutex);
    /* Initialize pkt counters per src-grp */
    rt->mfc_pkt_cnt = 0;
    rt->mfc_byte_cnt    = 0;
    rt->mfc_wrong_if    = 0;
    rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0;

}

static void
free_queue(struct mfc *mfcp)
{
    struct rtdetq *rte0;

    /*
     * Drop all queued upcall packets.
     * Free the mbuf with the pkt.
     */
    while ((rte0 = mfcp->mfc_rte) != NULL) {
        mfcp->mfc_rte = rte0->rte_next;
        freemsg(rte0->mp);
        mi_free((char *)rte0);
    }
}
/*
 * go thorugh the hash bucket and free all the entries marked condemned.
 */
void
release_mfc(struct mfcb *mfcbp)
{
    struct mfc *current_mfcp;
    struct mfc *prev_mfcp;

    prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;

    while (current_mfcp != NULL) {
        if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) {
            if (current_mfcp == mfcbp->mfcb_mfc) {
                mfcbp->mfcb_mfc = current_mfcp->mfc_next;
                free_queue(current_mfcp);
                mi_free(current_mfcp);
                prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
                continue;
            }
            ASSERT(prev_mfcp != NULL);
            prev_mfcp->mfc_next = current_mfcp->mfc_next;
            free_queue(current_mfcp);
            mi_free(current_mfcp);
            current_mfcp = NULL;
        } else {
            prev_mfcp = current_mfcp;
        }

        current_mfcp = prev_mfcp->mfc_next;

    }
    mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED;
    ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0);
}

/*
 * Delete an mfc entry.
 */
static int
del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
{
    struct in_addr  origin;
    struct in_addr  mcastgrp;
    struct mfc  *rt;
    uint_t      hash;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    origin = mfccp->mfcc_origin;
    mcastgrp = mfccp->mfcc_mcastgrp;
    hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);

    if (ipst->ips_ip_mrtdebug > 1) {
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "del_mfc: o %x g %x",
            ntohl(origin.s_addr),
            ntohl(mcastgrp.s_addr));
    }

    MFCB_REFHOLD(&ipst->ips_mfcs[hash]);

    /* Find mfc in mfctable, finds only entries without upcalls */
    for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) {
        mutex_enter(&rt->mfc_mutex);
        if (origin.s_addr == rt->mfc_origin.s_addr &&
            mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
            rt->mfc_rte == NULL &&
            !(rt->mfc_marks & MFCB_MARK_CONDEMNED))
            break;
        mutex_exit(&rt->mfc_mutex);
    }

    /*
     * Return if there was an upcall (mfc_rte != NULL,
     * or rt not in mfctable.
     */
    if (rt == NULL) {
        MFCB_REFRELE(&ipst->ips_mfcs[hash]);
        return (EADDRNOTAVAIL);
    }


    /*
     * no need to hold lock as we have a reference.
     */
    ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
    /* error checking */
    if (rt->mfc_timeout_id != 0) {
        ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
        /*
         * Its ok to drop the lock,  the struct cannot be freed
         * since we have a ref on the hash bucket.
         */
        rt->mfc_timeout_id = 0;
        mutex_exit(&rt->mfc_mutex);
        (void) untimeout(rt->mfc_timeout_id);
        mutex_enter(&rt->mfc_mutex);
    }

    ASSERT(rt->mfc_rte == NULL);


    /*
     * Delete the entry from the cache
     */
    rt->mfc_marks |= MFCB_MARK_CONDEMNED;
    mutex_exit(&rt->mfc_mutex);

    MFCB_REFRELE(&ipst->ips_mfcs[hash]);

    return (0);
}

#define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */

/*
 * IP multicast forwarding function. This function assumes that the packet
 * pointed to by ipha has arrived on (or is about to be sent to) the interface
 * pointed to by "ill", and the packet is to be relayed to other networks
 * that have members of the packet's destination IP multicast group.
 *
 * The packet is returned unscathed to the caller, unless it is
 * erroneous, in which case a -1 value tells the caller (IP)
 * to discard it.
 *
 * Unlike BSD, SunOS 5.x needs to return to IP info about
 * whether pkt came in thru a tunnel, so it can be discarded, unless
 * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
 * to be delivered.
 * Return values are 0 - pkt is okay and phyint
 *          -1 - pkt is malformed and to be tossed
 *                   1 - pkt came in on tunnel
 */
int
ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
{
    struct mfc  *rt;
    ipaddr_t    src, dst, tunnel_src = 0;
    static int  srctun = 0;
    vifi_t      vifi;
    boolean_t   pim_reg_packet = B_FALSE;
    struct mfcb *mfcbp;
    ip_stack_t  *ipst = ill->ill_ipst;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    if (ipst->ips_ip_mrtdebug > 1) {
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
            ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
            ill->ill_name);
    }

    dst = ipha->ipha_dst;
    if ((uint32_t)(uintptr_t)mp->b_prev == PIM_REGISTER_MARKER)
        pim_reg_packet = B_TRUE;
    else
        tunnel_src = (ipaddr_t)(uintptr_t)mp->b_prev;

    /*
     * Don't forward a packet with time-to-live of zero or one,
     * or a packet destined to a local-only group.
     */
    if (CLASSD(dst) && (ipha->ipha_ttl <= 1 ||
        (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) {
        if (ipst->ips_ip_mrtdebug > 1) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "ip_mforward: not forwarded ttl %d,"
                " dst 0x%x ill %s",
                ipha->ipha_ttl, ntohl(dst), ill->ill_name);
        }
        mp->b_prev = NULL;
        if (tunnel_src != 0)
            return (1);
        else
            return (0);
    }

    if ((tunnel_src != 0) || pim_reg_packet) {
        /*
         * Packet arrived over an encapsulated tunnel or via a PIM
         * register message. Both ip_mroute_decap() and pim_input()
         * encode information in mp->b_prev.
         */
        mp->b_prev = NULL;
        if (ipst->ips_ip_mrtdebug > 1) {
            if (tunnel_src != 0) {
                (void) mi_strlog(mrouter->conn_rq, 1,
                    SL_TRACE,
                    "ip_mforward: ill %s arrived via ENCAP TUN",
                    ill->ill_name);
            } else if (pim_reg_packet) {
                (void) mi_strlog(mrouter->conn_rq, 1,
                    SL_TRACE,
                    "ip_mforward: ill %s arrived via"
                    "  REGISTER VIF",
                    ill->ill_name);
            }
        }
    } else if ((ipha->ipha_version_and_hdr_length & 0xf) <
        (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 ||
        ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) {
        /* Packet arrived via a physical interface. */
        if (ipst->ips_ip_mrtdebug > 1) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "ip_mforward: ill %s arrived via PHYINT",
                ill->ill_name);
        }

    } else {
        /*
         * Packet arrived through a SRCRT tunnel.
         * Source-route tunnels are no longer supported.
         * Error message printed every 1000 times.
         */
        if ((srctun++ % 1000) == 0) {
            cmn_err(CE_WARN,
                "ip_mforward: received source-routed pkt from %x",
                ntohl(ipha->ipha_src));
        }
        return (-1);
    }

    ipst->ips_mrtstat->mrts_fwd_in++;
    src = ipha->ipha_src;

    /* Find route in cache, return NULL if not there or upcalls q'ed. */

    /*
     * Lock the mfctable against changes made by ip_mforward.
     * Note that only add_mfc and del_mfc can remove entries and
     * they run with exclusive access to IP. So we do not need to
     * guard against the rt being deleted, so release lock after reading.
     */

    if (is_mrouter_off(ipst))
        return (-1);

    mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)];
    MFCB_REFHOLD(mfcbp);
    MFCFIND(mfcbp, src, dst, rt);

    /* Entry exists, so forward if necessary */
    if (rt != NULL) {
        int ret = 0;
        ipst->ips_mrtstat->mrts_mfc_hits++;
        if (pim_reg_packet) {
            ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
            ret = ip_mdq(mp, ipha,
                ipst->ips_vifs[ipst->ips_reg_vif_num].
                v_ipif->ipif_ill,
                0, rt);
        } else {
            ret = ip_mdq(mp, ipha, ill, tunnel_src, rt);
        }

        MFCB_REFRELE(mfcbp);
        return (ret);

        /*
         * Don't forward if we don't have a cache entry.  Mrouted will
         * always provide a cache entry in response to an upcall.
         */
    } else {
        /*
         * If we don't have a route for packet's origin, make a copy
         * of the packet and send message to routing daemon.
         */
        struct mfc  *mfc_rt  = NULL;
        mblk_t      *mp0     = NULL;
        mblk_t      *mp_copy = NULL;
        struct rtdetq   *rte     = NULL;
        struct rtdetq   *rte_m, *rte1, *prev_rte;
        uint_t      hash;
        int     npkts;
        boolean_t   new_mfc = B_FALSE;
        ipst->ips_mrtstat->mrts_mfc_misses++;
        /* BSD uses mrts_no_route++ */
        if (ipst->ips_ip_mrtdebug > 1) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "ip_mforward: no rte ill %s src %x g %x misses %d",
                ill->ill_name, ntohl(src), ntohl(dst),
                (int)ipst->ips_mrtstat->mrts_mfc_misses);
        }
        /*
         * The order of the following code differs from the BSD code.
         * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
         * code works, so SunOS 5.x wasn't changed to conform to the
         * BSD version.
         */

        /* Lock mfctable. */
        hash = MFCHASH(src, dst);
        mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock));

        /*
         * If we are turning off mrouted return an error
         */
        if (is_mrouter_off(ipst)) {
            mutex_exit(&mfcbp->mfcb_lock);
            MFCB_REFRELE(mfcbp);
            return (-1);
        }

        /* Is there an upcall waiting for this packet? */
        for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt;
            mfc_rt = mfc_rt->mfc_next) {
            mutex_enter(&mfc_rt->mfc_mutex);
            if (ipst->ips_ip_mrtdebug > 1) {
                (void) mi_strlog(mrouter->conn_rq, 1,
                    SL_TRACE,
                    "ip_mforward: MFCTAB hash %d o 0x%x"
                    " g 0x%x\n",
                    hash, ntohl(mfc_rt->mfc_origin.s_addr),
                    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
            }
            /* There is an upcall */
            if ((src == mfc_rt->mfc_origin.s_addr) &&
                (dst == mfc_rt->mfc_mcastgrp.s_addr) &&
                (mfc_rt->mfc_rte != NULL) &&
                !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
                break;
            }
            mutex_exit(&mfc_rt->mfc_mutex);
        }
        /* No upcall, so make a new entry into mfctable */
        if (mfc_rt == NULL) {
            mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
            if (mfc_rt == NULL) {
                ipst->ips_mrtstat->mrts_fwd_drop++;
                ip1dbg(("ip_mforward: out of memory "
                    "for mfc, mfc_rt\n"));
                goto error_return;
            } else
                new_mfc = B_TRUE;
            /* Get resources */
            /* TODO could copy header and dup rest */
            mp_copy = copymsg(mp);
            if (mp_copy == NULL) {
                ipst->ips_mrtstat->mrts_fwd_drop++;
                ip1dbg(("ip_mforward: out of memory for "
                    "mblk, mp_copy\n"));
                goto error_return;
            }
            mutex_enter(&mfc_rt->mfc_mutex);
        }
        /* Get resources for rte, whether first rte or not first. */
        /* Add this packet into rtdetq */
        rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq));
        if (rte == NULL) {
            ipst->ips_mrtstat->mrts_fwd_drop++;
            mutex_exit(&mfc_rt->mfc_mutex);
            ip1dbg(("ip_mforward: out of memory for"
                " rtdetq, rte\n"));
            goto error_return;
        }

        mp0 = copymsg(mp);
        if (mp0 == NULL) {
            ipst->ips_mrtstat->mrts_fwd_drop++;
            ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
            mutex_exit(&mfc_rt->mfc_mutex);
            goto error_return;
        }
        rte->mp     = mp0;
        if (pim_reg_packet) {
            ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
            rte->ill =
                ipst->ips_vifs[ipst->ips_reg_vif_num].
                v_ipif->ipif_ill;
        } else {
            rte->ill = ill;
        }
        rte->rte_next   = NULL;

        /*
         * Determine if upcall q (rtdetq) has overflowed.
         * mfc_rt->mfc_rte is null by mi_zalloc
         * if it is the first message.
         */
        for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m;
            rte_m = rte_m->rte_next)
            npkts++;
        if (ipst->ips_ip_mrtdebug > 1) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "ip_mforward: upcalls %d\n", npkts);
        }
        if (npkts > MAX_UPQ) {
            ipst->ips_mrtstat->mrts_upq_ovflw++;
            mutex_exit(&mfc_rt->mfc_mutex);
            goto error_return;
        }

        if (npkts == 0) {   /* first upcall */
            int i = 0;
            /*
             * Now finish installing the new mfc! Now that we have
             * resources!  Insert new entry at head of hash chain.
             * Use src and dst which are ipaddr_t's.
             */
            mfc_rt->mfc_origin.s_addr = src;
            mfc_rt->mfc_mcastgrp.s_addr = dst;

            mutex_enter(&ipst->ips_numvifs_mutex);
            for (i = 0; i < (int)ipst->ips_numvifs; i++)
                mfc_rt->mfc_ttls[i] = 0;
            mutex_exit(&ipst->ips_numvifs_mutex);
            mfc_rt->mfc_parent = ALL_VIFS;

            /* Link into table */
            if (ipst->ips_ip_mrtdebug > 1) {
                (void) mi_strlog(mrouter->conn_rq, 1,
                    SL_TRACE,
                    "ip_mforward: NEW MFCTAB hash %d o 0x%x "
                    "g 0x%x\n", hash,
                    ntohl(mfc_rt->mfc_origin.s_addr),
                    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
            }
            mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc;
            ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt;
            mfc_rt->mfc_rte = NULL;
        }

        /* Link in the upcall */
        /* First upcall */
        if (mfc_rt->mfc_rte == NULL)
            mfc_rt->mfc_rte = rte;
        else {
            /* not the first upcall */
            prev_rte = mfc_rt->mfc_rte;
            for (rte1 = mfc_rt->mfc_rte->rte_next; rte1;
                prev_rte = rte1, rte1 = rte1->rte_next)
                ;
            prev_rte->rte_next = rte;
        }

        /*
         * No upcalls waiting, this is first one, so send a message to
         * routing daemon to install a route into kernel table.
         */
        if (npkts == 0) {
            struct igmpmsg  *im;
            /* ipha_protocol is 0, for upcall */
            ASSERT(mp_copy != NULL);
            im = (struct igmpmsg *)mp_copy->b_rptr;
            im->im_msgtype  = IGMPMSG_NOCACHE;
            im->im_mbz = 0;
            mutex_enter(&ipst->ips_numvifs_mutex);
            if (pim_reg_packet) {
                im->im_vif = (uchar_t)ipst->ips_reg_vif_num;
                mutex_exit(&ipst->ips_numvifs_mutex);
            } else {
                /*
                 * XXX do we need to hold locks here ?
                 */
                for (vifi = 0;
                    vifi < ipst->ips_numvifs;
                    vifi++) {
                    if (ipst->ips_vifs[vifi].v_ipif == NULL)
                        continue;
                    if (ipst->ips_vifs[vifi].
                        v_ipif->ipif_ill == ill) {
                        im->im_vif = (uchar_t)vifi;
                        break;
                    }
                }
                mutex_exit(&ipst->ips_numvifs_mutex);
                ASSERT(vifi < ipst->ips_numvifs);
            }

            ipst->ips_mrtstat->mrts_upcalls++;
            /* Timer to discard upcalls if mrouted is too slow */
            mfc_rt->mfc_timeout_id = timeout(expire_upcalls,
                mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE);
            mutex_exit(&mfc_rt->mfc_mutex);
            mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
            /* Pass to RAWIP */
            (mrouter->conn_recv)(mrouter, mp_copy, NULL);
        } else {
            mutex_exit(&mfc_rt->mfc_mutex);
            mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
            freemsg(mp_copy);
        }

        MFCB_REFRELE(mfcbp);
        if (tunnel_src != 0)
            return (1);
        else
            return (0);
    error_return:
        mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
        MFCB_REFRELE(mfcbp);
        if (mfc_rt != NULL && (new_mfc == B_TRUE))
            mi_free((char *)mfc_rt);
        if (rte != NULL)
            mi_free((char *)rte);
        if (mp_copy != NULL)
            freemsg(mp_copy);
        if (mp0 != NULL)
            freemsg(mp0);
        return (-1);
    }
}

/*
 * Clean up the mfctable cache entry if upcall is not serviced.
 * SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
 */
static void
expire_upcalls(void *arg)
{
    struct mfc *mfc_rt = arg;
    uint_t hash;
    struct mfc *prev_mfc, *mfc0;
    ip_stack_t  *ipst;
    conn_t      *mrouter;

    if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) {
        cmn_err(CE_WARN, "expire_upcalls: no ILL\n");
        return;
    }
    ipst = mfc_rt->mfc_rte->ill->ill_ipst;
    mrouter = ipst->ips_ip_g_mrouter;

    hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr);
    if (ipst->ips_ip_mrtdebug > 1) {
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "expire_upcalls: hash %d s %x g %x",
            hash, ntohl(mfc_rt->mfc_origin.s_addr),
            ntohl(mfc_rt->mfc_mcastgrp.s_addr));
    }
    MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
    mutex_enter(&mfc_rt->mfc_mutex);
    /*
     * if timeout has been set to zero, than the
     * entry has been filled, no need to delete it.
     */
    if (mfc_rt->mfc_timeout_id == 0)
        goto done;
    ipst->ips_mrtstat->mrts_cache_cleanups++;
    mfc_rt->mfc_timeout_id = 0;

    /* Determine entry to be cleaned up in cache table. */
    for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0;
        prev_mfc = mfc0, mfc0 = mfc0->mfc_next)
        if (mfc0 == mfc_rt)
            break;

    /* del_mfc takes care of gone mfcs */
    ASSERT(prev_mfc != NULL);
    ASSERT(mfc0 != NULL);

    /*
     * Delete the entry from the cache
     */
    ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
    mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;

    /*
     * release_mfc will drop all queued upcall packets.
     * and will free the mbuf with the pkt, if, timing info.
     */
done:
    mutex_exit(&mfc_rt->mfc_mutex);
    MFCB_REFRELE(&ipst->ips_mfcs[hash]);
}

/*
 * Packet forwarding routine once entry in the cache is made.
 */
static int
ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
    struct mfc *rt)
{
    ill_t *vill;
    vifi_t vifi;
    struct vif *vifp;
    ipaddr_t dst = ipha->ipha_dst;
    size_t  plen = msgdsize(mp);
    vifi_t num_of_vifs;
    ip_stack_t  *ipst = ill->ill_ipst;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    if (ipst->ips_ip_mrtdebug > 1) {
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "ip_mdq: SEND src %x, ipha_dst %x, ill %s",
            ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
            ill->ill_name);
    }

    /* Macro to send packet on vif */
#define MC_SEND(ipha, mp, vifp, dst) { \
    if ((vifp)->v_flags & VIFF_TUNNEL) \
        encap_send((ipha), (mp), (vifp), (dst)); \
    else if ((vifp)->v_flags & VIFF_REGISTER) \
        register_send((ipha), (mp), (vifp), (dst)); \
    else \
        phyint_send((ipha), (mp), (vifp), (dst)); \
}

    vifi = rt->mfc_parent;

    /*
     * The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
     * Mrouted had no route.
     * We wanted the route installed in the mfctable to prevent multiple
     * tries, so it passed add_mfc(), but is discarded here. The v_ipif is
     * NULL so we don't want to check the ill. Still needed as of Mrouted
     * 3.6.
     */
    if (vifi == NO_VIF) {
        ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
            ill->ill_name));
        if (ipst->ips_ip_mrtdebug > 1) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name);
        }
        return (-1);    /* drop pkt */
    }

    if (!lock_good_vif(&ipst->ips_vifs[vifi]))
        return (-1);
    /*
     * The MFC entries are not cleaned up when an ipif goes
     * away thus this code has to guard against an MFC referencing
     * an ipif that has been closed. Note: reset_mrt_vif_ipif
     * sets the v_ipif to NULL when the ipif disappears.
     */
    ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL);

    if (vifi >= ipst->ips_numvifs) {
        cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs "
            "%d ill %s viftable ill %s\n",
            (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
            ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
        unlock_good_vif(&ipst->ips_vifs[vifi]);
        return (-1);
    }
    /*
     * Don't forward if it didn't arrive from the parent vif for its
     * origin.
     */
    vill = ipst->ips_vifs[vifi].v_ipif->ipif_ill;
    if ((vill != ill && !IS_IN_SAME_ILLGRP(vill, ill)) ||
        (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
        /* Came in the wrong interface */
        ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
            "numvifs %d ill %s viftable ill %s\n",
            (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
            vill->ill_name));
        if (ipst->ips_ip_mrtdebug > 1) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "ip_mdq: arrived wrong if, vifi %d ill "
                "%s viftable ill %s\n",
                (int)vifi, ill->ill_name, vill->ill_name);
        }
        ipst->ips_mrtstat->mrts_wrong_if++;
        rt->mfc_wrong_if++;

        /*
         * If we are doing PIM assert processing and we are forwarding
         * packets on this interface, and it is a broadcast medium
         * interface (and not a tunnel), send a message to the routing.
         *
         * We use the first ipif on the list, since it's all we have.
         * Chances are the ipif_flags are the same for ipifs on the ill.
         */
        if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 &&
            (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) &&
            !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) {
            mblk_t      *mp_copy;
            struct igmpmsg  *im;

            /* TODO could copy header and dup rest */
            mp_copy = copymsg(mp);
            if (mp_copy == NULL) {
                ipst->ips_mrtstat->mrts_fwd_drop++;
                ip1dbg(("ip_mdq: out of memory "
                    "for mblk, mp_copy\n"));
                unlock_good_vif(&ipst->ips_vifs[vifi]);
                return (-1);
            }

            im = (struct igmpmsg *)mp_copy->b_rptr;
            im->im_msgtype = IGMPMSG_WRONGVIF;
            im->im_mbz = 0;
            im->im_vif = (ushort_t)vifi;
            /* Pass to RAWIP */
            (mrouter->conn_recv)(mrouter, mp_copy, NULL);
        }
        unlock_good_vif(&ipst->ips_vifs[vifi]);
        if (tunnel_src != 0)
            return (1);
        else
            return (0);
    }
    /*
     * If I sourced this packet, it counts as output, else it was input.
     */
    if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) {
        ipst->ips_vifs[vifi].v_pkt_out++;
        ipst->ips_vifs[vifi].v_bytes_out += plen;
    } else {
        ipst->ips_vifs[vifi].v_pkt_in++;
        ipst->ips_vifs[vifi].v_bytes_in += plen;
    }
    mutex_enter(&rt->mfc_mutex);
    rt->mfc_pkt_cnt++;
    rt->mfc_byte_cnt += plen;
    mutex_exit(&rt->mfc_mutex);
    unlock_good_vif(&ipst->ips_vifs[vifi]);
    /*
     * For each vif, decide if a copy of the packet should be forwarded.
     * Forward if:
     *      - the vif threshold ttl is non-zero AND
     *      - the pkt ttl exceeds the vif's threshold
     * A non-zero mfc_ttl indicates that the vif is part of
     * the output set for the mfc entry.
     */
    mutex_enter(&ipst->ips_numvifs_mutex);
    num_of_vifs = ipst->ips_numvifs;
    mutex_exit(&ipst->ips_numvifs_mutex);
    for (vifp = ipst->ips_vifs, vifi = 0;
        vifi < num_of_vifs;
        vifp++, vifi++) {
        if (!lock_good_vif(vifp))
            continue;
        if ((rt->mfc_ttls[vifi] > 0) &&
            (ipha->ipha_ttl > rt->mfc_ttls[vifi])) {
            /*
             * lock_good_vif should not have succedded if
             * v_ipif is null.
             */
            ASSERT(vifp->v_ipif != NULL);
            vifp->v_pkt_out++;
            vifp->v_bytes_out += plen;
            MC_SEND(ipha, mp, vifp, dst);
            ipst->ips_mrtstat->mrts_fwd_out++;
        }
        unlock_good_vif(vifp);
    }
    if (tunnel_src != 0)
        return (1);
    else
        return (0);
}

/*
 * Send the packet on physical interface.
 * Caller assumes can continue to use mp on return.
 */
/* ARGSUSED */
static void
phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
{
    mblk_t  *mp_copy;
    ip_stack_t  *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    /* Make a new reference to the packet */
    mp_copy = copymsg(mp);  /* TODO could copy header and dup rest */
    if (mp_copy == NULL) {
        ipst->ips_mrtstat->mrts_fwd_drop++;
        ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
        return;
    }
    if (vifp->v_rate_limit <= 0)
        tbf_send_packet(vifp, mp_copy);
    else  {
        if (ipst->ips_ip_mrtdebug > 1) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "phyint_send: tbf_contr rate %d "
                "vifp 0x%p mp 0x%p dst 0x%x",
                vifp->v_rate_limit, (void *)vifp, (void *)mp, dst);
        }
        tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr);
    }
}

/*
 * Send the whole packet for REGISTER encapsulation to PIM daemon
 * Caller assumes it can continue to use mp on return.
 */
/* ARGSUSED */
static void
register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
{
    struct igmpmsg  *im;
    mblk_t      *mp_copy;
    ipha_t      *ipha_copy;
    ip_stack_t  *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    if (ipst->ips_ip_mrtdebug > 1) {
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "register_send: src %x, dst %x\n",
            ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
    }

    /*
     * Copy the old packet & pullup its IP header into the new mblk_t so we
     * can modify it.  Try to fill the new mblk_t since if we don't the
     * ethernet driver will.
     */
    mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED);
    if (mp_copy == NULL) {
        ++ipst->ips_mrtstat->mrts_pim_nomemory;
        if (ipst->ips_ip_mrtdebug > 3) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "register_send: allocb failure.");
        }
        return;
    }

    /*
     * Bump write pointer to account for igmpmsg being added.
     */
    mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg);

    /*
     * Chain packet to new mblk_t.
     */
    if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
        ++ipst->ips_mrtstat->mrts_pim_nomemory;
        if (ipst->ips_ip_mrtdebug > 3) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "register_send: copymsg failure.");
        }
        freeb(mp_copy);
        return;
    }

    /*
     * icmp_input() asserts that IP version field is set to an
     * appropriate version. Hence, the struct igmpmsg that this really
     * becomes, needs to have the correct IP version field.
     */
    ipha_copy = (ipha_t *)mp_copy->b_rptr;
    *ipha_copy = multicast_encap_iphdr;

    /*
     * The kernel uses the struct igmpmsg header to encode the messages to
     * the multicast routing daemon. Fill in the fields in the header
     * starting with the message type which is IGMPMSG_WHOLEPKT
     */
    im = (struct igmpmsg *)mp_copy->b_rptr;
    im->im_msgtype = IGMPMSG_WHOLEPKT;
    im->im_src.s_addr = ipha->ipha_src;
    im->im_dst.s_addr = ipha->ipha_dst;

    /*
     * Must Be Zero. This is because the struct igmpmsg is really an IP
     * header with renamed fields and the multicast routing daemon uses
     * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
     */
    im->im_mbz = 0;

    ++ipst->ips_mrtstat->mrts_upcalls;
    if (!canputnext(mrouter->conn_rq)) {
        ++ipst->ips_mrtstat->mrts_pim_regsend_drops;
        if (ipst->ips_ip_mrtdebug > 3) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "register_send: register upcall failure.");
        }
        freemsg(mp_copy);
    } else {
        /* Pass to RAWIP */
        (mrouter->conn_recv)(mrouter, mp_copy, NULL);
    }
}

/*
 * pim_validate_cksum handles verification of the checksum in the
 * pim header.  For PIM Register packets, the checksum is calculated
 * across the PIM header only.  For all other packets, the checksum
 * is for the PIM header and remainder of the packet.
 *
 * returns: B_TRUE, if checksum is okay.
 *          B_FALSE, if checksum is not valid.
 */
static boolean_t
pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
{
    mblk_t *mp_dup;

    if ((mp_dup = dupmsg(mp)) == NULL)
        return (B_FALSE);

    mp_dup->b_rptr += IPH_HDR_LENGTH(ip);
    if (pimp->pim_type == PIM_REGISTER)
        mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN;
    if (IP_CSUM(mp_dup, 0, 0)) {
        freemsg(mp_dup);
        return (B_FALSE);
    }
    freemsg(mp_dup);
    return (B_TRUE);
}

/*
 * int
 * pim_input(queue_t *, mblk_t *, ill_t *ill) - Process PIM protocol packets.
 *  IP Protocol 103. Register messages are decapsulated and sent
 *  onto multicast forwarding.
 */
int
pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
{
    ipha_t      *eip, *ip;
    int     iplen, pimlen, iphlen;
    struct pim  *pimp;  /* pointer to a pim struct */
    uint32_t    *reghdr;
    ip_stack_t  *ipst = ill->ill_ipst;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    /*
     * Pullup the msg for PIM protocol processing.
     */
    if (pullupmsg(mp, -1) == 0) {
        ++ipst->ips_mrtstat->mrts_pim_nomemory;
        freemsg(mp);
        return (-1);
    }

    ip = (ipha_t *)mp->b_rptr;
    iplen = ip->ipha_length;
    iphlen = IPH_HDR_LENGTH(ip);
    pimlen = ntohs(iplen) - iphlen;

    /*
     * Validate lengths
     */
    if (pimlen < PIM_MINLEN) {
        ++ipst->ips_mrtstat->mrts_pim_malformed;
        if (ipst->ips_ip_mrtdebug > 1) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "pim_input: length not at least minlen");
        }
        freemsg(mp);
        return (-1);
    }

    /*
     * Point to the PIM header.
     */
    pimp = (struct pim *)((caddr_t)ip + iphlen);

    /*
     * Check the version number.
     */
    if (pimp->pim_vers != PIM_VERSION) {
        ++ipst->ips_mrtstat->mrts_pim_badversion;
        if (ipst->ips_ip_mrtdebug > 1) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "pim_input: unknown version of PIM");
        }
        freemsg(mp);
        return (-1);
    }

    /*
     * Validate the checksum
     */
    if (!pim_validate_cksum(mp, ip, pimp)) {
        ++ipst->ips_mrtstat->mrts_pim_rcv_badcsum;
        if (ipst->ips_ip_mrtdebug > 1) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "pim_input: invalid checksum");
        }
        freemsg(mp);
        return (-1);
    }

    if (pimp->pim_type != PIM_REGISTER)
        return (0);

    reghdr = (uint32_t *)(pimp + 1);
    eip = (ipha_t *)(reghdr + 1);

    /*
     * check if the inner packet is destined to mcast group
     */
    if (!CLASSD(eip->ipha_dst)) {
        ++ipst->ips_mrtstat->mrts_pim_badregisters;
        if (ipst->ips_ip_mrtdebug > 1) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "pim_input: Inner pkt not mcast .. !");
        }
        freemsg(mp);
        return (-1);
    }
    if (ipst->ips_ip_mrtdebug > 1) {
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "register from %x, to %x, len %d",
            ntohl(eip->ipha_src),
            ntohl(eip->ipha_dst),
            ntohs(eip->ipha_length));
    }
    /*
     * If the null register bit is not set, decapsulate
     * the packet before forwarding it.
     */
    if (!(ntohl(*reghdr) & PIM_NULL_REGISTER)) {
        mblk_t *mp_copy;

        /* Copy the message */
        if ((mp_copy = copymsg(mp)) == NULL) {
            ++ipst->ips_mrtstat->mrts_pim_nomemory;
            freemsg(mp);
            return (-1);
        }

        /*
         * Decapsulate the packet and give it to
         * register_mforward.
         */
        mp_copy->b_rptr += iphlen + sizeof (pim_t) +
            sizeof (*reghdr);
        if (register_mforward(q, mp_copy, ill) != 0) {
            freemsg(mp);
            return (-1);
        }
    }

    /*
     * Pass all valid PIM packets up to any process(es) listening on a raw
     * PIM socket. For Solaris it is done right after pim_input() is
     * called.
     */
    return (0);
}

/*
 * PIM sparse mode hook.  Called by pim_input after decapsulating
 * the packet. Loop back the packet, as if we have received it.
 * In pim_input() we have to check if the destination is a multicast address.
 */
/* ARGSUSED */
static int
register_mforward(queue_t *q, mblk_t *mp, ill_t *ill)
{
    ip_stack_t  *ipst = ill->ill_ipst;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs);

    if (ipst->ips_ip_mrtdebug > 3) {
        ipha_t *ipha;

        ipha = (ipha_t *)mp->b_rptr;
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "register_mforward: src %x, dst %x\n",
            ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
    }
    /*
     * Need to pass in to ip_mforward() the information that the
     * packet has arrived on the register_vif. We use the solution that
     * ip_mroute_decap() employs: use mp->b_prev to pass some information
     * to ip_mforward(). Nonzero value means the packet has arrived on a
     * tunnel (ip_mroute_decap() puts the address of the other side of the
     * tunnel there.) This is safe since ip_rput() either frees the packet
     * or passes it to ip_mforward(). We use
     * PIM_REGISTER_MARKER = 0xffffffff to indicate the has arrived on the
     * register vif. If in the future we have more than one register vifs,
     * then this will need re-examination.
     */
    mp->b_prev = (mblk_t *)PIM_REGISTER_MARKER;
    ++ipst->ips_mrtstat->mrts_pim_regforwards;
    ip_rput(q, mp);
    return (0);
}

/*
 * Send an encapsulated packet.
 * Caller assumes can continue to use mp when routine returns.
 */
/* ARGSUSED */
static void
encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
{
    mblk_t  *mp_copy;
    ipha_t  *ipha_copy;
    size_t  len;
    ip_stack_t  *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    if (ipst->ips_ip_mrtdebug > 1) {
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "encap_send: vif %ld enter",
            (ptrdiff_t)(vifp - ipst->ips_vifs));
    }
    len = ntohs(ipha->ipha_length);

    /*
     * Copy the old packet & pullup it's IP header into the
     * new mbuf so we can modify it.  Try to fill the new
     * mbuf since if we don't the ethernet driver will.
     */
    mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED);
    if (mp_copy == NULL)
        return;
    mp_copy->b_rptr += 32;
    mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr);
    if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
        freeb(mp_copy);
        return;
    }

    /*
     * Fill in the encapsulating IP header.
     * Remote tunnel dst in rmt_addr, from add_vif().
     */
    ipha_copy = (ipha_t *)mp_copy->b_rptr;
    *ipha_copy = multicast_encap_iphdr;
    ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET);
    ipha_copy->ipha_length = htons(len + sizeof (ipha_t));
    ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr;
    ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr;
    ASSERT(ipha_copy->ipha_ident == 0);

    /* Turn the encapsulated IP header back into a valid one. */
    ipha = (ipha_t *)mp_copy->b_cont->b_rptr;
    ipha->ipha_ttl--;
    ipha->ipha_hdr_checksum = 0;
    ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);

    if (ipst->ips_ip_mrtdebug > 1) {
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
    }
    if (vifp->v_rate_limit <= 0)
        tbf_send_packet(vifp, mp_copy);
    else
        /* ipha is from the original header */
        tbf_control(vifp, mp_copy, ipha);
}

/*
 * De-encapsulate a packet and feed it back through IP input.
 * This routine is called whenever IP gets a packet with prototype
 * IPPROTO_ENCAP and a local destination address.
 */
void
ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill)
{
    ipha_t      *ipha = (ipha_t *)mp->b_rptr;
    ipha_t      *ipha_encap;
    int     hlen = IPH_HDR_LENGTH(ipha);
    ipaddr_t    src;
    struct vif  *vifp;
    ip_stack_t  *ipst = ill->ill_ipst;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    /*
     * Dump the packet if it's not to a multicast destination or if
     * we don't have an encapsulating tunnel with the source.
     * Note:  This code assumes that the remote site IP address
     * uniquely identifies the tunnel (i.e., that this site has
     * at most one tunnel with the remote site).
     */
    ipha_encap = (ipha_t *)((char *)ipha + hlen);
    if (!CLASSD(ipha_encap->ipha_dst)) {
        ipst->ips_mrtstat->mrts_bad_tunnel++;
        ip1dbg(("ip_mroute_decap: bad tunnel\n"));
        freemsg(mp);
        return;
    }
    src = (ipaddr_t)ipha->ipha_src;
    mutex_enter(&ipst->ips_last_encap_lock);
    if (src != ipst->ips_last_encap_src) {
        struct vif *vife;

        vifp = ipst->ips_vifs;
        vife = vifp + ipst->ips_numvifs;
        ipst->ips_last_encap_src = src;
        ipst->ips_last_encap_vif = 0;
        for (; vifp < vife; ++vifp) {
            if (!lock_good_vif(vifp))
                continue;
            if (vifp->v_rmt_addr.s_addr == src) {
                if (vifp->v_flags & VIFF_TUNNEL)
                    ipst->ips_last_encap_vif = vifp;
                if (ipst->ips_ip_mrtdebug > 1) {
                    (void) mi_strlog(mrouter->conn_rq,
                        1, SL_TRACE,
                        "ip_mroute_decap: good tun "
                        "vif %ld with %x",
                        (ptrdiff_t)(vifp - ipst->ips_vifs),
                        ntohl(src));
                }
                unlock_good_vif(vifp);
                break;
            }
            unlock_good_vif(vifp);
        }
    }
    if ((vifp = ipst->ips_last_encap_vif) == 0) {
        mutex_exit(&ipst->ips_last_encap_lock);
        ipst->ips_mrtstat->mrts_bad_tunnel++;
        freemsg(mp);
        ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
            (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src)));
        return;
    }
    mutex_exit(&ipst->ips_last_encap_lock);

    /*
     * Need to pass in the tunnel source to ip_mforward (so that it can
     * verify that the packet arrived over the correct vif.)  We use b_prev
     * to pass this information. This is safe since the ip_rput either
     * frees the packet or passes it to ip_mforward.
     */
    mp->b_prev = (mblk_t *)(uintptr_t)src;
    mp->b_rptr += hlen;
    /* Feed back into ip_rput as an M_DATA. */
    ip_rput(q, mp);
}

/*
 * Remove all records with v_ipif == ipif.  Called when an interface goes away
 * (stream closed).  Called as writer.
 */
void
reset_mrt_vif_ipif(ipif_t *ipif)
{
    vifi_t vifi, tmp_vifi;
    vifi_t num_of_vifs;
    ip_stack_t  *ipst = ipif->ipif_ill->ill_ipst;

    /* Can't check vifi >= 0 since vifi_t is unsigned! */

    mutex_enter(&ipst->ips_numvifs_mutex);
    num_of_vifs = ipst->ips_numvifs;
    mutex_exit(&ipst->ips_numvifs_mutex);

    for (vifi = num_of_vifs; vifi != 0; vifi--) {
        tmp_vifi = vifi - 1;
        if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) {
            (void) del_vif(&tmp_vifi, NULL, NULL, ipst);
        }
    }
}

/* Remove pending upcall msgs when ill goes away.  Called by ill_delete.  */
void
reset_mrt_ill(ill_t *ill)
{
    struct mfc      *rt;
    struct rtdetq   *rte;
    int         i;
    ip_stack_t  *ipst = ill->ill_ipst;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    for (i = 0; i < MFCTBLSIZ; i++) {
        MFCB_REFHOLD(&ipst->ips_mfcs[i]);
        if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) {
            if (ipst->ips_ip_mrtdebug > 1) {
                (void) mi_strlog(mrouter->conn_rq, 1,
                    SL_TRACE,
                    "reset_mrt_ill: mfctable [%d]", i);
            }
            while (rt != NULL) {
                mutex_enter(&rt->mfc_mutex);
                while ((rte = rt->mfc_rte) != NULL) {
                    if (rte->ill == ill) {
                        if (ipst->ips_ip_mrtdebug > 1) {
                        (void) mi_strlog(
                            mrouter->conn_rq,
                            1, SL_TRACE,
                            "reset_mrt_ill: "
                            "ill 0x%p", (void *)ill);
                        }
                        rt->mfc_rte = rte->rte_next;
                        freemsg(rte->mp);
                        mi_free((char *)rte);
                    }
                }
                mutex_exit(&rt->mfc_mutex);
                rt = rt->mfc_next;
            }
        }
        MFCB_REFRELE(&ipst->ips_mfcs[i]);
    }
}

/*
 * Token bucket filter module.
 * The ipha is for mcastgrp destination for phyint and encap.
 */
static void
tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
{
    size_t  p_len =  msgdsize(mp);
    struct tbf  *t    = vifp->v_tbf;
    timeout_id_t id = 0;
    ip_stack_t  *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    /* Drop if packet is too large */
    if (p_len > MAX_BKT_SIZE) {
        ipst->ips_mrtstat->mrts_pkt2large++;
        freemsg(mp);
        return;
    }
    if (ipst->ips_ip_mrtdebug > 1) {
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
            (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len,
            ntohl(ipha->ipha_dst));
    }

    mutex_enter(&t->tbf_lock);

    tbf_update_tokens(vifp);

    /*
     * If there are enough tokens,
     * and the queue is empty, send this packet out.
     */
    if (ipst->ips_ip_mrtdebug > 1) {
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "tbf_control: vif %ld, TOKENS  %d, pkt len  %lu, qlen  %d",
            (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len,
            t->tbf_q_len);
    }
    /* No packets are queued */
    if (t->tbf_q_len == 0) {
        /* queue empty, send packet if enough tokens */
        if (p_len <= t->tbf_n_tok) {
            t->tbf_n_tok -= p_len;
            mutex_exit(&t->tbf_lock);
            tbf_send_packet(vifp, mp);
            return;
        } else {
            /* Queue packet and timeout till later */
            tbf_queue(vifp, mp);
            ASSERT(vifp->v_timeout_id == 0);
            vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
                TBF_REPROCESS);
        }
    } else if (t->tbf_q_len < t->tbf_max_q_len) {
        /* Finite queue length, so queue pkts and process queue */
        tbf_queue(vifp, mp);
        tbf_process_q(vifp);
    } else {
        /* Check that we have UDP header with IP header */
        size_t hdr_length = IPH_HDR_LENGTH(ipha) +
            sizeof (struct udphdr);

        if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
            if (!pullupmsg(mp, hdr_length)) {
                freemsg(mp);
                ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
                    "vif %ld src 0x%x dst 0x%x\n",
                    (ptrdiff_t)(vifp - ipst->ips_vifs),
                    ntohl(ipha->ipha_src),
                    ntohl(ipha->ipha_dst)));
                mutex_exit(&vifp->v_tbf->tbf_lock);
                return;
            } else
                /* Have to reassign ipha after pullupmsg */
                ipha = (ipha_t *)mp->b_rptr;
        }
        /*
         * Queue length too much,
         * try to selectively dq, or queue and process
         */
        if (!tbf_dq_sel(vifp, ipha)) {
            ipst->ips_mrtstat->mrts_q_overflow++;
            freemsg(mp);
        } else {
            tbf_queue(vifp, mp);
            tbf_process_q(vifp);
        }
    }
    if (t->tbf_q_len == 0) {
        id = vifp->v_timeout_id;
        vifp->v_timeout_id = 0;
    }
    mutex_exit(&vifp->v_tbf->tbf_lock);
    if (id != 0)
        (void) untimeout(id);
}

/*
 * Adds a packet to the tbf queue at the interface.
 * The ipha is for mcastgrp destination for phyint and encap.
 */
static void
tbf_queue(struct vif *vifp, mblk_t *mp)
{
    struct tbf  *t = vifp->v_tbf;
    ip_stack_t  *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    if (ipst->ips_ip_mrtdebug > 1) {
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs));
    }
    ASSERT(MUTEX_HELD(&t->tbf_lock));

    if (t->tbf_t == NULL) {
        /* Queue was empty */
        t->tbf_q = mp;
    } else {
        /* Insert at tail */
        t->tbf_t->b_next = mp;
    }
    /* set new tail pointer */
    t->tbf_t = mp;

    mp->b_next = mp->b_prev = NULL;

    t->tbf_q_len++;
}

/*
 * Process the queue at the vif interface.
 * Drops the tbf_lock when sending packets.
 *
 * NOTE : The caller should quntimeout if the queue length is 0.
 */
static void
tbf_process_q(struct vif *vifp)
{
    mblk_t  *mp;
    struct tbf  *t = vifp->v_tbf;
    size_t  len;
    ip_stack_t  *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    if (ipst->ips_ip_mrtdebug > 1) {
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "tbf_process_q 1: vif %ld qlen = %d",
            (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len);
    }

    /*
     * Loop through the queue at the interface and send
     * as many packets as possible.
     */
    ASSERT(MUTEX_HELD(&t->tbf_lock));

    while (t->tbf_q_len > 0) {
        mp = t->tbf_q;
        len = (size_t)msgdsize(mp); /* length of ip pkt */

        /* Determine if the packet can be sent */
        if (len <= t->tbf_n_tok) {
            /*
             * If so, reduce no. of tokens, dequeue the packet,
             * send the packet.
             */
            t->tbf_n_tok -= len;

            t->tbf_q = mp->b_next;
            if (--t->tbf_q_len == 0) {
                t->tbf_t = NULL;
            }
            mp->b_next = NULL;
            /* Exit mutex before sending packet, then re-enter */
            mutex_exit(&t->tbf_lock);
            tbf_send_packet(vifp, mp);
            mutex_enter(&t->tbf_lock);
        } else
            break;
    }
}

/* Called at tbf timeout to update tokens, process q and reset timer.  */
static void
tbf_reprocess_q(void *arg)
{
    struct vif *vifp = arg;
    ip_stack_t  *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    mutex_enter(&vifp->v_tbf->tbf_lock);
    vifp->v_timeout_id = 0;
    tbf_update_tokens(vifp);

    tbf_process_q(vifp);

    if (vifp->v_tbf->tbf_q_len > 0) {
        vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
            TBF_REPROCESS);
    }
    mutex_exit(&vifp->v_tbf->tbf_lock);

    if (ipst->ips_ip_mrtdebug > 1) {
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "tbf_reprcess_q: vif %ld timeout id = %p",
            (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id);
    }
}

/*
 * Function that will selectively discard a member of the tbf queue,
 * based on the precedence value and the priority.
 *
 * NOTE : The caller should quntimeout if the queue length is 0.
 */
static int
tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
{
    uint_t      p;
    struct tbf      *t = vifp->v_tbf;
    mblk_t      **np;
    mblk_t      *last, *mp;
    ip_stack_t  *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    if (ipst->ips_ip_mrtdebug > 1) {
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "dq_sel: vif %ld dst 0x%x",
            (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst));
    }

    ASSERT(MUTEX_HELD(&t->tbf_lock));
    p = priority(vifp, ipha);

    np = &t->tbf_q;
    last = NULL;
    while ((mp = *np) != NULL) {
        if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) {
            *np = mp->b_next;
            /* If removing the last packet, fix the tail pointer */
            if (mp == t->tbf_t)
                t->tbf_t = last;
            mp->b_prev = mp->b_next = NULL;
            freemsg(mp);
            /*
             * It's impossible for the queue to be empty, but
             * we check anyway.
             */
            if (--t->tbf_q_len == 0) {
                t->tbf_t = NULL;
            }
            ipst->ips_mrtstat->mrts_drop_sel++;
            return (1);
        }
        np = &mp->b_next;
        last = mp;
    }
    return (0);
}

/* Sends packet, 2 cases - encap tunnel, phyint.  */
static void
tbf_send_packet(struct vif *vifp, mblk_t *mp)
{
    ipif_t  *ipif;
    ip_stack_t  *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    /* If encap tunnel options */
    if (vifp->v_flags & VIFF_TUNNEL)  {
        if (ipst->ips_ip_mrtdebug > 1) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "tbf_send_pkt: ENCAP tunnel vif %ld",
                (ptrdiff_t)(vifp - ipst->ips_vifs));
        }

        /*
         * Feed into ip_wput which will set the ident field and
         * checksum the encapsulating header.
         * BSD gets the cached route vifp->v_route from ip_output()
         * to speed up route table lookups. Not necessary in SunOS 5.x.
         */
        put(vifp->v_ipif->ipif_wq, mp);
        return;

        /* phyint */
    } else {
        /* Need to loop back to members on the outgoing interface. */
        ipha_t  *ipha;
        ipaddr_t    dst;
        ipha  = (ipha_t *)mp->b_rptr;
        dst  = ipha->ipha_dst;
        ipif = vifp->v_ipif;

        if (ilm_lookup_ipif(ipif, dst) != NULL) {
            /*
             * The packet is not yet reassembled, thus we need to
             * pass it to ip_rput_local for checksum verification
             * and reassembly (and fanout the user stream).
             */
            mblk_t  *mp_loop;
            ire_t   *ire;

            if (ipst->ips_ip_mrtdebug > 1) {
                (void) mi_strlog(mrouter->conn_rq, 1,
                    SL_TRACE,
                    "tbf_send_pkt: loopback vif %ld",
                    (ptrdiff_t)(vifp - ipst->ips_vifs));
            }
            mp_loop = copymsg(mp);
            ire = ire_ctable_lookup(~0, 0, IRE_BROADCAST, NULL,
                ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);

            if (mp_loop != NULL && ire != NULL) {
                IP_RPUT_LOCAL(ipif->ipif_rq, mp_loop,
                    ((ipha_t *)mp_loop->b_rptr),
                    ire, (ill_t *)ipif->ipif_rq->q_ptr);
            } else {
                /* Either copymsg failed or no ire */
                (void) mi_strlog(mrouter->conn_rq, 1,
                    SL_TRACE,
                    "tbf_send_pkt: mp_loop 0x%p, ire 0x%p "
                    "vif %ld\n", (void *)mp_loop, (void *)ire,
                    (ptrdiff_t)(vifp - ipst->ips_vifs));
            }
            if (ire != NULL)
                ire_refrele(ire);
        }
        if (ipst->ips_ip_mrtdebug > 1) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "tbf_send_pkt: phyint forward  vif %ld dst = 0x%x",
                (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst));
        }
        ip_rput_forward_multicast(dst, mp, ipif);
    }
}

/*
 * Determine the current time and then the elapsed time (between the last time
 * and time now).  Update the no. of tokens in the bucket.
 */
static void
tbf_update_tokens(struct vif *vifp)
{
    timespec_t  tp;
    hrtime_t    tm;
    struct tbf  *t = vifp->v_tbf;
    ip_stack_t  *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    ASSERT(MUTEX_HELD(&t->tbf_lock));

    /* Time in secs and nsecs, rate limit in kbits/sec */
    gethrestime(&tp);

    /*LINTED*/
    TV_DELTA(tp, t->tbf_last_pkt_t, tm);

    /*
     * This formula is actually
     * "time in seconds" * "bytes/second".  Scaled for nsec.
     * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
     *
     * The (1000/1024) was introduced in add_vif to optimize
     * this divide into a shift.
     */
    t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8;
    t->tbf_last_pkt_t = tp;

    if (t->tbf_n_tok > MAX_BKT_SIZE)
        t->tbf_n_tok = MAX_BKT_SIZE;
    if (ipst->ips_ip_mrtdebug > 1) {
        (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
            "tbf_update_tok: tm %lld tok %d vif %ld",
            tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs));
    }
}

/*
 * Priority currently is based on port nos.
 * Different forwarding mechanisms have different ways
 * of obtaining the port no. Hence, the vif must be
 * given along with the packet itself.
 *
 */
static int
priority(struct vif *vifp, ipha_t *ipha)
{
    int prio;
    ip_stack_t  *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
    conn_t      *mrouter = ipst->ips_ip_g_mrouter;

    /* Temporary hack; may add general packet classifier some day */

    ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock));

    /*
     * The UDP port space is divided up into four priority ranges:
     * [0, 16384)   : unclassified - lowest priority
     * [16384, 32768)   : audio - highest priority
     * [32768, 49152)   : whiteboard - medium priority
     * [49152, 65536)   : video - low priority
     */

    if (ipha->ipha_protocol == IPPROTO_UDP) {
        struct udphdr *udp =
            (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha));
        switch (ntohs(udp->uh_dport) & 0xc000) {
        case 0x4000:
            prio = 70;
            break;
        case 0x8000:
            prio = 60;
            break;
        case 0xc000:
            prio = 55;
            break;
        default:
            prio = 50;
            break;
        }
        if (ipst->ips_ip_mrtdebug > 1) {
            (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
                "priority: port %x prio %d\n",
                ntohs(udp->uh_dport), prio);
        }
    } else
        prio = 50;  /* default priority */
    return (prio);
}

/*
 * End of token bucket filter modifications
 */


/*
 * Produces data for netstat -M.
 */
int
ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst)
{
    ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
    ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
    if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat,
        sizeof (struct mrtstat))) {
        ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
            (size_t)sizeof (struct mrtstat)));
        return (0);
    }
    return (1);
}

/*
 * Sends info for SNMP's MIB.
 */
int
ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst)
{
    struct vifctl   vi;
    vifi_t      vifi;

    mutex_enter(&ipst->ips_numvifs_mutex);
    for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) {
        if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0)
            continue;
        /*
         * No locks here, an approximation is fine.
         */
        vi.vifc_vifi = vifi;
        vi.vifc_flags = ipst->ips_vifs[vifi].v_flags;
        vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold;
        vi.vifc_rate_limit  = ipst->ips_vifs[vifi].v_rate_limit;
        vi.vifc_lcl_addr    = ipst->ips_vifs[vifi].v_lcl_addr;
        vi.vifc_rmt_addr    = ipst->ips_vifs[vifi].v_rmt_addr;
        vi.vifc_pkt_in      = ipst->ips_vifs[vifi].v_pkt_in;
        vi.vifc_pkt_out     = ipst->ips_vifs[vifi].v_pkt_out;

        if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) {
            ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
                (size_t)sizeof (vi)));
            mutex_exit(&ipst->ips_numvifs_mutex);
            return (0);
        }
    }
    mutex_exit(&ipst->ips_numvifs_mutex);
    return (1);
}

/*
 * Called by ip_snmp_get to send up multicast routing table.
 */
int
ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst)
{
    int         i, j;
    struct mfc      *rt;
    struct mfcctl   mfcc;

    /*
     * Make sure multicast has not been turned off.
     */
    if (is_mrouter_off(ipst))
        return (1);

    /* Loop over all hash buckets and their chains */
    for (i = 0; i < MFCTBLSIZ; i++) {
        MFCB_REFHOLD(&ipst->ips_mfcs[i]);
        for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) {
            mutex_enter(&rt->mfc_mutex);
            if (rt->mfc_rte != NULL ||
                (rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
                mutex_exit(&rt->mfc_mutex);
                continue;
            }
            mfcc.mfcc_origin = rt->mfc_origin;
            mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp;
            mfcc.mfcc_parent = rt->mfc_parent;
            mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt;
            mutex_enter(&ipst->ips_numvifs_mutex);
            for (j = 0; j < (int)ipst->ips_numvifs; j++)
                mfcc.mfcc_ttls[j] = rt->mfc_ttls[j];
            for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++)
                mfcc.mfcc_ttls[j] = 0;
            mutex_exit(&ipst->ips_numvifs_mutex);

            mutex_exit(&rt->mfc_mutex);
            if (!snmp_append_data(mp, (char *)&mfcc,
                sizeof (mfcc))) {
                MFCB_REFRELE(&ipst->ips_mfcs[i]);
                ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
                    (size_t)sizeof (mfcc)));
                return (0);
            }
        }
        MFCB_REFRELE(&ipst->ips_mfcs[i]);
    }
    return (1);
}