ip_ndp.c revision 309fa1740d090ebe5cd183c34c7bc282f94856fa
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/stropts.h>
#include <sys/sysmacros.h>
#include <sys/errno.h>
#include <sys/dlpi.h>
#include <sys/socket.h>
#include <sys/ddi.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/vtrace.h>
#include <sys/kmem.h>
#include <sys/zone.h>
#include <net/if.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <inet/common.h>
#include <inet/mi.h>
#include <inet/mib2.h>
#include <inet/nd.h>
#include <inet/ip.h>
#include <inet/ip_if.h>
#include <inet/ip_ire.h>
#include <inet/ip_rts.h>
#include <inet/ip6.h>
#include <inet/ip_ndp.h>
#include <inet/ipsec_impl.h>
#include <inet/ipsec_info.h>
/*
* Function names with nce_ prefix are static while function
* names with ndp_ prefix are used by rest of the IP.
*/
static boolean_t nce_cmp_ll_addr(nce_t *nce, char *new_ll_addr,
uint32_t ll_addr_len);
static void nce_fastpath(nce_t *nce);
static void nce_ire_delete(nce_t *nce);
static void nce_ire_delete1(ire_t *ire, char *nce_arg);
static void nce_set_ll(nce_t *nce, uchar_t *ll_addr);
static nce_t *nce_lookup_addr(ill_t *ill, const in6_addr_t *addr);
static nce_t *nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
static void nce_make_mapping(nce_t *nce, uchar_t *addrpos,
uchar_t *addr);
static int nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
static void nce_queue_mp(nce_t *nce, mblk_t *mp);
static void nce_report1(nce_t *nce, uchar_t *mp_arg);
static mblk_t *nce_udreq_alloc(ill_t *ill);
static void nce_update(nce_t *nce, uint16_t new_state,
uchar_t *new_ll_addr);
static uint32_t nce_solicit(nce_t *nce, mblk_t *mp);
static boolean_t nce_xmit(ill_t *ill, uint32_t operation,
ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
const in6_addr_t *target, int flag);
static void lla2ascii(uint8_t *lla, int addrlen, uchar_t *buf);
extern void th_trace_rrecord(th_trace_t *);
#ifdef NCE_DEBUG
void nce_trace_inactive(nce_t *);
#endif
/* NDP Cache Entry Hash Table */
#define NCE_TABLE_SIZE 256
static nce_t *nce_hash_tbl[NCE_TABLE_SIZE];
static nce_t *nce_mask_entries; /* mask not all ones */
static int ndp_g_walker = 0; /* # of active thread */
/* walking nce hash list */
/* ndp_g_walker_cleanup will be true, when deletion have to be defered */
static boolean_t ndp_g_walker_cleanup = B_FALSE;
#define NCE_HASH_PTR(addr) \
(&(nce_hash_tbl[NCE_ADDR_HASH_V6(addr, NCE_TABLE_SIZE)]))
/*
* NDP Cache Entry creation routine.
* Mapped entries will never do NUD .
* This routine must always be called with ndp_g_lock held.
* Prior to return, nce_refcnt is incremented.
*/
int
ndp_add(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
const in6_addr_t *mask, const in6_addr_t *extract_mask,
uint32_t hw_extract_start, uint16_t flags, uint16_t state,
nce_t **newnce)
{
static nce_t nce_nil;
nce_t *nce;
mblk_t *mp;
mblk_t *template;
nce_t **ncep;
boolean_t dropped = B_FALSE;
ASSERT(MUTEX_HELD(&ndp_g_lock));
ASSERT(ill != NULL);
if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
ip0dbg(("ndp_add: no addr\n"));
return (EINVAL);
}
if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
ip0dbg(("ndp_add: flags = %x\n", (int)flags));
return (EINVAL);
}
if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
(flags & NCE_F_MAPPING)) {
ip0dbg(("ndp_add: extract mask zero for mapping"));
return (EINVAL);
}
/*
* Allocate the mblk to hold the nce.
*
* XXX This can come out of a separate cache - nce_cache.
* We don't need the mp anymore as there are no more
* "qwriter"s
*/
mp = allocb(sizeof (nce_t), BPRI_MED);
if (mp == NULL)
return (ENOMEM);
nce = (nce_t *)mp->b_rptr;
mp->b_wptr = (uchar_t *)&nce[1];
*nce = nce_nil;
/*
* This one holds link layer address
*/
if (ill->ill_net_type == IRE_IF_RESOLVER) {
template = nce_udreq_alloc(ill);
} else {
ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
ASSERT((ill->ill_resolver_mp != NULL));
template = copyb(ill->ill_resolver_mp);
}
if (template == NULL) {
freeb(mp);
return (ENOMEM);
}
nce->nce_ill = ill;
nce->nce_flags = flags;
nce->nce_state = state;
nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
nce->nce_rcnt = ill->ill_xmit_count;
nce->nce_addr = *addr;
nce->nce_mask = *mask;
nce->nce_extract_mask = *extract_mask;
nce->nce_ll_extract_start = hw_extract_start;
nce->nce_fp_mp = NULL;
nce->nce_res_mp = template;
if (state == ND_REACHABLE)
nce->nce_last = TICK_TO_MSEC(lbolt64);
else
nce->nce_last = 0;
nce->nce_qd_mp = NULL;
nce->nce_mp = mp;
if (hw_addr != NULL)
nce_set_ll(nce, hw_addr);
/* This one is for nce getting created */
nce->nce_refcnt = 1;
mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
if (nce->nce_flags & NCE_F_MAPPING) {
ASSERT(IN6_IS_ADDR_MULTICAST(addr));
ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
ncep = &nce_mask_entries;
} else {
ncep = ((nce_t **)NCE_HASH_PTR(*addr));
}
#ifdef NCE_DEBUG
bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX);
#endif
/*
* Atomically ensure that the ill is not CONDEMNED, before
* adding the NCE.
*/
mutex_enter(&ill->ill_lock);
if (ill->ill_state_flags & ILL_CONDEMNED) {
mutex_exit(&ill->ill_lock);
freeb(mp);
return (EINVAL);
}
if ((nce->nce_next = *ncep) != NULL)
nce->nce_next->nce_ptpn = &nce->nce_next;
*ncep = nce;
nce->nce_ptpn = ncep;
*newnce = nce;
/* This one is for nce being used by an active thread */
NCE_REFHOLD(*newnce);
/* Bump up the number of nce's referencing this ill */
ill->ill_nce_cnt++;
mutex_exit(&ill->ill_lock);
/*
* Before we insert the nce, honor the UNSOL_ADV flag.
* We cannot hold the ndp_g_lock and call nce_xmit
* which does a putnext.
*/
if (flags & NCE_F_UNSOL_ADV) {
flags |= NDP_ORIDE;
/*
* We account for the transmit below by assigning one
* less than the ndd variable. Subsequent decrements
* are done in ndp_timer.
*/
mutex_enter(&nce->nce_lock);
mutex_exit(&ndp_g_lock);
nce->nce_unsolicit_count = ip_ndp_unsolicit_count - 1;
mutex_exit(&nce->nce_lock);
dropped = nce_xmit(ill,
ND_NEIGHBOR_ADVERT,
ill, /* ill to be used for extracting ill_nd_lla */
B_TRUE, /* use ill_nd_lla */
addr, /* Source and target of the advertisement pkt */
&ipv6_all_hosts_mcast, /* Destination of the packet */
flags);
mutex_enter(&nce->nce_lock);
if (dropped)
nce->nce_unsolicit_count++;
if (nce->nce_unsolicit_count != 0) {
nce->nce_timeout_id = timeout(ndp_timer, nce,
MSEC_TO_TICK(ip_ndp_unsolicit_interval));
}
mutex_exit(&nce->nce_lock);
mutex_enter(&ndp_g_lock);
}
/*
* If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
* we call nce_fastpath as soon as the nce is resolved in ndp_process.
* We call nce_fastpath from nce_update if the link layer address of
* the peer changes from nce_update
*/
if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
nce_fastpath(nce);
return (0);
}
int
ndp_lookup_then_add(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
const in6_addr_t *mask, const in6_addr_t *extract_mask,
uint32_t hw_extract_start, uint16_t flags, uint16_t state,
nce_t **newnce)
{
int err = 0;
nce_t *nce;
mutex_enter(&ndp_g_lock);
nce = nce_lookup_addr(ill, addr);
if (nce == NULL) {
err = ndp_add(ill,
hw_addr,
addr,
mask,
extract_mask,
hw_extract_start,
flags,
state,
newnce);
} else {
*newnce = nce;
err = EEXIST;
}
mutex_exit(&ndp_g_lock);
return (err);
}
/*
* Remove all the CONDEMNED nces from the appropriate hash table.
* We create a private list of NCEs, these may have ires pointing
* to them, so the list will be passed through to clean up dependent
* ires and only then we can do NCE_REFRELE which can make NCE inactive.
*/
static void
nce_remove(nce_t *nce, nce_t **free_nce_list)
{
nce_t *nce1;
nce_t **ptpn;
ASSERT(MUTEX_HELD(&ndp_g_lock));
ASSERT(ndp_g_walker == 0);
for (; nce; nce = nce1) {
nce1 = nce->nce_next;
mutex_enter(&nce->nce_lock);
if (nce->nce_flags & NCE_F_CONDEMNED) {
ptpn = nce->nce_ptpn;
nce1 = nce->nce_next;
if (nce1 != NULL)
nce1->nce_ptpn = ptpn;
*ptpn = nce1;
nce->nce_ptpn = NULL;
nce->nce_next = NULL;
nce->nce_next = *free_nce_list;
*free_nce_list = nce;
}
mutex_exit(&nce->nce_lock);
}
}
/*
* 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
* will return this NCE. Also no new IREs will be created that
* point to this NCE (See ire_add_v6). Also no new timeouts will
* be started (See NDP_RESTART_TIMER).
* 2. Cancel any currently running timeouts.
* 3. If there is an ndp walker, return. The walker will do the cleanup.
* This ensures that walkers see a consistent list of NCEs while walking.
* 4. Otherwise remove the NCE from the list of NCEs
* 5. Delete all IREs pointing to this NCE.
*/
void
ndp_delete(nce_t *nce)
{
nce_t **ptpn;
nce_t *nce1;
/* Serialize deletes */
mutex_enter(&nce->nce_lock);
if (nce->nce_flags & NCE_F_CONDEMNED) {
/* Some other thread is doing the delete */
mutex_exit(&nce->nce_lock);
return;
}
/*
* Caller has a refhold. Also 1 ref for being in the list. Thus
* refcnt has to be >= 2
*/
ASSERT(nce->nce_refcnt >= 2);
nce->nce_flags |= NCE_F_CONDEMNED;
mutex_exit(&nce->nce_lock);
nce_fastpath_list_delete(nce);
/*
* Cancel any running timer. Timeout can't be restarted
* since CONDEMNED is set. Can't hold nce_lock across untimeout.
* Passing invalid timeout id is fine.
*/
if (nce->nce_timeout_id != 0) {
(void) untimeout(nce->nce_timeout_id);
nce->nce_timeout_id = 0;
}
mutex_enter(&ndp_g_lock);
if (nce->nce_ptpn == NULL) {
/*
* The last ndp walker has already removed this nce from
* the list after we marked the nce CONDEMNED and before
* we grabbed the ndp_g_lock.
*/
mutex_exit(&ndp_g_lock);
return;
}
if (ndp_g_walker > 0) {
/*
* Can't unlink. The walker will clean up
*/
ndp_g_walker_cleanup = B_TRUE;
mutex_exit(&ndp_g_lock);
return;
}
/*
* Now remove the nce from the list. NDP_RESTART_TIMER won't restart
* the timer since it is marked CONDEMNED.
*/
ptpn = nce->nce_ptpn;
nce1 = nce->nce_next;
if (nce1 != NULL)
nce1->nce_ptpn = ptpn;
*ptpn = nce1;
nce->nce_ptpn = NULL;
nce->nce_next = NULL;
mutex_exit(&ndp_g_lock);
nce_ire_delete(nce);
}
void
ndp_inactive(nce_t *nce)
{
mblk_t **mpp;
ill_t *ill;
ASSERT(nce->nce_refcnt == 0);
ASSERT(MUTEX_HELD(&nce->nce_lock));
ASSERT(nce->nce_fastpath == NULL);
/* Free all nce allocated messages */
mpp = &nce->nce_first_mp_to_free;
do {
while (*mpp != NULL) {
mblk_t *mp;
mp = *mpp;
*mpp = mp->b_next;
mp->b_next = NULL;
mp->b_prev = NULL;
freemsg(mp);
}
} while (mpp++ != &nce->nce_last_mp_to_free);
#ifdef NCE_DEBUG
nce_trace_inactive(nce);
#endif
ill = nce->nce_ill;
mutex_enter(&ill->ill_lock);
ill->ill_nce_cnt--;
/*
* If the number of nce's associated with this ill have dropped
* to zero, check whether we need to restart any operation that
* is waiting for this to happen.
*/
if (ill->ill_nce_cnt == 0) {
/* ipif_ill_refrele_tail drops the ill_lock */
ipif_ill_refrele_tail(ill);
} else {
mutex_exit(&ill->ill_lock);
}
mutex_destroy(&nce->nce_lock);
freeb(nce->nce_mp);
}
/*
* ndp_walk routine. Delete the nce if it is associated with the ill
* that is going away. Always called as a writer.
*/
void
ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
{
if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
ndp_delete(nce);
}
}
/*
* Walk a list of to be inactive NCEs and blow away all the ires.
*/
static void
nce_ire_delete_list(nce_t *nce)
{
nce_t *nce_next;
ASSERT(nce != NULL);
while (nce != NULL) {
nce_next = nce->nce_next;
nce->nce_next = NULL;
/*
* It is possible for the last ndp walker (this thread)
* to come here after ndp_delete has marked the nce CONDEMNED
* and before it has removed the nce from the fastpath list
* or called untimeout. So we need to do it here. It is safe
* for both ndp_delete and this thread to do it twice or
* even simultaneously since each of the threads has a
* reference on the nce.
*/
nce_fastpath_list_delete(nce);
/*
* Cancel any running timer. Timeout can't be restarted
* since CONDEMNED is set. Can't hold nce_lock across untimeout.
* Passing invalid timeout id is fine.
*/
if (nce->nce_timeout_id != 0) {
(void) untimeout(nce->nce_timeout_id);
nce->nce_timeout_id = 0;
}
ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
nce_ire_delete1, (char *)nce, nce->nce_ill);
NCE_REFRELE_NOTR(nce);
nce = nce_next;
}
}
/*
* Delete an ire when the nce goes away.
*/
/* ARGSUSED */
static void
nce_ire_delete(nce_t *nce)
{
ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
nce_ire_delete1, (char *)nce, nce->nce_ill);
NCE_REFRELE_NOTR(nce);
}
/*
* ire_walk routine used to delete every IRE that shares this nce
*/
static void
nce_ire_delete1(ire_t *ire, char *nce_arg)
{
nce_t *nce = (nce_t *)nce_arg;
ASSERT(ire->ire_type == IRE_CACHE);
if (ire->ire_nce == nce)
ire_delete(ire);
}
/*
* Cache entry lookup. Try to find an nce matching the parameters passed.
* If one is found, the refcnt on the nce will be incremented.
*/
nce_t *
ndp_lookup(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
{
nce_t *nce;
if (!caller_holds_lock)
mutex_enter(&ndp_g_lock);
nce = nce_lookup_addr(ill, addr);
if (nce == NULL)
nce = nce_lookup_mapping(ill, addr);
if (!caller_holds_lock)
mutex_exit(&ndp_g_lock);
return (nce);
}
/*
* Cache entry lookup. Try to find an nce matching the parameters passed.
* Look only for exact entries (no mappings). If an nce is found, increment
* the hold count on that nce.
*/
static nce_t *
nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
{
nce_t *nce;
ASSERT(ill != NULL);
ASSERT(MUTEX_HELD(&ndp_g_lock));
if (IN6_IS_ADDR_UNSPECIFIED(addr))
return (NULL);
nce = *((nce_t **)NCE_HASH_PTR(*addr));
for (; nce != NULL; nce = nce->nce_next) {
if (nce->nce_ill == ill) {
if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
&ipv6_all_ones)) {
mutex_enter(&nce->nce_lock);
if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
NCE_REFHOLD_LOCKED(nce);
mutex_exit(&nce->nce_lock);
break;
}
mutex_exit(&nce->nce_lock);
}
}
}
return (nce);
}
/*
* Cache entry lookup. Try to find an nce matching the parameters passed.
* Look only for mappings.
*/
static nce_t *
nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
{
nce_t *nce;
ASSERT(ill != NULL);
ASSERT(MUTEX_HELD(&ndp_g_lock));
if (!IN6_IS_ADDR_MULTICAST(addr))
return (NULL);
nce = nce_mask_entries;
for (; nce != NULL; nce = nce->nce_next)
if (nce->nce_ill == ill &&
(V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
mutex_enter(&nce->nce_lock);
if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
NCE_REFHOLD_LOCKED(nce);
mutex_exit(&nce->nce_lock);
break;
}
mutex_exit(&nce->nce_lock);
}
return (nce);
}
/*
* Process passed in parameters either from an incoming packet or via
* user ioctl.
*/
void
ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
{
ill_t *ill = nce->nce_ill;
uint32_t hw_addr_len = ill->ill_nd_lla_len;
mblk_t *mp;
boolean_t ll_updated = B_FALSE;
boolean_t ll_changed;
/*
* No updates of link layer address or the neighbor state is
* allowed, when the cache is in NONUD state. This still
* allows for responding to reachability solicitation.
*/
mutex_enter(&nce->nce_lock);
if (nce->nce_state == ND_INCOMPLETE) {
if (hw_addr == NULL) {
mutex_exit(&nce->nce_lock);
return;
}
nce_set_ll(nce, hw_addr);
/*
* Update nce state and send the queued packets
* back to ip this time ire will be added.
*/
if (flag & ND_NA_FLAG_SOLICITED) {
nce_update(nce, ND_REACHABLE, NULL);
} else {
nce_update(nce, ND_STALE, NULL);
}
mutex_exit(&nce->nce_lock);
nce_fastpath(nce);
mutex_enter(&nce->nce_lock);
mp = nce->nce_qd_mp;
nce->nce_qd_mp = NULL;
mutex_exit(&nce->nce_lock);
while (mp != NULL) {
mblk_t *nxt_mp;
nxt_mp = mp->b_next;
mp->b_next = NULL;
if (mp->b_prev != NULL) {
ill_t *inbound_ill;
queue_t *fwdq = NULL;
uint_t ifindex;
ifindex = (uint_t)(uintptr_t)mp->b_prev;
inbound_ill = ill_lookup_on_ifindex(ifindex,
B_TRUE, NULL, NULL, NULL, NULL);
if (inbound_ill == NULL) {
mp->b_prev = NULL;
freemsg(mp);
return;
} else {
fwdq = inbound_ill->ill_rq;
}
mp->b_prev = NULL;
/*
* Send a forwarded packet back into ip_rput_v6
* just as in ire_send_v6().
* Extract the queue from b_prev (set in
* ip_rput_data_v6).
*/
if (fwdq != NULL) {
/*
* Forwarded packets hop count will
* get decremented in ip_rput_data_v6
*/
put(fwdq, mp);
} else {
/*
* Send locally originated packets back
* into * ip_wput_v6.
*/
put(ill->ill_wq, mp);
}
ill_refrele(inbound_ill);
} else {
put(ill->ill_wq, mp);
}
mp = nxt_mp;
}
return;
}
ll_changed = nce_cmp_ll_addr(nce, (char *)hw_addr, hw_addr_len);
if (!is_adv) {
/* If this is a SOLICITATION request only */
if (ll_changed)
nce_update(nce, ND_STALE, hw_addr);
mutex_exit(&nce->nce_lock);
return;
}
if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
/* If in any other state than REACHABLE, ignore */
if (nce->nce_state == ND_REACHABLE) {
nce_update(nce, ND_STALE, NULL);
}
mutex_exit(&nce->nce_lock);
return;
} else {
if (ll_changed) {
nce_update(nce, ND_UNCHANGED, hw_addr);
ll_updated = B_TRUE;
}
if (flag & ND_NA_FLAG_SOLICITED) {
nce_update(nce, ND_REACHABLE, NULL);
} else {
if (ll_updated) {
nce_update(nce, ND_STALE, NULL);
}
}
mutex_exit(&nce->nce_lock);
if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
NCE_F_ISROUTER)) {
ire_t *ire;
/*
* Router turned to host. We need to remove the
* entry as well as any default route that may be
* using this as a next hop. This is required by
* section 7.2.5 of RFC 2461.
*/
ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
&ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
MATCH_IRE_DEFAULT);
if (ire != NULL) {
ip_rts_rtmsg(RTM_DELETE, ire, 0);
ire_delete(ire);
ire_refrele(ire);
}
ndp_delete(nce);
}
}
}
/*
* Pass arg1 to the pfi supplied, along with each nce in existence.
* ndp_walk() places a REFHOLD on the nce and drops the lock when
* walking the hash list.
*/
void
ndp_walk_impl(ill_t *ill, pfi_t pfi, void *arg1, boolean_t trace)
{
nce_t *nce;
nce_t *nce1;
nce_t **ncep;
nce_t *free_nce_list = NULL;
mutex_enter(&ndp_g_lock);
ndp_g_walker++; /* Prevent ndp_delete from unlink and free of NCE */
mutex_exit(&ndp_g_lock);
for (ncep = nce_hash_tbl; ncep < A_END(nce_hash_tbl); ncep++) {
for (nce = *ncep; nce; nce = nce1) {
nce1 = nce->nce_next;
if (ill == NULL || nce->nce_ill == ill) {
if (trace) {
NCE_REFHOLD(nce);
(*pfi)(nce, arg1);
NCE_REFRELE(nce);
} else {
NCE_REFHOLD_NOTR(nce);
(*pfi)(nce, arg1);
NCE_REFRELE_NOTR(nce);
}
}
}
}
for (nce = nce_mask_entries; nce; nce = nce1) {
nce1 = nce->nce_next;
if (ill == NULL || nce->nce_ill == ill) {
if (trace) {
NCE_REFHOLD(nce);
(*pfi)(nce, arg1);
NCE_REFRELE(nce);
} else {
NCE_REFHOLD_NOTR(nce);
(*pfi)(nce, arg1);
NCE_REFRELE_NOTR(nce);
}
}
}
mutex_enter(&ndp_g_lock);
ndp_g_walker--;
/*
* While NCE's are removed from global list they are placed
* in a private list, to be passed to nce_ire_delete_list().
* The reason is, there may be ires pointing to this nce
* which needs to cleaned up.
*/
if (ndp_g_walker_cleanup && ndp_g_walker == 0) {
/* Time to delete condemned entries */
for (ncep = nce_hash_tbl; ncep < A_END(nce_hash_tbl); ncep++) {
nce = *ncep;
if (nce != NULL) {
nce_remove(nce, &free_nce_list);
}
}
nce = nce_mask_entries;
if (nce != NULL) {
nce_remove(nce, &free_nce_list);
}
ndp_g_walker_cleanup = B_FALSE;
}
mutex_exit(&ndp_g_lock);
if (free_nce_list != NULL) {
nce_ire_delete_list(free_nce_list);
}
}
void
ndp_walk(ill_t *ill, pfi_t pfi, void *arg1)
{
ndp_walk_impl(ill, pfi, arg1, B_TRUE);
}
/*
* Prepend the zoneid using an ipsec_out_t for later use by functions like
* ip_rput_v6() after neighbor discovery has taken place. If the message
* block already has a M_CTL at the front of it, then simply set the zoneid
* appropriately.
*/
static mblk_t *
ndp_prepend_zone(mblk_t *mp, zoneid_t zoneid)
{
mblk_t *first_mp;
ipsec_out_t *io;
ASSERT(zoneid != ALL_ZONES);
if (mp->b_datap->db_type == M_CTL) {
io = (ipsec_out_t *)mp->b_rptr;
ASSERT(io->ipsec_out_type == IPSEC_OUT);
io->ipsec_out_zoneid = zoneid;
return (mp);
}
first_mp = ipsec_alloc_ipsec_out();
if (first_mp == NULL)
return (NULL);
io = (ipsec_out_t *)first_mp->b_rptr;
/* This is not a secure packet */
io->ipsec_out_secure = B_FALSE;
io->ipsec_out_zoneid = zoneid;
first_mp->b_cont = mp;
return (first_mp);
}
/*
* Process resolve requests. Handles both mapped entries
* as well as cases that needs to be send out on the wire.
* Lookup a NCE for a given IRE. Regardless of whether one exists
* or one is created, we defer making ire point to nce until the
* ire is actually added at which point the nce_refcnt on the nce is
* incremented. This is done primarily to have symmetry between ire_add()
* and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
*/
int
ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
{
nce_t *nce;
int err = 0;
uint32_t ms;
mblk_t *mp_nce = NULL;
ASSERT(ill != NULL);
if (IN6_IS_ADDR_MULTICAST(dst)) {
err = nce_set_multicast(ill, dst);
return (err);
}
err = ndp_lookup_then_add(ill,
NULL, /* No hardware address */
dst,
&ipv6_all_ones,
&ipv6_all_zeros,
0,
(ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
ND_INCOMPLETE,
&nce);
switch (err) {
case 0:
/*
* New cache entry was created. Make sure that the state
* is not ND_INCOMPLETE. It can be in some other state
* even before we send out the solicitation as we could
* get un-solicited advertisements.
*
* If this is an XRESOLV interface, simply return 0,
* since we don't want to solicit just yet.
*/
if (ill->ill_flags & ILLF_XRESOLV) {
NCE_REFRELE(nce);
return (0);
}
rw_enter(&ill_g_lock, RW_READER);
mutex_enter(&nce->nce_lock);
if (nce->nce_state != ND_INCOMPLETE) {
mutex_exit(&nce->nce_lock);
rw_exit(&ill_g_lock);
NCE_REFRELE(nce);
return (0);
}
mp_nce = ndp_prepend_zone(mp, zoneid);
if (mp_nce == NULL) {
/* The caller will free mp */
mutex_exit(&nce->nce_lock);
rw_exit(&ill_g_lock);
ndp_delete(nce);
NCE_REFRELE(nce);
return (ENOMEM);
}
ms = nce_solicit(nce, mp_nce);
rw_exit(&ill_g_lock);
if (ms == 0) {
/* The caller will free mp */
if (mp_nce != mp)
freeb(mp_nce);
mutex_exit(&nce->nce_lock);
ndp_delete(nce);
NCE_REFRELE(nce);
return (EBUSY);
}
mutex_exit(&nce->nce_lock);
NDP_RESTART_TIMER(nce, (clock_t)ms);
NCE_REFRELE(nce);
return (EINPROGRESS);
case EEXIST:
/* Resolution in progress just queue the packet */
mutex_enter(&nce->nce_lock);
if (nce->nce_state == ND_INCOMPLETE) {
mp_nce = ndp_prepend_zone(mp, zoneid);
if (mp_nce == NULL) {
err = ENOMEM;
} else {
nce_queue_mp(nce, mp_nce);
err = EINPROGRESS;
}
} else {
/*
* Any other state implies we have
* a nce but IRE needs to be added ...
* ire_add_v6() will take care of the
* the case when the nce becomes CONDEMNED
* before the ire is added to the table.
*/
err = 0;
}
mutex_exit(&nce->nce_lock);
NCE_REFRELE(nce);
break;
default:
ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
break;
}
return (err);
}
/*
* When there is no resolver, the link layer template is passed in
* the IRE.
* Lookup a NCE for a given IRE. Regardless of whether one exists
* or one is created, we defer making ire point to nce until the
* ire is actually added at which point the nce_refcnt on the nce is
* incremented. This is done primarily to have symmetry between ire_add()
* and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
*/
int
ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
{
nce_t *nce;
int err = 0;
ASSERT(ill != NULL);
if (IN6_IS_ADDR_MULTICAST(dst)) {
err = nce_set_multicast(ill, dst);
return (err);
}
err = ndp_lookup_then_add(ill,
NULL, /* hardware address */
dst,
&ipv6_all_ones,
&ipv6_all_zeros,
0,
(ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
ND_REACHABLE,
&nce);
switch (err) {
case 0:
/*
* Cache entry with a proper resolver cookie was
* created.
*/
NCE_REFRELE(nce);
break;
case EEXIST:
err = 0;
NCE_REFRELE(nce);
break;
default:
ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
break;
}
return (err);
}
/*
* For each interface an entry is added for the unspecified multicast group.
* Here that mapping is used to form the multicast cache entry for a particular
* multicast destination.
*/
static int
nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
{
nce_t *mnce; /* Multicast mapping entry */
nce_t *nce;
uchar_t *hw_addr = NULL;
int err = 0;
ASSERT(ill != NULL);
ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
mutex_enter(&ndp_g_lock);
nce = nce_lookup_addr(ill, dst);
if (nce != NULL) {
mutex_exit(&ndp_g_lock);
NCE_REFRELE(nce);
return (0);
}
/* No entry, now lookup for a mapping this should never fail */
mnce = nce_lookup_mapping(ill, dst);
if (mnce == NULL) {
/* Something broken for the interface. */
mutex_exit(&ndp_g_lock);
return (ESRCH);
}
ASSERT(mnce->nce_flags & NCE_F_MAPPING);
if (ill->ill_net_type == IRE_IF_RESOLVER) {
/*
* For IRE_IF_RESOLVER a hardware mapping can be
* generated, for IRE_IF_NORESOLVER, resolution cookie
* in the ill is copied in ndp_add().
*/
hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
if (hw_addr == NULL) {
mutex_exit(&ndp_g_lock);
NCE_REFRELE(mnce);
return (ENOMEM);
}
nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
}
NCE_REFRELE(mnce);
/*
* IRE_IF_NORESOLVER type simply copies the resolution
* cookie passed in. So no hw_addr is needed.
*/
err = ndp_add(ill,
hw_addr,
dst,
&ipv6_all_ones,
&ipv6_all_zeros,
0,
NCE_F_NONUD,
ND_REACHABLE,
&nce);
mutex_exit(&ndp_g_lock);
if (hw_addr != NULL)
kmem_free(hw_addr, ill->ill_nd_lla_len);
if (err != 0) {
ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
return (err);
}
NCE_REFRELE(nce);
return (0);
}
/*
* Return the link layer address, and any flags of a nce.
*/
int
ndp_query(ill_t *ill, struct lif_nd_req *lnr)
{
nce_t *nce;
in6_addr_t *addr;
sin6_t *sin6;
dl_unitdata_req_t *dl;
ASSERT(ill != NULL);
sin6 = (sin6_t *)&lnr->lnr_addr;
addr = &sin6->sin6_addr;
nce = ndp_lookup(ill, addr, B_FALSE);
if (nce == NULL)
return (ESRCH);
/* If in INCOMPLETE state, no link layer address is available yet */
if (nce->nce_state == ND_INCOMPLETE)
goto done;
dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
if (ill->ill_flags & ILLF_XRESOLV)
lnr->lnr_hdw_len = dl->dl_dest_addr_length;
else
lnr->lnr_hdw_len = ill->ill_nd_lla_len;
ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
sizeof (lnr->lnr_hdw_addr));
bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
(uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
if (nce->nce_flags & NCE_F_ISROUTER)
lnr->lnr_flags = NDF_ISROUTER_ON;
if (nce->nce_flags & NCE_F_PROXY)
lnr->lnr_flags |= NDF_PROXY_ON;
if (nce->nce_flags & NCE_F_ANYCAST)
lnr->lnr_flags |= NDF_ANYCAST_ON;
done:
NCE_REFRELE(nce);
return (0);
}
/*
* Send Enable/Disable multicast reqs to driver.
*/
int
ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
uint32_t hw_addr_offset, mblk_t *mp)
{
nce_t *nce;
uchar_t *hw_addr;
ASSERT(ill != NULL);
ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
freemsg(mp);
return (EINVAL);
}
mutex_enter(&ndp_g_lock);
nce = nce_lookup_mapping(ill, addr);
if (nce == NULL) {
mutex_exit(&ndp_g_lock);
freemsg(mp);
return (ESRCH);
}
mutex_exit(&ndp_g_lock);
/*
* Update dl_addr_length and dl_addr_offset for primitives that
* have physical addresses as opposed to full saps
*/
switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
case DL_ENABMULTI_REQ:
/* Track the state if this is the first enabmulti */
if (ill->ill_dlpi_multicast_state == IDMS_UNKNOWN)
ill->ill_dlpi_multicast_state = IDMS_INPROGRESS;
ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
break;
case DL_DISABMULTI_REQ:
ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
break;
default:
NCE_REFRELE(nce);
ip1dbg(("ndp_mcastreq: default\n"));
return (EINVAL);
}
nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
NCE_REFRELE(nce);
putnext(ill->ill_wq, mp);
return (0);
}
/*
* Send a neighbor solicitation.
* Returns number of milliseconds after which we should either rexmit or abort.
* Return of zero means we should abort.
* The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
*
* NOTE: This routine drops nce_lock (and later reacquires it) when sending
* the packet.
* NOTE: This routine does not consume mp.
*/
uint32_t
nce_solicit(nce_t *nce, mblk_t *mp)
{
ill_t *ill;
ill_t *src_ill;
ip6_t *ip6h;
in6_addr_t src;
in6_addr_t dst;
ipif_t *ipif;
ip6i_t *ip6i;
boolean_t dropped = B_FALSE;
ASSERT(RW_READ_HELD(&ill_g_lock));
ASSERT(MUTEX_HELD(&nce->nce_lock));
ill = nce->nce_ill;
ASSERT(ill != NULL);
if (nce->nce_rcnt == 0) {
return (0);
}
if (mp == NULL) {
ASSERT(nce->nce_qd_mp != NULL);
mp = nce->nce_qd_mp;
} else {
nce_queue_mp(nce, mp);
}
/* Handle ip_newroute_v6 giving us IPSEC packets */
if (mp->b_datap->db_type == M_CTL)
mp = mp->b_cont;
ip6h = (ip6_t *)mp->b_rptr;
if (ip6h->ip6_nxt == IPPROTO_RAW) {
/*
* This message should have been pulled up already in
* ip_wput_v6. We can't do pullups here because the message
* could be from the nce_qd_mp which could have b_next/b_prev
* non-NULL.
*/
ip6i = (ip6i_t *)ip6h;
ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
sizeof (ip6i_t) + IPV6_HDR_LEN);
ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
}
src = ip6h->ip6_src;
/*
* If the src of outgoing packet is one of the assigned interface
* addresses use it, otherwise we will pick the source address below.
*/
src_ill = ill;
if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
if (ill->ill_group != NULL)
src_ill = ill->ill_group->illgrp_ill;
for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
for (ipif = src_ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
if (IN6_ARE_ADDR_EQUAL(&src,
&ipif->ipif_v6lcl_addr)) {
break;
}
}
if (ipif != NULL)
break;
}
if (src_ill == NULL) {
/* May be a forwarding packet */
src_ill = ill;
src = ipv6_all_zeros;
}
}
dst = nce->nce_addr;
/*
* If source address is unspecified, nce_xmit will choose
* one for us and initialize the hardware address also
* appropriately.
*/
if (IN6_IS_ADDR_UNSPECIFIED(&src))
src_ill = NULL;
nce->nce_rcnt--;
mutex_exit(&nce->nce_lock);
rw_exit(&ill_g_lock);
dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
&dst, 0);
rw_enter(&ill_g_lock, RW_READER);
mutex_enter(&nce->nce_lock);
if (dropped)
nce->nce_rcnt++;
return (ill->ill_reachable_retrans_time);
}
void
ndp_input_solicit(ill_t *ill, mblk_t *mp)
{
nd_neighbor_solicit_t *ns;
uint32_t hlen = ill->ill_nd_lla_len;
uchar_t *haddr = NULL;
icmp6_t *icmp_nd;
ip6_t *ip6h;
nce_t *our_nce = NULL;
in6_addr_t target;
in6_addr_t src;
int len;
int flag = 0;
nd_opt_hdr_t *opt = NULL;
boolean_t bad_solicit = B_FALSE;
mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
ip6h = (ip6_t *)mp->b_rptr;
icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
src = ip6h->ip6_src;
ns = (nd_neighbor_solicit_t *)icmp_nd;
target = ns->nd_ns_target;
if (IN6_IS_ADDR_MULTICAST(&target)) {
if (ip_debug > 2) {
/* ip1dbg */
pr_addr_dbg("ndp_input_solicit: Target is"
" multicast! %s\n", AF_INET6, &target);
}
bad_solicit = B_TRUE;
goto done;
}
if (len > sizeof (nd_neighbor_solicit_t)) {
/* Options present */
opt = (nd_opt_hdr_t *)&ns[1];
len -= sizeof (nd_neighbor_solicit_t);
if (!ndp_verify_optlen(opt, len)) {
ip1dbg(("ndp_input_solicit: Bad opt len\n"));
bad_solicit = B_TRUE;
goto done;
}
}
if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
/* Check to see if this is a valid DAD solicitation */
if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
if (ip_debug > 2) {
/* ip1dbg */
pr_addr_dbg("ndp_input_solicit: IPv6 "
"Destination is not solicited node "
"multicast %s\n", AF_INET6,
&ip6h->ip6_dst);
}
bad_solicit = B_TRUE;
goto done;
}
}
our_nce = ndp_lookup(ill, &target, B_FALSE);
/*
* If this is a valid Solicitation, a permanent
* entry should exist in the cache
*/
if (our_nce == NULL ||
!(our_nce->nce_flags & NCE_F_PERMANENT)) {
ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
"ifname=%s ", ill->ill_name));
if (ip_debug > 2) {
/* ip1dbg */
pr_addr_dbg(" dst %s\n", AF_INET6, &target);
}
bad_solicit = B_TRUE;
goto done;
}
/* At this point we should have a verified NS per spec */
if (opt != NULL) {
opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
if (opt != NULL) {
/*
* No source link layer address option should
* be present in a valid DAD request.
*/
if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
ip1dbg(("ndp_input_solicit: source link-layer "
"address option present with an "
"unspecified source. \n"));
bad_solicit = B_TRUE;
goto done;
}
haddr = (uchar_t *)&opt[1];
if (hlen > opt->nd_opt_len * 8 ||
hlen == 0) {
bad_solicit = B_TRUE;
goto done;
}
}
}
/*
* haddr can be NULL if no options are present,
* or no Source link layer address is present in,
* recvd NDP options of solicitation message.
*/
if (haddr == NULL) {
nce_t *nnce;
mutex_enter(&ndp_g_lock);
nnce = nce_lookup_addr(ill, &src);
mutex_exit(&ndp_g_lock);
if (nnce == NULL) {
in6_addr_t dst = ipv6_solicited_node_mcast;
/* Form solicited node multicast address */
dst.s6_addr32[3] |= src.s6_addr32[3];
(void) nce_xmit(ill,
ND_NEIGHBOR_SOLICIT,
ill,
B_TRUE,
&target,
&dst,
flag);
bad_solicit = B_TRUE;
goto done;
}
}
/* Set override flag, it will be reset later if need be. */
flag |= NDP_ORIDE;
if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
flag |= NDP_UNICAST;
}
/*
* Create/update the entry for the soliciting node.
* or respond to outstanding queries, don't if
* the source is unspecified address.
*/
if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
int err = 0;
nce_t *nnce;
err = ndp_lookup_then_add(ill,
haddr,
&src, /* Soliciting nodes address */
&ipv6_all_ones,
&ipv6_all_zeros,
0,
0,
ND_STALE,
&nnce);
switch (err) {
case 0:
/* done with this entry */
NCE_REFRELE(nnce);
break;
case EEXIST:
/*
* B_FALSE indicates this is not an
* an advertisement.
*/
ndp_process(nnce, haddr, 0, B_FALSE);
NCE_REFRELE(nnce);
break;
default:
ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
err));
goto done;
}
flag |= NDP_SOLICITED;
} else {
/*
* This is a DAD req, multicast the advertisement
* to the all-nodes address.
*/
src = ipv6_all_hosts_mcast;
}
if (our_nce->nce_flags & NCE_F_ISROUTER)
flag |= NDP_ISROUTER;
if (our_nce->nce_flags & NCE_F_PROXY)
flag &= ~NDP_ORIDE;
/* Response to a solicitation */
(void) nce_xmit(ill,
ND_NEIGHBOR_ADVERT,
ill, /* ill to be used for extracting ill_nd_lla */
B_TRUE, /* use ill_nd_lla */
&target, /* Source and target of the advertisement pkt */
&src, /* IP Destination (source of original pkt) */
flag);
done:
if (bad_solicit)
BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
if (our_nce != NULL)
NCE_REFRELE(our_nce);
}
void
ndp_input_advert(ill_t *ill, mblk_t *mp)
{
nd_neighbor_advert_t *na;
uint32_t hlen = ill->ill_nd_lla_len;
uchar_t *haddr = NULL;
icmp6_t *icmp_nd;
ip6_t *ip6h;
nce_t *dst_nce = NULL;
in6_addr_t target;
nd_opt_hdr_t *opt = NULL;
int len;
mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
ip6h = (ip6_t *)mp->b_rptr;
icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
na = (nd_neighbor_advert_t *)icmp_nd;
if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
(na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
ip1dbg(("ndp_input_advert: Target is multicast but the "
"solicited flag is not zero\n"));
BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
return;
}
target = na->nd_na_target;
if (IN6_IS_ADDR_MULTICAST(&target)) {
ip1dbg(("ndp_input_advert: Target is multicast!\n"));
BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
return;
}
if (len > sizeof (nd_neighbor_advert_t)) {
opt = (nd_opt_hdr_t *)&na[1];
if (!ndp_verify_optlen(opt,
len - sizeof (nd_neighbor_advert_t))) {
BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
return;
}
/* At this point we have a verified NA per spec */
len -= sizeof (nd_neighbor_advert_t);
opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
if (opt != NULL) {
haddr = (uchar_t *)&opt[1];
if (hlen > opt->nd_opt_len * 8 ||
hlen == 0) {
BUMP_MIB(mib,
ipv6IfIcmpInBadNeighborAdvertisements);
return;
}
}
}
/*
* If this interface is part of the group look at all the
* ills in the group.
*/
rw_enter(&ill_g_lock, RW_READER);
if (ill->ill_group != NULL)
ill = ill->ill_group->illgrp_ill;
for (; ill != NULL; ill = ill->ill_group_next) {
mutex_enter(&ill->ill_lock);
if (!ILL_CAN_LOOKUP(ill)) {
mutex_exit(&ill->ill_lock);
continue;
}
ill_refhold_locked(ill);
mutex_exit(&ill->ill_lock);
dst_nce = ndp_lookup(ill, &target, B_FALSE);
/* We have to drop the lock since ndp_process calls put* */
rw_exit(&ill_g_lock);
if (dst_nce != NULL) {
if (na->nd_na_flags_reserved &
ND_NA_FLAG_ROUTER) {
dst_nce->nce_flags |= NCE_F_ISROUTER;
}
/* B_TRUE indicates this an advertisement */
ndp_process(dst_nce, haddr,
na->nd_na_flags_reserved, B_TRUE);
NCE_REFRELE(dst_nce);
}
rw_enter(&ill_g_lock, RW_READER);
ill_refrele(ill);
}
rw_exit(&ill_g_lock);
}
/*
* Process NDP neighbor solicitation/advertisement messages.
* The checksum has already checked o.k before reaching here.
*/
void
ndp_input(ill_t *ill, mblk_t *mp)
{
icmp6_t *icmp_nd;
ip6_t *ip6h;
int len;
mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
if (!pullupmsg(mp, -1)) {
ip1dbg(("ndp_input: pullupmsg failed\n"));
BUMP_MIB(ill->ill_ip6_mib, ipv6InDiscards);
goto done;
}
ip6h = (ip6_t *)mp->b_rptr;
if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
goto done;
}
/*
* NDP does not accept any extension headers between the
* IP header and the ICMP header since e.g. a routing
* header could be dangerous.
* This assumes that any AH or ESP headers are removed
* by ip prior to passing the packet to ndp_input.
*/
if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
ip1dbg(("ndp_input: Wrong next header 0x%x\n",
ip6h->ip6_nxt));
BUMP_MIB(mib, ipv6IfIcmpInErrors);
goto done;
}
icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
if (icmp_nd->icmp6_code != 0) {
ip1dbg(("ndp_input: icmp6 code != 0 \n"));
BUMP_MIB(mib, ipv6IfIcmpInErrors);
goto done;
}
len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
/*
* Make sure packet length is large enough for either
* a NS or a NA icmp packet.
*/
if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
ip1dbg(("ndp_input: packet too short\n"));
BUMP_MIB(mib, ipv6IfIcmpInErrors);
goto done;
}
if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
ndp_input_solicit(ill, mp);
} else {
ndp_input_advert(ill, mp);
}
done:
freemsg(mp);
}
/*
* nce_xmit is called to form and transmit a ND solicitation or
* advertisement ICMP packet.
* If source address is unspecified, appropriate source address
* and link layer address will be chosen here. This function
* *always* sends the link layer option.
* It returns B_FALSE only if it does a successful put() to the
* corresponding ill's ill_wq otherwise returns B_TRUE.
*/
static boolean_t
nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
int flag)
{
uint32_t len;
icmp6_t *icmp6;
mblk_t *mp;
ip6_t *ip6h;
nd_opt_hdr_t *opt;
uint_t plen;
ip6i_t *ip6i;
ipif_t *src_ipif = NULL;
/*
* If we have a unspecified source(sender) address, select a
* proper source address for the solicitation here itself so
* that we can initialize the h/w address correctly. This is
* needed for interface groups as source address can come from
* the whole group and the h/w address initialized from ill will
* be wrong if the source address comes from a different ill.
*
* Note that the NA never comes here with the unspecified source
* address. The following asserts that whenever the source
* address is specified, the haddr also should be specified.
*/
ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
if (IN6_IS_ADDR_UNSPECIFIED(sender)) {
ASSERT(operation != ND_NEIGHBOR_ADVERT);
/*
* Pick a source address for this solicitation, but
* restrict the selection to addresses assigned to the
* output interface (or interface group). We do this
* because the destination will create a neighbor cache
* entry for the source address of this packet, so the
* source address had better be a valid neighbor.
*/
src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL,
IPV6_PREFER_SRC_DEFAULT, GLOBAL_ZONEID);
if (src_ipif == NULL) {
char buf[INET6_ADDRSTRLEN];
ip1dbg(("nce_xmit: No source ipif for dst %s\n",
inet_ntop(AF_INET6, (char *)target, buf,
sizeof (buf))));
return (B_TRUE);
}
sender = &src_ipif->ipif_v6src_addr;
hwaddr_ill = src_ipif->ipif_ill;
}
plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7)/8;
/*
* Always make sure that the NS/NA packets don't get load
* spread. This is needed so that the probe packets sent
* by the in.mpathd daemon can really go out on the desired
* interface. Probe packets are made to go out on a desired
* interface by including a ip6i with ATTACH_IF flag. As these
* packets indirectly end up sending/receiving NS/NA packets
* (neighbor doing NUD), we have to make sure that NA
* also go out on the same interface.
*/
len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
plen * 8;
mp = allocb(len, BPRI_LO);
if (mp == NULL) {
if (src_ipif != NULL)
ipif_refrele(src_ipif);
return (B_TRUE);
}
bzero((char *)mp->b_rptr, len);
mp->b_wptr = mp->b_rptr + len;
ip6i = (ip6i_t *)mp->b_rptr;
ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
ip6i->ip6i_nxt = IPPROTO_RAW;
ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
ip6h->ip6_nxt = IPPROTO_ICMPV6;
ip6h->ip6_hops = IPV6_MAX_HOPS;
ip6h->ip6_dst = *target;
icmp6 = (icmp6_t *)&ip6h[1];
opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
sizeof (nd_neighbor_advert_t));
if (operation == ND_NEIGHBOR_SOLICIT) {
nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
ip6h->ip6_src = *sender;
ns->nd_ns_target = *target;
if (!(flag & NDP_UNICAST)) {
/* Form multicast address of the target */
ip6h->ip6_dst = ipv6_solicited_node_mcast;
ip6h->ip6_dst.s6_addr32[3] |=
ns->nd_ns_target.s6_addr32[3];
}
} else {
nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
ip6h->ip6_src = *sender;
na->nd_na_target = *sender;
if (flag & NDP_ISROUTER)
na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
if (flag & NDP_SOLICITED)
na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
if (flag & NDP_ORIDE)
na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
}
/* Fill in link layer address and option len */
opt->nd_opt_len = (uint8_t)plen;
mutex_enter(&hwaddr_ill->ill_lock);
bcopy(use_nd_lla ? hwaddr_ill->ill_nd_lla : hwaddr_ill->ill_phys_addr,
&opt[1], hwaddr_ill->ill_nd_lla_len);
mutex_exit(&hwaddr_ill->ill_lock);
icmp6->icmp6_type = (uint8_t)operation;
icmp6->icmp6_code = 0;
/*
* Prepare for checksum by putting icmp length in the icmp
* checksum field. The checksum is calculated in ip_wput_v6.
*/
icmp6->icmp6_cksum = ip6h->ip6_plen;
if (src_ipif != NULL)
ipif_refrele(src_ipif);
if (canput(ill->ill_wq)) {
put(ill->ill_wq, mp);
return (B_FALSE);
}
freemsg(mp);
return (B_TRUE);
}
/*
* Make a link layer address (does not include the SAP) from an nce.
* To form the link layer address, use the last four bytes of ipv6
* address passed in and the fixed offset stored in nce.
*/
static void
nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
{
uchar_t *mask, *to;
ill_t *ill = nce->nce_ill;
int len;
if (ill->ill_net_type == IRE_IF_NORESOLVER)
return;
ASSERT(nce->nce_res_mp != NULL);
ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
ASSERT(nce->nce_flags & NCE_F_MAPPING);
ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
ASSERT(addr != NULL);
bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
addrpos, ill->ill_nd_lla_len);
len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
IPV6_ADDR_LEN);
mask = (uchar_t *)&nce->nce_extract_mask;
mask += (IPV6_ADDR_LEN - len);
addr += (IPV6_ADDR_LEN - len);
to = addrpos + nce->nce_ll_extract_start;
while (len-- > 0)
*to++ |= *mask++ & *addr++;
}
/*
* Pass a cache report back out via NDD.
*/
/* ARGSUSED */
int
ndp_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
{
(void) mi_mpprintf(mp, "ifname hardware addr flags"
" proto addr/mask");
ndp_walk(NULL, (pfi_t)nce_report1, (uchar_t *)mp);
return (0);
}
/*
* convert a link level address of arbitrary length
* to an ascii string.
* The caller *must* have already verified that the string buffer
* is large enough to hold the entire string, including trailing NULL.
*/
static void
lla2ascii(uint8_t *lla, int addrlen, uchar_t *buf)
{
uchar_t addrbyte[8]; /* needs to hold ascii for a byte plus a NULL */
int i;
size_t len;
buf[0] = '\0';
for (i = 0; i < addrlen; i++) {
addrbyte[0] = '\0';
(void) sprintf((char *)addrbyte, "%02x:", (lla[i] & 0xff));
len = strlen((const char *)addrbyte);
bcopy(addrbyte, buf, len);
buf = buf + len;
}
*--buf = '\0';
}
/*
* Add a single line to the NDP Cache Entry Report.
*/
static void
nce_report1(nce_t *nce, uchar_t *mp_arg)
{
ill_t *ill = nce->nce_ill;
char local_buf[INET6_ADDRSTRLEN];
uchar_t flags_buf[10];
uint32_t flags = nce->nce_flags;
mblk_t *mp = (mblk_t *)mp_arg;
uchar_t *h;
uchar_t *m = flags_buf;
in6_addr_t v6addr;
/*
* Lock the nce to protect nce_res_mp from being changed
* if an external resolver address resolution completes
* while nce_res_mp is being accessed here.
*
* Deal with all address formats, not just Ethernet-specific
* In addition, make sure that the mblk has enough space
* before writing to it. If is doesn't, allocate a new one.
*/
ASSERT(ill != NULL);
v6addr = nce->nce_mask;
if (flags & NCE_F_PERMANENT)
*m++ = 'P';
if (flags & NCE_F_ISROUTER)
*m++ = 'R';
if (flags & NCE_F_MAPPING)
*m++ = 'M';
*m = '\0';
if (ill->ill_net_type == IRE_IF_RESOLVER) {
size_t addrlen;
uchar_t *addr_buf;
dl_unitdata_req_t *dl;
mutex_enter(&nce->nce_lock);
h = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
if (ill->ill_flags & ILLF_XRESOLV)
addrlen = (3 * (dl->dl_dest_addr_length));
else
addrlen = (3 * (ill->ill_nd_lla_len));
if (addrlen <= 0) {
mutex_exit(&nce->nce_lock);
(void) mi_mpprintf(mp,
"%8s %9s %5s %s/%d",
ill->ill_name,
"None",
(uchar_t *)&flags_buf,
inet_ntop(AF_INET6, (char *)&nce->nce_addr,
(char *)local_buf, sizeof (local_buf)),
ip_mask_to_plen_v6(&v6addr));
} else {
/*
* Convert the hardware/lla address to ascii
*/
addr_buf = kmem_zalloc(addrlen, KM_NOSLEEP);
if (addr_buf == NULL) {
mutex_exit(&nce->nce_lock);
return;
}
if (ill->ill_flags & ILLF_XRESOLV)
lla2ascii((uint8_t *)h, dl->dl_dest_addr_length,
addr_buf);
else
lla2ascii((uint8_t *)h, ill->ill_nd_lla_len,
addr_buf);
mutex_exit(&nce->nce_lock);
(void) mi_mpprintf(mp, "%8s %17s %5s %s/%d",
ill->ill_name, addr_buf, (uchar_t *)&flags_buf,
inet_ntop(AF_INET6, (char *)&nce->nce_addr,
(char *)local_buf, sizeof (local_buf)),
ip_mask_to_plen_v6(&v6addr));
kmem_free(addr_buf, addrlen);
}
} else {
(void) mi_mpprintf(mp,
"%8s %9s %5s %s/%d",
ill->ill_name,
"None",
(uchar_t *)&flags_buf,
inet_ntop(AF_INET6, (char *)&nce->nce_addr,
(char *)local_buf, sizeof (local_buf)),
ip_mask_to_plen_v6(&v6addr));
}
}
mblk_t *
nce_udreq_alloc(ill_t *ill)
{
mblk_t *template_mp = NULL;
dl_unitdata_req_t *dlur;
int sap_length;
sap_length = ill->ill_sap_length;
template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
if (template_mp == NULL)
return (NULL);
dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
dlur->dl_priority.dl_min = 0;
dlur->dl_priority.dl_max = 0;
dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
/* Copy in the SAP value. */
NCE_LL_SAP_COPY(ill, template_mp);
return (template_mp);
}
/*
* NDP retransmit timer.
* This timer goes off when:
* a. It is time to retransmit NS for resolver.
* b. It is time to send reachability probes.
*/
void
ndp_timer(void *arg)
{
nce_t *nce = arg;
ill_t *ill = nce->nce_ill;
uint32_t ms;
char addrbuf[INET6_ADDRSTRLEN];
mblk_t *mp;
boolean_t dropped = B_FALSE;
/*
* The timer has to be cancelled by ndp_delete before doing the final
* refrele. So the NCE is guaranteed to exist when the timer runs
* until it clears the timeout_id. Before clearing the timeout_id
* bump up the refcnt so that we can continue to use the nce
*/
ASSERT(nce != NULL);
/*
* Grab the ill_g_lock now itself to avoid lock order problems.
* nce_solicit needs ill_g_lock to be able to traverse ills
*/
rw_enter(&ill_g_lock, RW_READER);
mutex_enter(&nce->nce_lock);
NCE_REFHOLD_LOCKED(nce);
nce->nce_timeout_id = 0;
/*
* Check the reachability state first.
*/
switch (nce->nce_state) {
case ND_DELAY:
rw_exit(&ill_g_lock);
nce->nce_state = ND_PROBE;
mutex_exit(&nce->nce_lock);
(void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
&ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
if (ip_debug > 3) {
/* ip2dbg */
pr_addr_dbg("ndp_timer: state for %s changed "
"to PROBE\n", AF_INET6, &nce->nce_addr);
}
NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
NCE_REFRELE(nce);
return;
case ND_PROBE:
/* must be retransmit timer */
rw_exit(&ill_g_lock);
nce->nce_pcnt--;
ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
nce->nce_pcnt >= -1);
if (nce->nce_pcnt == 0) {
/* Wait RetransTimer, before deleting the entry */
ip2dbg(("ndp_timer: pcount=%x dst %s\n",
nce->nce_pcnt, inet_ntop(AF_INET6,
&nce->nce_addr, addrbuf, sizeof (addrbuf))));
mutex_exit(&nce->nce_lock);
NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
} else {
/*
* As per RFC2461, the nce gets deleted after
* MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
* Note that the first unicast solicitation is sent
* during the DELAY state.
*/
if (nce->nce_pcnt > 0) {
ip2dbg(("ndp_timer: pcount=%x dst %s\n",
nce->nce_pcnt, inet_ntop(AF_INET6,
&nce->nce_addr,
addrbuf, sizeof (addrbuf))));
mutex_exit(&nce->nce_lock);
dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT,
NULL, B_FALSE, &ipv6_all_zeros,
&nce->nce_addr, NDP_UNICAST);
if (dropped) {
mutex_enter(&nce->nce_lock);
nce->nce_pcnt++;
mutex_exit(&nce->nce_lock);
}
NDP_RESTART_TIMER(nce,
ill->ill_reachable_retrans_time);
} else {
/* No hope, delete the nce */
nce->nce_state = ND_UNREACHABLE;
mutex_exit(&nce->nce_lock);
if (ip_debug > 2) {
/* ip1dbg */
pr_addr_dbg("ndp_timer: Delete IRE for"
" dst %s\n", AF_INET6,
&nce->nce_addr);
}
ndp_delete(nce);
}
}
NCE_REFRELE(nce);
return;
case ND_INCOMPLETE:
/*
* Must be resolvers retransmit timer.
*/
for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
ip6i_t *ip6i;
ip6_t *ip6h;
mblk_t *data_mp;
/*
* Walk the list of packets queued, and see if there
* are any multipathing probe packets. Such packets
* are always queued at the head. Since this is a
* retransmit timer firing, mark such packets as
* delayed in ND resolution. This info will be used
* in ip_wput_v6(). Multipathing probe packets will
* always have an ip6i_t. Once we hit a packet without
* it, we can break out of this loop.
*/
if (mp->b_datap->db_type == M_CTL)
data_mp = mp->b_cont;
else
data_mp = mp;
ip6h = (ip6_t *)data_mp->b_rptr;
if (ip6h->ip6_nxt != IPPROTO_RAW)
break;
/*
* This message should have been pulled up already in
* ip_wput_v6. We can't do pullups here because the
* b_next/b_prev is non-NULL.
*/
ip6i = (ip6i_t *)ip6h;
ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
sizeof (ip6i_t) + IPV6_HDR_LEN);
/* Mark this packet as delayed due to ND resolution */
if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
ip6i->ip6i_flags |= IP6I_ND_DELAYED;
}
if (nce->nce_qd_mp != NULL) {
ms = nce_solicit(nce, NULL);
rw_exit(&ill_g_lock);
if (ms == 0) {
if (nce->nce_state != ND_REACHABLE) {
mutex_exit(&nce->nce_lock);
nce_resolv_failed(nce);
ndp_delete(nce);
} else {
mutex_exit(&nce->nce_lock);
}
} else {
mutex_exit(&nce->nce_lock);
NDP_RESTART_TIMER(nce, (clock_t)ms);
}
NCE_REFRELE(nce);
return;
}
mutex_exit(&nce->nce_lock);
rw_exit(&ill_g_lock);
NCE_REFRELE(nce);
break;
case ND_REACHABLE :
rw_exit(&ill_g_lock);
if (nce->nce_flags & NCE_F_UNSOL_ADV &&
nce->nce_unsolicit_count != 0) {
nce->nce_unsolicit_count--;
mutex_exit(&nce->nce_lock);
dropped = nce_xmit(ill,
ND_NEIGHBOR_ADVERT,
ill, /* ill to be used for hw addr */
B_FALSE, /* use ill_phys_addr */
&nce->nce_addr,
&ipv6_all_hosts_mcast,
nce->nce_flags | NDP_ORIDE);
if (dropped) {
mutex_enter(&nce->nce_lock);
nce->nce_unsolicit_count++;
mutex_exit(&nce->nce_lock);
}
if (nce->nce_unsolicit_count != 0) {
NDP_RESTART_TIMER(nce,
ip_ndp_unsolicit_interval);
}
} else {
mutex_exit(&nce->nce_lock);
}
NCE_REFRELE(nce);
break;
default:
rw_exit(&ill_g_lock);
mutex_exit(&nce->nce_lock);
NCE_REFRELE(nce);
break;
}
}
/*
* Set a link layer address from the ll_addr passed in.
* Copy SAP from ill.
*/
static void
nce_set_ll(nce_t *nce, uchar_t *ll_addr)
{
ill_t *ill = nce->nce_ill;
uchar_t *woffset;
ASSERT(ll_addr != NULL);
/* Always called before fast_path_probe */
ASSERT(nce->nce_fp_mp == NULL);
if (ill->ill_sap_length != 0) {
/*
* Copy the SAP type specified in the
* request into the xmit template.
*/
NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
}
if (ill->ill_phys_addr_length > 0) {
/*
* The bcopy() below used to be called for the physical address
* length rather than the link layer address length. For
* ethernet and many other media, the phys_addr and lla are
* identical.
* However, with xresolv interfaces being introduced, the
* phys_addr and lla are no longer the same, and the physical
* address may not have any useful meaning, so we use the lla
* for IPv6 address resolution and destination addressing.
*
* For PPP or other interfaces with a zero length
* physical address, don't do anything here.
* The bcopy() with a zero phys_addr length was previously
* a no-op for interfaces with a zero-length physical address.
* Using the lla for them would change the way they operate.
* Doing nothing in such cases preserves expected behavior.
*/
woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
}
}
static boolean_t
nce_cmp_ll_addr(nce_t *nce, char *ll_addr, uint32_t ll_addr_len)
{
ill_t *ill = nce->nce_ill;
uchar_t *ll_offset;
ASSERT(nce->nce_res_mp != NULL);
if (ll_addr == NULL)
return (B_FALSE);
ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
if (bcmp(ll_addr, (char *)ll_offset, ll_addr_len) != 0)
return (B_TRUE);
return (B_FALSE);
}
/*
* Updates the link layer address or the reachability state of
* a cache entry. Reset probe counter if needed.
*/
static void
nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
{
ill_t *ill = nce->nce_ill;
boolean_t need_stop_timer = B_FALSE;
boolean_t need_fastpath_update = B_FALSE;
ASSERT(MUTEX_HELD(&nce->nce_lock));
/*
* If this interface does not do NUD, there is no point
* in allowing an update to the cache entry. Although
* we will respond to NS.
* The only time we accept an update for a resolver when
* NUD is turned off is when it has just been created.
* Non-Resolvers will always be created as REACHABLE.
*/
if (new_state != ND_UNCHANGED) {
if ((nce->nce_flags & NCE_F_NONUD) &&
(nce->nce_state != ND_INCOMPLETE))
return;
ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
need_stop_timer = B_TRUE;
if (new_state == ND_REACHABLE)
nce->nce_last = TICK_TO_MSEC(lbolt64);
else {
/* We force NUD in this case */
nce->nce_last = 0;
}
nce->nce_state = new_state;
nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
}
/*
* In case of fast path we need to free the the fastpath
* M_DATA and do another probe. Otherwise we can just
* overwrite the DL_UNITDATA_REQ data, noting we'll lose
* whatever packets that happens to be transmitting at the time.
*/
if (new_ll_addr != NULL) {
ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
if (nce->nce_fp_mp != NULL) {
freemsg(nce->nce_fp_mp);
nce->nce_fp_mp = NULL;
}
need_fastpath_update = B_TRUE;
}
mutex_exit(&nce->nce_lock);
if (need_stop_timer) {
(void) untimeout(nce->nce_timeout_id);
nce->nce_timeout_id = 0;
}
if (need_fastpath_update)
nce_fastpath(nce);
mutex_enter(&nce->nce_lock);
}
static void
nce_queue_mp(nce_t *nce, mblk_t *mp)
{
uint_t count = 0;
mblk_t **mpp;
boolean_t head_insert = B_FALSE;
ip6_t *ip6h;
ip6i_t *ip6i;
mblk_t *data_mp;
ASSERT(MUTEX_HELD(&nce->nce_lock));
if (mp->b_datap->db_type == M_CTL)
data_mp = mp->b_cont;
else
data_mp = mp;
ip6h = (ip6_t *)data_mp->b_rptr;
if (ip6h->ip6_nxt == IPPROTO_RAW) {
/*
* This message should have been pulled up already in
* ip_wput_v6. We can't do pullups here because the message
* could be from the nce_qd_mp which could have b_next/b_prev
* non-NULL.
*/
ip6i = (ip6i_t *)ip6h;
ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
sizeof (ip6i_t) + IPV6_HDR_LEN);
/*
* Multipathing probe packets have IP6I_DROP_IFDELAYED set.
* This has 2 aspects mentioned below.
* 1. Perform head insertion in the nce_qd_mp for these packets.
* This ensures that next retransmit of ND solicitation
* will use the interface specified by the probe packet,
* for both NS and NA. This corresponds to the src address
* in the IPv6 packet. If we insert at tail, we will be
* depending on the packet at the head for successful
* ND resolution. This is not reliable, because the interface
* on which the NA arrives could be different from the interface
* on which the NS was sent, and if the receiving interface is
* failed, it will appear that the sending interface is also
* failed, causing in.mpathd to misdiagnose this as link
* failure.
* 2. Drop the original packet, if the ND resolution did not
* succeed in the first attempt. However we will create the
* nce and the ire, as soon as the ND resolution succeeds.
* We don't gain anything by queueing multiple probe packets
* and sending them back-to-back once resolution succeeds.
* It is sufficient to send just 1 packet after ND resolution
* succeeds. Since mpathd is sending down probe packets at a
* constant rate, we don't need to send the queued packet. We
* need to queue it only for NDP resolution. The benefit of
* dropping the probe packets that were delayed in ND
* resolution, is that in.mpathd will not see inflated
* RTT. If the ND resolution does not succeed within
* in.mpathd's failure detection time, mpathd may detect
* a failure, and it does not matter whether the packet
* was queued or dropped.
*/
if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
head_insert = B_TRUE;
}
for (mpp = &nce->nce_qd_mp; *mpp != NULL;
mpp = &(*mpp)->b_next) {
if (++count >
nce->nce_ill->ill_max_buf) {
mblk_t *tmp = nce->nce_qd_mp->b_next;
nce->nce_qd_mp->b_next = NULL;
nce->nce_qd_mp->b_prev = NULL;
freemsg(nce->nce_qd_mp);
ip1dbg(("nce_queue_mp: pkt dropped\n"));
nce->nce_qd_mp = tmp;
}
}
/* put this on the list */
if (head_insert) {
mp->b_next = nce->nce_qd_mp;
nce->nce_qd_mp = mp;
} else {
*mpp = mp;
}
}
/*
* Called when address resolution failed due to a timeout.
* Send an ICMP unreachable in response to all queued packets.
*/
void
nce_resolv_failed(nce_t *nce)
{
mblk_t *mp, *nxt_mp, *first_mp;
char buf[INET6_ADDRSTRLEN];
ip6_t *ip6h;
zoneid_t zoneid = GLOBAL_ZONEID;
ip1dbg(("nce_resolv_failed: dst %s\n",
inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
mutex_enter(&nce->nce_lock);
mp = nce->nce_qd_mp;
nce->nce_qd_mp = NULL;
mutex_exit(&nce->nce_lock);
while (mp != NULL) {
nxt_mp = mp->b_next;
mp->b_next = NULL;
mp->b_prev = NULL;
first_mp = mp;
if (mp->b_datap->db_type == M_CTL) {
ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
ASSERT(io->ipsec_out_type == IPSEC_OUT);
zoneid = io->ipsec_out_zoneid;
ASSERT(zoneid != ALL_ZONES);
mp = mp->b_cont;
}
ip6h = (ip6_t *)mp->b_rptr;
if (ip6h->ip6_nxt == IPPROTO_RAW) {
ip6i_t *ip6i;
/*
* This message should have been pulled up already
* in ip_wput_v6. ip_hdr_complete_v6 assumes that
* the header is pulled up.
*/
ip6i = (ip6i_t *)ip6h;
ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
sizeof (ip6i_t) + IPV6_HDR_LEN);
mp->b_rptr += sizeof (ip6i_t);
}
/*
* Ignore failure since icmp_unreachable_v6 will silently
* drop packets with an unspecified source address.
*/
(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid);
icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE);
mp = nxt_mp;
}
}
/*
* Called by SIOCSNDP* ioctl to add/change an nce entry
* and the corresponding attributes.
* Disallow states other than ND_REACHABLE or ND_STALE.
*/
int
ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
{
sin6_t *sin6;
in6_addr_t *addr;
nce_t *nce;
int err;
uint16_t new_flags = 0;
uint16_t old_flags = 0;
int inflags = lnr->lnr_flags;
if ((lnr->lnr_state_create != ND_REACHABLE) &&
(lnr->lnr_state_create != ND_STALE))
return (EINVAL);
sin6 = (sin6_t *)&lnr->lnr_addr;
addr = &sin6->sin6_addr;
mutex_enter(&ndp_g_lock);
/* We know it can not be mapping so just look in the hash table */
nce = nce_lookup_addr(ill, addr);
if (nce != NULL)
new_flags = nce->nce_flags;
switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
case NDF_ISROUTER_ON:
new_flags |= NCE_F_ISROUTER;
break;
case NDF_ISROUTER_OFF:
new_flags &= ~NCE_F_ISROUTER;
break;
case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
mutex_exit(&ndp_g_lock);
if (nce != NULL)
NCE_REFRELE(nce);
return (EINVAL);
}
switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
case NDF_ANYCAST_ON:
new_flags |= NCE_F_ANYCAST;
break;
case NDF_ANYCAST_OFF:
new_flags &= ~NCE_F_ANYCAST;
break;
case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
mutex_exit(&ndp_g_lock);
if (nce != NULL)
NCE_REFRELE(nce);
return (EINVAL);
}
switch (inflags & (NDF_PROXY_ON|NDF_PROXY_OFF)) {
case NDF_PROXY_ON:
new_flags |= NCE_F_PROXY;
break;
case NDF_PROXY_OFF:
new_flags &= ~NCE_F_PROXY;
break;
case (NDF_PROXY_OFF|NDF_PROXY_ON):
mutex_exit(&ndp_g_lock);
if (nce != NULL)
NCE_REFRELE(nce);
return (EINVAL);
}
if (nce == NULL) {
err = ndp_add(ill,
(uchar_t *)lnr->lnr_hdw_addr,
addr,
&ipv6_all_ones,
&ipv6_all_zeros,
0,
new_flags,
lnr->lnr_state_create,
&nce);
if (err != 0) {
mutex_exit(&ndp_g_lock);
ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
return (err);
}
}
old_flags = nce->nce_flags;
if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
/*
* Router turned to host, delete all ires.
* XXX Just delete the entry, but we need to add too.
*/
nce->nce_flags &= ~NCE_F_ISROUTER;
mutex_exit(&ndp_g_lock);
ndp_delete(nce);
NCE_REFRELE(nce);
return (0);
}
mutex_exit(&ndp_g_lock);
mutex_enter(&nce->nce_lock);
nce->nce_flags = new_flags;
mutex_exit(&nce->nce_lock);
/*
* Note that we ignore the state at this point, which
* should be either STALE or REACHABLE. Instead we let
* the link layer address passed in to determine the state
* much like incoming packets.
*/
ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
NCE_REFRELE(nce);
return (0);
}
/*
* If the device driver supports it, we make nce_fp_mp to have
* an M_DATA prepend. Otherwise nce_fp_mp will be null.
* The caller insures there is hold on nce for this function.
* Note that since ill_fastpath_probe() copies the mblk there is
* no need for the hold beyond this function.
*/
static void
nce_fastpath(nce_t *nce)
{
ill_t *ill = nce->nce_ill;
int res;
ASSERT(ill != NULL);
if (nce->nce_fp_mp != NULL) {
/* Already contains fastpath info */
return;
}
if (nce->nce_res_mp != NULL) {
nce_fastpath_list_add(nce);
res = ill_fastpath_probe(ill, nce->nce_res_mp);
/*
* EAGAIN is an indication of a transient error
* i.e. allocation failure etc. leave the nce in the list it
* will be updated when another probe happens for another ire
* if not it will be taken out of the list when the ire is
* deleted.
*/
if (res != 0 && res != EAGAIN)
nce_fastpath_list_delete(nce);
}
}
/*
* Drain the list of nce's waiting for fastpath response.
*/
void
nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void *),
void *arg)
{
nce_t *next_nce;
nce_t *current_nce;
nce_t *first_nce;
nce_t *prev_nce = NULL;
ASSERT(ill != NULL);
mutex_enter(&ill->ill_lock);
first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
next_nce = current_nce->nce_fastpath;
/*
* Take it off the list if we're flushing, or if the callback
* routine tells us to do so. Otherwise, leave the nce in the
* fastpath list to handle any pending response from the lower
* layer. We can't drain the list when the callback routine
* comparison failed, because the response is asynchronous in
* nature, and may not arrive in the same order as the list
* insertion.
*/
if (func == NULL || func(current_nce, arg)) {
current_nce->nce_fastpath = NULL;
if (current_nce == first_nce)
ill->ill_fastpath_list = first_nce = next_nce;
else
prev_nce->nce_fastpath = next_nce;
} else {
/* previous element that is still in the list */
prev_nce = current_nce;
}
current_nce = next_nce;
}
mutex_exit(&ill->ill_lock);
}
/*
* Add nce to the nce fastpath list.
*/
void
nce_fastpath_list_add(nce_t *nce)
{
ill_t *ill;
ill = nce->nce_ill;
ASSERT(ill != NULL);
mutex_enter(&ill->ill_lock);
mutex_enter(&nce->nce_lock);
/*
* if nce has not been deleted and
* is not already in the list add it.
*/
if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
(nce->nce_fastpath == NULL)) {
nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
ill->ill_fastpath_list = nce;
}
mutex_exit(&nce->nce_lock);
mutex_exit(&ill->ill_lock);
}
/*
* remove nce from the nce fastpath list.
*/
void
nce_fastpath_list_delete(nce_t *nce)
{
nce_t *nce_ptr;
ill_t *ill;
ill = nce->nce_ill;
ASSERT(ill != NULL);
mutex_enter(&ill->ill_lock);
if (nce->nce_fastpath == NULL)
goto done;
ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
if (ill->ill_fastpath_list == nce) {
ill->ill_fastpath_list = nce->nce_fastpath;
} else {
nce_ptr = ill->ill_fastpath_list;
while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
if (nce_ptr->nce_fastpath == nce) {
nce_ptr->nce_fastpath = nce->nce_fastpath;
break;
}
nce_ptr = nce_ptr->nce_fastpath;
}
}
nce->nce_fastpath = NULL;
done:
mutex_exit(&ill->ill_lock);
}
/*
* Update all NCE's that are not in fastpath mode and
* have an nce_fp_mp that matches mp. mp->b_cont contains
* the fastpath header.
*
* Returns TRUE if entry should be dequeued, or FALSE otherwise.
*/
boolean_t
ndp_fastpath_update(nce_t *nce, void *arg)
{
mblk_t *mp, *fp_mp;
uchar_t *mp_rptr, *ud_mp_rptr;
mblk_t *ud_mp = nce->nce_res_mp;
ptrdiff_t cmplen;
if (nce->nce_flags & NCE_F_MAPPING)
return (B_TRUE);
if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
return (B_TRUE);
ip2dbg(("ndp_fastpath_update: trying\n"));
mp = (mblk_t *)arg;
mp_rptr = mp->b_rptr;
cmplen = mp->b_wptr - mp_rptr;
ASSERT(cmplen >= 0);
ud_mp_rptr = ud_mp->b_rptr;
/*
* The nce is locked here to prevent any other threads
* from accessing and changing nce_res_mp when the IPv6 address
* becomes resolved to an lla while we're in the middle
* of looking at and comparing the hardware address (lla).
* It is also locked to prevent multiple threads in nce_fastpath_update
* from examining nce_res_mp atthe same time.
*/
mutex_enter(&nce->nce_lock);
if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
mutex_exit(&nce->nce_lock);
/*
* Don't take the ire off the fastpath list yet,
* since the response may come later.
*/
return (B_FALSE);
}
/* Matched - install mp as the fastpath mp */
ip1dbg(("ndp_fastpath_update: match\n"));
fp_mp = dupb(mp->b_cont);
if (fp_mp != NULL) {
nce->nce_fp_mp = fp_mp;
}
mutex_exit(&nce->nce_lock);
return (B_TRUE);
}
/*
* This function handles the DL_NOTE_FASTPATH_FLUSH notification from
* driver. Note that it assumes IP is exclusive...
*/
/* ARGSUSED */
void
ndp_fastpath_flush(nce_t *nce, char *arg)
{
if (nce->nce_flags & NCE_F_MAPPING)
return;
/* No fastpath info? */
if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
return;
/* Just delete the NCE... */
ndp_delete(nce);
}
/*
* Return a pointer to a given option in the packet.
* Assumes that option part of the packet have already been validated.
*/
nd_opt_hdr_t *
ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
{
while (optlen > 0) {
if (opt->nd_opt_type == opt_type)
return (opt);
optlen -= 8 * opt->nd_opt_len;
opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
}
return (NULL);
}
/*
* Verify all option lengths present are > 0, also check to see
* if the option lengths and packet length are consistent.
*/
boolean_t
ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
{
ASSERT(opt != NULL);
while (optlen > 0) {
if (opt->nd_opt_len == 0)
return (B_FALSE);
optlen -= 8 * opt->nd_opt_len;
if (optlen < 0)
return (B_FALSE);
opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
}
return (B_TRUE);
}
/*
* ndp_walk function.
* Free a fraction of the NCE cache entries.
* A fraction of zero means to not free any in that category.
*/
void
ndp_cache_reclaim(nce_t *nce, char *arg)
{
nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
uint_t rand;
if (nce->nce_flags & NCE_F_PERMANENT)
return;
rand = (uint_t)lbolt +
NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
if (ncr->ncr_host != 0 &&
(rand/ncr->ncr_host)*ncr->ncr_host == rand) {
ndp_delete(nce);
return;
}
}
/*
* ndp_walk function.
* Count the number of NCEs that can be deleted.
* These would be hosts but not routers.
*/
void
ndp_cache_count(nce_t *nce, char *arg)
{
ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
if (nce->nce_flags & NCE_F_PERMANENT)
return;
ncc->ncc_total++;
if (!(nce->nce_flags & NCE_F_ISROUTER))
ncc->ncc_host++;
}
#ifdef NCE_DEBUG
th_trace_t *
th_trace_nce_lookup(nce_t *nce)
{
int bucket_id;
th_trace_t *th_trace;
ASSERT(MUTEX_HELD(&nce->nce_lock));
bucket_id = IP_TR_HASH(curthread);
ASSERT(bucket_id < IP_TR_HASH_MAX);
for (th_trace = nce->nce_trace[bucket_id]; th_trace != NULL;
th_trace = th_trace->th_next) {
if (th_trace->th_id == curthread)
return (th_trace);
}
return (NULL);
}
void
nce_trace_ref(nce_t *nce)
{
int bucket_id;
th_trace_t *th_trace;
/*
* Attempt to locate the trace buffer for the curthread.
* If it does not exist, then allocate a new trace buffer
* and link it in list of trace bufs for this ipif, at the head
*/
ASSERT(MUTEX_HELD(&nce->nce_lock));
if (nce->nce_trace_disable == B_TRUE)
return;
th_trace = th_trace_nce_lookup(nce);
if (th_trace == NULL) {
bucket_id = IP_TR_HASH(curthread);
th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t),
KM_NOSLEEP);
if (th_trace == NULL) {
nce->nce_trace_disable = B_TRUE;
nce_trace_inactive(nce);
return;
}
th_trace->th_id = curthread;
th_trace->th_next = nce->nce_trace[bucket_id];
th_trace->th_prev = &nce->nce_trace[bucket_id];
if (th_trace->th_next != NULL)
th_trace->th_next->th_prev = &th_trace->th_next;
nce->nce_trace[bucket_id] = th_trace;
}
ASSERT(th_trace->th_refcnt < TR_BUF_MAX - 1);
th_trace->th_refcnt++;
th_trace_rrecord(th_trace);
}
void
nce_untrace_ref(nce_t *nce)
{
th_trace_t *th_trace;
ASSERT(MUTEX_HELD(&nce->nce_lock));
if (nce->nce_trace_disable == B_TRUE)
return;
th_trace = th_trace_nce_lookup(nce);
ASSERT(th_trace != NULL && th_trace->th_refcnt > 0);
th_trace_rrecord(th_trace);
th_trace->th_refcnt--;
}
void
nce_trace_inactive(nce_t *nce)
{
th_trace_t *th_trace;
int i;
ASSERT(MUTEX_HELD(&nce->nce_lock));
for (i = 0; i < IP_TR_HASH_MAX; i++) {
while (nce->nce_trace[i] != NULL) {
th_trace = nce->nce_trace[i];
/* unlink th_trace and free it */
nce->nce_trace[i] = th_trace->th_next;
if (th_trace->th_next != NULL)
th_trace->th_next->th_prev =
&nce->nce_trace[i];
th_trace->th_next = NULL;
th_trace->th_prev = NULL;
kmem_free(th_trace, sizeof (th_trace_t));
}
}
}
/* ARGSUSED */
int
nce_thread_exit(nce_t *nce, caddr_t arg)
{
th_trace_t *th_trace;
mutex_enter(&nce->nce_lock);
th_trace = th_trace_nce_lookup(nce);
if (th_trace == NULL) {
mutex_exit(&nce->nce_lock);
return (0);
}
ASSERT(th_trace->th_refcnt == 0);
/* unlink th_trace and free it */
*th_trace->th_prev = th_trace->th_next;
if (th_trace->th_next != NULL)
th_trace->th_next->th_prev = th_trace->th_prev;
th_trace->th_next = NULL;
th_trace->th_prev = NULL;
kmem_free(th_trace, sizeof (th_trace_t));
mutex_exit(&nce->nce_lock);
return (0);
}
#endif