ip6_ire.c revision a1b6bb22225758893709ddb81f495b831d79b4ec
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 1990 Mentat Inc.
*/
/*
* This file contains routines that manipulate Internet Routing Entries (IREs).
*/
#include <inet/ipclassifier.h>
#define IS_DEFAULT_ROUTE_V6(ire) \
static ire_t *
ip_stack_t *ipst);
/*
* Initialize the ire that is specific to IPv6 part and call
* ire_init_common to finish it.
* Returns zero or errno.
*/
int
{
int error;
/*
* Reject IRE security attmakeribute creation/initialization
* if system is not running in Trusted mode.
*/
return (EINVAL);
/* Make sure we don't have stray values in some fields */
switch (type) {
case IRE_LOOPBACK:
case IRE_HOST:
case IRE_LOCAL:
case IRE_IF_CLONE:
break;
case IRE_PREFIX:
case IRE_DEFAULT:
case IRE_IF_RESOLVER:
case IRE_IF_NORESOLVER:
ire->ire_masklen =
}
break;
case IRE_MULTICAST:
case IRE_NOROUTE:
break;
default:
ASSERT(0);
return (EINVAL);
}
return (error);
/* Determine which function pointers to use */
case IRE_LOCAL:
break;
case IRE_LOOPBACK:
break;
case IRE_MULTICAST:
break;
default:
/*
* For IRE_IF_ALL and IRE_OFFLINK we forward received
* packets by default.
*/
break;
}
}
return (0);
}
/*
* ire_create_v6 is called to allocate and initialize a new IRE.
*
* NOTE : This is called as writer sometimes though not required
* by this function.
*/
/* ARGSUSED */
ire_t *
{
int error;
return (NULL);
}
if (error != 0) {
return (NULL);
}
return (ire);
}
/*
* Find the ill matching a multicast group.
* Allows different routes for multicast addresses
* in the unicast routing table (akin to FF::0/8 but could be more specific)
* which point at different interfaces. This is used when IPV6_MULTICAST_IF
* isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't
* specify the interface to join on.
*
* Supports link-local addresses by using ire_route_recursive which follows
* the ill when recursing.
*
* To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
* and the MULTIRT property can be different for different groups, we
* extract RTF_MULTIRT from the special unicast route added for a group
* with CGTP and pass that back in the multirtp argument.
* This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
* We have a setsrcp argument for the same reason.
*/
ill_t *
{
return (NULL);
}
return (ill);
}
/*
* This function takes a mask and returns number of bits set in the
* mask (the represented prefix length). Assumes a contiguous mask.
*/
int
{
int bits;
int plen = IPV6_ABITS;
int i;
for (i = 3; i >= 0; i--) {
plen -= 32;
continue;
}
if (bits == 0)
break;
}
return (plen);
}
/*
* Convert a prefix length to the mask for that prefix.
* Returns the argument bitmask.
*/
{
return (NULL);
if (plen == 0)
return (bitmask);
while (plen > 32) {
*ptr++ = 0xffffffffU;
plen -= 32;
}
return (bitmask);
}
/*
* Add a fully initialized IPv6 IRE to the forwarding table.
* This returns NULL on failure, or a held IRE on success.
* Normally the returned IRE is the same as the argument. But a different
* IRE will be returned if the added IRE is deemed identical to an existing
* one. In that case ire_identical_ref will be increased.
* The caller always needs to do an ire_refrele() on the returned IRE.
*/
ire_t *
{
int mask_table_index;
int match_flags;
int error;
/* Make sure the address is properly masked. */
int i;
sizeof (irb_t)));
return (NULL);
}
for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
}
NULL) {
ptr;
} else {
/*
* Some other thread won the race in
* initializing the forwarding table at the
* same index.
*/
for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
}
}
}
/*
* Start the atomic add of the ire. Grab the bucket lock and the
* ill lock. Check for condemned.
*/
if (error != 0) {
return (NULL);
}
/*
* If we are creating a hidden IRE, make sure we search for
* hidden IREs when searching for duplicates below.
* Otherwise, we might find an IRE on some other interface
* that's not marked hidden.
*/
if (ire->ire_testhidden)
/*
* Atomically check for duplicate and insert in the table.
*/
if (IRE_IS_CONDEMNED(ire1))
continue;
/*
* Here we need an exact match on zoneid, i.e.,
* ire_match_args doesn't fit.
*/
continue;
continue;
/*
* Note: We do not allow multiple routes that differ only
* in the gateway security attributes; such routes are
* considered duplicates.
* To change that we explicitly have to treat them as
* different here.
*/
match_flags)) {
/*
* Return the old ire after doing a REFHOLD.
* As most of the callers continue to use the IRE
* after adding, we return a held ire. This will
* avoid a lookup in the caller again. If the callers
* don't want to use it, they need to do a REFRELE.
*/
ip1dbg(("found dup ire existing %p new %p",
return (ire1);
}
}
/*
* Normally we do head insertion since most things do not care about
* the order of the IREs in the bucket.
* However, due to shared-IP zones (and restrict_interzone_loopback)
* we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same
* address. For that reason we do tail insertion for IRE_IF_CLONE.
*/
}
/* Insert at *irep */
/* Link the new one in. */
/*
* ire_walk routines de-reference ire_next without holding
* a lock. Before we point to the new ire, we want to make
* sure the store that sets the ire_next of the new ire
* reaches global visibility, so that ire_walk routines
* don't see a truncated list of ires i.e if the ire_next
* of the new ire gets set after we do "*irep = ire" due
* to re-ordering, the ire_walk thread will see a NULL
* once it accesses the ire_next of the new ire.
* membar_producer() makes sure that the following store
* happens *after* all of the above stores.
*/
/*
* We return a bumped up IRE above. Keep it symmetrical
* so that the callers will always have to release. This
* helps the callers of this function because they continue
* to use the IRE after adding and hence they don't have to
* lookup again after we return the IRE.
*
* NOTE : We don't have to use atomics as this is appearing
* in the list for the first time and no one else can bump
* up the reference count on this yet.
*/
irb_ptr->irb_ire_cnt++;
(char *), "ire", (void *), ire);
}
/* Make any caching of the IREs be notified or updated */
return (ire);
}
/*
* Search for all HOST REDIRECT routes that are
* pointing at the specified gateway and
* delete them. This routine is called only
* when a default gateway is going away.
*/
static void
{
int i;
/* get the hash table for HOST routes */
return;
for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) {
continue;
}
}
}
/*
* Delete the specified IRE.
* All calls should use ire_delete().
* Sometimes called as writer though not required by this function.
*
* NOTE : This function is called only if the ire was added
* in the list.
*/
void
{
/*
* Make sure ire_generation increases from ire_flush_cache happen
* completed we can exit the lock immediately.
*/
/*
* when a default gateway is going away
* delete all the host redirects pointing at that
* gateway.
*/
}
/*
* If we are deleting an IRE_INTERFACE then we make sure we also
* delete any IRE_IF_CLONE that has been created from it.
* Those are always in ire_dep_children.
*/
/* Remove from parent dependencies and child */
}
}
/*
* When an IRE is added or deleted this routine is called to make sure
* any caching of IRE information is notified or updated.
*
* The flag argument indicates if the flush request is due to addition
* of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE),
* or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE).
*/
void
{
/*
* IRE_IF_CLONE ire's don't provide any new information
* than the parent from which they are cloned, so don't
* perturb the generation numbers.
*/
return;
/*
* Ensure that an ire_add during a lookup serializes the updates of
* the generation numbers under ire_head_lock so that the lookup gets
* either the old ire and old generation number, or a new ire and new
* generation number.
*/
/*
* If a route was just added, we need to notify everybody that
* has cached an IRE_NOROUTE since there might now be a better
* route for them.
*/
if (flag == IRE_FLUSH_ADD) {
}
/* Adding a default can't otherwise provide a better route */
return;
}
switch (flag) {
case IRE_FLUSH_DELETE:
case IRE_FLUSH_GWCHANGE:
/*
* Update ire_generation for all ire_dep_children chains
* starting with this IRE
*/
break;
case IRE_FLUSH_ADD: {
/*
* Find an IRE which is a shorter match than the ire to be added
* For any such IRE (which we repeat) we update the
* ire_generation the same way as in the delete case.
*/
/* We need to handle all in the same bucket */
}
}
break;
}
}
/*
* Matches the arguments passed with the values in the ire.
*
* Note: for match types that match using "ill" passed in, ill
* must be checked for non-NULL before calling this routine.
*/
{
/*
* If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it
* is in fact hidden, to ensure the caller gets the right one.
*/
if (ire->ire_testhidden) {
if (!(match_flags & MATCH_IRE_TESTHIDDEN))
return (B_FALSE);
}
/*
* If MATCH_IRE_ZONEONLY has been set and the supplied zoneid
* does not match that of ire_zoneid, a failure to
* match is reported at this point. Otherwise, since some IREs
* that are available in the global zone can be used in local
* zones, additional checks need to be performed:
*
* IRE_LOOPBACK
* entries should never be matched in this situation.
* Each zone has its own IRE_LOOPBACK.
*
* IRE_LOCAL
* We allow them for any zoneid. ire_route_recursive
* does additional checks when
* ip_restrict_interzone_loopback is set.
*
* If ill_usesrc_ifindex is set
* Then we check if the zone has a valid source address
* on the usesrc ill.
*
* If ire_ill is set, then check that the zone has an ipif
* on that ill.
*
* Outside of this function (in ire_round_robin) we check
* that any IRE_OFFLINK has a gateway that reachable from the
* zone when we have multiple choices (ECMP).
*/
if (match_flags & MATCH_IRE_ZONEONLY)
return (B_FALSE);
return (B_FALSE);
goto matchit;
/*
* The normal case of IRE_ONLINK has a matching zoneid.
* Here we handle the case when shared-IP zones have been
* configured with IP addresses on vniN. In that case it
* is ok for traffic from a zone to use IRE_ONLINK routes
* if the ill has a usesrc pointing at vniN
* Applies to IRE_INTERFACE.
*/
/*
* Note there is no IRE_INTERFACE on vniN thus
* can't do an IRE lookup for a matching route.
*/
if (ifindex == 0)
return (B_FALSE);
/*
* If there is a usable source address in the
* zone, then it's ok to return this IRE_INTERFACE
*/
ip3dbg(("ire_match_args: no usrsrc for zone"
" dst_ill %p\n", (void *)dst_ill));
return (B_FALSE);
}
}
/*
* For exampe, with
* route add 11.0.0.0 gw1 -ifp bge0
* route add 11.0.0.0 gw2 -ifp bge1
* this code would differentiate based on
* where the sending zone has addresses.
* Only if the zone has an address on bge0 can it use the first
* route. It isn't clear if this behavior is documented
* anywhere.
*/
if (!IPIF_IS_CONDEMNED(tipif) &&
break;
}
return (B_FALSE);
}
}
if (match_flags & MATCH_IRE_GW) {
}
if (match_flags & MATCH_IRE_ILL) {
/*
* If asked to match an ill, we *must* match
* on the ire_ill for ipmp test addresses, or
* any of the ill in the group for data addresses.
* If we don't, we may as well fail.
* However, we need an exception for IRE_LOCALs to ensure
* we loopback packets even sent to test addresses on different
* interfaces in the group.
*/
if ((match_flags & MATCH_IRE_TESTHIDDEN) &&
return (B_FALSE);
} else {
/*
* We know that ill is not NULL, but ire_ill could be
* NULL
*/
return (B_FALSE);
}
}
/* No ire_addr_v6 bits set past the mask */
ire->ire_addr_v6));
((!(match_flags & MATCH_IRE_GW)) ||
((!(match_flags & MATCH_IRE_MASK)) ||
((!(match_flags & MATCH_IRE_SECATTR)) ||
(!is_system_labeled()) ||
/* We found the matched IRE */
return (B_TRUE);
}
return (B_FALSE);
}
/*
* Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified
* gateway address. If ill is non-NULL we also match on it.
* The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set.
*/
{
if (lock_held)
else
ipst);
if (!lock_held)
return (B_TRUE);
} else {
return (B_FALSE);
}
}
/*
* Lookup a route in forwarding table.
* specific lookup is indicated by passing the
* required parameters and indicating the
* match required in flag field.
*
*/
ire_t *
{
/*
* ire_match_args_v6() will dereference ill if MATCH_IRE_ILL
* is set.
*/
return (NULL);
return (NULL);
}
/*
* round-robin only if we have more than one route in the bucket.
* ips_ip_ecmp_behavior controls when we do ECMP
* 2: always
* 1: for IRE_DEFAULT and /0 IRE_INTERFACE
* 0: never
*
* Note: if we found an IRE_IF_CLONE we won't look at the bucket with
* other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
* and the IRE_INTERFACESs are likely to be shorter matches.
*/
IS_DEFAULT_ROUTE_V6(ire))) {
/* keep ire if next_ire is null */
goto done;
}
}
}
done:
/* Return generation before dropping lock */
if (generationp != NULL)
/*
* For shared-IP zones we need additional checks to what was
* done in ire_match_args to make sure IRE_LOCALs are handled.
*
* When ip_restrict_interzone_loopback is set, then
* we ensure that IRE_LOCAL are only used for loopback
* between zones when the logical "Ethernet" would
* have looped them back. That is, if in the absense of
* the IRE_LOCAL we would have sent to packet out the
* same ill.
*/
}
return (ire);
}
/*
* Look up a single ire. The caller holds either the read or write lock.
*/
ire_t *
{
int i;
/*
* If the mask is known, the lookup
* is simple, if the mask is not known
* we need to search.
*/
if (flags & MATCH_IRE_MASK) {
return (NULL);
}
if (IRE_IS_CONDEMNED(ire))
continue;
goto found_ire;
}
} else {
/*
* In this case we don't know the mask, we need to
* search the table assuming different mask sizes.
*/
if (flags & MATCH_IRE_SHORTERMASK) {
if (masklen == 0) {
/* Nothing shorter than zero */
return (NULL);
}
masklen--;
} else {
}
for (i = masklen; i >= 0; i--) {
continue;
(void) ip_plen_to_mask_v6(i, &tmpmask);
if (IRE_IS_CONDEMNED(ire))
continue;
goto found_ire;
}
}
}
ip1dbg(("ire_ftable_lookup_v6: returning NULL ire"));
return (NULL);
return (ire);
}
/*
* This function is called by
* ip_input/ire_route_recursive when doing a route lookup on only the
* destination address.
*
* The optimizations of this function over ire_ftable_lookup are:
* o removing unnecessary flag matching
* o doing longest prefix match instead of overloading it further
* with the unnecessary "best_prefix_match"
*
* If no route is found we return IRE_NOROUTE.
*/
ire_t *
{
if (generationp != NULL)
}
/* ftable_lookup did round robin */
return (ire);
}
ire_t *
{
multirtp));
}
/*
* Recursively look for a route to the destination. Can also match on
* the zoneid, ill, and label. Used for the data paths. See also
* ire_route_recursive_dstonly.
*
* If ill is set this means we will match it by adding MATCH_IRE_ILL.
*
* If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
* create an IRE_IF_CLONE. This is used on the receive side when we are not
* forwarding.
* If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
* resolve the gateway.
*
* Note that this function never returns NULL. It returns an IRE_NOROUTE
* instead.
*
* If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
* is an error.
* Allow at most one RTF_INDIRECT.
*/
ire_t *
{
int i, j;
int prefs[MAX_IRE_RECURSION];
/*
* We iterate up to three times to resolve a route, even though
* we have four slots in the array. The extra slot is for an
* IRE_IF_CLONE we might need to create.
*/
i = 0;
while (i < MAX_IRE_RECURSION - 1) {
/* ire_ftable_lookup handles round-robin/ECMP */
} else {
/* Caller passed it; extra hold since we will rele */
if (generationp != NULL)
else
}
/* Need to return the ire with RTF_REJECT|BLACKHOLE */
goto error;
if (i != 0) {
/*
* Don't allow anything unusual past the first
* iteration.
*/
if (irr_flags & IRR_INCOMPLETE) {
} else {
}
goto error;
}
}
/* We have a usable IRE */
generations[i] = generation;
i++;
/* The first RTF_SETSRC address is passed back if setsrcp */
&ire->ire_setsrc_addr_v6));
}
/* The first ire_gw_secattr is passed back if gwattrp */
/*
* Check if we have a short-cut pointer to an IRE for this
* destination, and that the cached dependency isn't stale.
* In that case we've rejoined an existing tree towards a
* parent, thus we don't need to continue the loop to
* discover the rest of the tree.
*/
goto done;
}
/*
* If this type should have an ire_nce_cache (even if it
* doesn't yet have one) then we are done. Includes
* IRE_INTERFACE with a full 128 bit mask.
*/
if (ire->ire_nce_capable) {
goto done;
}
/*
* For an IRE_INTERFACE we create an IRE_IF_CLONE for this
* particular destination
*/
/*
* In the case of ip_input and ILLF_FORWARDING not
* being set, and in the case of RTM_GET, there is
* no point in allocating an IRE_IF_CLONE. We return
* the IRE_INTERFACE. Note that !IRR_ALLOCATE can
* result in a ire_dep_parent which is IRE_IF_*
* without an IRE_IF_CLONE.
* We recover from that when we need to send packets
* by ensuring that the generations become
* IRE_GENERATION_VERIFY in this case.
*/
if (!(irr_flags & IRR_ALLOCATE)) {
invalidate = B_TRUE;
goto done;
}
&generation);
/*
* Temporary failure - no memory.
* Don't want caller to cache IRE_NOROUTE.
*/
invalidate = B_TRUE;
goto error;
}
/*
* Make clone next to last entry and the
* IRE_INTERFACE the last in the dependency
* chain since the clone depends on the
* IRE_INTERFACE.
*/
ASSERT(i >= 1);
ASSERT(i < MAX_IRE_RECURSION);
i++;
goto done;
}
/*
* We only match on the type and optionally ILL when
* recursing. The type match is used by some callers
* to exclude certain types (such as IRE_IF_CLONE or
* IRE_LOCAL|IRE_LOOPBACK).
*/
}
/*
* We set the prefs[i] value above if i > 0. We've already
* done i++ so i is one in the case of the first time around.
*/
if (i == 1)
}
if (need_refrele)
/*
* In the case of MULTIRT we want to try a different IRE the next
* time. We let the next packet retry in that case.
*/
(void) ire_no_good(ires[0]);
/* cleanup ires[i] */
ire_dep_unbuild(ires, i);
for (j = 0; j < i; j++)
ire_refrele(ires[j]);
(irr_flags & IRR_INCOMPLETE));
/*
* Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
* ip_select_route since the reject or lack of memory might be gone.
*/
if (generationp != NULL)
return (ire);
done:
if (need_refrele)
/* Build dependencies */
/* Something in chain was condemned; tear it apart */
goto cleanup;
}
/*
* Release all refholds except the one for ires[0] that we
* will return to the caller.
*/
for (j = 1; j < i; j++)
ire_refrele(ires[j]);
if (invalidate) {
/*
* Since we needed to allocate but couldn't we need to make
* sure that the dependency chain is rebuilt the next time.
*/
} else {
/*
* IREs can have been added or deleted while we did the
* recursive lookup and we can't catch those until we've built
* the dependencies. We verify the stored
* ire_dep_parent_generation to catch any such changes and
* return IRE_GENERATION_VERIFY (which will cause
* ip_select_route to be called again so we can redo the
* recursive lookup next time we send a packet.
*/
else
/* Something changed at the top */
}
}
if (generationp != NULL)
return (ires[0]);
}
ire_t *
{
gwattrp, generationp));
}
/*
* Recursively look for a route to the destination.
* We only handle a destination match here, yet we have the same arguments
* as the full match to allow function pointers to select between the two.
*
* Note that this function never returns NULL. It returns an IRE_NOROUTE
* instead.
*
* If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
* is an error.
* Allow at most one RTF_INDIRECT.
*/
ire_t *
{
/* ire_ftable_lookup handles round-robin/ECMP */
&generation);
/*
* If this type should have an ire_nce_cache (even if it
* doesn't yet have one) then we are done. Includes
* IRE_INTERFACE with a full 128 bit mask.
*/
if (ire->ire_nce_capable)
return (ire);
/*
* If the IRE has a current cached parent we know that the whole
* parent chain is current, hence we don't need to discover and
* build any dependencies by doing a recursive lookup.
*/
return (ire);
}
/*
* Fallback to loop in the normal code starting with the ire
* we found. Normally this would return the same ire.
*/
&generation);
return (ire1);
}