ip_ftable.c revision 9e3469d3db608feb0e43d9955cbf406c22025463
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* This file contains consumer routines of the IPv4 forwarding engine
*/
#include <inet/ipsec_impl.h>
#include <inet/ip_ftable.h>
#include <inet/ipclassifier.h>
#define IS_DEFAULT_ROUTE(ire) \
static void ire_del_host_redir(ire_t *, char *);
/*
* Lookup a route in forwarding table. A specific lookup is indicated by
* passing the required parameters and indicating the match required in the
* flag field.
*
*/
ire_t *
{
/*
* ire_match_args() will dereference ill if MATCH_IRE_ILL
* is set.
*/
return (NULL);
/*
* The flags argument passed to ire_ftable_lookup may cause the
* search to return, not the longest matching prefix, but the
* "best matching prefix", i.e., the longest prefix that also
* satisfies constraints imposed via the permutation of flags
* passed in. To achieve this, we invoke ire_match_args() on
* each matching leaf in the radix tree. ire_match_args is
* invoked by the callback function ire_find_best_route()
* We hold the global tree lock in read mode when calling
* rn_match_args. Before dropping the global tree lock, ensure
* that the radix node can't be deleted by incrementing ire_refcnt.
*/
return (NULL);
}
/*
* round-robin only if we have more than one route in the bucket.
* ips_ip_ecmp_behavior controls when we do ECMP
* 2: always
* 1: for IRE_DEFAULT and /0 IRE_INTERFACE
* 0: never
*/
IS_DEFAULT_ROUTE(ire))) {
/* keep ire if next_ire is null */
goto done;
}
}
}
done:
/* Return generation before dropping lock */
if (generationp != NULL)
/*
* For shared-IP zones we need additional checks to what was
* done in ire_match_args to make sure IRE_LOCALs are handled.
*
* When ip_restrict_interzone_loopback is set, then
* we ensure that IRE_LOCAL are only used for loopback
* between zones when the logical "Ethernet" would
* have looped them back. That is, if in the absense of
* the IRE_LOCAL we would have sent to packet out the
* same ill.
*/
}
return (ire);
}
/*
* This function is called by
* ip_input/ire_route_recursive when doing a route lookup on only the
* destination address.
*
* The optimizations of this function over ire_ftable_lookup are:
* o removing unnecessary flag matching
* o doing longest prefix match instead of overloading it further
* with the unnecessary "best_prefix_match"
*
* If no route is found we return IRE_NOROUTE.
*/
ire_t *
{
struct rt_sockaddr rdst;
/*
* This is basically inlining a simpler version of ire_match_args
*/
goto bad;
if (irb->irb_ire_cnt == 0)
goto bad;
goto bad;
}
while (IRE_IS_CONDEMNED(ire)) {
goto bad;
}
}
/* we have a ire that matches */
/*
* round-robin only if we have more than one route in the bucket.
* ips_ip_ecmp_behavior controls when we do ECMP
* 2: always
* 1: for IRE_DEFAULT and /0 IRE_INTERFACE
* 0: never
*
* Note: if we found an IRE_IF_CLONE we won't look at the bucket with
* other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
* and the IRE_INTERFACESs are likely to be shorter matches.
*/
IS_DEFAULT_ROUTE(ire))) {
/* keep ire if next_ire is null */
if (generationp != NULL)
return (ire);
}
}
}
/* Return generation before dropping lock */
if (generationp != NULL)
/*
* Since we only did ALL_ZONES matches there is no special handling
* of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that.
*/
return (ire);
bad:
if (generationp != NULL)
}
/*
* Find the ill matching a multicast group.
* Allows different routes for multicast addresses
* in the unicast routing table (akin to 224.0.0.0 but could be more specific)
* which point at different interfaces. This is used when IP_MULTICAST_IF
* isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
* specify the interface to join on.
*
* Supports link-local addresses by using ire_route_recursive which follows
* the ill when recursing.
*
* To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
* and the MULTIRT property can be different for different groups, we
* extract RTF_MULTIRT from the special unicast route added for a group
* with CGTP and pass that back in the multirtp argument.
* This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
* We have a setsrcp argument for the same reason.
*/
ill_t *
{
return (NULL);
}
return (ill);
}
/*
* Delete the passed in ire if the gateway addr matches
*/
void
{
}
/*
* Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are
* pointing at the specified gateway and
* delete them. This routine is called only
* when a default gateway is going away.
*/
void
{
}
/*
* Obtain the rt_entry and rt_irb for the route to be added to
* the ips_ip_ftable.
* First attempt to add a node to the radix tree via rn_addroute. If the
* route already exists, return the bucket for the existing route.
*
* Locking notes: Need to hold the global radix tree lock in write mode to
* add a radix node. To prevent the node from being deleted, ire_get_bucket()
* returns with a ref'ed irb_t. The ire itself is added in ire_add_v4()
* while holding the irb_lock, but not the radix tree lock.
*/
irb_t *
{
struct radix_node *rn;
/* first try to see if route exists (based on rtalloc1) */
/*
* add the route. based on BSD's rtrequest1(RTM_ADD)
*/
/* kmem_alloc failed */
return (NULL);
/* found a non-root match */
}
}
}
return (irb);
}
/*
* This function is used when the caller wants to know the outbound
* interface for a packet given only the address.
* If this is a offlink IP address and there are multiple
* routes to this destination, this routine will utilise the
* first route it finds to IP address
* Return values:
* 0 - FAILURE
* nonzero - ifindex
*/
{
netstack_t *ns;
else
/*
* For exclusive stacks we set the zoneid to zero
* since IP uses the global zoneid in the exclusive stacks.
*/
}
}
return (ifindex);
}
/*
* Routine to find the route to a destination. If a ifindex is supplied
* it tries to match the route to the corresponding ipif for the ifindex
*/
static ire_t *
{
int match_flags;
/* XXX pass NULL tsl for now */
} else {
}
return (NULL);
}
return (ire);
}
/*
* This routine is called by IP Filter to send a packet out on the wire
* to a specified dstination (which may be onlink or offlink). The ifindex may
* or may not be 0. A non-null ifindex indicates IP Filter has stipulated
* an outgoing interface and requires the nexthop to be on that interface.
* IP WILL NOT DO the following to the data packet before sending it out:
* a. manipulate ttl
* b. ipsec work
* c. fragmentation
*
* If the packet has been prepared for hardware checksum then it will be
* passed off to ip_send_align_cksum() to check that the flags set on the
* packet are in alignment with the capabilities of the new outgoing NIC.
*
* Return values:
* 0: IP was able to send of the data pkt
* ECOMM: Could not send packet
* ENONET No route to dst. It is up to the caller
* to send icmp unreachable error message,
* EINPROGRESS The macaddr of the onlink dst or that
* of the offlink dst's nexthop needs to get
* resolved before packet can be sent to dst.
* Thus transmission is not guaranteed.
* Note: No longer have visibility to the ARP queue
* hence no EINPROGRESS.
*/
int
{
netstack_t *ns;
int error;
else
/*
* For exclusive stacks we set the zoneid to zero
* since IP uses the global zoneid in the exclusive stacks.
*/
/*
* No IPsec, no fragmentation, and don't let any hooks see
* the packet.
*/
}
} else {
}
}
ixa_cleanup(&ixas);
switch (error) {
case 0:
break;
case EHOSTUNREACH:
case ENETUNREACH:
break;
default:
break;
}
return (error);
}
/*
* callback function provided by ire_ftable_lookup when calling
* rn_match_args(). Invoke ire_match_args on each matching leaf node in
* the radix tree.
*/
{
if (irb_ptr->irb_ire_cnt == 0)
return (B_FALSE);
if (IRE_IS_CONDEMNED(ire))
continue;
else
return (B_TRUE);
}
}
return (B_FALSE);
}
/*
* ftable irb_t structures are dynamically allocated, and we need to
* check if the irb_t (and associated ftable tree attachment) needs to
* be cleaned up when the irb_refcnt goes to 0. The conditions that need
* be verified are:
* - no other walkers of the irebucket, i.e., quiescent irb_refcnt,
* - no other threads holding references to ire's in the bucket,
* i.e., irb_nire == 0
* - no active ire's in the bucket, i.e., irb_ire_cnt == 0
* - need to hold the global tree lock and irb_lock in write mode.
*/
void
{
for (;;) {
/*
* Someone has a reference to this radix node
* or there is some bucket walker.
*/
irb->irb_refcnt--;
return;
} else {
/*
* There is no other walker, nor is there any
* other thread that holds a direct ref to this
* radix node. Do the clean up if needed. Call
* to ire_unlink will clear the IRB_MARK_CONDEMNED flag
*/
/*
* more CONDEMNED entries could have
* been added while we dropped the lock,
* so we have to re-check.
*/
continue;
}
/*
* Now check if there are still any ires
* associated with this radix node.
*/
/*
* someone is still holding on
* to ires in this bucket
*/
irb->irb_refcnt--;
return;
} else {
/*
* Everything is clear. Zero walkers,
* Zero threads with a ref to this
* radix node, Zero ires associated with
* this radix node. Due to lock order,
* check the above conditions again
* after grabbing all locks in the right order
*/
if (irb_inactive(irb))
return;
/*
* irb_inactive could not free the irb.
* See if there are any walkers, if not
* try to clean up again.
*/
}
}
}
}
/*
* IRE iterator used by ire_ftable_lookup to process multiple equal
* routes. Given a starting point in the hash list (hash), walk the IREs
* in the bucket skipping deleted entries. We treat the bucket as a circular
* list for the purposes of walking it.
* Returns the IRE (held) that corresponds to the hash value. If that IRE is
* not applicable (ire_match_args failed) then it returns a subsequent one.
* If we fail to find an IRE we return NULL.
*
* Assumes that the caller holds a reference on the IRE bucket and a read lock
* on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6).
*
* Applies to IPv4 and IPv6.
*
* For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same
* address and bucket, we compare against ire_type for the orig_ire. We also
* have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being
* first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire.
*
* Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is
* reachable from the zone i.e., that the ire_gateway_addr is in a subnet
* in which the zone has an IP address. We check this for the global zone
* even if no shared-IP zones are configured.
*/
ire_t *
{
/*
* Round-robin the routers list looking for a route that
* matches the passed in parameters.
* First we skip "hash" number of non-condemned IREs.
* Then we match the IRE.
* If we find an ire which has a non-zero ire_badcnt then we remember
* it and keep on looking for a lower ire_badcnt.
* If we come to the end of the list we continue (treat the
* bucket list as a circular list) but we match less than "max"
* entries.
*/
while (maxwalk > 0) {
if (IRE_IS_CONDEMNED(ire))
goto next_ire_skip;
/* Skip the first "hash" entries to do ECMP */
if (hash != 0) {
hash--;
goto next_ire_skip;
}
/* See CGTP comment above */
goto next_ire;
/*
* Note: Since IPv6 has hash buckets instead of radix
* buckers we need to explicitly compare the addresses.
* That makes this less efficient since we will be called
* even if there is no alternatives just because the
* bucket has multiple IREs for different addresses.
*/
&ire->ire_addr_v6))
goto next_ire;
}
/*
* For some reason find_best_route uses ire_mask. We do
* the same.
*/
goto next_ire;
/*
* When we're in a zone, we're only
* interested in routers that are
* reachable through ipifs within our zone.
*/
if (!ire_gateway_ok_zone_v4(
B_TRUE))
goto next_ire;
} else {
if (!ire_gateway_ok_zone_v6(
goto next_ire;
}
}
/* Look for stale ire_badcnt and clear */
if (ire->ire_badcnt != 0 &&
ire->ire_badcnt = 0;
if (ire->ire_badcnt == 0) {
/* We found one with a zero badcnt; done */
/*
* Care needed since irb_refrele grabs WLOCK to free
* the irb_t.
*/
} else {
}
return (ire);
}
/*
* keep looking to see if there is a better (lower
* badcnt) matching IRE, but save this one as a last resort.
* If we find a lower badcnt pick that one as the last* resort.
*/
}
maxwalk--;
}
/* Care needed since irb_refrele grabs WLOCK to free the irb_t. */
} else {
}
return (maybe_ire);
}
void
{
}
void
{
}
/*
* Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
* routes this routine sets up a ire_nce_cache as well. The caller needs to
* lookup an nce for the multicast case.
*/
ire_t *
{
/*
* The content of the ixa will be different if IP_NEXTHOP,
* SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
*/
/* Pick up the IRE_MULTICAST for the ill */
if (ixa->ixa_multicast_ifindex != 0) {
} else if (ixaflags & IXAF_SCOPEID_SET) {
/* sin6_scope_id takes precedence over ixa_ifindex */
} else if (ixa->ixa_ifindex != 0) {
/*
* In the ipmp case, the ixa_ifindex is set to
* point at an under_ill and we would return the
* ire_multicast() corresponding to that under_ill.
*/
} else if (ixaflags & IXAF_IS_IPV4) {
} else {
}
}
/* Get a hold on the IRE_NOROUTE */
return (ire);
}
*errorp = EHOSTUNREACH;
/* Get a hold on the IRE_NOROUTE */
return (ire);
}
/* Get a refcnt on the single IRE_MULTICAST per ill */
if (generationp != NULL)
*errorp = EHOSTUNREACH;
}
return (ire);
}
if (ixaflags & IXAF_SCOPEID_SET) {
/* sin6_scope_id takes precedence over ixa_ifindex */
} else {
}
}
/* Get a hold on the IRE_NOROUTE */
return (ire);
}
/*
* icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
* so for both of them we need to be able look for an under
* interface.
*/
if (IS_UNDER_IPMP(ill))
} else {
}
if (ixaflags & IXAF_NEXTHOP_SET) {
/* IP_NEXTHOP was set */
} else {
}
ire_type = 0;
/* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */
/*
* If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
* we only look for an onlink IRE.
*/
}
if (ixaflags & IXAF_IS_IPV4) {
} else {
}
#ifdef DEBUG
if (match_args & MATCH_IRE_TESTHIDDEN) {
ip3dbg(("looking for hidden; dst %x ire %p\n",
}
#endif
/* No ire_nce_cache */
return (ire);
}
/* Setup ire_nce_cache if it doesn't exist or is condemned. */
(void) ire_revalidate_nce(ire);
} else {
}
return (ire);
}
/*
* Find a route given some xmit attributes and a packet.
* Generic for IPv4 and IPv6
*
* This never returns NULL. But when it returns the IRE_NOROUTE
* it might set errorp.
*/
ire_t *
{
} else {
}
}
ire_t *
{
multirtp);
return (ire);
}
/*
* Recursively look for a route to the destination. Can also match on
* the zoneid, ill, and label. Used for the data paths. See also
* ire_route_recursive.
*
* If ill is set this means we will match it by adding MATCH_IRE_ILL.
*
* If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
* create an IRE_IF_CLONE. This is used on the receive side when we are not
* forwarding.
* If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
* resolve the gateway.
*
* Note that this function never returns NULL. It returns an IRE_NOROUTE
* instead.
*
* If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
* is an error.
* Allow at most one RTF_INDIRECT.
*/
ire_t *
{
int i, j;
int prefs[MAX_IRE_RECURSION];
/*
* We iterate up to three times to resolve a route, even though
* we have four slots in the array. The extra slot is for an
* IRE_IF_CLONE we might need to create.
*/
i = 0;
while (i < MAX_IRE_RECURSION - 1) {
/* ire_ftable_lookup handles round-robin/ECMP */
} else {
/* Caller passed it; extra hold since we will rele */
if (generationp != NULL)
else
}
/* Need to return the ire with RTF_REJECT|BLACKHOLE */
goto error;
if (i != 0) {
/*
* Don't allow anything unusual past the first
* iteration.
*/
if (irr_flags & IRR_INCOMPLETE) {
} else {
}
goto error;
}
}
/* We have a usable IRE */
generations[i] = generation;
i++;
/* The first RTF_SETSRC address is passed back if setsrcp */
}
/* The first ire_gw_secattr is passed back if gwattrp */
/*
* Check if we have a short-cut pointer to an IRE for this
* destination, and that the cached dependency isn't stale.
* In that case we've rejoined an existing tree towards a
* parent, thus we don't need to continue the loop to
* discover the rest of the tree.
*/
goto done;
}
/*
* If this type should have an ire_nce_cache (even if it
* doesn't yet have one) then we are done. Includes
* IRE_INTERFACE with a full 32 bit mask.
*/
if (ire->ire_nce_capable) {
goto done;
}
/*
* For an IRE_INTERFACE we create an IRE_IF_CLONE for this
* particular destination
*/
/*
* In the case of ip_input and ILLF_FORWARDING not
* being set, and in the case of RTM_GET, there is
* no point in allocating an IRE_IF_CLONE. We return
* the IRE_INTERFACE. Note that !IRR_ALLOCATE can
* result in a ire_dep_parent which is IRE_IF_*
* without an IRE_IF_CLONE.
* We recover from that when we need to send packets
* by ensuring that the generations become
* IRE_GENERATION_VERIFY in this case.
*/
if (!(irr_flags & IRR_ALLOCATE)) {
invalidate = B_TRUE;
goto done;
}
&generation);
/*
* Temporary failure - no memory.
* Don't want caller to cache IRE_NOROUTE.
*/
invalidate = B_TRUE;
goto error;
}
/*
* Make clone next to last entry and the
* IRE_INTERFACE the last in the dependency
* chain since the clone depends on the
* IRE_INTERFACE.
*/
ASSERT(i >= 1);
ASSERT(i < MAX_IRE_RECURSION);
i++;
goto done;
}
/*
* We only match on the type and optionally ILL when
* recursing. The type match is used by some callers
* to exclude certain types (such as IRE_IF_CLONE or
* IRE_LOCAL|IRE_LOOPBACK).
*/
}
/*
* We set the prefs[i] value above if i > 0. We've already
* done i++ so i is one in the case of the first time around.
*/
if (i == 1)
}
if (need_refrele)
/*
* In the case of MULTIRT we want to try a different IRE the next
* time. We let the next packet retry in that case.
*/
(void) ire_no_good(ires[0]);
/* cleanup ires[i] */
ire_dep_unbuild(ires, i);
for (j = 0; j < i; j++)
ire_refrele(ires[j]);
(irr_flags & IRR_INCOMPLETE));
/*
* Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
* ip_select_route since the reject or lack of memory might be gone.
*/
if (generationp != NULL)
return (ire);
done:
if (need_refrele) {
}
/* Build dependencies */
/* Something in chain was condemned; tear it apart */
goto cleanup;
}
/*
* Release all refholds except the one for ires[0] that we
* will return to the caller.
*/
for (j = 1; j < i; j++)
ire_refrele(ires[j]);
if (invalidate) {
/*
* Since we needed to allocate but couldn't we need to make
* sure that the dependency chain is rebuilt the next time.
*/
} else {
/*
* IREs can have been added or deleted while we did the
* recursive lookup and we can't catch those until we've built
* the dependencies. We verify the stored
* ire_dep_parent_generation to catch any such changes and
* return IRE_GENERATION_VERIFY (which will cause
* ip_select_route to be called again so we can redo the
* recursive lookup next time we send a packet.
*/
else
/* Something changed at the top */
}
}
if (generationp != NULL)
return (ires[0]);
}
ire_t *
{
gwattrp, generationp));
}
/*
* Recursively look for a route to the destination.
* We only handle a destination match here, yet we have the same arguments
* as the full match to allow function pointers to select between the two.
*
* Note that this function never returns NULL. It returns an IRE_NOROUTE
* instead.
*
* If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
* is an error.
* Allow at most one RTF_INDIRECT.
*/
ire_t *
{
/* ire_ftable_lookup handles round-robin/ECMP */
&generation);
/*
* If this type should have an ire_nce_cache (even if it
* doesn't yet have one) then we are done. Includes
* IRE_INTERFACE with a full 32 bit mask.
*/
if (ire->ire_nce_capable)
return (ire);
/*
* If the IRE has a current cached parent we know that the whole
* parent chain is current, hence we don't need to discover and
* build any dependencies by doing a recursive lookup.
*/
return (ire);
}
/*
* Fallback to loop in the normal code starting with the ire
* we found. Normally this would return the same ire.
*/
&generation);
return (ire1);
}