c793af95640863cd29868fc7c419c5d2496b207bsangeeta * CDDL HEADER START
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * The contents of this file are subject to the terms of the
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * Common Development and Distribution License (the "License").
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * You may not use this file except in compliance with the License.
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * See the License for the specific language governing permissions
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * and limitations under the License.
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * When distributing Covered Code, include this CDDL HEADER in each
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * If applicable, add the following below this CDDL HEADER, with the
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * fields enclosed by brackets "[]" replaced with your own identifying
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * information: Portions Copyright [yyyy] [name of copyright owner]
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * CDDL HEADER END
fff7ec1d8ce71b3d8a998ac4391a99860ce07180Sowmini Varadhan * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * This file contains consumer routines of the IPv4 forwarding engine
c793af95640863cd29868fc7c419c5d2496b207bsangeeta (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan (isv6 ? ipst->ips_ipv6_strict_src_multihoming : \
f4b3ec61df05330d25f55a36b975b4d7519fdeb1dhstatic ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmarkstatic void ire_del_host_redir(ire_t *, char *);
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmarkstatic boolean_t ire_find_best_route(struct radix_node *, void *);
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * Lookup a route in forwarding table. A specific lookup is indicated by
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * passing the required parameters and indicating the match required in the
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * flag field.
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * Supports IP_BOUND_IF by following the ipif/ill when recursing.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmarkire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * ire_match_args() will dereference ill if MATCH_IRE_ILL
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL))
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * The flags argument passed to ire_ftable_lookup may cause the
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * search to return, not the longest matching prefix, but the
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * "best matching prefix", i.e., the longest prefix that also
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * satisfies constraints imposed via the permutation of flags
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * passed in. To achieve this, we invoke ire_match_args() on
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * each matching leaf in the radix tree. ire_match_args is
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * invoked by the callback function ire_find_best_route()
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * We hold the global tree lock in read mode when calling
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * rn_match_args. Before dropping the global tree lock, ensure
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * that the radix node can't be deleted by incrementing ire_refcnt.
f4b3ec61df05330d25f55a36b975b4d7519fdeb1dh rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
c793af95640863cd29868fc7c419c5d2496b207bsangeeta DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire);
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * round-robin only if we have more than one route in the bucket.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * ips_ip_ecmp_behavior controls when we do ECMP
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark next_ire = ire_round_robin(ire->ire_bucket, &margs,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* keep ire if next_ire is null */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* Return generation before dropping lock */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * For shared-IP zones we need additional checks to what was
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * done in ire_match_args to make sure IRE_LOCALs are handled.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * When ip_restrict_interzone_loopback is set, then
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * we ensure that IRE_LOCAL are only used for loopback
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * between zones when the logical "Ethernet" would
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * have looped them back. That is, if in the absense of
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * the IRE_LOCAL we would have sent to packet out the
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * This function is called by
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * ip_input/ire_route_recursive when doing a route lookup on only the
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * destination address.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * The optimizations of this function over ire_ftable_lookup are:
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * o removing unnecessary flag matching
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * o doing longest prefix match instead of overloading it further
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * with the unnecessary "best_prefix_match"
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * If no route is found we return IRE_NOROUTE.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmarkire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * This is basically inlining a simpler version of ire_match_args
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* we have a ire that matches */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * round-robin only if we have more than one route in the bucket.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * ips_ip_ecmp_behavior controls when we do ECMP
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * and the IRE_INTERFACESs are likely to be shorter matches.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark next_ire = ire_round_robin(ire->ire_bucket, &margs,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* keep ire if next_ire is null */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* Return generation before dropping lock */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Since we only did ALL_ZONES matches there is no special handling
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Find the ill matching a multicast group.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Allows different routes for multicast addresses
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * in the unicast routing table (akin to 224.0.0.0 but could be more specific)
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * which point at different interfaces. This is used when IP_MULTICAST_IF
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * specify the interface to join on.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Supports link-local addresses by using ire_route_recursive which follows
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * the ill when recursing.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * and the MULTIRT property can be different for different groups, we
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * extract RTF_MULTIRT from the special unicast route added for a group
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * with CGTP and pass that back in the multirtp argument.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * We have a setsrcp argument for the same reason.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmarkire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL,
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL);
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Delete the passed in ire if the gateway addr matches
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark (ire->ire_gateway_addr == *(ipaddr_t *)gateway))
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * pointing at the specified gateway and
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * delete them. This routine is called only
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * when a default gateway is going away.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmarkire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst)
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
f4b3ec61df05330d25f55a36b975b4d7519fdeb1dh * Obtain the rt_entry and rt_irb for the route to be added to
f4b3ec61df05330d25f55a36b975b4d7519fdeb1dh * the ips_ip_ftable.
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * First attempt to add a node to the radix tree via rn_addroute. If the
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * route already exists, return the bucket for the existing route.
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * Locking notes: Need to hold the global radix tree lock in write mode to
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * add a radix node. To prevent the node from being deleted, ire_get_bucket()
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4()
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * while holding the irb_lock, but not the radix tree lock.
c793af95640863cd29868fc7c419c5d2496b207bsangeeta /* first try to see if route exists (based on rtalloc1) */
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * add the route. based on BSD's rtrequest1(RTM_ADD)
29bc4795a563df76952f94da2aa3b7daa8abf972sangeeta /* kmem_alloc failed */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */
f4b3ec61df05330d25f55a36b975b4d7519fdeb1dh rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask,
c793af95640863cd29868fc7c419c5d2496b207bsangeeta /* found a non-root match */
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * This function is used when the caller wants to know the outbound
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * interface for a packet given only the address.
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * If this is a offlink IP address and there are multiple
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * routes to this destination, this routine will utilise the
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * first route it finds to IP address
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * Return values:
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * 0 - FAILURE
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * nonzero - ifindex
c793af95640863cd29868fc7c419c5d2496b207bsangeetaifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid)
f4b3ec61df05330d25f55a36b975b4d7519fdeb1dh * For exclusive stacks we set the zoneid to zero
f4b3ec61df05330d25f55a36b975b4d7519fdeb1dh * since IP uses the global zoneid in the exclusive stacks.
c793af95640863cd29868fc7c419c5d2496b207bsangeeta ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6);
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) {
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * Routine to find the route to a destination. If a ifindex is supplied
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * it tries to match the route to the corresponding ipif for the ifindex
f4b3ec61df05330d25f55a36b975b4d7519fdeb1dhroute_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
c793af95640863cd29868fc7c419c5d2496b207bsangeeta /* XXX pass NULL tsl for now */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL,
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL,
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * This routine is called by IP Filter to send a packet out on the wire
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * to a specified dstination (which may be onlink or offlink). The ifindex may
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * or may not be 0. A non-null ifindex indicates IP Filter has stipulated
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * an outgoing interface and requires the nexthop to be on that interface.
edd26dc5eeb3b093945c371e4b6dd8286348d53fdr * IP WILL NOT DO the following to the data packet before sending it out:
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * a. manipulate ttl
edd26dc5eeb3b093945c371e4b6dd8286348d53fdr * b. ipsec work
edd26dc5eeb3b093945c371e4b6dd8286348d53fdr * c. fragmentation
edd26dc5eeb3b093945c371e4b6dd8286348d53fdr * If the packet has been prepared for hardware checksum then it will be
edd26dc5eeb3b093945c371e4b6dd8286348d53fdr * passed off to ip_send_align_cksum() to check that the flags set on the
edd26dc5eeb3b093945c371e4b6dd8286348d53fdr * packet are in alignment with the capabilities of the new outgoing NIC.
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * Return values:
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * 0: IP was able to send of the data pkt
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * ECOMM: Could not send packet
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * ENONET No route to dst. It is up to the caller
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * to send icmp unreachable error message,
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * EINPROGRESS The macaddr of the onlink dst or that
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * of the offlink dst's nexthop needs to get
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * resolved before packet can be sent to dst.
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * Thus transmission is not guaranteed.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Note: No longer have visibility to the ARP queue
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * hence no EINPROGRESS.
c793af95640863cd29868fc7c419c5d2496b207bsangeetaipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * For exclusive stacks we set the zoneid to zero
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * since IP uses the global zoneid in the exclusive stacks.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark if (ns->netstack_stackid != GLOBAL_NETSTACKID)
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * No IPsec, no fragmentation, and don't let any hooks see
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * the packet.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK;
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr;
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr;
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) {
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * callback function provided by ire_ftable_lookup when calling
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * rn_match_args(). Invoke ire_match_args on each matching leaf node in
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * the radix tree.
c793af95640863cd29868fc7c419c5d2496b207bsangeetaire_find_best_route(struct radix_node *rn, void *arg)
c793af95640863cd29868fc7c419c5d2496b207bsangeeta for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0);
c793af95640863cd29868fc7c419c5d2496b207bsangeeta if (ire_match_args(ire, margs->ift_addr, match_mask,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark margs->ift_gateway, margs->ift_type, margs->ift_ill,
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * ftable irb_t structures are dynamically allocated, and we need to
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * check if the irb_t (and associated ftable tree attachment) needs to
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * be cleaned up when the irb_refcnt goes to 0. The conditions that need
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * be verified are:
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * - no other walkers of the irebucket, i.e., quiescent irb_refcnt,
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * - no other threads holding references to ire's in the bucket,
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * i.e., irb_nire == 0
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * - no active ire's in the bucket, i.e., irb_ire_cnt == 0
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * - need to hold the global tree lock and irb_lock in write mode.
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * Someone has a reference to this radix node
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * or there is some bucket walker.
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * There is no other walker, nor is there any
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * other thread that holds a direct ref to this
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * radix node. Do the clean up if needed. Call
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * to ire_unlink will clear the IRB_MARK_CONDEMNED flag
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * more CONDEMNED entries could have
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * been added while we dropped the lock,
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * so we have to re-check.
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * Now check if there are still any ires
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * associated with this radix node.
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * someone is still holding on
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * to ires in this bucket
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * Everything is clear. Zero walkers,
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * Zero threads with a ref to this
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * radix node, Zero ires associated with
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * this radix node. Due to lock order,
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * check the above conditions again
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * after grabbing all locks in the right order
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * irb_inactive could not free the irb.
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * See if there are any walkers, if not
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * try to clean up again.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * IRE iterator used by ire_ftable_lookup to process multiple equal
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * routes. Given a starting point in the hash list (hash), walk the IREs
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * in the bucket skipping deleted entries. We treat the bucket as a circular
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * list for the purposes of walking it.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Returns the IRE (held) that corresponds to the hash value. If that IRE is
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * not applicable (ire_match_args failed) then it returns a subsequent one.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * If we fail to find an IRE we return NULL.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Assumes that the caller holds a reference on the IRE bucket and a read lock
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6).
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Applies to IPv4 and IPv6.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * address and bucket, we compare against ire_type for the orig_ire. We also
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being
188e166434dcdde5356d87fb06c169f15dc4dca9Erik Nordmark * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * reachable from the zone i.e., that the ire_gateway_addr is in a subnet
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * in which the zone has an IP address. We check this for the global zone
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * even if no shared-IP zones are configured.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmarkire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* Fold in more bits from the hint/hash */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * Round-robin the routers list looking for a route that
c793af95640863cd29868fc7c419c5d2496b207bsangeeta * matches the passed in parameters.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * First we skip "hash" number of non-condemned IREs.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Then we match the IRE.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * If we find an ire which has a non-zero ire_badcnt then we remember
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * it and keep on looking for a lower ire_badcnt.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * If we come to the end of the list we continue (treat the
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * bucket list as a circular list) but we match less than "max"
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark while (maxwalk > 0) {
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* Skip the first "hash" entries to do ECMP */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* See CGTP comment above */
188e166434dcdde5356d87fb06c169f15dc4dca9Erik Nordmark ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0)
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Note: Since IPv6 has hash buckets instead of radix
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * buckers we need to explicitly compare the addresses.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * That makes this less efficient since we will be called
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * even if there is no alternatives just because the
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * bucket has multiple IREs for different addresses.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * For some reason find_best_route uses ire_mask. We do
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark margs->ift_type, margs->ift_ill, margs->ift_zoneid,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark margs->ift_type, margs->ift_ill, margs->ift_zoneid,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * When we're in a zone, we're only
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * interested in routers that are
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * reachable through ipifs within our zone.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* Look for stale ire_badcnt and clear */
d3d50737e566cade9a08d73d2af95105ac7cd960Rafael Vanoni (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt >
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* We found one with a zero badcnt; done */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Care needed since irb_refrele grabs WLOCK to free
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * keep looking to see if there is a better (lower
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * badcnt) matching IRE, but save this one as a last resort.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * If we find a lower badcnt pick that one as the last* resort.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * ip_select_src_ill() is used by ip_select_route() to find the src_ill
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * to be used for source-aware routing table lookup. This function will
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * ignore IPIF_UNNUMBERED interface addresses, and will only return a
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * interfaces).
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhanip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst)
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src);
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst);
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst);
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * verify that v6src is configured on ill
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhanip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid)
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst);
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst);
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * routes this routine sets up a ire_nce_cache as well. The caller needs to
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * lookup an nce for the multicast case.
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * When src_multihoming is set to 2 (strict src multihoming) we use the source
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * address to select the interface and route. If IP_BOUND_IF etc are
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * specified, we require that they specify an interface on which the
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * source address is assigned.
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * When src_multihoming is set to 1 (preferred src aware route
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * selection) the unicast lookup prefers a matching source
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * (i.e., that the route points out an ill on which the source is assigned), but
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * if no such route is found we fallback to not considering the source in the
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * route lookup.
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * We skip the src_multihoming check when the source isn't (yet) set, and
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * when secpolicy_net_rawaccess().
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhanip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src,
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp,
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan boolean_t isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4);
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan int src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst);
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * We only verify that the src has been configured on a selected
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * interface if the src is not :: or INADDR_ANY, and if the
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * IXAF_VERIFY_SOURCE flag is set.
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan verify_src = (!V6_OR_V4_INADDR_ANY(v6src) &&
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * The content of the ixa will be different if IP_NEXTHOP,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) {
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* Pick up the IRE_MULTICAST for the ill */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* sin6_scope_id takes precedence over ixa_ifindex */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * In the ipmp case, the ixa_ifindex is set to
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * point at an under_ill and we would return the
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * ire_multicast() corresponding to that under_ill.
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan } else if (src_multihoming != 0 && verify_src) {
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan /* Look up the ill based on the source address */
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * Since we looked up the ill from the source there
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * is no need to verify that the source is on the ill
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan } else if (!isv6) {
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid,
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* Get a hold on the IRE_NOROUTE */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* Get a hold on the IRE_NOROUTE */
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * If we are doing the strictest src_multihoming, then
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * an interface that is consistent with the source address.
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan /* Get a hold on the IRE_NOROUTE */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* Get a refcnt on the single IRE_MULTICAST per ill */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan /* Now for unicast */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) {
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* sin6_scope_id takes precedence over ixa_ifindex */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* Get a hold on the IRE_NOROUTE */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * so for both of them we need to be able look for an under
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * If we are doing the strictest src_multihoming, then
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * we check that IP_BOUND_IF, IP_PKTINFO, etc specify
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * an interface that is consistent with the source address.
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan /* Get a hold on the IRE_NOROUTE */
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan } else if (src_multihoming != 0 && verify_src) {
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan /* Look up the ill based on the source address */
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan /* Get a hold on the IRE_NOROUTE */
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan preferred_src_aware = (src_multihoming == 1);
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* IP_NEXTHOP was set */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * we only look for an onlink IRE.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) {
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop);
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark ire = ire_route_recursive_v4(v4nexthop, ire_type, ill,
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp);
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill,
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp);
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark ip3dbg(("looking for hidden; dst %x ire %p\n",
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * "Preferred Source Aware" send mode. If we cannot
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * find an ire whose ire_ill had the desired source
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * address retry after relaxing the ill matching
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * constraint.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* No ire_nce_cache */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* Setup ire_nce_cache if it doesn't exist or is condemned. */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Find a route given some xmit attributes and a packet.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Generic for IPv4 and IPv6
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * This never returns NULL. But when it returns the IRE_NOROUTE
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * it might set errorp.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmarkip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan return (ip_select_route(&v6dst, v6src, ixa, generationp,
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src,
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan ixa, generationp, NULL, errorp, multirtp));
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhanip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa,
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Recursively look for a route to the destination. Can also match on
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * the zoneid, ill, and label. Used for the data paths. See also
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * ire_route_recursive.
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark * create an IRE_IF_CLONE. This is used on the receive side when we are not
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark * forwarding.
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark * resolve the gateway.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Note that this function never returns NULL. It returns an IRE_NOROUTE
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * is an error.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Allow at most one RTF_INDIRECT.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
01685f973ffa404db3bc35b99a86c94e268d6587Sowmini Varadhan uint_t maskoff = (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST);
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * We iterate up to three times to resolve a route, even though
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * we have four slots in the array. The extra slot is for an
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * IRE_IF_CLONE we might need to create.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* ire_ftable_lookup handles round-robin/ECMP */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* Caller passed it; extra hold since we will rele */
01685f973ffa404db3bc35b99a86c94e268d6587Sowmini Varadhan if (i > 0 && (irr_flags & IRR_INCOMPLETE)) {
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* Need to return the ire with RTF_REJECT|BLACKHOLE */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
fff7ec1d8ce71b3d8a998ac4391a99860ce07180Sowmini Varadhan * Verify that the IRE_IF_CLONE has a consistent generation
fff7ec1d8ce71b3d8a998ac4391a99860ce07180Sowmini Varadhan if ((ire->ire_type & IRE_IF_CLONE) && !ire_clone_verify(ire)) {
01685f973ffa404db3bc35b99a86c94e268d6587Sowmini Varadhan * Don't allow anything unusual past the first iteration.
01685f973ffa404db3bc35b99a86c94e268d6587Sowmini Varadhan * After the first lookup, we should no longer look for
01685f973ffa404db3bc35b99a86c94e268d6587Sowmini Varadhan * (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST) or RTF_INDIRECT
01685f973ffa404db3bc35b99a86c94e268d6587Sowmini Varadhan * In addition, after we have found a direct IRE_OFFLINK,
01685f973ffa404db3bc35b99a86c94e268d6587Sowmini Varadhan * we should only look for interface or clone routes.
01685f973ffa404db3bc35b99a86c94e268d6587Sowmini Varadhan match_args |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */
01685f973ffa404db3bc35b99a86c94e268d6587Sowmini Varadhan * no more local, loopback, broadcast routes
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* We have a usable IRE */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* The first RTF_SETSRC address is passed back if setsrcp */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* The first ire_gw_secattr is passed back if gwattrp */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Check if we have a short-cut pointer to an IRE for this
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * destination, and that the cached dependency isn't stale.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * In that case we've rejoined an existing tree towards a
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * parent, thus we don't need to continue the loop to
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * discover the rest of the tree.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * If this type should have an ire_nce_cache (even if it
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * doesn't yet have one) then we are done. Includes
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * IRE_INTERFACE with a full 32 bit mask.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * particular destination
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * In the case of ip_input and ILLF_FORWARDING not
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark * being set, and in the case of RTM_GET, there is
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark * no point in allocating an IRE_IF_CLONE. We return
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark * the IRE_INTERFACE. Note that !IRR_ALLOCATE can
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark * result in a ire_dep_parent which is IRE_IF_*
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark * without an IRE_IF_CLONE.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * We recover from that when we need to send packets
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * by ensuring that the generations become
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * IRE_GENERATION_VERIFY in this case.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Temporary failure - no memory.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Don't want caller to cache IRE_NOROUTE.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Make clone next to last entry and the
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * IRE_INTERFACE the last in the dependency
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * chain since the clone depends on the
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * IRE_INTERFACE.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * We only match on the type and optionally ILL when
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * recursing. The type match is used by some callers
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * to exclude certain types (such as IRE_IF_CLONE or
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * IRE_LOCAL|IRE_LOOPBACK).
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * ire->ire_ill, and we want to find the IRE_INTERFACE for
44b099c4d944a196d124a02c7403ad891223139eSowmini Varadhan * ire_ill, so we set ill to the ire_ill;
01685f973ffa404db3bc35b99a86c94e268d6587Sowmini Varadhan match_args &= (MATCH_IRE_TYPE | MATCH_IRE_DIRECT);
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * In the case of MULTIRT we want to try a different IRE the next
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * time. We let the next packet retry in that case.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* cleanup ires[i] */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark for (j = 0; j < i; j++)
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * ip_select_route since the reject or lack of memory might be gone.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* Build dependencies */
188e166434dcdde5356d87fb06c169f15dc4dca9Erik Nordmark if (i > 1 && !ire_dep_build(ires, generations, i)) {
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* Something in chain was condemned; tear it apart */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Release all refholds except the one for ires[0] that we
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * will return to the caller.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark for (j = 1; j < i; j++)
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Since we needed to allocate but couldn't we need to make
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * sure that the dependency chain is rebuilt the next time.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * IREs can have been added or deleted while we did the
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * recursive lookup and we can't catch those until we've built
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * the dependencies. We verify the stored
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * ire_dep_parent_generation to catch any such changes and
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * return IRE_GENERATION_VERIFY (which will cause
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * ip_select_route to be called again so we can redo the
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * recursive lookup next time we send a packet.
188e166434dcdde5356d87fb06c169f15dc4dca9Erik Nordmark generation = ire_dep_validate_generations(ires[0]);
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark if (generations[0] != ires[0]->ire_generation) {
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* Something changed at the top */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmarkire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill,
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Recursively look for a route to the destination.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * We only handle a destination match here, yet we have the same arguments
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * as the full match to allow function pointers to select between the two.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Note that this function never returns NULL. It returns an IRE_NOROUTE
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * is an error.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Allow at most one RTF_INDIRECT.
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmarkire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark /* ire_ftable_lookup handles round-robin/ECMP */
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst,
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * If the IRE has a current cached parent we know that the whole
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * parent chain is current, hence we don't need to discover and
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * build any dependencies by doing a recursive lookup.
fff7ec1d8ce71b3d8a998ac4391a99860ce07180Sowmini Varadhan * If this type should have an ire_nce_cache (even if it
fff7ec1d8ce71b3d8a998ac4391a99860ce07180Sowmini Varadhan * doesn't yet have one) then we are done. Includes
fff7ec1d8ce71b3d8a998ac4391a99860ce07180Sowmini Varadhan * IRE_INTERFACE with a full 32 bit mask.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * Fallback to loop in the normal code starting with the ire
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark * we found. Normally this would return the same ire.
bd670b35a010421b6e1a5536c34453a827007c81Erik Nordmark ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES,
9e3469d3db608feb0e43d9955cbf406c22025463Erik Nordmark NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL,
fff7ec1d8ce71b3d8a998ac4391a99860ce07180Sowmini Varadhan * Verify that the generation numbers in the chain leading to an IRE_IF_CLONE
fff7ec1d8ce71b3d8a998ac4391a99860ce07180Sowmini Varadhan * are consistent. Return FALSE (and delete the IRE_IF_CLONE) if they
fff7ec1d8ce71b3d8a998ac4391a99860ce07180Sowmini Varadhan * are not consistent, and TRUE otherwise.