ip_input.c revision 44b099c4d944a196d124a02c7403ad891223139e
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
#include <sys/sysmacros.h>
#define _SUN_TPI_VERSION 2
#include <sys/xti_inet.h>
#include <sys/isa_defs.h>
#include <inet/kstatcom.h>
#include <netinet/igmp_var.h>
#include <inet/tcp_impl.h>
#include <inet/ip_multi.h>
#include <inet/ip_ftable.h>
#include <inet/ip_listutils.h>
#include <netinet/ip_mroute.h>
#include <inet/ipp_common.h>
#include <inet/ipsec_impl.h>
#include <inet/ip_netinfo.h>
#include <sys/squeue_impl.h>
#include <sys/ethernet.h>
#include <net/if_types.h>
#include <ipp/ipp_impl.h>
#include <inet/ipclassifier.h>
#include <inet/udp_impl.h>
#ifdef DEBUG
extern boolean_t skip_sctp_cksum;
#endif
ip_recv_attr_t *);
ip_recv_attr_t *);
ip_recv_attr_t *);
/*
* Direct read side procedure capable of dealing with chains. GLDv3 based
* drivers call this function directly with mblk chains while STREAMS
* read side procedure ip_rput() calls this for single packet with ip_ring
* set to NULL to process one packet at a time.
*
* The ill will always be valid if this function is called directly from
* the driver.
*
* If ip_input() is called from GLDv3:
*
* - This must be a non-VLAN IP stream.
* - 'mp' is either an untagged or a special priority-tagged packet.
* - Any VLAN tag that was in the MAC header has been stripped.
*
* If the IP header in packet is not 32-bit aligned, every message in the
* chain will be aligned before further operations. This is required on SPARC
* platform.
*/
void
struct mac_header_info_s *mhip)
{
NULL);
}
/*
* ip_accept_tcp() - This function is called by the squeue when it retrieves
* a chain of packets in the poll mode. The packets have gone through the
* data link processing but not IP processing. For performance and latency
* reasons, the squeue wants to process the chain in line instead of feeding
* it back via ip_input path.
*
* We set up the ip_recv_attr_t with IRAF_TARGET_SQP to that ip_fanout_v4
* will pass back any TCP packets matching the target sqp to
* ip_input_common_v4 using ira_target_sqp_mp. Other packets are handled by
* ip_input_v4 and ip_fanout_v4 as normal.
* The TCP packets that match the target squeue are returned to the caller
* as a b_next chain after each packet has been prepend with an mblk
* from ip_recv_attr_to_mblk.
*/
mblk_t *
{
}
/*
* Used by ip_input and ip_accept_tcp
* The last three arguments are only used by ip_accept_tcp, and mhip is
* only used by ip_input.
*/
mblk_t *
{
/* These ones do not change as we loop over packets */
/* For ECMP and outbound transmit ring selection */
if (target_sqp != NULL)
/*
* We try to have a mhip pointer when possible, but
* it might be NULL in some cases. In those cases we
* have to assume unicast.
*/
switch (mhip->mhi_dsttype) {
case MAC_ADDRTYPE_MULTICAST :
break;
case MAC_ADDRTYPE_BROADCAST :
break;
}
}
/*
* Initialize the one-element route cache.
*
* We do ire caching from one iteration to
* another. In the event the packet chain contains
* all packets from the same dst, this caching saves
* an ire_route_recursive for each of the succeeding
* packets in a packet chain.
*/
/* Loop over b_next */
/*
* if db_ref > 1 then copymsg and free original. Packet
* may be changed and we do not want the other entity
* who has a reference to this message to trip over the
* changes. This is a blind change because trying to
* catch all places that might change the packet is too
* difficult.
*
* This corresponds to the fast path case, where we have
* a chain of M_DATA mblks. We check the db_ref count
* of only the 1st data block in the mblk chain. There
* doesn't seem to be a reason why a device driver would
* send up data with varying db_ref counts in the mblk
* chain. In any case the Fast path is a private
* interface, and our drivers don't do such a thing.
* Given the above assumption, there is no need to walk
* down the entire mblk chain (which could have a
* potential performance problem)
*
* The "(DB_REF(mp) > 1)" check was moved from ip_rput()
* to here because of exclusive ip stacks and vnics.
* Packets transmitted from exclusive stack over vnic
* can have db_ref > 1 and when it gets looped back to
* another vnic in a different zone, you have ip_input()
* getting dblks with db_ref > 1. So if someone
* complains of TCP performance under this scenario,
* take a serious look here on the impact of copymsg().
*/
/* mhip might point into 1st packet in chain */
continue;
}
}
/*
* IP header ptr not aligned?
* OR IP header not complete in first mblk
*/
&iras);
/* mhip might point into 1st packet in chain */
continue;
}
}
/* Protect against a mix of Ethertypes and IP versions */
/* mhip might point into 1st packet in the chain. */
continue;
}
/*
* Check for Martian addrs; we have to explicitly
* test for for zero dst since this is also used as
* an indication that the rtc is not used.
*/
/* mhip might point into 1st packet in the chain. */
continue;
}
/*
* Keep L2SRC from a previous packet in chain since mhip
* might point into an earlier packet in the chain.
* Keep IRAF_VERIFIED_SRC to avoid redoing broadcast
* source check in forwarding path.
*/
iras.ira_free_flags = 0;
/*
* We must count all incoming packets, even if they end
* up being dropped later on. Defer counting bytes until
* we have the whole IP header in first mblk.
*/
/*
* Call one of:
* ill_input_full_v4
* ill_input_short_v4
* The former is used in unusual cases. See ill_set_inputfn().
*/
/* Any references to clean up? No hold on ira_ill */
/* Better be called from ip_accept_tcp */
/* Found one packet to accept */
else
acnt++;
}
/* mhip might point into 1st packet in the chain. */
}
/* Any remaining references to the route cache? */
}
/* Better be called from ip_accept_tcp */
return (ahead);
}
return (NULL);
}
/*
* This input function is used when
* - is_system_labeled()
* - CGTP filtering
* - DHCP unicast before we have an IP address configured
* - there is an listener for IPPROTO_RSVP
*/
void
{
int cgtp_flt_pkt;
/*
* Attach any necessary label information to
* this packet
*/
if (is_system_labeled()) {
/*
* This updates ira_cred, ira_tsl and ira_free_flags based
* on the label.
*/
return;
}
/* Note that ira_tsl can be NULL here. */
/* tsol_get_pkt_label sometimes does pullupmsg */
}
/*
* Invoke the CGTP (multirouting) filtering module to process
* the incoming packet. Packets identified as duplicates
* must be discarded. Filtering is active only if the
* the ip_cgtp_filter ndd variable is non-zero.
*/
if (ipst->ips_ip_cgtp_filter &&
/*
* CGTP and IPMP are mutually exclusive so
* phyint_ifindex is fine here.
*/
if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) {
return;
}
}
/*
* Brutal hack for DHCPv4 unicast: RFC2131 allows a DHCP
* server to unicast DHCP packets to a DHCP client using the
* IP address it is offering to the client. This can be
* disabled through the "broadcast bit", but not all DHCP
* servers honor that bit. Therefore, to interoperate with as
* many DHCP servers as possible, the DHCP client allows the
* server to unicast, but we treat those packets as broadcast
* here. Note that we don't rewrite the packet itself since
* (a) that would mess up the checksums and (b) the DHCP
* client conn is bound to INADDR_ANY so ip_fanout_udp() will
* hand it the packet regardless.
*/
if (ill->ill_dhcpinit != 0 &&
return;
}
/* Reload since pullupmsg() can change b_rptr. */
/*
* This assumes that we deliver to all conns for
* multicast and broadcast packets.
*/
}
}
/*
* If rsvpd is running, let RSVP daemon handle its processing
* If rsvpd is not running but mrouted is running, RSVP
* multicast packets are forwarded as multicast traffic
* and RSVP unicast packets are forwarded by unicast router.
* If neither rsvpd nor mrouted is running, RSVP multicast
* packets are not forwarded, but the unicast packets are
* forwarded like unicast traffic.
*/
/* RSVP packet and rsvpd running. Treat as ours */
/*
* We use a multicast address to get the packet to
* ire_recv_multicast_v4. There will not be a membership
* check since we set IRAF_RSVP
*/
}
}
/*
* This is the tail-end of the full receive side packet handling.
* It can be used directly when the configuration is simple.
*/
void
{
/*
* The following test for loopback is faster than
* IP_LOOPBACK_ADDR(), because it avoids any bitwise
* operations.
* Note that these addresses are always in network byte order
*/
return;
}
/* multiple mblk or too short */
if (len != 0) {
return;
}
int, 0);
/*
* The event for packets being received from a 'physical'
* destination address as being local so that packets can be
* redirected to loopback addresses using ipnat.
*/
if (HOOKS4_INTERESTED_PHYSICAL_IN(ipst)) {
int ll_multicast = 0;
int error;
return;
/* The length could have changed */
/*
* In case the destination changed we override any previous
* change to nexthop.
*/
if (nexthop == INADDR_ANY) {
return;
}
}
/*
* On the inbound path the src zone will be unknown as
* this packet has come from the wire.
*/
}
/*
* If there is a good HW IP header checksum we clear the need
* look at the IP header checksum.
*/
/* Header checksum was ok. Clear the flag */
}
/*
* Here we check to see if we machine is setup as
* L3 loadbalancer and if the incoming packet is for a VIP
*
* Check the following:
* - there is at least a rule
* - protocol of the packet is supported
*/
int lb_ret;
/* For convenience, we pull up the mblk. */
ip_drop_input("ipIfStatsInDiscards - pullupmsg",
return;
}
}
/*
* We just drop all fragments going to any VIP, at
* least for now....
*/
(IPH_MF | IPH_OFFSET)) {
goto after_ilb;
}
return;
}
if (lb_ret == ILB_DROPPED) {
/* Is this the right counter to increase? */
return;
}
if (lb_ret == ILB_BALANCED) {
/* Set the dst to that of the chosen server */
DB_CKSUMFLAGS(mp) = 0;
}
}
if (opt_len != 0) {
int error = 0;
/* IP Options present! Validate the length. */
return;
/* Might have changed */
/* Verify IP header checksum before parsing the options */
ip_csum_hdr(ipha)) {
return;
}
/*
* Go off to ip_input_options which returns the next hop
* destination address, which may have been affected
* by source routing.
*/
if (error != 0) {
/*
* An ICMP error has been sent and the packet has
* been dropped.
*/
return;
}
}
else
/* Can not use route cache with TX since the labels can differ */
} else {
/* Match destination and label */
NULL);
}
/* Update the route cache so we do the ire_refrele */
/* Use the route cache */
} else {
/* Update the route cache */
} else {
/* Just match the destination */
}
}
ire->ire_ib_pkt_count++;
/*
* Based on ire_type and ire_flags call one of:
* ire_recv_local_v4 - for IRE_LOCAL
* ire_recv_loopback_v4 - for IRE_LOOPBACK
* ire_recv_multirt_v4 - if RTF_MULTIRT
* ire_recv_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE
* ire_recv_multicast_v4 - for IRE_MULTICAST
* ire_recv_broadcast_v4 - for IRE_BROADCAST
* ire_recv_noaccept_v4 - for ire_noaccept ones
* ire_recv_forward_v4 - for the rest.
*/
}
/*
* ire_recvfn for IREs that need forwarding
*/
void
{
return;
}
return;
}
/*
* Either ire_nce_capable or ire_dep_parent would be set for the IRE
* when it is found by ire_route_recursive, but that some other thread
* could have changed the routes with the effect of clearing
* ire_dep_parent. In that case we'd end up dropping the packet, or
* finding a new nce below.
* Get, allocate, or update the nce.
* We get a refhold on ire_nce_cache as a result of this to avoid races
* where ire_nce_cache is deleted.
*
* This ensures that we don't forward if the interface is down since
* ipif_down removes all the nces.
*/
/* Not yet set up - try to set one up */
(void) ire_revalidate_nce(ire);
/* The ire_dep_parent chain went bad, or no memory */
return;
}
}
if (nce->nce_is_condemned) {
return;
}
}
/*
* Unless we are forwarding, drop the packet.
* We have to let source routed packets through if they go out
* the same interface i.e., they are 'ping -l' packets.
*/
return;
}
return;
}
ire->ire_ib_pkt_count--;
/*
* Should only use IREs that are visible from the
* global zone for forwarding.
* Take a source route into account the same way as ip_input
* did.
*/
int error = 0;
}
ire->ire_ib_pkt_count++;
return;
}
/*
* ipIfStatsHCInForwDatagrams should only be increment if there
* will be an attempt to forward the packet, which is why we
* increment after the above condition has been checked.
*/
/* Initiate Read side IPPF processing */
/* ip_process translates an IS_UNDER_IPMP */
/* ip_drop_packet and MIB done */
"during IPPF processing\n"));
return;
}
}
if (HOOKS4_INTERESTED_FORWARDING(ipst)) {
int error;
return;
}
/*
* Even if the destination was changed by the filter we use the
* forwarding decision that was made based on the address
* in ip_input.
*/
/* Might have changed */
}
/* Packet is being forwarded. Turning off hwcksum flag. */
DB_CKSUMFLAGS(mp) = 0;
/*
* Martian Address Filtering [RFC 1812, Section 5.3.7]
* The loopback address check for both src and dst has already
* been checked in ip_input
* In the future one can envision adding RPF checks using number 3.
* If we already checked the same source address we can skip this.
*/
switch (ipst->ips_src_check) {
case 0:
break;
case 2:
return;
}
/* FALLTHRU */
case 1:
return;
}
break;
}
/* Remember for next packet */
}
/*
* Check if packet is going out the same link on which it arrived.
* Means we might need to send a redirect.
*/
}
added_tx_len = 0;
ip_csum_hdr(ipha)) {
return;
}
/*
* CIPSO options as needed.
*/
return;
}
/*
* Size may have changed. Remember amount added in case
* IP needs to send an ICMP too big.
*/
/* Options can have been added or removed */
else
}
}
/*
* Used for sending out unicast and multicast packets that are
* forwarded.
*/
void
{
/* Perhaps the checksum was bad */
return;
}
return;
}
/* Adjust the checksum to reflect the ttl decrement. */
/* Check if there are options to update */
if (iraflags & IRAF_IPV4_OPTIONS) {
/* ipIfStatsForwProhibits and ip_drop_input done */
return;
}
ipha->ipha_hdr_checksum = 0;
}
/* Initiate Write side IPPF processing before any fragmentation */
/* ip_process translates an IS_UNDER_IPMP */
/* ip_drop_packet and MIB done */
" during IPPF processing\n"));
return;
}
}
/*
* It needs fragging on its way out. If we haven't
* verified the header checksum yet we do it now since
* are going to put a surely good checksum in the
* outgoing header, we have to make sure that it
* was good coming in.
*/
return;
}
if (iraflags & IRAF_SYSTEM_LABELED) {
/*
* Remove any CIPSO option added by
* tsol_ip_forward, and make sure we report
* a path MTU so that there
* is room to add such a CIPSO option for future
* packets.
*/
AF_INET);
}
return;
}
return;
}
if (iraflags & IRAF_LOOPBACK_COPY) {
/*
* IXAF_NO_LOOP_ZONEID is not set hence 7th arg
* is don't care
*/
} else {
GLOBAL_ZONEID, 0, NULL);
}
}
/*
* ire_recvfn for RTF_REJECT and RTF_BLACKHOLE routes, including IRE_NOROUTE,
* which is what ire_route_recursive returns when there is no matching ire.
* Send ICMP unreachable unless blackhole.
*/
void
{
/* Would we have forwarded this packet if we had a route? */
return;
}
return;
}
/*
* If we had a route this could have been forwarded. Count as such.
*
* ipIfStatsHCInForwDatagrams should only be increment if there
* will be an attempt to forward the packet, which is why we
* increment after the above condition has been checked.
*/
ipst);
} else {
} else {
}
}
}
/*
* ire_recvfn for IRE_LOCALs marked with ire_noaccept. Such IREs are used for
* VRRP when in noaccept mode.
* We silently drop the packet. ARP handles packets even if noaccept is set.
*/
/* ARGSUSED */
void
{
}
/*
* ire_recvfn for IRE_BROADCAST.
*/
void
{
/* Tag for higher-level protocols */
/*
* Whether local or directed broadcast forwarding: don't allow
* for TCP.
*/
return;
}
/*
* So that we don't end up with dups, only one ill an IPMP group is
* nominated to receive broadcast traffic.
* If we have no cast_ill we are liberal and accept everything.
*/
if (IS_UNDER_IPMP(ill)) {
/* For an under ill_grp can change under lock */
/* No MIB since this is normal operation */
return;
}
}
/*
* After reassembly and IPsec we will need to duplicate the
* broadcast packet for all matching zones on the ill.
*/
/*
* Check for directed broadcast i.e. ire->ire_ill is different than
* the incoming ill.
* The same broadcast address can be assigned to multiple interfaces
* so have to check explicitly for that case by looking up the alt_ire
*/
/* Reassemble on the ill on which the packet arrived */
/* Restore */
return;
}
/* Is there an IRE_BROADCAST on the incoming ill? */
/* Not a directed broadcast */
/*
* In the special case of multirouted broadcast
* packets, we unconditionally need to "gateway"
* them to the appropriate interface here so that reassembly
* works. We know that the IRE_BROADCAST on cgtp0 doesn't
* have RTF_MULTIRT set so we look for such an IRE in the
* bucket.
*/
if (IRE_IS_CONDEMNED(ire1))
continue;
continue;
break;
}
/* Reassemble on the new ill */
/* Restore */
ira->ira_ruifindex =
return;
}
}
/* Reassemble on the ill on which the packet arrived */
goto done;
}
/*
* This is a directed broadcast
*
* If directed broadcast is allowed, then forward the packet out
* the destination interface with IXAF_LOOPBACK_COPY set. That will
* result in ip_input() receiving a copy of the packet on the
* appropriate ill. (We could optimize this to avoid the extra trip
* via ip_input(), but since directed broadcasts are normally disabled
* it doesn't make sense to optimize it.)
*/
if (!ipst->ips_ip_g_forward_directed_bcast ||
goto done;
}
goto done;
}
/*
* Clear the indication that this may have hardware
* checksum as we are not using it for forwarding.
*/
DB_CKSUMFLAGS(mp) = 0;
/*
* Adjust ttl to 2 (1+1 - the forward engine will decrement it by one.
*/
ipha->ipha_hdr_checksum = 0;
/*
* We use ip_forward_xmit to do any fragmentation.
* and loopback copy on the outbound interface.
*
* Make it so that IXAF_LOOPBACK_COPY to be set on transmit side.
*/
goto done;
}
done:
/* Restore */
}
/*
* ire_recvfn for IRE_MULTICAST.
*/
void
{
/* RSVP hook */
goto forus;
/* Tag for higher-level protocols */
/*
* So that we don't end up with dups, only one ill an IPMP group is
* nominated to receive multicast traffic.
* If we have no cast_ill we are liberal and accept everything.
*/
if (IS_UNDER_IPMP(ill)) {
/* For an under ill_grp can change under lock */
return;
}
/*
* We switch to the upper ill so that mrouter and hasmembers
* can operate on upper here and in ip_input_multicast.
*/
} else {
}
}
/*
* Check if we are a multicast router - send ip_mforward a copy of
* the packet.
* Due to mroute_decap tunnels we consider forwarding packets even if
* mrouted has not joined the allmulti group on this interface.
*/
if (ipst->ips_ip_g_mrouter) {
int retval;
/*
* Clear the indication that this may have hardware
* checksum as we are not using it for forwarding.
*/
DB_CKSUMFLAGS(mp) = 0;
/*
* ip_mforward helps us make these distinctions: If received
* on tunnel and not IGMP, then drop.
* If IGMP packet, then don't check membership
* If received on a phyint and IGMP or PIM, then
* don't check membership
*/
/* ip_mforward updates mib variables if needed */
switch (retval) {
case 0:
/*
* pkt is okay and arrived on phyint.
*
* If we are running as a multicast router
*/
goto forus;
}
break;
case -1:
/* pkt is mal-formed, toss it */
goto done;
case 1:
/*
* pkt is okay and arrived on a tunnel
*
* If we are running a multicast router
* we need to see all igmp packets.
*/
goto forus;
}
goto done;
}
}
/*
* Check if we have members on this ill. This is not necessary for
* filter before passing to each conn_t.
*/
/*
* Nobody interested
*
* This might just be caused by the fact that
* multiple IP Multicast addresses map to the same
* link layer multicast - no need to increment counter!
*/
goto done;
}
ip2dbg(("ire_recv_multicast_v4: multicast for us: 0x%x\n",
/*
* After reassembly and IPsec we will need to duplicate the
* multicast packet for all matching zones on the ill.
*/
/* Reassemble on the ill on which the packet arrived */
done:
}
}
/*
* ire_recvfn for IRE_OFFLINK with RTF_MULTIRT.
* Drop packets since we don't forward out multirt routes.
*/
/* ARGSUSED */
void
{
}
/*
* ire_recvfn for IRE_LOOPBACK. This is only used when a FW_HOOK
* has rewritten the packet to have a loopback destination address (We
* filter out packet with a loopback destination from arriving over the wire).
* We don't know what zone to use, thus we always use the GLOBAL_ZONEID.
*/
void
{
/* Switch to the lo0 ill for further processing */
/*
* Update ira_ill to be the ILL on which the IP address
* is hosted.
* No need to hold the ill since we have a hold on the ire
*/
/* Restore */
return;
}
}
/*
* ire_recvfn for IRE_LOCAL.
*/
void
{
/* Make a note for DAD that this address is in use */
/* Only target the IRE_LOCAL with the right zoneid. */
/*
* If the packet arrived on the wrong ill, we check that
* this is ok.
* If it is, then we ensure that we do the reassembly on
* the ill on which the address is hosted. We keep ira_rill as
* the one on which the packet arrived, so that IP_PKTINFO and
* friends can report this.
*/
/* Drop packet */
return;
}
/*
* Update ira_ill to be the ILL on which the IP address
* is hosted. No need to hold the ill since we have a
* hold on the ire. Note that we do the switch even if
* new_ire == ire (for IPMP, ire would be the one corresponding
* to the IPMP ill).
*/
/* ira_ruifindex tracks the upper for ira_rill */
if (IS_UNDER_IPMP(ill))
/* Restore */
return;
}
}
/*
* Common function for packets arriving for the host. Handles
* checksum verification, reassembly checks, etc.
*/
static void
{
/*
* Verify IP header checksum. If the packet was AH or ESP then
* this flag has already been cleared. Likewise if the packet
* had a hardware checksum.
*/
return;
}
if (iraflags & IRAF_IPV4_OPTIONS) {
/* Error has been sent and mp consumed */
return;
}
/*
* Some old hardware does partial checksum by including the
* whole IP header, so the partial checksum value might have
* become invalid if any option in the packet have been
* updated. Always clear partial checksum flag here.
*/
}
/*
* Is packet part of fragmented IP packet?
* We compare against defined values in network byte order
*/
(IPH_MF_HTONS | IPH_OFFSET_HTONS)) {
/*
* Make sure we have ira_l2src before we loose the original
* mblk
*/
return;
/* Completed reassembly */
}
/*
* For broadcast and multicast we need some extra work before
* we call ip_fanout_v4(), since in the case of shared-IP zones
* we need to pretend that a packet arrived for each zoneid.
*/
if (iraflags & IRAF_MULTIBROADCAST) {
if (iraflags & IRAF_BROADCAST)
else
return;
}
}
/*
* Handle multiple zones which match the same broadcast address
* and ill by delivering a packet to each of them.
* Walk the bucket and look for different ire_zoneid but otherwise
* Note that ire_add() tracks IREs that are identical in all
* increasing ire_identical_cnt. Thus we don't need to be concerned
* about those.
*/
static void
{
/*
* If we don't have more than one shared-IP zone, or if
* there can't be more than one IRE_BROADCAST for this
* IP address, then just set the zoneid and proceed.
*/
return;
}
/* We do the main IRE after the end of the loop */
continue;
/*
* Only IREs for the same IP address should be in the same
* bucket.
* But could have IRE_HOSTs in the case of CGTP.
*/
continue;
if (IRE_IS_CONDEMNED(ire1))
continue;
/* Failed to deliver to one zone */
continue;
}
/*
* IPsec might have modified ira_pktlen and ira_ip_hdr_length
* so we restore them for a potential next iteration
*/
}
/* Do the main ire */
}
/*
* Handle multiple zones which want to receive the same multicast packets
* on this ill by delivering a packet to each of them.
*
* Note that for packets delivered to transports we could instead do this
* as part of the fanout code, but since we need to handle icmp_inbound
* it is simpler to have multicast work the same as broadcast.
*
* The ip_fanout matching for multicast matches based on ilm independent of
* zoneid since the zoneid restriction is applied when joining a multicast
* group.
*/
/* ARGSUSED */
static void
{
/* ire_recv_multicast has switched to the upper ill for IPMP */
/*
* If we don't have more than one shared-IP zone, or if
* there are no members in anything but the global zone,
* then just set the zoneid and proceed.
*/
GLOBAL_ZONEID)) {
/* If sender didn't want this zone to receive it, drop */
if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
return;
}
return;
}
/*
* Here we loop over all zoneids that have members in the group
* and deliver a packet to ip_fanout for each zoneid.
*
* First find any members in the lowest numeric zoneid by looking for
* first zoneid larger than -1 (ALL_ZONES).
* We terminate the loop when we receive -1 (ALL_ZONES).
*/
/*
* and doing that at the end.
*/
if (zoneid == GLOBAL_ZONEID)
continue;
/* If sender didn't want this zone to receive it, skip */
if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
continue;
/* Failed to deliver to one zone */
continue;
}
/*
* IPsec might have modified ira_pktlen and ira_ip_hdr_length
* so we restore them for a potential next iteration
*/
}
/* Do the main ire */
/* If sender didn't want this zone to receive it, drop */
if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
} else {
}
}
/*
* Determine the zoneid and IRAF_TX_* flags if trusted extensions
* is in use. Updates ira_zoneid and ira_flags as a result.
*/
static void
{
/*
* If the packet is unlabeled we might allow read-down
* for MAC_EXEMPT. Below we clear this if it is a multi-level
* port (MLP).
* Note that ira_tsl can be NULL here.
*/
return;
switch (protocol) {
case IPPROTO_TCP:
case IPPROTO_SCTP:
case IPPROTO_UDP:
/* Caller ensures this */
/*
* Only these transports support MLP.
* We know their destination port numbers is in
* the same place in the header.
*/
/*
* No need to handle exclusive-stack zones
* since ALL_ZONES only applies to the shared IP instance.
*/
/*
* If no shared MLP is found, tsol_mlp_findzone returns
* ALL_ZONES. In that case, we assume it's SLP, and
* search for the zone based on the packet label.
*
* If there is such a zone, we prefer to find a
* connection in it. Otherwise, we look for a
* MAC-exempt connection in any zone whose label
* dominates the default label on the packet.
*/
else
break;
default:
/* Handle shared address for other protocols */
break;
}
}
/*
* Increment checksum failure statistics
*/
static void
{
switch (protocol) {
case IPPROTO_TCP:
if (hck_flags & HCK_FULLCKSUM)
else if (hck_flags & HCK_PARTIALCKSUM)
else
break;
case IPPROTO_UDP:
if (hck_flags & HCK_FULLCKSUM)
else if (hck_flags & HCK_PARTIALCKSUM)
else
break;
case IPPROTO_ICMP:
break;
default:
ASSERT(0);
break;
}
}
/* Calculate the IPv4 pseudo-header checksum */
{
switch (protocol) {
case IPPROTO_TCP:
/* Protocol and length */
/* IP addresses */
break;
case IPPROTO_UDP: {
/* Protocol and length */
/* IP addresses */
break;
}
default:
cksum = 0;
break;
}
return (cksum);
}
/*
* Software verification of the ULP checksums.
* Returns B_TRUE if ok.
* Increments statistics of failed.
*/
static boolean_t
{
if (cksum == 0)
return (B_TRUE);
return (B_FALSE);
}
/*
* Verify the ULP checksums.
* Returns B_TRUE if ok, or if the ULP doesn't have a well-defined checksum
* algorithm.
* Increments statistics if failed.
*/
static boolean_t
{
switch (protocol) {
case IPPROTO_TCP:
break;
case IPPROTO_UDP: {
if (udpha->uha_checksum == 0) {
/* Packet doesn't have a UDP checksum */
return (B_TRUE);
}
break;
}
case IPPROTO_SCTP: {
#ifdef DEBUG
if (skip_sctp_cksum)
return (B_TRUE);
#endif
return (B_TRUE);
/*
* Defer until later whether a bad checksum is ok
* in order to allow RAW sockets to use Adler checksum
* with SCTP.
*/
return (B_TRUE);
}
default:
/* No ULP checksum to verify. */
return (B_TRUE);
}
/*
* Revert to software checksum calculation if the interface
* isn't capable of checksum offload.
* We clear DB_CKSUMFLAGS when going through IPsec in ip_fanout.
* Note: IRAF_NO_HW_CKSUM is not currently used.
*/
!dohwcksum) {
}
/*
* We apply this for all ULP protocols. Does the HW know to
* not set the flags for SCTP and other protocols.
*/
if (hck_flags & HCK_FULLCKSUM) {
/*
* Full checksum has been computed by the hardware
* and has been attached. If the driver wants us to
* verify the correctness of the attached value, in
* order to protect against faulty hardware, compare
* it against -0 (0xFFFF) to see if it's valid.
*/
if (hck_flags & HCK_FULLCKSUM_OK)
return (B_TRUE);
if (cksum == 0xFFFF)
return (B_TRUE);
return (B_FALSE);
}
if ((hck_flags & HCK_PARTIALCKSUM) &&
/*
* Partial checksum has been calculated by hardware
* and attached to the packet; in addition, any
* prepended extraneous data is even byte aligned,
* and there are at most two mblks associated with
* the packet. If any such data exists, we adjust
* the checksum; also take care any postpended data.
*/
/*
* One's complement subtract extraneous checksum
*/
else
if (!(~cksum & 0xFFFF))
return (B_TRUE);
return (B_FALSE);
}
}
/*
* Handle fanout of received packets.
* Unicast packets that are looped back (from ire_send_local_v4) and packets
* from the wire are differentiated by checking IRAF_VERIFY_ULP_CKSUM.
*
* IPQoS Notes
* Before sending it to the client, invoke IPPF processing. Policy processing
* takes place only if the callout_position, IPP_LOCAL_IN, is enabled.
*/
void
{
int offset;
/*
* Time for IPP once we've done reassembly and IPsec.
* We skip this for loopback packets since we don't do IPQoS
* on loopback.
*/
!(iraflags & IRAF_LOOPBACK) &&
/*
* Use the interface on which the packet arrived - not where
* the IP address is hosted.
*/
/* ip_process translates an IS_UNDER_IPMP */
/* ip_drop_packet and MIB done */
return;
}
}
/* Determine the minimum required size of the upper-layer header */
/* Need to do this for at least the set of ULPs that TX handles. */
switch (protocol) {
case IPPROTO_TCP:
break;
case IPPROTO_SCTP:
break;
case IPPROTO_UDP:
break;
case IPPROTO_ICMP:
break;
default:
break;
}
/* Make sure we have the min ULP header length */
return;
}
ira);
goto discard;
}
/*
* If trusted extensions then determine the zoneid and TX specific
* ira_flags.
*/
if (iraflags & IRAF_SYSTEM_LABELED) {
/* This can update ira->ira_flags and ira->ira_zoneid */
}
/* Verify ULP checksum. Handles TCP, UDP, and SCTP */
if (iraflags & IRAF_VERIFY_ULP_CKSUM) {
/* Bad checksum. Stats are already incremented */
return;
}
/* IRAF_SCTP_CSUM_ERR could have been set */
}
switch (protocol) {
case IPPROTO_TCP:
/* For TCP, discard broadcast and multicast packets. */
if (iraflags & IRAF_MULTIBROADCAST)
goto discard;
/* First mblk contains IP+TCP headers per above check */
/* TCP options present? */
if (offset != 5) {
if (offset < 5)
goto discard;
/*
* There must be TCP options.
* Make sure we can grab them.
*/
offset <<= 2;
offset += ip_hdr_length;
"ipIfStatsInTruncatedPkts",
return;
}
goto discard;
}
}
/*
* Pass up a squeue hint to tcp.
* If ira_sqp is already set (this is loopback) we leave it
* alone.
*/
}
/* Look for AF_INET or AF_INET6 that matches */
/* Send the TH_RST */
return;
}
if (connp->conn_incoming_ifindex != 0 &&
/* Send the TH_RST */
return;
}
(iraflags & IRAF_IPSEC_SECURE)) {
/* Note that mp is NULL */
return;
}
}
/* Found a client; up it goes */
if (!IPCL_IS_TCP(connp)) {
/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
return;
}
/*
* We do different processing whether called from
* ip_accept_tcp and we match the target, don't match
* the target, and when we are called by ip_input.
*/
if (iraflags & IRAF_TARGET_SQP) {
ip_drop_input("ipIfStatsInDiscards",
} else {
connp);
/*
* Conn ref release when drained from
* the squeue.
*/
}
} else {
}
} else {
}
return;
case IPPROTO_SCTP: {
/* For SCTP, discard broadcast and multicast packets. */
if (iraflags & IRAF_MULTIBROADCAST)
goto discard;
/*
* Since there is no SCTP h/w cksum support yet, just
* clear the flag.
*/
DB_CKSUMFLAGS(mp) = 0;
/* Length ensured above */
/* get the ports */
if (iraflags & IRAF_SCTP_CSUM_ERR) {
/*
* No potential sctp checksum errors go to the Sun
* sctp stack however they might be Adler-32 summed
* packets a userland stack bound to a raw IP socket
* could reasonably use. Note though that Adler-32 is
* a long deprecated algorithm and customer sctp
* networks should eventually migrate to CRC-32 at
* which time this facility should be removed.
*/
return;
}
/* Check for raw socket or OOTB handling */
return;
}
if (connp->conn_incoming_ifindex != 0 &&
/* Check for raw socket or OOTB handling */
return;
}
/* Found a client; up it goes */
/* sctp_input does a rele of the sctp_t */
return;
}
case IPPROTO_UDP:
/* First mblk contains IP+UDP headers as checked above */
if (iraflags & IRAF_MULTIBROADCAST) {
return;
}
/* Look for AF_INET or AF_INET6 that matches */
connf_head != NULL) {
} else {
}
return;
}
if (connp->conn_incoming_ifindex != 0 &&
goto no_udp_match;
}
return;
}
(iraflags & IRAF_IPSEC_SECURE)) {
/* Note that mp is NULL */
return;
}
}
/*
* Remove 0-spi if it's 0, or move everything behind
* the UDP header over it and forward to ESP via
* ip_fanout_v4().
*/
if (iraflags & IRAF_IPSEC_SECURE) {
&ipss->ipsec_dropper);
return;
}
/*
* Packet was consumed - probably sent to
* ip_fanout_v4.
*/
return;
}
/* Else continue like a normal UDP packet. */
}
/* Found a client; up it goes */
return;
default:
break;
}
/*
* Clear hardware checksumming flag as it is currently only
* used by TCP and UDP.
*/
DB_CKSUMFLAGS(mp) = 0;
switch (protocol) {
case IPPROTO_ICMP:
/*
* We need to accomodate icmp messages coming in clear
* until we get everything secure from the wire. If
* icmp_accept_clear_messages is zero we check with
* the global policy and act accordingly. If it is
* non-zero, we accept the message without any checks.
* But *this does not mean* that this will be delivered
* to RAW socket clients. By accepting we might send
* replies back, change our MTU value etc.,
* policy dispositions.
*/
if (ipst->ips_icmp_accept_clear_messages == 0) {
return;
}
/*
* On a labeled system, we have to check whether the zone
* itself is permitted to receive raw traffic.
*/
return;
}
}
/*
* ICMP header checksum, including checksum field,
* should be zero.
*/
return;
}
/* No need to pass to RAW sockets */
return;
}
break;
case IPPROTO_IGMP:
/*
* If we are not willing to accept IGMP packets in clear,
* then check with global policy.
*/
if (ipst->ips_igmp_accept_clear_messages == 0) {
return;
}
return;
}
/*
* Validate checksum
*/
return;
}
/* Bad packet - discarded by igmp_input */
return;
}
break;
case IPPROTO_PIM:
/*
* If we are not willing to accept PIM packets in clear,
* then check with global policy.
*/
if (ipst->ips_pim_accept_clear_messages == 0) {
return;
}
return;
}
/* Checksum is verified in pim_input */
/* Bad packet - discarded by pim_input */
return;
}
break;
case IPPROTO_AH:
case IPPROTO_ESP: {
/*
*/
if (!ipsec_loaded(ipss)) {
return;
}
/* select inbound SA and have IPsec process the pkt */
if (protocol == IPPROTO_ESP) {
return;
IPSA_F_NATT) != 0);
/*
* The following is a fancy, but quick, way of saying:
* ESP-in-UDP SA and Raw ESP packet --> drop
* OR
* ESP SA and ESP-in-UDP packet --> drop
*/
if (esp_in_udp_sa != esp_in_udp_packet) {
&ipss->ipsec_dropper);
return;
}
ira);
} else {
return;
ira);
}
/*
* Either it failed or is pending. In the former case
* ipIfStatsInDiscards was increased.
*/
return;
}
/* we're done with IPsec processing, send it up */
return;
}
case IPPROTO_ENCAP: {
/*
* Handle self-encapsulated packets (IP-in-IP where
* the inner addresses == the outer addresses).
*/
if (ira->ira_pktlen <
ip_hdr_length + sizeof (ipha_t)) {
ip_drop_input("ipIfStatsInTruncatedPkts",
return;
}
return;
}
}
/*
* Check the sanity of the inner IP header.
*/
return;
}
return;
}
/* We fallthru to iptun fanout below */
goto iptun;
}
/*
* Self-encapsulated tunnel packet. Remove
* the outer IP header and fanout again.
* We also need to make sure that the inner
* header is pulled up until options.
*/
ipha = inner_ipha;
if (ira->ira_pktlen <
ip_drop_input("ipIfStatsInTruncatedPkts",
return;
}
return;
}
}
if (ip_hdr_length > sizeof (ipha_t)) {
/* We got options on the inner packet. */
int error = 0;
if (error != 0) {
/*
* An ICMP error has been sent and the packet
* has been dropped.
*/
return;
}
/*
* Someone put a source-route in
* the inside header of a self-
* encapsulated packet. Drop it
* with extreme prejudice and let
* the sender know.
*/
ip_drop_input("ICMP_SOURCE_ROUTE_FAILED",
ira);
return;
}
}
/*
* This means that somebody is sending
*
* Send this packet to find a tunnel endpoint.
* if I can't find one, an ICMP
* PROTOCOL_UNREACHABLE will get sent.
*/
goto iptun;
}
/* Update based on removed IP header */
/*
* This packet is self-encapsulated multiple
* times. We don't want to recurse infinitely.
* To keep it simple, drop the packet.
*/
return;
}
return;
}
iptun: /* IPPROTO_ENCAPS that is not self-encapsulated */
case IPPROTO_IPV6:
/* iptun will verify trusted label */
return;
}
/* FALLTHRU */
default:
/*
* On a labeled system, we have to check whether the zone
* itself is permitted to receive raw traffic.
*/
return;
}
}
break;
}
/*
* The above input functions may have returned the pulled up message.
* So ipha need to be reinitialized.
*/
/*
* No user-level listener for these packets packets.
* Check for IPPROTO_ENCAP...
*/
/*
* Check policy here,
* THEN ship off to ip_mroute_decap().
*
* BTW, If I match a configured IP-in-IP
* tunnel above, this path will not be reached, and
* ip_mroute_decap will never be called.
*/
} /* Else we already freed everything! */
} else {
}
return;
}
/*
* Handle fanout to raw sockets. There
* can be more than one stream bound to a particular
* protocol. When this is the case, each one gets a copy
* of any incoming packets.
*/
return;
}