ipclassifier.c revision a5628610b3cb18335f49944f353e3be7b9e669f4
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
/*
* IP PACKET CLASSIFIER
*
* The IP packet classifier provides mapping between IP packets and persistent
* connection state for connection-oriented protocols. It also provides
* interface for managing connection states.
*
* The connection state is kept in conn_t data structure and contains, among
* other things:
*
* o Transport protocol
* o squeue for the connection (for TCP only)
* o reference counter
* o Connection state
* o hash table linkage
* o credentials
* o ipsec policy
* o send and receive functions.
* o mutex lock.
*
* Connections use a reference counting scheme. They are freed when the
* reference counter drops to zero. A reference is incremented when connection
* is placed in a list or table, when incoming packet for the connection arrives
* and when connection is processed via squeue (squeue processing may be
* asynchronous and the reference protects the connection from being destroyed
* before its processing is finished).
*
* conn_recv is used to pass up packets to the ULP.
* For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
* a listener, and changes to tcp_input_listener as the listener has picked a
* good squeue. For other cases it is set to tcp_input_data.
*
* conn_recvicmp is used to pass up ICMP errors to the ULP.
*
* Classifier uses several hash tables:
*
* ipcl_conn_fanout: contains all TCP connections in CONNECTED state
* ipcl_bind_fanout: contains all connections in BOUND state
* ipcl_proto_fanout: IPv4 protocol fanout
* ipcl_proto_fanout_v6: IPv6 protocol fanout
* ipcl_udp_fanout: contains all UDP connections
* ipcl_iptun_fanout: contains all IP tunnel connections
* ipcl_globalhash_fanout: contains all connections
*
* The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
* which need to view all existing connections.
*
* All tables are protected by per-bucket locks. When both per-bucket lock and
* connection lock need to be held, the per-bucket lock should be acquired
* first, followed by the connection lock.
*
* All functions doing search in one of these tables increment a reference
* counter on the connection found (if any). This reference should be dropped
* when the caller has finished processing the connection.
*
*
* INTERFACES:
* ===========
*
* Connection Lookup:
* ------------------
*
* conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
* conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
*
* Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
* it can't find any associated connection. If the connection is found, its
* reference counter is incremented.
*
* mp: mblock, containing packet header. The full header should fit
* into a single mblock. It should also contain at least full IP
* and TCP or UDP header.
*
* protocol: Either IPPROTO_TCP or IPPROTO_UDP.
*
* hdr_len: The size of IP header. It is used to find TCP or UDP header in
* the packet.
*
* ira->ira_zoneid: The zone in which the returned connection must be; the
* zoneid corresponding to the ire_zoneid on the IRE located for
* the packet's destination address.
*
* ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
* IRAF_TX_SHARED_ADDR flags
*
* For TCP connections, the lookup order is as follows:
* 5-tuple {src, dst, protocol, local port, remote port}
* lookup in ipcl_conn_fanout table.
* 3-tuple {dst, remote port, protocol} lookup in
* ipcl_bind_fanout table.
*
* For UDP connections, a 5-tuple {src, dst, protocol, local port,
* remote port} lookup is done on ipcl_udp_fanout. Note that,
* these interfaces do not handle cases where a packets belongs
* to multiple UDP clients, which is handled in IP itself.
*
* If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
* determine which actual zone gets the segment. This is used only in a
* labeled environment. The matching rules are:
*
* - If it's not a multilevel port, then the label on the packet selects
* the zone. Unlabeled packets are delivered to the global zone.
*
* - If it's a multilevel port, then only the zone registered to receive
* packets on that port matches.
*
* Also, in a labeled environment, packet labels need to be checked. For fully
* bound TCP connections, we can assume that the packet label was checked
* during connection establishment, and doesn't need to be checked on each
* packet. For others, though, we need to check for strict equality or, for
* multilevel ports, membership in the range or set. This part currently does
* a tnrh lookup on each packet, but could be optimized to use cached results
* if that were necessary. (SCTP doesn't come through here, but if it did,
* we would apply the same rules as TCP.)
*
* An implication of the above is that fully-bound TCP sockets must always use
* distinct 4-tuples; they can't be discriminated by label alone.
*
* Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
* as there's no connection set-up handshake and no shared state.
*
* Labels on looped-back packets within a single zone do not need to be
* checked, as all processes in the same zone have the same label.
*
* Finally, for unlabeled packets received by a labeled system, special rules
* apply. We consider only the MLP if there is one. Otherwise, we prefer a
* socket in the zone whose label matches the default label of the sender, if
* any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the
* receiver's label must dominate the sender's default label.
*
* conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
* conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
* ip_stack);
*
* Lookup routine to find a exact match for {src, dst, local port,
* remote port) for TCP connections in ipcl_conn_fanout. The address and
* ports are read from the IP and TCP header respectively.
*
* conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol,
* zoneid, ip_stack);
* conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
* zoneid, ip_stack);
*
* Lookup routine to find a listener with the tuple {lport, laddr,
* protocol} in the ipcl_bind_fanout table. For IPv6, an additional
* parameter interface index is also compared.
*
* void ipcl_walk(func, arg, ip_stack)
*
* Apply 'func' to every connection available. The 'func' is called as
* (*func)(connp, arg). The walk is non-atomic so connections may be
* created and destroyed during the walk. The CONN_CONDEMNED and
* CONN_INCIPIENT flags ensure that connections which are newly created
* or being destroyed are not selected by the walker.
*
* Table Updates
* -------------
*
* int ipcl_conn_insert(connp);
* int ipcl_conn_insert_v4(connp);
* int ipcl_conn_insert_v6(connp);
*
* Insert 'connp' in the ipcl_conn_fanout.
* Arguements :
* connp conn_t to be inserted
*
* Return value :
* 0 if connp was inserted
* EADDRINUSE if the connection with the same tuple
* already exists.
*
* int ipcl_bind_insert(connp);
* int ipcl_bind_insert_v4(connp);
* int ipcl_bind_insert_v6(connp);
*
* Insert 'connp' in ipcl_bind_fanout.
* Arguements :
* connp conn_t to be inserted
*
*
* void ipcl_hash_remove(connp);
*
* Removes the 'connp' from the connection fanout table.
*
* Connection Creation/Destruction
* -------------------------------
*
* conn_t *ipcl_conn_create(type, sleep, netstack_t *)
*
* Creates a new conn based on the type flag, inserts it into
* globalhash table.
*
* type: This flag determines the type of conn_t which needs to be
* created i.e., which kmem_cache it comes from.
* IPCL_TCPCONN indicates a TCP connection
* IPCL_SCTPCONN indicates a SCTP connection
* IPCL_UDPCONN indicates a UDP conn_t.
* IPCL_RTSCONN indicates a RTS conn_t.
* IPCL_IPCCONN indicates all other connections.
*
* void ipcl_conn_destroy(connp)
*
* Destroys the connection state, removes it from the global
* connection hash table and frees its memory.
*/
#include <sys/sysmacros.h>
#define _SUN_TPI_VERSION 2
#include <sys/isa_defs.h>
#include <inet/udp_impl.h>
#include <inet/rawip_impl.h>
#include <inet/rts_impl.h>
#include <inet/ipclassifier.h>
#include <inet/ipsec_impl.h>
uint_t tcp_conn_hash_size = 0;
/* Raw socket fanout size. Must be a power of 2. */
/*
* The IPCL_IPTUN_HASH() function works best with a prime table size. We
* expect that most large deployments would have hundreds of tunnels, and
* thousands in the extreme case.
*/
/*
* Power of 2^N Primes useful for hashing for N of 0-28,
* these primes are the nearest prime <= 2^N - 2^(N-2).
*/
6143, 12281, 24571, 49139, 98299, 196597, 393209, \
786431, 1572853, 3145721, 6291449, 12582893, 25165813, \
50331599, 100663291, 201326557, 0}
/*
* wrapper structure to ensure that conn and what follows it (tcp_t, etc)
* are aligned on cache lines.
*/
typedef union itc_s {
} itc_t;
struct kmem_cache *tcp_conn_cache;
struct kmem_cache *ip_conn_cache;
extern struct kmem_cache *sctp_conn_cache;
struct kmem_cache *udp_conn_cache;
struct kmem_cache *rawip_conn_cache;
struct kmem_cache *rts_conn_cache;
extern void tcp_timermp_free(tcp_t *);
extern mblk_t *tcp_timermp_alloc(int);
static int ip_conn_constructor(void *, void *, int);
static void ip_conn_destructor(void *, void *);
static int tcp_conn_constructor(void *, void *, int);
static void tcp_conn_destructor(void *, void *);
static int udp_conn_constructor(void *, void *, int);
static void udp_conn_destructor(void *, void *);
static int rawip_conn_constructor(void *, void *, int);
static void rawip_conn_destructor(void *, void *);
static int rts_conn_constructor(void *, void *, int);
static void rts_conn_destructor(void *, void *);
/*
* Global (for all stack instances) init routine
*/
void
ipcl_g_init(void)
{
sizeof (conn_t), CACHE_ALIGN_SIZE,
}
/*
* ipclassifier intialization routine, sets up hash tables.
*/
void
{
int i;
/*
*/
if (ipcl_conn_hash_size != 0) {
} else if (tcp_conn_hash_size != 0) {
} else {
}
}
break;
}
}
/* Out of range, use the 2^16 value */
}
for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
}
for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
}
for (i = 0; i < IPPROTO_MAX; i++) {
}
for (i = 0; i < IPPROTO_MAX; i++) {
}
for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
}
for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
}
for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
}
for (i = 0; i < CONN_G_HASH_SIZE; i++) {
}
}
void
ipcl_g_destroy(void)
{
}
/*
* All user-level and kernel use of the stack must be gone
* by now.
*/
void
{
int i;
for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
}
sizeof (connf_t));
for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
}
sizeof (connf_t));
for (i = 0; i < IPPROTO_MAX; i++) {
}
IPPROTO_MAX * sizeof (connf_t));
for (i = 0; i < IPPROTO_MAX; i++) {
}
IPPROTO_MAX * sizeof (connf_t));
for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
}
sizeof (connf_t));
for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
}
for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
}
sizeof (connf_t));
for (i = 0; i < CONN_G_HASH_SIZE; i++) {
}
sizeof (connf_t) * CONN_G_HASH_SIZE);
}
/*
* conn creation routine. initialize the conn, sets the reference
* and inserts it in the global hash table.
*/
conn_t *
{
struct kmem_cache *conn_cache;
switch (type) {
case IPCL_SCTPCONN:
return (NULL);
return (connp);
case IPCL_TCPCONN:
break;
case IPCL_UDPCONN:
break;
case IPCL_RAWIPCONN:
break;
case IPCL_RTSCONN:
break;
case IPCL_IPCCONN:
break;
default:
ASSERT(0);
}
return (NULL);
return (connp);
}
void
{
/* ixa_cred done in ipcl_conn_cleanup below */
}
connp->conn_ht_iphc_allocated = 0;
connp->conn_ht_iphc_len = 0;
connp->conn_ht_ulp_len = 0;
}
}
}
}
}
}
/*
* tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
* the mblk.
*/
}
}
return;
}
return;
}
}
/* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
} else {
}
}
/*
* Running in cluster mode - deregister listener information
*/
static void
{
if (cl_inet_unlisten != NULL) {
} else {
}
}
}
/*
* We set the IPCL_REMOVED flag (instead of clearing the flag indicating
* which table the conn belonged to). So for debugging we can see which hash
* table this connection was in.
*/
#define IPCL_HASH_REMOVE(connp) { \
else \
ipcl_conn_unlisten((connp)); \
CONN_DEC_REF((connp)); \
} \
}
void
{
if (protocol == IPPROTO_RSVP)
}
/*
* The whole purpose of this function is allow removal of
* a conn_t from the connected hash for timewait reclaim.
* This is essentially a TW reclaim fastpath where timewait
* collector checks under fanout lock (so no one else can
* get access to the conn_t) that refcnt is 2 i.e. one for
* TCP and one for the classifier hash list. If ref count
* is indeed 2, we can just remove the conn under lock and
* avoid cleaning up the conn under squeue. This gives us
* improved performance.
*/
void
{
}
} else {
}
}
} \
CONN_INC_REF(connp); \
}
IPCL_HASH_REMOVE((connp)); \
}
IPCL_HASH_REMOVE((connp)); \
} \
} else { \
} \
} \
IPCL_BOUND; \
CONN_INC_REF(connp); \
}
boolean_t isv4mapped = \
IPCL_HASH_REMOVE((connp)); \
if (isv4mapped && \
break; \
} \
} \
IPCL_BOUND; \
CONN_INC_REF((connp)); \
}
void
{
}
/*
* Because the classifier is used to classify inbound packets, the destination
* address is meant to be our local tunnel address (tunnel source), and the
* source the remote tunnel address (tunnel destination).
*
* Note that conn_proto can't be used for fanout since the upper protocol
* can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
*/
conn_t *
{
/* first look for IPv4 tunnel links */
break;
}
goto done;
/* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
INADDR_ANY)];
break;
}
done:
return (connp);
}
conn_t *
{
/* Look for an IPv6 tunnel link */
break;
}
}
return (connp);
}
/*
* This function is used only for inserting SCTP raw socket now.
* This may change later.
*
* Note that only one raw socket can be bound to a port. The param
* lport is in network byte order.
*/
static int
{
/* Check for existing raw socket already bound to the port. */
&connp->conn_laddr_v6))) {
break;
}
}
return (EADDRNOTAVAIL);
} else {
}
} else {
}
return (0);
}
static int
{
/* A tunnel is already bound to these addresses. */
return (EADDRINUSE);
}
}
return (0);
}
static int
{
/* A tunnel is already bound to these addresses. */
return (EADDRINUSE);
}
}
return (0);
}
/*
* Check for a MAC exemption conflict on a labeled system. Note that for
* protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
* transport layer. This check is for binding all other protocols.
*
* Returns true if there's a conflict.
*/
static boolean_t
{
/* We don't allow v4 fallback for v6 raw socket */
continue;
/* If neither is exempt, then there's no conflict */
continue;
/* We are only concerned about sockets for a different zone */
continue;
/* If both are bound to different specific addrs, ok */
continue;
/* These two conflict; fail */
break;
}
}
static boolean_t
{
/* We don't allow v4 fallback for v6 raw socket */
continue;
/* If neither is exempt, then there's no conflict */
continue;
/* We are only concerned about sockets for a different zone */
continue;
/* If both are bound to different addrs, ok */
&tconn->conn_laddr_v6))
continue;
/* These two conflict; fail */
break;
}
}
/*
* (v4, v6) bind hash insertion routines
* The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
*/
int
{
return (ipcl_bind_insert_v6(connp));
else
return (ipcl_bind_insert_v4(connp));
}
int
{
int ret = 0;
if (IPCL_IS_IPTUN(connp))
switch (protocol) {
default:
if (is_system_labeled() &&
return (EADDRINUSE);
/* FALLTHROUGH */
case IPPROTO_UDP:
if (protocol == IPPROTO_UDP) {
} else {
}
} else {
}
if (protocol == IPPROTO_RSVP)
break;
case IPPROTO_TCP:
/* Insert it in the Bind Hash */
} else {
}
if (cl_inet_listen != NULL) {
(*cl_inet_listen)(
}
break;
case IPPROTO_SCTP:
break;
}
return (ret);
}
int
{
int ret = 0;
if (IPCL_IS_IPTUN(connp)) {
}
switch (protocol) {
default:
if (is_system_labeled() &&
return (EADDRINUSE);
/* FALLTHROUGH */
case IPPROTO_UDP:
if (protocol == IPPROTO_UDP) {
} else {
}
} else {
}
break;
case IPPROTO_TCP:
/* Insert it in the Bind Hash */
} else {
}
if (cl_inet_listen != NULL) {
laddrp =
} else {
}
(*cl_inet_listen)(
}
break;
case IPPROTO_SCTP:
break;
}
return (ret);
}
/*
* ipcl_conn_hash insertion routines.
*/
int
{
return (ipcl_conn_insert_v6(connp));
else
return (ipcl_conn_insert_v4(connp));
}
int
{
int ret = 0;
if (IPCL_IS_IPTUN(connp))
switch (protocol) {
case IPPROTO_TCP:
/*
* For TCP, we check whether the connection tuple already
* exists before allowing the connection to proceed. We
* also allow indexing on the zoneid. This is to allow
* multiple shared stack zones to have the same tcp
* connection tuple. In practice this only happens for
* INADDR_LOOPBACK as it's the only local address which
* doesn't have to be unique.
*/
connp->conn_ports) &&
/* Already have a conn. bail out */
return (EADDRINUSE);
}
}
/*
* rebind. Let it happen.
*/
}
break;
case IPPROTO_SCTP:
/*
* The raw socket may have already been bound, remove it
* from the hash first.
*/
break;
default:
/*
* Check for conflicts among MAC exempt bindings. For
* transports with port numbers, this is done by the upper
* level per-transport binding logic. For all others, it's
* done here.
*/
if (is_system_labeled() &&
return (EADDRINUSE);
/* FALLTHROUGH */
case IPPROTO_UDP:
if (protocol == IPPROTO_UDP) {
} else {
}
} else {
}
break;
}
return (ret);
}
int
{
int ret = 0;
if (IPCL_IS_IPTUN(connp))
switch (protocol) {
case IPPROTO_TCP:
/*
* For tcp, we check whether the connection tuple already
* exists before allowing the connection to proceed. We
* also allow indexing on the zoneid. This is to allow
* multiple shared stack zones to have the same tcp
* connection tuple. In practice this only happens for
* ipv6_loopback as it's the only local address which
* doesn't have to be unique.
*/
ipst)];
/* NOTE: need to match zoneid. Bug in onnv-gate */
connp->conn_ports) &&
(tconnp->conn_bound_if == 0 ||
/* Already have a conn. bail out */
return (EADDRINUSE);
}
}
/*
* rebind. Let it happen.
*/
}
break;
case IPPROTO_SCTP:
break;
default:
if (is_system_labeled() &&
return (EADDRINUSE);
/* FALLTHROUGH */
case IPPROTO_UDP:
if (protocol == IPPROTO_UDP) {
} else {
}
} else {
}
break;
}
return (ret);
}
/*
* v4 packet classifying function. looks up the fanout table to
* find the conn, the packet belongs to. returns the conn with
* the reference held, null otherwise.
*
* If zoneid is ALL_ZONES, then the search rules described in the "Connection
* Lookup" comment block are applied. Labels are also checked as described
* above. If the packet is from the inside (looped back), and is from the same
* zone, then label checks are omitted.
*/
conn_t *
{
switch (protocol) {
case IPPROTO_TCP:
connfp =
connp->conn_allzones ||
break;
}
/*
* We have a fully-bound TCP connection.
*
* For labeled systems, there's no need to check the
* label here. It's known to be good as we checked
* before allowing the connection to become bound.
*/
return (connp);
}
lport) &&
connp->conn_allzones ||
break;
}
/*
* If the matching connection is SLP on a private address, then
* the label on the packet must match the local zone's label.
* Otherwise, it must be in the label range defined by tnrh.
* This is ensured by tsol_receive_local.
*
* Note that we don't check tsol_receive_local for
* the connected case.
*/
char *, "connp(1) could not receive mp(2)",
}
/* Have a listener at least */
return (connp);
}
break;
case IPPROTO_UDP:
connp->conn_allzones ||
break;
}
char *, "connp(1) could not receive mp(2)",
}
return (connp);
}
/*
*/
break;
case IPPROTO_ENCAP:
case IPPROTO_IPV6:
}
return (NULL);
}
conn_t *
{
switch (protocol) {
case IPPROTO_TCP:
connfp =
connp->conn_allzones ||
break;
}
/*
* We have a fully-bound TCP connection.
*
* For labeled systems, there's no need to check the
* label here. It's known to be good as we checked
* before allowing the connection to become bound.
*/
return (connp);
}
connp->conn_allzones ||
break;
}
char *, "connp(1) could not receive mp(2)",
}
/* Have a listner at least */
return (connp);
}
break;
case IPPROTO_UDP:
connp->conn_allzones ||
break;
}
char *, "connp(1) could not receive mp(2)",
}
return (connp);
}
/*
*/
break;
case IPPROTO_ENCAP:
case IPPROTO_IPV6:
}
return (NULL);
}
/*
* wrapper around ipcl_classify_(v4,v6) routines.
*/
conn_t *
{
} else {
}
}
/*
* Only used to classify SCTP RAW sockets
*/
conn_t *
{
int ipversion;
const void *dst;
} else {
}
/* We don't allow v4 fallback for v6 raw socket. */
continue;
if (ipversion == IPV4_VERSION) {
continue;
} else {
continue;
}
} else {
if (ipversion == IPV4_VERSION) {
continue;
} else {
continue;
}
}
connp->conn_allzones ||
break;
}
char *, "connp(1) could not receive mp(2)",
}
goto found;
/* Try to look for a wildcard SCTP RAW socket match. */
/* We don't allow v4 fallback for v6 raw socket. */
continue;
continue;
if (ipversion == IPV4_VERSION) {
break;
} else {
break;
}
}
}
goto found;
return (NULL);
return (connp);
}
/* ARGSUSED */
static int
{
return (ENOMEM);
return (ENOMEM);
}
return (0);
}
/* ARGSUSED */
static void
{
/* Can be NULL if constructor failed */
}
}
/* ARGSUSED */
static int
{
return (ENOMEM);
return (0);
}
/* ARGSUSED */
static void
{
/* Can be NULL if constructor failed */
}
}
/* ARGSUSED */
static int
{
return (ENOMEM);
return (0);
}
/* ARGSUSED */
static void
{
/* Can be NULL if constructor failed */
}
}
/* ARGSUSED */
static int
{
return (ENOMEM);
return (0);
}
/* ARGSUSED */
static void
{
/* Can be NULL if constructor failed */
}
}
/* ARGSUSED */
static int
{
return (ENOMEM);
return (0);
}
/* ARGSUSED */
static void
{
/* Can be NULL if constructor failed */
}
}
/*
* Called as part of ipcl_conn_destroy to assert and clear any pointers
* in the conn_t.
*
* Below we list all the pointers in the conn_t as a documentation aid.
* The ones that we can not ASSERT to be NULL are #ifdef'ed out.
* If you add any pointers to the conn_t please add an ASSERT here
* and #ifdef it out if it can't be actually asserted to be NULL.
* In any case, we bzero most of the conn_t at the end of the function.
*/
void
{
#ifdef notdef
#endif
#ifdef notdef
/* conn_idl is not cleared when removed from idl list */
#endif
#ifdef notdef
/* conn_netstack is cleared by the caller; needed by ixa_cleanup */
#endif
/* Need to preserve ixa_protocol */
/* Clear out the conn_t fields that are not preserved */
sizeof (conn_t) -
}
/*
* All conns are inserted in a global multi-list for the benefit of
* walkers. The walk is guaranteed to walk all open conns at the time
* of the start of the walk exactly once. This property is needed to
* achieve some cleanups during unplumb of interfaces. This is achieved
* as follows.
*
* ipcl_conn_create and ipcl_conn_destroy are the only functions that
* call the insert and delete functions below at creation and deletion
* time respectively. The conn never moves or changes its position in this
* multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
* won't increase due to walkers, once the conn deletion has started. Note
* that we can't remove the conn from the global list and then wait for
* the refcnt to drop to zero, since walkers would then see a truncated
* list. CONN_INCIPIENT ensures that walkers don't start looking at
* conns until ip_open is ready to make them globally visible.
* The global round robin multi-list locks are held only to get the
* if the multi-list is much greater than the number of cpus.
*/
void
{
int index;
/*
* No need for atomic here. Approximate even distribution
* in the global lists is sufficient.
*/
ipst->ips_conn_g_index++;
/*
* Mark as INCIPIENT, so that walkers will ignore this
* for now, till ip_open is ready to make it visible globally.
*/
/* Insert at the head of the list */
/* The fanout bucket this conn points to */
}
void
{
/*
* We were never inserted in the global multi list.
* IPCL_NONE variety is never inserted in the global multilist
* since it is presumed to not need any cleanup and is transient.
*/
return;
else
/* Better to stumble on a null pointer than to corrupt memory */
}
/*
* Walk the list of all conn_t's in the system, calling the function provided
* With the specified argument for each.
* Applies to both IPv4 and IPv6.
*
* CONNs may hold pointers to ills (conn_dhcpinit_ill and
* conn_oper_pending_ill). To guard against stale pointers
* ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
* unplumbed or removed. New conn_t's that are created while we are walking
* may be missed by this walk, because they are not necessarily inserted
* at the tail of the list. They are new conn_t's and thus don't have any
* stale pointers. The CONN_CLOSING flag ensures that no new reference
* is created to the struct that is going away.
*/
void
{
int i;
for (i = 0; i < CONN_G_HASH_SIZE; i++) {
prev_connp = NULL;
if (connp->conn_state_flags &
(CONN_CONDEMNED | CONN_INCIPIENT)) {
continue;
}
if (prev_connp != NULL)
prev_connp = connp;
}
if (prev_connp != NULL)
}
}
/*
* the {src, dst, lport, fport} quadruplet. Returns with conn reference
* held; caller must call CONN_DEC_REF. Only checks for connected entries
* (peer tcp in ESTABLISHED state).
*/
conn_t *
{
/*
* If either the source of destination address is loopback, then
* both endpoints must be in the same Zone. Otherwise, both of
* the addresses are system-wide unique (tcp is in ESTABLISHED
* state) and the endpoints may reside in different Zones.
*/
return (tconnp);
}
}
return (NULL);
}
/*
* the {src, dst, lport, fport} quadruplet. Returns with conn reference
* held; caller must call CONN_DEC_REF. Only checks for connected entries
* (peer tcp in ESTABLISHED state).
*/
conn_t *
{
/*
* If either the source of destination address is loopback, then
* both endpoints must be in the same Zone. Otherwise, both of
* the addresses are system-wide unique (tcp is in ESTABLISHED
* state) and the endpoints may reside in different Zones. We
* don't do Zone check for link local address(es) because the
* current Zone implementation treats each link local address as
* being unique per system node, i.e. they belong to global Zone.
*/
/* We skip conn_bound_if check here as this is loopback tcp */
return (tconnp);
}
}
return (NULL);
}
/*
* Find an exact {src, dst, lport, fport} match for a bounced datagram.
* Returns with conn reference held. Caller must call CONN_DEC_REF.
* Only checks for connected entries i.e. no INADDR_ANY checks.
*/
conn_t *
{
return (tconnp);
}
}
return (NULL);
}
/*
* Find an exact {src, dst, lport, fport} match for a bounced datagram.
* Returns with conn reference held. Caller must call CONN_DEC_REF.
* Only checks for connected entries i.e. no INADDR_ANY checks.
* Match on ifindex in addition to addresses.
*/
conn_t *
{
(tconnp->conn_bound_if == 0 ||
return (tconnp);
}
}
return (NULL);
}
/*
* a listener when changing state.
*/
conn_t *
{
/*
* Avoid false matches for packets sent to an IP destination of
* all zeros.
*/
if (laddr == 0)
return (NULL);
return (connp);
}
}
return (NULL);
}
/*
* a listener when changing state.
*/
conn_t *
{
/*
* Avoid false matches for packets sent to an IP destination of
* all zeros.
*/
if (IN6_IS_ADDR_UNSPECIFIED(laddr))
return (NULL);
(connp->conn_bound_if == 0 ||
return (connp);
}
}
return (NULL);
}
/*
* ipcl_get_next_conn
* get the next entry in the conn global list
* and put a reference on the next_conn.
* decrement the reference on the current conn.
*
* This is an iterator based walker function that also provides for
* some selection by the caller. It walks through the conn_hash bucket
* searching for the next valid connp in the list, and selects connections
* that are neither closed nor condemned. It also REFHOLDS the conn
* thus ensuring that the conn exists when the caller uses the conn.
*/
conn_t *
{
return (NULL);
while (next_connp != NULL) {
(CONN_CONDEMNED | CONN_INCIPIENT))) {
/*
* This conn has been condemned or
* is closing, or the flags don't match
*/
continue;
}
break;
}
return (next_connp);
}
#ifdef CONN_DEBUG
/*
*/
int
{
int last;
last++;
if (last == CONN_TRACE_MAX)
last = 0;
return (1);
}
int
{
int last;
last++;
if (last == CONN_TRACE_MAX)
last = 0;
return (1);
}
#endif