ilb.c revision 6e0672ac23ea9d93b3e86c7f6e2fd7a79fdd78d3
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <inet/udp_impl.h>
#include <inet/kstatcom.h>
#include "ilb_alg.h"
#include "ilb_nat.h"
#include "ilb_conn.h"
/* ILB kmem cache flag */
int ilb_kmem_flags = 0;
/*
* The default size for the different hash tables. Global for all stacks.
* But each stack has its own table, just that their sizes are the same.
*/
/* This should be a prime number. */
/* Default NAT cache entry expiry time. */
/* Default sticky entry expiry time. */
/* addr is assumed to be a uint8_t * to an ipaddr_t. */
/*
* Note on ILB delayed processing
*
* To avoid in line removal on some of the data structures, such as rules,
* servers and ilb_conn_hash entries, ILB delays such processing to a taskq.
* There are three types of ILB taskq:
*
* 1. rule handling: created at stack initialialization time, ilb_stack_init()
* 2. conn hash handling: created at conn hash initialization time,
* ilb_conn_hash_init()
* 3. sticky hash handling: created at sticky hash initialization time,
* ilb_sticky_hash_init()
*
* The rule taskq is for processing rule and server removal. When a user
* So the user land thread requesting the removal does not need to wait
* for the removal completion.
*
* ilb_sticky_hash table entry removal. There are ilb_conn_timer_size timers
* and ilb_sticky_timer_size timers running for ilb_conn_hash and
* ilb_sticky_hash cleanup respectively. Each timer is responsible for one
* portion (same size) of the hash table. When a timer fires, it dispatches
* a conn hash taskq to clean up its portion of the table. This avoids in
* line processing of the removal.
*
* There is another delayed processing, the clean up of NAT source address
* table. We just use the timer to directly handle it instead of using
* a taskq. The reason is that the table is small so it is OK to use the
* timer.
*/
/* ILB rule taskq constants. */
#define ILB_RULE_TASKQ_NUM_THR 20
/* Argument passed to ILB rule taskq routines. */
typedef struct {
/* kstat handling routines. */
ilb_server_t *);
/* Rule hash handling routines. */
static void ilb_rule_hash_init(ilb_stack_t *);
static void ilb_rule_hash_fini(ilb_stack_t *);
static void ilb_rule_hash_del(ilb_rule_t *);
int *);
/* Back end server handling routines. */
static void ilb_server_free(ilb_server_t *);
/* Network stack handling routines. */
static void ilb_stack_shutdown(netstackid_t, void *);
static void ilb_stack_fini(netstackid_t, void *);
/* Sticky connection handling routines. */
static void ilb_rule_sticky_init(ilb_rule_t *);
static void ilb_rule_sticky_fini(ilb_rule_t *);
/* Handy macro to check for unspecified address. */
#define IS_ADDR_UNSPEC(addr) \
/*
* Global kstat instance counter. When a rule is created, its kstat instance
* number is assigned by ilb_kstat_instance and ilb_kstat_instance is
* incremented.
*/
static uint_t ilb_kstat_instance = 0;
/*
* The ILB global kstat has name ILB_G_KS_NAME and class name ILB_G_KS_CNAME.
* A rule's kstat has ILB_RULE_KS_CNAME class name.
*/
#define ILB_G_KS_NAME "global"
#define ILB_G_KS_CNAME "kstat"
#define ILB_RULE_KS_CNAME "rulestat"
static kstat_t *
{
{ "num_rules", KSTAT_DATA_UINT64, 0 },
{ "ip_frag_in", KSTAT_DATA_UINT64, 0 },
{ "ip_frag_dropped", KSTAT_DATA_UINT64, 0 }
};
return (NULL);
return (ksp);
}
static void
{
}
}
static kstat_t *
{
{ "num_servers", KSTAT_DATA_UINT64, 0 },
{ "bytes_not_processed", KSTAT_DATA_UINT64, 0 },
{ "pkt_not_processed", KSTAT_DATA_UINT64, 0 },
{ "bytes_dropped", KSTAT_DATA_UINT64, 0 },
{ "pkt_dropped", KSTAT_DATA_UINT64, 0 },
{ "nomem_bytes_dropped", KSTAT_DATA_UINT64, 0 },
{ "nomem_pkt_dropped", KSTAT_DATA_UINT64, 0 },
{ "noport_bytes_dropped", KSTAT_DATA_UINT64, 0 },
{ "noport_pkt_dropped", KSTAT_DATA_UINT64, 0 },
{ "icmp_echo_processed", KSTAT_DATA_UINT64, 0 },
{ "icmp_dropped", KSTAT_DATA_UINT64, 0 },
{ "icmp_too_big_processed", KSTAT_DATA_UINT64, 0 },
{ "icmp_too_big_dropped", KSTAT_DATA_UINT64, 0 }
};
return (NULL);
return (ksp);
}
static kstat_t *
{
{ "bytes_processed", KSTAT_DATA_UINT64, 0 },
{ "pkt_processed", KSTAT_DATA_UINT64, 0 },
{ "ip_address", KSTAT_DATA_STRING, 0 }
};
char cname_buf[KSTAT_STRLEN];
/* 7 is "-sstat" */
return (NULL);
/* We never change the IP address */
return (ksp);
}
/* Initialize the rule hash table. */
static void
{
int i;
/*
* If ilbs->ilbs_rule_hash_size is not a power of 2, bump it up to
* the next power of 2.
*/
for (i = 0; i < 31; i++) {
break;
}
}
for (i = 0; i < ilbs->ilbs_rule_hash_size; i++) {
}
}
/* Clean up the rule hash table. */
static void
{
return;
}
/* Add a rule to the rule hash table. */
static void
{
int i;
}
/*
* Remove a rule from the rule hash table. Note that the rule is not freed
* in this routine.
*/
static void
{
} else {
}
}
}
/*
* Given the info of a packet, look for a match in the rule hash table.
*/
static ilb_rule_t *
{
int i;
if (!rule->ir_port_range) {
continue;
} else {
continue;
}
}
continue;
}
if (l3 == IPPROTO_IP) {
continue;
}
} else {
continue;
}
}
/*
* Just update the stats if the rule is disabled.
*/
break;
/*
* If we are busy...
*
* XXX we should have a queue to postpone the
* packet processing. But this requires a
* mechanism in IP to re-start the packet
* processing. So for now, just drop the packet.
*/
break;
} else {
break;
}
}
return (rule);
}
/*
* Add a rule to the global rule list. This list is for finding all rules
* in an IP stack. The caller is assumed to hold the ilbs_g_lock.
*/
static void
{
}
/* The call is assumed to hold the ilbs_g_lock. */
static void
{
break;
}
return;
}
else
}
/*
* Helper routine to calculate how many source addresses are in a given
* range.
*/
static int64_t
{
/*
* Here we assume that the max number of NAT source cannot be
* large such that the most significant 2 s6_addr32 must be
* equal.
*/
return (-1);
}
} else {
ret <<= 32;
return (ret + 1);
}
}
/*
* Add an ILB rule.
*/
int
{
int ret;
/* Sanity checks. */
return (EINVAL);
/* Need to support SCTP... */
return (EINVAL);
/* For full NAT, the NAT source must be supplied. */
return (EINVAL);
}
}
/* Check invalid mask */
return (EINVAL);
}
/* Port is passed in network byte order. */
return (EINVAL);
/* min_port == 0 means "all ports". Make it so */
if (min_port == 0) {
min_port = 1;
max_port = 65535;
}
/* Funny address checking. */
v4_addr1 == INADDR_ANY ||
return (EINVAL);
}
v4_addr1 == INADDR_BROADCAST ||
v4_addr2 == INADDR_BROADCAST ||
return (EINVAL);
}
return (EINVAL);
}
} else {
return (EINVAL);
}
return (EINVAL);
}
&cmd->nat_src_end)) < 0 ||
num_src > ILB_MAX_NAT_SRC) {
return (EINVAL);
}
}
}
}
/* Make sure that the new rule does not duplicate an existing one. */
return (EEXIST);
}
return (ENOMEM);
}
/* ir_name is all 0 to begin with */
goto error;
}
}
else
/*
* The default connection drain timeout is indefinite (value 0),
* meaning we will wait for all connections to finish. So we
* can assign cmd->conn_drain_timeout to it directly.
*/
if (cmd->nat_expiry != 0) {
} else {
case IPPROTO_TCP:
break;
case IPPROTO_UDP:
break;
default:
(void *)rule);
break;
}
}
if (cmd->sticky_expiry != 0)
else
}
case ILB_ALG_IMPL_ROUNDROBIN:
goto error;
}
break;
case ILB_ALG_IMPL_HASH_IP:
case ILB_ALG_IMPL_HASH_IP_VIP:
goto error;
}
break;
default:
goto error;
}
/* Add it to the global list and hash array at the end. */
return (0);
/* stackid must be initialized if ir_ksp != NULL */
}
return (ret);
}
/*
* The final part in deleting a rule. Either called directly or by the
* taskq dispatched.
*/
static void
{
/*
* Let the algorithm know that the rule is going away. The
* algorithm fini routine will free all its resources with this
* rule.
*/
if (tmp_rule->ir_conn_drain_timeout != 0) {
/*
* The garbage collection thread checks this value
* without grabing a lock. So we need to use
* atomic_swap_64() to make sure that the value seen
* by gc thread is intact.
*/
(void) atomic_swap_64(
}
}
}
/* The routine executed by the delayed rule taskq. */
static void
ilb_rule_del_tq(void *arg)
{
}
/* Routine to delete a rule. */
int
{
int err;
return (err);
}
/*
* First remove the rule from the hash array and the global list so
* that no one can find this rule any more.
*/
/*
* Now no one can find this rule, we can remove it once all
* references to it are dropped and all references to the list
* of servers are dropped. So dispatch a task to finish the deletion.
* We do this instead of letting the last one referencing the
* rule do it. The reason is that the last one may be the
* interrupt thread. We want to minimize the work it needs to
* do. Rule deletion is not a critical task so it can be delayed.
*/
TQ_SLEEP);
return (0);
}
/*
* Given an IP address, check to see if there is a rule using this
* as the VIP. It can be used to check if we need to drop a fragment.
*/
{
int i;
break;
}
} else {
}
break;
}
}
return (ret);
}
{
int i;
break;
}
} else {
}
break;
}
}
return (ret);
}
static ilb_rule_t *
int *err)
{
continue;
*err = EINPROGRESS;
return (NULL);
}
*err = 0;
return (tmp_rule);
}
}
return (NULL);
}
/* To find a rule with a given name and zone in the global rule list. */
int *err)
{
return (tmp_rule);
}
/* Try to match the given packet info and zone ID with a rule. */
static boolean_t
{
continue;
/*
* We don't allow the same name in different rules even if all
* the other rule components are different.
*/
return (B_TRUE);
continue;
/*
* ir_min_port and ir_max_port are the same if ir_port_range
* is false. In this case, if the ir_min|max_port (same) is
* outside of the given port range, it is OK. In other cases,
* check if min and max port are outside a rule's range.
*/
continue;
}
/*
* If l3 is IPv4, the addr passed in is assumed to be
* mapped address.
*/
if (V6_OR_V4_INADDR_ANY(*addr) ||
return (B_TRUE);
}
}
return (B_FALSE);
}
int
{
int err;
return (err);
}
}
/* Only refrele if the rule is passed in. */
return (0);
}
int
{
int err;
return (err);
}
}
/* Only refrele if the rule is passed in. */
return (0);
}
/*
* XXX We should probably have a walker function to walk all rules. For
*/
void
{
continue;
/*
* No need to hold the rule as we are holding the global
* lock so it won't go away. Ignore the return value here
* as the rule is provided so the call cannot fail.
*/
}
}
void
{
continue;
}
}
void
{
continue;
}
}
/*
* This is just an optimization, so don't grab the global lock. The
* worst case is that we missed a couple packets.
*/
{
}
static int
{
int ret;
return (ret);
}
}
break;
}
if (tmp_server == NULL) {
goto done;
}
if (enable) {
if (ret == 0) {
tmp_server->iser_die_time = 0;
}
} else {
if (ret == 0) {
if (rule->ir_conn_drain_timeout != 0) {
(void) atomic_swap_64(
}
}
}
done:
return (ret);
}
int
{
}
int
{
}
/*
* Add a back end server to a rule. If the address is IPv4, it is assumed
* to be passed in as a mapped address.
*/
int
{
int ret = 0;
/* Port is passed in network byte order. */
return (EINVAL);
/* min_port == 0 means "all ports". Make it so */
if (min_port == 0) {
min_port = 1;
max_port = 65535;
}
return (EINTR);
}
}
/*
* Set the rule to be busy to make sure that no new packet can
* use this rule.
*/
/* Now wait for all other guys to finish their work. */
goto end;
}
}
/* Sanity checks... */
goto end;
}
/*
* Check for valid port range.
*
* For DSR, there can be no port shifting. Hence the server
* specification must be the same as the rule's.
*
* it must be equal to the same value as the rule port range.
*
*/
goto end;
}
} else {
range != 0) {
goto end;
}
}
/* Check for duplicate. */
break;
}
}
goto end;
}
goto end;
}
sizeof (server->iser_ip_addr));
goto end;
}
else
/*
* for this server.
*/
/*
* If the server uses a port range, our port allocation
* scheme needs to treat it as a wildcard. Refer to the
* comments in ilb_nat.c about the scheme.
*/
if (server->iser_port_range)
port = 0;
else
&rule->ir_nat_src_end))) != 0) {
goto end;
}
}
/*
* The iser_lock is only used to protect iser_refcnt. All the other
* fields in ilb_server_t should not change, except for iser_enabled.
* The worst thing that can happen if iser_enabled is messed up is
* that one or two packets may not be load balanced to a server
* correctly.
*/
/* Let the load balancing algorithm know about the addition. */
goto end;
}
/*
* No need to hold ir_lock since no other thread should manipulate
* the following fields until ILB_RULE_BUSY is cleared.
*/
} else {
}
end:
return (ret);
}
/* The routine executed by the delayed rule processing taskq. */
static void
ilb_server_del_tq(void *arg)
{
}
/*
* Delete a back end server from a rule. If the address is IPv4, it is assumed
* to be passed in as a mapped address.
*/
int
{
int ret = 0;
return (ret);
}
}
}
return (EINTR);
}
}
/*
* Set the rule to be busy to make sure that no new packet can
* use this rule.
*/
/* Now wait for all other guys to finish their work. */
goto end;
}
}
prev_server = NULL;
break;
}
goto end;
}
/*
* Let the load balancing algorithm know about the removal.
* The algorithm may disallow the removal...
*/
goto end;
}
if (prev_server == NULL)
else
/*
* Mark the server as disabled so that if there is any sticky cache
* using this server around, it won't be used.
*/
/*
* De-allocate the NAT source array. The indiviual ilb_nat_src_entry_t
* may not go away if there is still a conn using it. The NAT source
* timer will do the garbage collection.
*/
/* If there is a hard limit on when a server should die, set it. */
if (rule->ir_conn_drain_timeout != 0) {
}
} else {
}
end:
return (ret);
}
/*
* First check if the destination of the ICMP message matches a VIP of
* a rule. If it does not, just return ILB_PASSED.
*
* If the destination matches a VIP:
*
* For ICMP_ECHO_REQUEST, generate a response on behalf of the back end
* server.
*
* For ICMP_DEST_UNREACHABLE fragmentation needed, check inside the payload
* and see which back end server we should send this message to. And we
* need to do NAT on both the payload message and the outside IP packet.
*
* For other ICMP messages, drop them.
*/
/* ARGSUSED */
static int
{
return (ILB_PASSED);
return (ILB_DROPPED);
}
switch (icmph->icmph_type) {
case ICMP_ECHO_REQUEST:
icmph->icmph_checksum = 0;
return (ILB_BALANCED);
case ICMP_DEST_UNREACHABLE: {
int ret;
return (ILB_DROPPED);
}
&addr6)) {
ret = ILB_BALANCED;
} else {
ret = ILB_DROPPED;
}
return (ret);
}
default:
return (ILB_DROPPED);
}
}
/* ARGSUSED */
static int
{
return (ILB_PASSED);
return (ILB_DROPPED);
}
switch (icmp6->icmp6_type) {
case ICMP6_ECHO_REQUEST: {
int hdr_len;
return (ILB_BALANCED);
}
case ICMP6_PACKET_TOO_BIG: {
int ret;
lb_dst)) {
ret = ILB_BALANCED;
} else {
ret = ILB_DROPPED;
}
return (ret);
}
default:
return (ILB_DROPPED);
}
}
/*
* Common routine to check an incoming packet and decide what to do with it.
* called by ilb_check_v4|v6().
*/
static int
{
struct ilb_sticky_s *s = NULL;
int ret;
/*
* We don't really need to switch here since both protocols's
* ports are at the same offset. Just prepare for future protocol
* specific processing.
*/
switch (l4) {
case IPPROTO_TCP:
return (ILB_DROPPED);
break;
case IPPROTO_UDP:
return (ILB_DROPPED);
break;
default:
return (ILB_PASSED);
}
/* Fast path, there is an existing conn. */
return (ILB_BALANCED);
}
/*
* If there is no existing connection for the incoming packet, check
* to see if the packet matches a rule. If not, just let IP decide
* what to do with it.
*
* Note: a reply from back end server should not match a rule. A
* reply should match one existing conn.
*/
/* If the rule is busy, just drop the packet. */
if (busy)
return (ILB_DROPPED);
else
return (ILB_PASSED);
}
/*
* The packet matches a rule, use the rule load balance algorithm
* to find a server.
*/
/*
* This can only happen if there is no server in a rule or all
* the servers are currently disabled.
*/
if (!balanced)
goto no_server;
/*
* If the rule is sticky enabled, we need to check the sticky table.
* If there is a sticky entry for the client, use the previous server
* instead of the one found above (note that both can be the same).
* If there is no entry for that client, add an entry to the sticky
* table. Both the find and add are done in ilb_sticky_find_add()
* to avoid checking for duplicate when adding an entry.
*/
&s, &nat_src_idx)) == NULL) {
goto no_server;
}
}
/*
* We are holding a reference on the rule, so the server
* cannot go away.
*/
case ILB_TOPO_IMPL_NAT: {
/*
* We create a cache even if it is not a SYN segment.
* The server should return a RST. When we see the
* RST, we will destroy this cache. But by having
* a cache, we know how to NAT the returned RST.
*/
/* If stickiness is enabled, use the same source address */
if (s != NULL)
src_idx = &nat_src_idx;
else
if (s != NULL)
ret = ILB_DROPPED;
break;
}
} else {
}
/*
* If ilb_conn_add() fails, it will release the reference on
* sticky info and de-allocate the NAT source port allocated
* above.
*/
ret = ILB_DROPPED;
break;
}
ret = ILB_BALANCED;
break;
}
case ILB_TOPO_IMPL_HALF_NAT:
} else {
}
ret = ILB_DROPPED;
break;
}
ret = ILB_BALANCED;
break;
case ILB_TOPO_IMPL_DSR:
/*
* By decrementing the sticky refcnt, the period of
* stickiness (life time of ilb_sticky_t) will be
* from now to (now + default expiry time).
*/
if (s != NULL)
ret = ILB_BALANCED;
break;
default:
(void *) rule);
break;
}
return (ret);
/* This can only happen if there is no server available. */
return (ILB_DROPPED);
}
int
{
int ret;
if (l4 == IPPROTO_ICMP) {
lb_dst));
}
if (ret == ILB_BALANCED)
return (ret);
}
int
{
if (l4 == IPPROTO_ICMPV6) {
lb_dst));
}
}
void
{
*num_rules = 0;
*num_rules += 1;
}
}
int
{
int err;
return (err);
return (0);
}
int
{
int err;
return (err);
}
*num_servers -= cnt;
return (0);
}
void
char *buf)
{
int cnt;
if (*num_names == 0)
return;
continue;
buf += ILB_RULE_NAMESZ;
break;
}
}
int
{
int err;
return (err);
}
/*
* Except the enabled flags, none of the following will change
* in the life time of a rule. So we don't hold the mutex when
* reading them. The worst is to report a wrong enabled flags.
*/
}
return (0);
}
static void *
{
char tq_name[TASKQ_NAMELEN];
return (NULL);
}
/*
* ilbs_conn/sticky_hash related info is initialized in
* ilb_conn/sticky_hash_init().
*/
/* The allocation is done later when there is a rule using NAT mode. */
ilbs->ilbs_nat_src_tid = 0;
/* For listing the conn hash table */
ilbs->ilbs_conn_list_cur = 0;
/* For listing the sticky hash table */
ilbs->ilbs_sticky_list_cur = 0;
(void *)ns);
return (ilbs);
}
/* ARGSUSED */
static void
{
}
}
static void
{
}
void
ilb_ddi_g_init(void)
{
}
void
ilb_ddi_g_destroy(void)
{
}