ilb.c revision 6e0672ac23ea9d93b3e86c7f6e2fd7a79fdd78d3
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/kmem.h>
#include <sys/ksynch.h>
#include <sys/systm.h>
#include <sys/socket.h>
#include <sys/disp.h>
#include <sys/taskq.h>
#include <sys/cmn_err.h>
#include <sys/strsun.h>
#include <sys/sdt.h>
#include <sys/atomic.h>
#include <netinet/in.h>
#include <inet/ip.h>
#include <inet/ip6.h>
#include <inet/tcp.h>
#include <inet/udp_impl.h>
#include <inet/kstatcom.h>
#include <inet/ilb_ip.h>
#include "ilb_alg.h"
#include "ilb_nat.h"
#include "ilb_conn.h"
/* ILB kmem cache flag */
int ilb_kmem_flags = 0;
/*
* The default size for the different hash tables. Global for all stacks.
* But each stack has its own table, just that their sizes are the same.
*/
static size_t ilb_rule_hash_size = 2048;
static size_t ilb_conn_hash_size = 262144;
static size_t ilb_sticky_hash_size = 262144;
/* This should be a prime number. */
static size_t ilb_nat_src_hash_size = 97;
/* Default NAT cache entry expiry time. */
static uint32_t ilb_conn_tcp_expiry = 120;
static uint32_t ilb_conn_udp_expiry = 60;
/* Default sticky entry expiry time. */
static uint32_t ilb_sticky_expiry = 60;
/* addr is assumed to be a uint8_t * to an ipaddr_t. */
#define ILB_RULE_HASH(addr, hash_size) \
((*((addr) + 3) * 29791 + *((addr) + 2) * 961 + *((addr) + 1) * 31 + \
*(addr)) & ((hash_size) - 1))
/*
* Note on ILB delayed processing
*
* To avoid in line removal on some of the data structures, such as rules,
* servers and ilb_conn_hash entries, ILB delays such processing to a taskq.
* There are three types of ILB taskq:
*
* 1. rule handling: created at stack initialialization time, ilb_stack_init()
* 2. conn hash handling: created at conn hash initialization time,
* ilb_conn_hash_init()
* 3. sticky hash handling: created at sticky hash initialization time,
* ilb_sticky_hash_init()
*
* The rule taskq is for processing rule and server removal. When a user
* land rule/server removal request comes in, a taskq is dispatched after
* removing the rule/server from all related hashes. This taskq will wait
* until all references to the rule/server are gone before removing it.
* So the user land thread requesting the removal does not need to wait
* for the removal completion.
*
* The conn hash/sticky hash taskq is for processing ilb_conn_hash and
* ilb_sticky_hash table entry removal. There are ilb_conn_timer_size timers
* and ilb_sticky_timer_size timers running for ilb_conn_hash and
* ilb_sticky_hash cleanup respectively. Each timer is responsible for one
* portion (same size) of the hash table. When a timer fires, it dispatches
* a conn hash taskq to clean up its portion of the table. This avoids in
* line processing of the removal.
*
* There is another delayed processing, the clean up of NAT source address
* table. We just use the timer to directly handle it instead of using
* a taskq. The reason is that the table is small so it is OK to use the
* timer.
*/
/* ILB rule taskq constants. */
#define ILB_RULE_TASKQ_NUM_THR 20
/* Argument passed to ILB rule taskq routines. */
typedef struct {
ilb_stack_t *ilbs;
ilb_rule_t *rule;
} ilb_rule_tq_t;
/* kstat handling routines. */
static kstat_t *ilb_kstat_g_init(netstackid_t, ilb_stack_t *);
static void ilb_kstat_g_fini(netstackid_t, ilb_stack_t *);
static kstat_t *ilb_rule_kstat_init(netstackid_t, ilb_rule_t *);
static kstat_t *ilb_server_kstat_init(netstackid_t, ilb_rule_t *,
ilb_server_t *);
/* Rule hash handling routines. */
static void ilb_rule_hash_init(ilb_stack_t *);
static void ilb_rule_hash_fini(ilb_stack_t *);
static void ilb_rule_hash_add(ilb_stack_t *, ilb_rule_t *, const in6_addr_t *);
static void ilb_rule_hash_del(ilb_rule_t *);
static ilb_rule_t *ilb_rule_hash(ilb_stack_t *, int, int, in6_addr_t *,
in_port_t, zoneid_t, uint32_t, boolean_t *);
static void ilb_rule_g_add(ilb_stack_t *, ilb_rule_t *);
static void ilb_rule_g_del(ilb_stack_t *, ilb_rule_t *);
static void ilb_del_rule_common(ilb_stack_t *, ilb_rule_t *);
static ilb_rule_t *ilb_find_rule_locked(ilb_stack_t *, zoneid_t, const char *,
int *);
static boolean_t ilb_match_rule(ilb_stack_t *, zoneid_t, const char *, int,
int, in_port_t, in_port_t, const in6_addr_t *);
/* Back end server handling routines. */
static void ilb_server_free(ilb_server_t *);
/* Network stack handling routines. */
static void *ilb_stack_init(netstackid_t, netstack_t *);
static void ilb_stack_shutdown(netstackid_t, void *);
static void ilb_stack_fini(netstackid_t, void *);
/* Sticky connection handling routines. */
static void ilb_rule_sticky_init(ilb_rule_t *);
static void ilb_rule_sticky_fini(ilb_rule_t *);
/* Handy macro to check for unspecified address. */
#define IS_ADDR_UNSPEC(addr) \
(IN6_IS_ADDR_V4MAPPED(addr) ? IN6_IS_ADDR_V4MAPPED_ANY(addr) : \
IN6_IS_ADDR_UNSPECIFIED(addr))
/*
* Global kstat instance counter. When a rule is created, its kstat instance
* number is assigned by ilb_kstat_instance and ilb_kstat_instance is
* incremented.
*/
static uint_t ilb_kstat_instance = 0;
/*
* The ILB global kstat has name ILB_G_KS_NAME and class name ILB_G_KS_CNAME.
* A rule's kstat has ILB_RULE_KS_CNAME class name.
*/
#define ILB_G_KS_NAME "global"
#define ILB_G_KS_CNAME "kstat"
#define ILB_RULE_KS_CNAME "rulestat"
static kstat_t *
ilb_kstat_g_init(netstackid_t stackid, ilb_stack_t *ilbs)
{
kstat_t *ksp;
ilb_g_kstat_t template = {
{ "num_rules", KSTAT_DATA_UINT64, 0 },
{ "ip_frag_in", KSTAT_DATA_UINT64, 0 },
{ "ip_frag_dropped", KSTAT_DATA_UINT64, 0 }
};
ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, 0, ILB_G_KS_NAME,
ILB_G_KS_CNAME, KSTAT_TYPE_NAMED, NUM_OF_FIELDS(ilb_g_kstat_t),
KSTAT_FLAG_VIRTUAL, stackid);
if (ksp == NULL)
return (NULL);
bcopy(&template, ilbs->ilbs_kstat, sizeof (template));
ksp->ks_data = ilbs->ilbs_kstat;
ksp->ks_private = (void *)(uintptr_t)stackid;
kstat_install(ksp);
return (ksp);
}
static void
ilb_kstat_g_fini(netstackid_t stackid, ilb_stack_t *ilbs)
{
if (ilbs->ilbs_ksp != NULL) {
ASSERT(stackid == (netstackid_t)(uintptr_t)
ilbs->ilbs_ksp->ks_private);
kstat_delete_netstack(ilbs->ilbs_ksp, stackid);
ilbs->ilbs_ksp = NULL;
}
}
static kstat_t *
ilb_rule_kstat_init(netstackid_t stackid, ilb_rule_t *rule)
{
kstat_t *ksp;
ilb_rule_kstat_t template = {
{ "num_servers", KSTAT_DATA_UINT64, 0 },
{ "bytes_not_processed", KSTAT_DATA_UINT64, 0 },
{ "pkt_not_processed", KSTAT_DATA_UINT64, 0 },
{ "bytes_dropped", KSTAT_DATA_UINT64, 0 },
{ "pkt_dropped", KSTAT_DATA_UINT64, 0 },
{ "nomem_bytes_dropped", KSTAT_DATA_UINT64, 0 },
{ "nomem_pkt_dropped", KSTAT_DATA_UINT64, 0 },
{ "noport_bytes_dropped", KSTAT_DATA_UINT64, 0 },
{ "noport_pkt_dropped", KSTAT_DATA_UINT64, 0 },
{ "icmp_echo_processed", KSTAT_DATA_UINT64, 0 },
{ "icmp_dropped", KSTAT_DATA_UINT64, 0 },
{ "icmp_too_big_processed", KSTAT_DATA_UINT64, 0 },
{ "icmp_too_big_dropped", KSTAT_DATA_UINT64, 0 }
};
ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
rule->ir_name, ILB_RULE_KS_CNAME, KSTAT_TYPE_NAMED,
NUM_OF_FIELDS(ilb_rule_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
if (ksp == NULL)
return (NULL);
bcopy(&template, &rule->ir_kstat, sizeof (template));
ksp->ks_data = &rule->ir_kstat;
ksp->ks_private = (void *)(uintptr_t)stackid;
kstat_install(ksp);
return (ksp);
}
static kstat_t *
ilb_server_kstat_init(netstackid_t stackid, ilb_rule_t *rule,
ilb_server_t *server)
{
kstat_t *ksp;
ilb_server_kstat_t template = {
{ "bytes_processed", KSTAT_DATA_UINT64, 0 },
{ "pkt_processed", KSTAT_DATA_UINT64, 0 },
{ "ip_address", KSTAT_DATA_STRING, 0 }
};
char cname_buf[KSTAT_STRLEN];
/* 7 is "-sstat" */
ASSERT(strlen(rule->ir_name) + 7 < KSTAT_STRLEN);
(void) sprintf(cname_buf, "%s-sstat", rule->ir_name);
ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
server->iser_name, cname_buf, KSTAT_TYPE_NAMED,
NUM_OF_FIELDS(ilb_server_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
if (ksp == NULL)
return (NULL);
bcopy(&template, &server->iser_kstat, sizeof (template));
ksp->ks_data = &server->iser_kstat;
ksp->ks_private = (void *)(uintptr_t)stackid;
kstat_named_setstr(&server->iser_kstat.ip_address,
server->iser_ip_addr);
/* We never change the IP address */
ksp->ks_data_size += strlen(server->iser_ip_addr) + 1;
kstat_install(ksp);
return (ksp);
}
/* Initialize the rule hash table. */
static void
ilb_rule_hash_init(ilb_stack_t *ilbs)
{
int i;
/*
* If ilbs->ilbs_rule_hash_size is not a power of 2, bump it up to
* the next power of 2.
*/
if (ilbs->ilbs_rule_hash_size & (ilbs->ilbs_rule_hash_size - 1)) {
for (i = 0; i < 31; i++) {
if (ilbs->ilbs_rule_hash_size < (1 << i))
break;
}
ilbs->ilbs_rule_hash_size = 1 << i;
}
ilbs->ilbs_g_hash = kmem_zalloc(sizeof (ilb_hash_t) *
ilbs->ilbs_rule_hash_size, KM_SLEEP);
for (i = 0; i < ilbs->ilbs_rule_hash_size; i++) {
mutex_init(&ilbs->ilbs_g_hash[i].ilb_hash_lock, NULL,
MUTEX_DEFAULT, NULL);
}
}
/* Clean up the rule hash table. */
static void
ilb_rule_hash_fini(ilb_stack_t *ilbs)
{
if (ilbs->ilbs_g_hash == NULL)
return;
kmem_free(ilbs->ilbs_g_hash, sizeof (ilb_hash_t) *
ilbs->ilbs_rule_hash_size);
}
/* Add a rule to the rule hash table. */
static void
ilb_rule_hash_add(ilb_stack_t *ilbs, ilb_rule_t *rule, const in6_addr_t *addr)
{
int i;
i = ILB_RULE_HASH((uint8_t *)&addr->s6_addr32[3],
ilbs->ilbs_rule_hash_size);
DTRACE_PROBE2(ilb__rule__hash__add, ilb_rule_t *, rule, int, i);
mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
rule->ir_hash_next = ilbs->ilbs_g_hash[i].ilb_hash_rule;
if (ilbs->ilbs_g_hash[i].ilb_hash_rule != NULL)
ilbs->ilbs_g_hash[i].ilb_hash_rule->ir_hash_prev = rule;
rule->ir_hash_prev = NULL;
ilbs->ilbs_g_hash[i].ilb_hash_rule = rule;
rule->ir_hash = &ilbs->ilbs_g_hash[i];
mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
}
/*
* Remove a rule from the rule hash table. Note that the rule is not freed
* in this routine.
*/
static void
ilb_rule_hash_del(ilb_rule_t *rule)
{
mutex_enter(&rule->ir_hash->ilb_hash_lock);
if (rule->ir_hash->ilb_hash_rule == rule) {
rule->ir_hash->ilb_hash_rule = rule->ir_hash_next;
if (rule->ir_hash_next != NULL)
rule->ir_hash_next->ir_hash_prev = NULL;
} else {
if (rule->ir_hash_prev != NULL)
rule->ir_hash_prev->ir_hash_next =
rule->ir_hash_next;
if (rule->ir_hash_next != NULL) {
rule->ir_hash_next->ir_hash_prev =
rule->ir_hash_prev;
}
}
mutex_exit(&rule->ir_hash->ilb_hash_lock);
rule->ir_hash_next = NULL;
rule->ir_hash_prev = NULL;
rule->ir_hash = NULL;
}
/*
* Given the info of a packet, look for a match in the rule hash table.
*/
static ilb_rule_t *
ilb_rule_hash(ilb_stack_t *ilbs, int l3, int l4, in6_addr_t *addr,
in_port_t port, zoneid_t zoneid, uint32_t len, boolean_t *busy)
{
int i;
ilb_rule_t *rule;
ipaddr_t v4_addr;
*busy = B_FALSE;
IN6_V4MAPPED_TO_IPADDR(addr, v4_addr);
i = ILB_RULE_HASH((uint8_t *)&v4_addr, ilbs->ilbs_rule_hash_size);
port = ntohs(port);
mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
rule = rule->ir_hash_next) {
if (!rule->ir_port_range) {
if (rule->ir_min_port != port)
continue;
} else {
if (port < rule->ir_min_port ||
port > rule->ir_max_port) {
continue;
}
}
if (rule->ir_ipver != l3 || rule->ir_proto != l4 ||
rule->ir_zoneid != zoneid) {
continue;
}
if (l3 == IPPROTO_IP) {
if (rule->ir_target_v4 != INADDR_ANY &&
rule->ir_target_v4 != v4_addr) {
continue;
}
} else {
if (!IN6_IS_ADDR_UNSPECIFIED(&rule->ir_target_v6) &&
!IN6_ARE_ADDR_EQUAL(addr, &rule->ir_target_v6)) {
continue;
}
}
/*
* Just update the stats if the rule is disabled.
*/
mutex_enter(&rule->ir_lock);
if (!(rule->ir_flags & ILB_RULE_ENABLED)) {
ILB_R_KSTAT(rule, pkt_not_processed);
ILB_R_KSTAT_UPDATE(rule, bytes_not_processed, len);
mutex_exit(&rule->ir_lock);
rule = NULL;
break;
} else if (rule->ir_flags & ILB_RULE_BUSY) {
/*
* If we are busy...
*
* XXX we should have a queue to postpone the
* packet processing. But this requires a
* mechanism in IP to re-start the packet
* processing. So for now, just drop the packet.
*/
ILB_R_KSTAT(rule, pkt_dropped);
ILB_R_KSTAT_UPDATE(rule, bytes_dropped, len);
mutex_exit(&rule->ir_lock);
*busy = B_TRUE;
rule = NULL;
break;
} else {
rule->ir_refcnt++;
ASSERT(rule->ir_refcnt != 1);
mutex_exit(&rule->ir_lock);
break;
}
}
mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
return (rule);
}
/*
* Add a rule to the global rule list. This list is for finding all rules
* in an IP stack. The caller is assumed to hold the ilbs_g_lock.
*/
static void
ilb_rule_g_add(ilb_stack_t *ilbs, ilb_rule_t *rule)
{
ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
rule->ir_next = ilbs->ilbs_rule_head;
ilbs->ilbs_rule_head = rule;
ILB_KSTAT_UPDATE(ilbs, num_rules, 1);
}
/* The call is assumed to hold the ilbs_g_lock. */
static void
ilb_rule_g_del(ilb_stack_t *ilbs, ilb_rule_t *rule)
{
ilb_rule_t *tmp_rule;
ilb_rule_t *prev_rule;
ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
prev_rule = NULL;
for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
prev_rule = tmp_rule, tmp_rule = tmp_rule->ir_next) {
if (tmp_rule == rule)
break;
}
if (tmp_rule == NULL) {
mutex_exit(&ilbs->ilbs_g_lock);
return;
}
if (prev_rule == NULL)
ilbs->ilbs_rule_head = tmp_rule->ir_next;
else
prev_rule->ir_next = tmp_rule->ir_next;
ILB_KSTAT_UPDATE(ilbs, num_rules, -1);
}
/*
* Helper routine to calculate how many source addresses are in a given
* range.
*/
static int64_t
num_nat_src_v6(const in6_addr_t *a1, const in6_addr_t *a2)
{
int64_t ret;
uint32_t addr1, addr2;
/*
* Here we assume that the max number of NAT source cannot be
* large such that the most significant 2 s6_addr32 must be
* equal.
*/
addr1 = ntohl(a1->s6_addr32[3]);
addr2 = ntohl(a2->s6_addr32[3]);
if (a1->s6_addr32[0] != a2->s6_addr32[0] ||
a1->s6_addr32[1] != a2->s6_addr32[1] ||
a1->s6_addr32[2] > a2->s6_addr32[2] ||
(a1->s6_addr32[2] == a2->s6_addr32[2] && addr1 > addr2)) {
return (-1);
}
if (a1->s6_addr32[2] == a2->s6_addr32[2]) {
return (addr2 - addr1 + 1);
} else {
ret = (ntohl(a2->s6_addr32[2]) - ntohl(a1->s6_addr32[2]));
ret <<= 32;
ret = ret + addr1 - addr2;
return (ret + 1);
}
}
/*
* Add an ILB rule.
*/
int
ilb_rule_add(ilb_stack_t *ilbs, zoneid_t zoneid, const ilb_rule_cmd_t *cmd)
{
ilb_rule_t *rule;
netstackid_t stackid;
int ret;
in_port_t min_port, max_port;
int64_t num_src;
/* Sanity checks. */
if (cmd->ip_ver != IPPROTO_IP && cmd->ip_ver != IPPROTO_IPV6)
return (EINVAL);
/* Need to support SCTP... */
if (cmd->proto != IPPROTO_TCP && cmd->proto != IPPROTO_UDP)
return (EINVAL);
/* For full NAT, the NAT source must be supplied. */
if (cmd->topo == ILB_TOPO_IMPL_NAT) {
if (IS_ADDR_UNSPEC(&cmd->nat_src_start) ||
IS_ADDR_UNSPEC(&cmd->nat_src_end)) {
return (EINVAL);
}
}
/* Check invalid mask */
if ((cmd->flags & ILB_RULE_STICKY) &&
IS_ADDR_UNSPEC(&cmd->sticky_mask)) {
return (EINVAL);
}
/* Port is passed in network byte order. */
min_port = ntohs(cmd->min_port);
max_port = ntohs(cmd->max_port);
if (min_port > max_port)
return (EINVAL);
/* min_port == 0 means "all ports". Make it so */
if (min_port == 0) {
min_port = 1;
max_port = 65535;
}
/* Funny address checking. */
if (cmd->ip_ver == IPPROTO_IP) {
in_addr_t v4_addr1, v4_addr2;
v4_addr1 = cmd->vip.s6_addr32[3];
if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
CLASSD(v4_addr1) || v4_addr1 == INADDR_BROADCAST ||
v4_addr1 == INADDR_ANY ||
!IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
return (EINVAL);
}
if (cmd->topo == ILB_TOPO_IMPL_NAT) {
v4_addr1 = ntohl(cmd->nat_src_start.s6_addr32[3]);
v4_addr2 = ntohl(cmd->nat_src_end.s6_addr32[3]);
if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
(*(uchar_t *)&v4_addr2) == IN_LOOPBACKNET ||
v4_addr1 == INADDR_BROADCAST ||
v4_addr2 == INADDR_BROADCAST ||
v4_addr1 == INADDR_ANY || v4_addr2 == INADDR_ANY ||
CLASSD(v4_addr1) || CLASSD(v4_addr2) ||
!IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
!IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
return (EINVAL);
}
num_src = v4_addr2 - v4_addr1 + 1;
if (v4_addr1 > v4_addr2 || num_src > ILB_MAX_NAT_SRC)
return (EINVAL);
}
} else {
if (IN6_IS_ADDR_LOOPBACK(&cmd->vip) ||
IN6_IS_ADDR_MULTICAST(&cmd->vip) ||
IN6_IS_ADDR_UNSPECIFIED(&cmd->vip) ||
IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
return (EINVAL);
}
if (cmd->topo == ILB_TOPO_IMPL_NAT) {
if (IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_start) ||
IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_end) ||
IN6_IS_ADDR_MULTICAST(&cmd->nat_src_start) ||
IN6_IS_ADDR_MULTICAST(&cmd->nat_src_end) ||
IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_start) ||
IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_end) ||
IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
return (EINVAL);
}
if ((num_src = num_nat_src_v6(&cmd->nat_src_start,
&cmd->nat_src_end)) < 0 ||
num_src > ILB_MAX_NAT_SRC) {
return (EINVAL);
}
}
}
mutex_enter(&ilbs->ilbs_g_lock);
if (ilbs->ilbs_g_hash == NULL)
ilb_rule_hash_init(ilbs);
if (ilbs->ilbs_c2s_conn_hash == NULL) {
ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
ilb_conn_hash_init(ilbs);
ilb_nat_src_init(ilbs);
}
/* Make sure that the new rule does not duplicate an existing one. */
if (ilb_match_rule(ilbs, zoneid, cmd->name, cmd->ip_ver, cmd->proto,
min_port, max_port, &cmd->vip)) {
mutex_exit(&ilbs->ilbs_g_lock);
return (EEXIST);
}
rule = kmem_zalloc(sizeof (ilb_rule_t), KM_NOSLEEP);
if (rule == NULL) {
mutex_exit(&ilbs->ilbs_g_lock);
return (ENOMEM);
}
/* ir_name is all 0 to begin with */
(void) memcpy(rule->ir_name, cmd->name, ILB_RULE_NAMESZ - 1);
rule->ir_ks_instance = atomic_add_int_nv(&ilb_kstat_instance, 1);
stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
if ((rule->ir_ksp = ilb_rule_kstat_init(stackid, rule)) == NULL) {
ret = ENOMEM;
goto error;
}
if (cmd->topo == ILB_TOPO_IMPL_NAT) {
rule->ir_nat_src_start = cmd->nat_src_start;
rule->ir_nat_src_end = cmd->nat_src_end;
}
rule->ir_ipver = cmd->ip_ver;
rule->ir_proto = cmd->proto;
rule->ir_topo = cmd->topo;
rule->ir_min_port = min_port;
rule->ir_max_port = max_port;
if (rule->ir_min_port != rule->ir_max_port)
rule->ir_port_range = B_TRUE;
else
rule->ir_port_range = B_FALSE;
rule->ir_zoneid = zoneid;
rule->ir_target_v6 = cmd->vip;
rule->ir_servers = NULL;
/*
* The default connection drain timeout is indefinite (value 0),
* meaning we will wait for all connections to finish. So we
* can assign cmd->conn_drain_timeout to it directly.
*/
rule->ir_conn_drain_timeout = cmd->conn_drain_timeout;
if (cmd->nat_expiry != 0) {
rule->ir_nat_expiry = cmd->nat_expiry;
} else {
switch (rule->ir_proto) {
case IPPROTO_TCP:
rule->ir_nat_expiry = ilb_conn_tcp_expiry;
break;
case IPPROTO_UDP:
rule->ir_nat_expiry = ilb_conn_udp_expiry;
break;
default:
cmn_err(CE_PANIC, "data corruption: wrong ir_proto: %p",
(void *)rule);
break;
}
}
if (cmd->sticky_expiry != 0)
rule->ir_sticky_expiry = cmd->sticky_expiry;
else
rule->ir_sticky_expiry = ilb_sticky_expiry;
if (cmd->flags & ILB_RULE_STICKY) {
rule->ir_flags |= ILB_RULE_STICKY;
rule->ir_sticky_mask = cmd->sticky_mask;
if (ilbs->ilbs_sticky_hash == NULL)
ilb_sticky_hash_init(ilbs);
}
if (cmd->flags & ILB_RULE_ENABLED)
rule->ir_flags |= ILB_RULE_ENABLED;
mutex_init(&rule->ir_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&rule->ir_cv, NULL, CV_DEFAULT, NULL);
rule->ir_refcnt = 1;
switch (cmd->algo) {
case ILB_ALG_IMPL_ROUNDROBIN:
if ((rule->ir_alg = ilb_alg_rr_init(rule, NULL)) == NULL) {
ret = ENOMEM;
goto error;
}
rule->ir_alg_type = ILB_ALG_IMPL_ROUNDROBIN;
break;
case ILB_ALG_IMPL_HASH_IP:
case ILB_ALG_IMPL_HASH_IP_SPORT:
case ILB_ALG_IMPL_HASH_IP_VIP:
if ((rule->ir_alg = ilb_alg_hash_init(rule,
&cmd->algo)) == NULL) {
ret = ENOMEM;
goto error;
}
rule->ir_alg_type = cmd->algo;
break;
default:
ret = EINVAL;
goto error;
}
/* Add it to the global list and hash array at the end. */
ilb_rule_g_add(ilbs, rule);
ilb_rule_hash_add(ilbs, rule, &cmd->vip);
mutex_exit(&ilbs->ilbs_g_lock);
return (0);
error:
mutex_exit(&ilbs->ilbs_g_lock);
if (rule->ir_ksp != NULL) {
/* stackid must be initialized if ir_ksp != NULL */
kstat_delete_netstack(rule->ir_ksp, stackid);
}
kmem_free(rule, sizeof (ilb_rule_t));
return (ret);
}
/*
* The final part in deleting a rule. Either called directly or by the
* taskq dispatched.
*/
static void
ilb_rule_del_common(ilb_stack_t *ilbs, ilb_rule_t *tmp_rule)
{
netstackid_t stackid;
ilb_server_t *server;
stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
/*
* Let the algorithm know that the rule is going away. The
* algorithm fini routine will free all its resources with this
* rule.
*/
tmp_rule->ir_alg->ilb_alg_fini(&tmp_rule->ir_alg);
while ((server = tmp_rule->ir_servers) != NULL) {
mutex_enter(&server->iser_lock);
ilb_destroy_nat_src(&server->iser_nat_src);
if (tmp_rule->ir_conn_drain_timeout != 0) {
/*
* The garbage collection thread checks this value
* without grabing a lock. So we need to use
* atomic_swap_64() to make sure that the value seen
* by gc thread is intact.
*/
(void) atomic_swap_64(
(uint64_t *)&server->iser_die_time, lbolt64 +
SEC_TO_TICK(tmp_rule->ir_conn_drain_timeout));
}
while (server->iser_refcnt > 1)
cv_wait(&server->iser_cv, &server->iser_lock);
tmp_rule->ir_servers = server->iser_next;
kstat_delete_netstack(server->iser_ksp, stackid);
kmem_free(server, sizeof (ilb_server_t));
}
ASSERT(tmp_rule->ir_ksp != NULL);
kstat_delete_netstack(tmp_rule->ir_ksp, stackid);
kmem_free(tmp_rule, sizeof (ilb_rule_t));
}
/* The routine executed by the delayed rule taskq. */
static void
ilb_rule_del_tq(void *arg)
{
ilb_stack_t *ilbs = ((ilb_rule_tq_t *)arg)->ilbs;
ilb_rule_t *rule = ((ilb_rule_tq_t *)arg)->rule;
mutex_enter(&rule->ir_lock);
while (rule->ir_refcnt > 1)
cv_wait(&rule->ir_cv, &rule->ir_lock);
ilb_rule_del_common(ilbs, rule);
kmem_free(arg, sizeof (ilb_rule_tq_t));
}
/* Routine to delete a rule. */
int
ilb_rule_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name)
{
ilb_rule_t *tmp_rule;
ilb_rule_tq_t *arg;
int err;
mutex_enter(&ilbs->ilbs_g_lock);
if ((tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name,
&err)) == NULL) {
mutex_exit(&ilbs->ilbs_g_lock);
return (err);
}
/*
* First remove the rule from the hash array and the global list so
* that no one can find this rule any more.
*/
ilb_rule_hash_del(tmp_rule);
ilb_rule_g_del(ilbs, tmp_rule);
mutex_exit(&ilbs->ilbs_g_lock);
ILB_RULE_REFRELE(tmp_rule);
/*
* Now no one can find this rule, we can remove it once all
* references to it are dropped and all references to the list
* of servers are dropped. So dispatch a task to finish the deletion.
* We do this instead of letting the last one referencing the
* rule do it. The reason is that the last one may be the
* interrupt thread. We want to minimize the work it needs to
* do. Rule deletion is not a critical task so it can be delayed.
*/
arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
arg->ilbs = ilbs;
arg->rule = tmp_rule;
(void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, arg,
TQ_SLEEP);
return (0);
}
/*
* Given an IP address, check to see if there is a rule using this
* as the VIP. It can be used to check if we need to drop a fragment.
*/
boolean_t
ilb_rule_match_vip_v6(ilb_stack_t *ilbs, in6_addr_t *vip, ilb_rule_t **ret_rule)
{
int i;
ilb_rule_t *rule;
boolean_t ret = B_FALSE;
i = ILB_RULE_HASH((uint8_t *)&vip->s6_addr32[3],
ilbs->ilbs_rule_hash_size);
mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
rule = rule->ir_hash_next) {
if (IN6_ARE_ADDR_EQUAL(vip, &rule->ir_target_v6)) {
mutex_enter(&rule->ir_lock);
if (rule->ir_flags & ILB_RULE_BUSY) {
mutex_exit(&rule->ir_lock);
break;
}
if (ret_rule != NULL) {
rule->ir_refcnt++;
mutex_exit(&rule->ir_lock);
*ret_rule = rule;
} else {
mutex_exit(&rule->ir_lock);
}
ret = B_TRUE;
break;
}
}
mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
return (ret);
}
boolean_t
ilb_rule_match_vip_v4(ilb_stack_t *ilbs, ipaddr_t addr, ilb_rule_t **ret_rule)
{
int i;
ilb_rule_t *rule;
boolean_t ret = B_FALSE;
i = ILB_RULE_HASH((uint8_t *)&addr, ilbs->ilbs_rule_hash_size);
mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
rule = rule->ir_hash_next) {
if (rule->ir_target_v6.s6_addr32[3] == addr) {
mutex_enter(&rule->ir_lock);
if (rule->ir_flags & ILB_RULE_BUSY) {
mutex_exit(&rule->ir_lock);
break;
}
if (ret_rule != NULL) {
rule->ir_refcnt++;
mutex_exit(&rule->ir_lock);
*ret_rule = rule;
} else {
mutex_exit(&rule->ir_lock);
}
ret = B_TRUE;
break;
}
}
mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
return (ret);
}
static ilb_rule_t *
ilb_find_rule_locked(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
int *err)
{
ilb_rule_t *tmp_rule;
ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
tmp_rule = tmp_rule->ir_next) {
if (tmp_rule->ir_zoneid != zoneid)
continue;
if (strcasecmp(tmp_rule->ir_name, name) == 0) {
mutex_enter(&tmp_rule->ir_lock);
if (tmp_rule->ir_flags & ILB_RULE_BUSY) {
mutex_exit(&tmp_rule->ir_lock);
*err = EINPROGRESS;
return (NULL);
}
tmp_rule->ir_refcnt++;
mutex_exit(&tmp_rule->ir_lock);
*err = 0;
return (tmp_rule);
}
}
*err = ENOENT;
return (NULL);
}
/* To find a rule with a given name and zone in the global rule list. */
ilb_rule_t *
ilb_find_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
int *err)
{
ilb_rule_t *tmp_rule;
mutex_enter(&ilbs->ilbs_g_lock);
tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, err);
mutex_exit(&ilbs->ilbs_g_lock);
return (tmp_rule);
}
/* Try to match the given packet info and zone ID with a rule. */
static boolean_t
ilb_match_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, int l3,
int l4, in_port_t min_port, in_port_t max_port, const in6_addr_t *addr)
{
ilb_rule_t *tmp_rule;
ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
tmp_rule = tmp_rule->ir_next) {
if (tmp_rule->ir_zoneid != zoneid)
continue;
/*
* We don't allow the same name in different rules even if all
* the other rule components are different.
*/
if (strcasecmp(tmp_rule->ir_name, name) == 0)
return (B_TRUE);
if (tmp_rule->ir_ipver != l3 || tmp_rule->ir_proto != l4)
continue;
/*
* ir_min_port and ir_max_port are the same if ir_port_range
* is false. In this case, if the ir_min|max_port (same) is
* outside of the given port range, it is OK. In other cases,
* check if min and max port are outside a rule's range.
*/
if (tmp_rule->ir_max_port < min_port ||
tmp_rule->ir_min_port > max_port) {
continue;
}
/*
* If l3 is IPv4, the addr passed in is assumed to be
* mapped address.
*/
if (V6_OR_V4_INADDR_ANY(*addr) ||
V6_OR_V4_INADDR_ANY(tmp_rule->ir_target_v6) ||
IN6_ARE_ADDR_EQUAL(addr, &tmp_rule->ir_target_v6)) {
return (B_TRUE);
}
}
return (B_FALSE);
}
int
ilb_rule_enable(ilb_stack_t *ilbs, zoneid_t zoneid,
const char *rule_name, ilb_rule_t *in_rule)
{
ilb_rule_t *rule;
int err;
ASSERT((in_rule == NULL && rule_name != NULL) ||
(in_rule != NULL && rule_name == NULL));
if ((rule = in_rule) == NULL) {
if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
&err)) == NULL) {
return (err);
}
}
mutex_enter(&rule->ir_lock);
rule->ir_flags |= ILB_RULE_ENABLED;
mutex_exit(&rule->ir_lock);
/* Only refrele if the rule is passed in. */
if (in_rule == NULL)
ILB_RULE_REFRELE(rule);
return (0);
}
int
ilb_rule_disable(ilb_stack_t *ilbs, zoneid_t zoneid,
const char *rule_name, ilb_rule_t *in_rule)
{
ilb_rule_t *rule;
int err;
ASSERT((in_rule == NULL && rule_name != NULL) ||
(in_rule != NULL && rule_name == NULL));
if ((rule = in_rule) == NULL) {
if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
&err)) == NULL) {
return (err);
}
}
mutex_enter(&rule->ir_lock);
rule->ir_flags &= ~ILB_RULE_ENABLED;
mutex_exit(&rule->ir_lock);
/* Only refrele if the rule is passed in. */
if (in_rule == NULL)
ILB_RULE_REFRELE(rule);
return (0);
}
/*
* XXX We should probably have a walker function to walk all rules. For
* now, just add a simple loop for enable/disable/del.
*/
void
ilb_rule_enable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
{
ilb_rule_t *rule;
mutex_enter(&ilbs->ilbs_g_lock);
for (rule = ilbs->ilbs_rule_head; rule != NULL; rule = rule->ir_next) {
if (rule->ir_zoneid != zoneid)
continue;
/*
* No need to hold the rule as we are holding the global
* lock so it won't go away. Ignore the return value here
* as the rule is provided so the call cannot fail.
*/
(void) ilb_rule_enable(ilbs, zoneid, NULL, rule);
}
mutex_exit(&ilbs->ilbs_g_lock);
}
void
ilb_rule_disable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
{
ilb_rule_t *rule;
mutex_enter(&ilbs->ilbs_g_lock);
for (rule = ilbs->ilbs_rule_head; rule != NULL;
rule = rule->ir_next) {
if (rule->ir_zoneid != zoneid)
continue;
(void) ilb_rule_disable(ilbs, zoneid, NULL, rule);
}
mutex_exit(&ilbs->ilbs_g_lock);
}
void
ilb_rule_del_all(ilb_stack_t *ilbs, zoneid_t zoneid)
{
ilb_rule_t *rule;
ilb_rule_tq_t *arg;
mutex_enter(&ilbs->ilbs_g_lock);
while ((rule = ilbs->ilbs_rule_head) != NULL) {
if (rule->ir_zoneid != zoneid)
continue;
ilb_rule_hash_del(rule);
ilb_rule_g_del(ilbs, rule);
mutex_exit(&ilbs->ilbs_g_lock);
arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
arg->ilbs = ilbs;
arg->rule = rule;
(void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq,
arg, TQ_SLEEP);
mutex_enter(&ilbs->ilbs_g_lock);
}
mutex_exit(&ilbs->ilbs_g_lock);
}
/*
* This is just an optimization, so don't grab the global lock. The
* worst case is that we missed a couple packets.
*/
boolean_t
ilb_has_rules(ilb_stack_t *ilbs)
{
return (ilbs->ilbs_rule_head != NULL);
}
static int
ilb_server_toggle(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
ilb_rule_t *rule, in6_addr_t *addr, boolean_t enable)
{
ilb_server_t *tmp_server;
int ret;
ASSERT((rule == NULL && rule_name != NULL) ||
(rule != NULL && rule_name == NULL));
if (rule == NULL) {
if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
&ret)) == NULL) {
return (ret);
}
}
/* Once we get a hold on the rule, no server can be added/deleted. */
for (tmp_server = rule->ir_servers; tmp_server != NULL;
tmp_server = tmp_server->iser_next) {
if (IN6_ARE_ADDR_EQUAL(&tmp_server->iser_addr_v6, addr))
break;
}
if (tmp_server == NULL) {
ret = ENOENT;
goto done;
}
if (enable) {
ret = rule->ir_alg->ilb_alg_server_enable(tmp_server,
rule->ir_alg->ilb_alg_data);
if (ret == 0) {
tmp_server->iser_enabled = B_TRUE;
tmp_server->iser_die_time = 0;
}
} else {
ret = rule->ir_alg->ilb_alg_server_disable(tmp_server,
rule->ir_alg->ilb_alg_data);
if (ret == 0) {
tmp_server->iser_enabled = B_FALSE;
if (rule->ir_conn_drain_timeout != 0) {
(void) atomic_swap_64(
(uint64_t *)&tmp_server->iser_die_time,
lbolt64 + SEC_TO_TICK(
rule->ir_conn_drain_timeout));
}
}
}
done:
if (rule_name != NULL)
ILB_RULE_REFRELE(rule);
return (ret);
}
int
ilb_server_enable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
ilb_rule_t *rule, in6_addr_t *addr)
{
return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_TRUE));
}
int
ilb_server_disable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
ilb_rule_t *rule, in6_addr_t *addr)
{
return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_FALSE));
}
/*
* Add a back end server to a rule. If the address is IPv4, it is assumed
* to be passed in as a mapped address.
*/
int
ilb_server_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_info_t *info)
{
ilb_server_t *server;
netstackid_t stackid;
int ret = 0;
in_port_t min_port, max_port;
in_port_t range;
/* Port is passed in network byte order. */
min_port = ntohs(info->min_port);
max_port = ntohs(info->max_port);
if (min_port > max_port)
return (EINVAL);
/* min_port == 0 means "all ports". Make it so */
if (min_port == 0) {
min_port = 1;
max_port = 65535;
}
range = max_port - min_port;
mutex_enter(&rule->ir_lock);
/* If someone is already doing server add/del, sleeps and wait. */
while (rule->ir_flags & ILB_RULE_BUSY) {
if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
mutex_exit(&rule->ir_lock);
return (EINTR);
}
}
/*
* Set the rule to be busy to make sure that no new packet can
* use this rule.
*/
rule->ir_flags |= ILB_RULE_BUSY;
/* Now wait for all other guys to finish their work. */
while (rule->ir_refcnt > 2) {
if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
mutex_exit(&rule->ir_lock);
ret = EINTR;
goto end;
}
}
mutex_exit(&rule->ir_lock);
/* Sanity checks... */
if ((IN6_IS_ADDR_V4MAPPED(&info->addr) &&
rule->ir_ipver != IPPROTO_IP) ||
(!IN6_IS_ADDR_V4MAPPED(&info->addr) &&
rule->ir_ipver != IPPROTO_IPV6)) {
ret = EINVAL;
goto end;
}
/*
* Check for valid port range.
*
* For DSR, there can be no port shifting. Hence the server
* specification must be the same as the rule's.
*
* For half-NAT/NAT, the range must either be 0 (port collapsing) or
* it must be equal to the same value as the rule port range.
*
*/
if (rule->ir_topo == ILB_TOPO_IMPL_DSR) {
if (rule->ir_max_port != max_port ||
rule->ir_min_port != min_port) {
ret = EINVAL;
goto end;
}
} else {
if ((range != rule->ir_max_port - rule->ir_min_port) &&
range != 0) {
ret = EINVAL;
goto end;
}
}
/* Check for duplicate. */
for (server = rule->ir_servers; server != NULL;
server = server->iser_next) {
if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, &info->addr) ||
strcasecmp(server->iser_name, info->name) == 0) {
break;
}
}
if (server != NULL) {
ret = EEXIST;
goto end;
}
if ((server = kmem_zalloc(sizeof (ilb_server_t), KM_NOSLEEP)) == NULL) {
ret = ENOMEM;
goto end;
}
(void) memcpy(server->iser_name, info->name, ILB_SERVER_NAMESZ - 1);
(void) inet_ntop(AF_INET6, &info->addr, server->iser_ip_addr,
sizeof (server->iser_ip_addr));
stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
server->iser_ksp = ilb_server_kstat_init(stackid, rule, server);
if (server->iser_ksp == NULL) {
kmem_free(server, sizeof (ilb_server_t));
ret = EINVAL;
goto end;
}
server->iser_stackid = stackid;
server->iser_addr_v6 = info->addr;
server->iser_min_port = min_port;
server->iser_max_port = max_port;
if (min_port != max_port)
server->iser_port_range = B_TRUE;
else
server->iser_port_range = B_FALSE;
/*
* If the rule uses NAT, find/create the NAT source entry to use
* for this server.
*/
if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
in_port_t port;
/*
* If the server uses a port range, our port allocation
* scheme needs to treat it as a wildcard. Refer to the
* comments in ilb_nat.c about the scheme.
*/
if (server->iser_port_range)
port = 0;
else
port = server->iser_min_port;
if ((ret = ilb_create_nat_src(ilbs, &server->iser_nat_src,
&server->iser_addr_v6, port, &rule->ir_nat_src_start,
num_nat_src_v6(&rule->ir_nat_src_start,
&rule->ir_nat_src_end))) != 0) {
kstat_delete_netstack(server->iser_ksp, stackid);
kmem_free(server, sizeof (ilb_server_t));
goto end;
}
}
/*
* The iser_lock is only used to protect iser_refcnt. All the other
* fields in ilb_server_t should not change, except for iser_enabled.
* The worst thing that can happen if iser_enabled is messed up is
* that one or two packets may not be load balanced to a server
* correctly.
*/
server->iser_refcnt = 1;
server->iser_enabled = info->flags & ILB_SERVER_ENABLED ? B_TRUE :
B_FALSE;
mutex_init(&server->iser_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&server->iser_cv, NULL, CV_DEFAULT, NULL);
/* Let the load balancing algorithm know about the addition. */
ASSERT(rule->ir_alg != NULL);
if ((ret = rule->ir_alg->ilb_alg_server_add(server,
rule->ir_alg->ilb_alg_data)) != 0) {
kstat_delete_netstack(server->iser_ksp, stackid);
kmem_free(server, sizeof (ilb_server_t));
goto end;
}
/*
* No need to hold ir_lock since no other thread should manipulate
* the following fields until ILB_RULE_BUSY is cleared.
*/
if (rule->ir_servers == NULL) {
server->iser_next = NULL;
} else {
server->iser_next = rule->ir_servers;
}
rule->ir_servers = server;
ILB_R_KSTAT(rule, num_servers);
end:
mutex_enter(&rule->ir_lock);
rule->ir_flags &= ~ILB_RULE_BUSY;
cv_signal(&rule->ir_cv);
mutex_exit(&rule->ir_lock);
return (ret);
}
/* The routine executed by the delayed rule processing taskq. */
static void
ilb_server_del_tq(void *arg)
{
ilb_server_t *server = (ilb_server_t *)arg;
mutex_enter(&server->iser_lock);
while (server->iser_refcnt > 1)
cv_wait(&server->iser_cv, &server->iser_lock);
kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
kmem_free(server, sizeof (ilb_server_t));
}
/*
* Delete a back end server from a rule. If the address is IPv4, it is assumed
* to be passed in as a mapped address.
*/
int
ilb_server_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
ilb_rule_t *rule, in6_addr_t *addr)
{
ilb_server_t *server;
ilb_server_t *prev_server;
int ret = 0;
ASSERT((rule == NULL && rule_name != NULL) ||
(rule != NULL && rule_name == NULL));
if (rule == NULL) {
if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
&ret)) == NULL) {
return (ret);
}
}
mutex_enter(&rule->ir_lock);
/* If someone is already doing server add/del, sleeps and wait. */
while (rule->ir_flags & ILB_RULE_BUSY) {
if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
if (rule_name != NULL) {
if (--rule->ir_refcnt <= 2)
cv_signal(&rule->ir_cv);
}
mutex_exit(&rule->ir_lock);
return (EINTR);
}
}
/*
* Set the rule to be busy to make sure that no new packet can
* use this rule.
*/
rule->ir_flags |= ILB_RULE_BUSY;
/* Now wait for all other guys to finish their work. */
while (rule->ir_refcnt > 2) {
if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
mutex_exit(&rule->ir_lock);
ret = EINTR;
goto end;
}
}
mutex_exit(&rule->ir_lock);
prev_server = NULL;
for (server = rule->ir_servers; server != NULL;
prev_server = server, server = server->iser_next) {
if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, addr))
break;
}
if (server == NULL) {
ret = ENOENT;
goto end;
}
/*
* Let the load balancing algorithm know about the removal.
* The algorithm may disallow the removal...
*/
if ((ret = rule->ir_alg->ilb_alg_server_del(server,
rule->ir_alg->ilb_alg_data)) != 0) {
goto end;
}
if (prev_server == NULL)
rule->ir_servers = server->iser_next;
else
prev_server->iser_next = server->iser_next;
ILB_R_KSTAT_UPDATE(rule, num_servers, -1);
/*
* Mark the server as disabled so that if there is any sticky cache
* using this server around, it won't be used.
*/
server->iser_enabled = B_FALSE;
mutex_enter(&server->iser_lock);
/*
* De-allocate the NAT source array. The indiviual ilb_nat_src_entry_t
* may not go away if there is still a conn using it. The NAT source
* timer will do the garbage collection.
*/
ilb_destroy_nat_src(&server->iser_nat_src);
/* If there is a hard limit on when a server should die, set it. */
if (rule->ir_conn_drain_timeout != 0) {
(void) atomic_swap_64((uint64_t *)&server->iser_die_time,
lbolt64 + SEC_TO_TICK(rule->ir_conn_drain_timeout));
}
if (server->iser_refcnt > 1) {
(void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_server_del_tq,
server, TQ_SLEEP);
mutex_exit(&server->iser_lock);
} else {
kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
kmem_free(server, sizeof (ilb_server_t));
}
end:
mutex_enter(&rule->ir_lock);
rule->ir_flags &= ~ILB_RULE_BUSY;
if (rule_name != NULL)
rule->ir_refcnt--;
cv_signal(&rule->ir_cv);
mutex_exit(&rule->ir_lock);
return (ret);
}
/*
* First check if the destination of the ICMP message matches a VIP of
* a rule. If it does not, just return ILB_PASSED.
*
* If the destination matches a VIP:
*
* For ICMP_ECHO_REQUEST, generate a response on behalf of the back end
* server.
*
* For ICMP_DEST_UNREACHABLE fragmentation needed, check inside the payload
* and see which back end server we should send this message to. And we
* need to do NAT on both the payload message and the outside IP packet.
*
* For other ICMP messages, drop them.
*/
/* ARGSUSED */
static int
ilb_icmp_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha,
icmph_t *icmph, ipaddr_t *lb_dst)
{
ipaddr_t vip;
ilb_rule_t *rule;
in6_addr_t addr6;
if (!ilb_rule_match_vip_v4(ilbs, ipha->ipha_dst, &rule))
return (ILB_PASSED);
if ((uint8_t *)icmph + sizeof (icmph_t) > mp->b_wptr) {
ILB_R_KSTAT(rule, icmp_dropped);
ILB_RULE_REFRELE(rule);
return (ILB_DROPPED);
}
switch (icmph->icmph_type) {
case ICMP_ECHO_REQUEST:
ILB_R_KSTAT(rule, icmp_echo_processed);
ILB_RULE_REFRELE(rule);
icmph->icmph_type = ICMP_ECHO_REPLY;
icmph->icmph_checksum = 0;
icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
ipha->ipha_ttl =
ilbs->ilbs_netstack->netstack_ip->ips_ip_def_ttl;
*lb_dst = ipha->ipha_src;
vip = ipha->ipha_dst;
ipha->ipha_dst = ipha->ipha_src;
ipha->ipha_src = vip;
return (ILB_BALANCED);
case ICMP_DEST_UNREACHABLE: {
int ret;
if (icmph->icmph_code != ICMP_FRAGMENTATION_NEEDED) {
ILB_R_KSTAT(rule, icmp_dropped);
ILB_RULE_REFRELE(rule);
return (ILB_DROPPED);
}
if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IP, ipha, icmph,
&addr6)) {
ILB_R_KSTAT(rule, icmp_2big_processed);
ret = ILB_BALANCED;
} else {
ILB_R_KSTAT(rule, icmp_2big_dropped);
ret = ILB_DROPPED;
}
ILB_RULE_REFRELE(rule);
IN6_V4MAPPED_TO_IPADDR(&addr6, *lb_dst);
return (ret);
}
default:
ILB_R_KSTAT(rule, icmp_dropped);
ILB_RULE_REFRELE(rule);
return (ILB_DROPPED);
}
}
/* ARGSUSED */
static int
ilb_icmp_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h,
icmp6_t *icmp6, in6_addr_t *lb_dst)
{
ilb_rule_t *rule;
if (!ilb_rule_match_vip_v6(ilbs, &ip6h->ip6_dst, &rule))
return (ILB_PASSED);
if ((uint8_t *)icmp6 + sizeof (icmp6_t) > mp->b_wptr) {
ILB_R_KSTAT(rule, icmp_dropped);
ILB_RULE_REFRELE(rule);
return (ILB_DROPPED);
}
switch (icmp6->icmp6_type) {
case ICMP6_ECHO_REQUEST: {
int hdr_len;
ILB_R_KSTAT(rule, icmp_echo_processed);
ILB_RULE_REFRELE(rule);
icmp6->icmp6_type = ICMP6_ECHO_REPLY;
icmp6->icmp6_cksum = ip6h->ip6_plen;
hdr_len = (char *)icmp6 - (char *)ip6h;
icmp6->icmp6_cksum = IP_CSUM(mp, hdr_len,
ilb_pseudo_sum_v6(ip6h, IPPROTO_ICMPV6));
ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
ip6h->ip6_hops =
ilbs->ilbs_netstack->netstack_ip->ips_ipv6_def_hops;
*lb_dst = ip6h->ip6_src;
ip6h->ip6_src = ip6h->ip6_dst;
ip6h->ip6_dst = *lb_dst;
return (ILB_BALANCED);
}
case ICMP6_PACKET_TOO_BIG: {
int ret;
if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IPV6, ip6h, icmp6,
lb_dst)) {
ILB_R_KSTAT(rule, icmp_2big_processed);
ret = ILB_BALANCED;
} else {
ILB_R_KSTAT(rule, icmp_2big_dropped);
ret = ILB_DROPPED;
}
ILB_RULE_REFRELE(rule);
return (ret);
}
default:
ILB_R_KSTAT(rule, icmp_dropped);
ILB_RULE_REFRELE(rule);
return (ILB_DROPPED);
}
}
/*
* Common routine to check an incoming packet and decide what to do with it.
* called by ilb_check_v4|v6().
*/
static int
ilb_check(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, in6_addr_t *src,
in6_addr_t *dst, int l3, int l4, void *iph, uint8_t *tph, uint32_t pkt_len,
in6_addr_t *lb_dst)
{
in_port_t sport, dport;
tcpha_t *tcph;
udpha_t *udph;
ilb_rule_t *rule;
ilb_server_t *server;
boolean_t balanced;
struct ilb_sticky_s *s = NULL;
int ret;
uint32_t ip_sum, tp_sum;
ilb_nat_info_t info;
uint16_t nat_src_idx;
boolean_t busy;
/*
* We don't really need to switch here since both protocols's
* ports are at the same offset. Just prepare for future protocol
* specific processing.
*/
switch (l4) {
case IPPROTO_TCP:
if (tph + TCP_MIN_HEADER_LENGTH > mp->b_wptr)
return (ILB_DROPPED);
tcph = (tcpha_t *)tph;
sport = tcph->tha_lport;
dport = tcph->tha_fport;
break;
case IPPROTO_UDP:
if (tph + sizeof (udpha_t) > mp->b_wptr)
return (ILB_DROPPED);
udph = (udpha_t *)tph;
sport = udph->uha_src_port;
dport = udph->uha_dst_port;
break;
default:
return (ILB_PASSED);
}
/* Fast path, there is an existing conn. */
if (ilb_check_conn(ilbs, l3, iph, l4, tph, src, dst, sport, dport,
pkt_len, lb_dst)) {
return (ILB_BALANCED);
}
/*
* If there is no existing connection for the incoming packet, check
* to see if the packet matches a rule. If not, just let IP decide
* what to do with it.
*
* Note: a reply from back end server should not match a rule. A
* reply should match one existing conn.
*/
rule = ilb_rule_hash(ilbs, l3, l4, dst, dport, ill->ill_zoneid,
pkt_len, &busy);
if (rule == NULL) {
/* If the rule is busy, just drop the packet. */
if (busy)
return (ILB_DROPPED);
else
return (ILB_PASSED);
}
/*
* The packet matches a rule, use the rule load balance algorithm
* to find a server.
*/
balanced = rule->ir_alg->ilb_alg_lb(src, sport, dst, dport,
rule->ir_alg->ilb_alg_data, &server);
/*
* This can only happen if there is no server in a rule or all
* the servers are currently disabled.
*/
if (!balanced)
goto no_server;
/*
* If the rule is sticky enabled, we need to check the sticky table.
* If there is a sticky entry for the client, use the previous server
* instead of the one found above (note that both can be the same).
* If there is no entry for that client, add an entry to the sticky
* table. Both the find and add are done in ilb_sticky_find_add()
* to avoid checking for duplicate when adding an entry.
*/
if (rule->ir_flags & ILB_RULE_STICKY) {
in6_addr_t addr;
V6_MASK_COPY(*src, rule->ir_sticky_mask, addr);
if ((server = ilb_sticky_find_add(ilbs, rule, &addr, server,
&s, &nat_src_idx)) == NULL) {
ILB_R_KSTAT(rule, nomem_pkt_dropped);
ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
goto no_server;
}
}
/*
* We are holding a reference on the rule, so the server
* cannot go away.
*/
*lb_dst = server->iser_addr_v6;
ILB_S_KSTAT(server, pkt_processed);
ILB_S_KSTAT_UPDATE(server, bytes_processed, pkt_len);
switch (rule->ir_topo) {
case ILB_TOPO_IMPL_NAT: {
ilb_nat_src_entry_t *src_ent;
uint16_t *src_idx;
/*
* We create a cache even if it is not a SYN segment.
* The server should return a RST. When we see the
* RST, we will destroy this cache. But by having
* a cache, we know how to NAT the returned RST.
*/
info.vip = *dst;
info.dport = dport;
info.src = *src;
info.sport = sport;
/* If stickiness is enabled, use the same source address */
if (s != NULL)
src_idx = &nat_src_idx;
else
src_idx = NULL;
if ((src_ent = ilb_alloc_nat_addr(server->iser_nat_src,
&info.nat_src, &info.nat_sport, src_idx)) == NULL) {
if (s != NULL)
ilb_sticky_refrele(s);
ILB_R_KSTAT(rule, pkt_dropped);
ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
ILB_R_KSTAT(rule, noport_pkt_dropped);
ILB_R_KSTAT_UPDATE(rule, noport_bytes_dropped, pkt_len);
ret = ILB_DROPPED;
break;
}
info.src_ent = src_ent;
info.nat_dst = server->iser_addr_v6;
if (rule->ir_port_range && server->iser_port_range) {
info.nat_dport = htons(ntohs(dport) -
rule->ir_min_port + server->iser_min_port);
} else {
info.nat_dport = htons(server->iser_min_port);
}
/*
* If ilb_conn_add() fails, it will release the reference on
* sticky info and de-allocate the NAT source port allocated
* above.
*/
if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
dport, &info, &ip_sum, &tp_sum, s) != 0) {
ILB_R_KSTAT(rule, pkt_dropped);
ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
ILB_R_KSTAT(rule, nomem_pkt_dropped);
ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
ret = ILB_DROPPED;
break;
}
ilb_full_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
ret = ILB_BALANCED;
break;
}
case ILB_TOPO_IMPL_HALF_NAT:
info.vip = *dst;
info.nat_dst = server->iser_addr_v6;
info.dport = dport;
if (rule->ir_port_range && server->iser_port_range) {
info.nat_dport = htons(ntohs(dport) -
rule->ir_min_port + server->iser_min_port);
} else {
info.nat_dport = htons(server->iser_min_port);
}
if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
dport, &info, &ip_sum, &tp_sum, s) != 0) {
ILB_R_KSTAT(rule, pkt_dropped);
ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
ILB_R_KSTAT(rule, nomem_pkt_dropped);
ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
ret = ILB_DROPPED;
break;
}
ilb_half_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
ret = ILB_BALANCED;
break;
case ILB_TOPO_IMPL_DSR:
/*
* By decrementing the sticky refcnt, the period of
* stickiness (life time of ilb_sticky_t) will be
* from now to (now + default expiry time).
*/
if (s != NULL)
ilb_sticky_refrele(s);
ret = ILB_BALANCED;
break;
default:
cmn_err(CE_PANIC, "data corruption unknown topology: %p",
(void *) rule);
break;
}
ILB_RULE_REFRELE(rule);
return (ret);
no_server:
/* This can only happen if there is no server available. */
ILB_R_KSTAT(rule, pkt_dropped);
ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
ILB_RULE_REFRELE(rule);
return (ILB_DROPPED);
}
int
ilb_check_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, int l4,
uint8_t *tph, ipaddr_t *lb_dst)
{
in6_addr_t v6_src, v6_dst, v6_lb_dst;
int ret;
ASSERT(DB_REF(mp) == 1);
if (l4 == IPPROTO_ICMP) {
return (ilb_icmp_v4(ilbs, ill, mp, ipha, (icmph_t *)tph,
lb_dst));
}
IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6_src);
IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6_dst);
ret = ilb_check(ilbs, ill, mp, &v6_src, &v6_dst, IPPROTO_IP, l4, ipha,
tph, ntohs(ipha->ipha_length), &v6_lb_dst);
if (ret == ILB_BALANCED)
IN6_V4MAPPED_TO_IPADDR(&v6_lb_dst, *lb_dst);
return (ret);
}
int
ilb_check_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, int l4,
uint8_t *tph, in6_addr_t *lb_dst)
{
uint32_t pkt_len;
ASSERT(DB_REF(mp) == 1);
if (l4 == IPPROTO_ICMPV6) {
return (ilb_icmp_v6(ilbs, ill, mp, ip6h, (icmp6_t *)tph,
lb_dst));
}
pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
return (ilb_check(ilbs, ill, mp, &ip6h->ip6_src, &ip6h->ip6_dst,
IPPROTO_IPV6, l4, ip6h, tph, pkt_len, lb_dst));
}
void
ilb_get_num_rules(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_rules)
{
ilb_rule_t *tmp_rule;
mutex_enter(&ilbs->ilbs_g_lock);
*num_rules = 0;
for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
tmp_rule = tmp_rule->ir_next) {
if (tmp_rule->ir_zoneid == zoneid)
*num_rules += 1;
}
mutex_exit(&ilbs->ilbs_g_lock);
}
int
ilb_get_num_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
uint32_t *num_servers)
{
ilb_rule_t *rule;
int err;
if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
return (err);
*num_servers = rule->ir_kstat.num_servers.value.ui64;
ILB_RULE_REFRELE(rule);
return (0);
}
int
ilb_get_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
ilb_server_info_t *servers, uint32_t *num_servers)
{
ilb_rule_t *rule;
ilb_server_t *server;
size_t cnt;
int err;
if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
return (err);
for (server = rule->ir_servers, cnt = *num_servers;
server != NULL && cnt > 0;
server = server->iser_next, cnt--, servers++) {
(void) memcpy(servers->name, server->iser_name,
ILB_SERVER_NAMESZ);
servers->addr = server->iser_addr_v6;
servers->min_port = htons(server->iser_min_port);
servers->max_port = htons(server->iser_max_port);
servers->flags = server->iser_enabled ? ILB_SERVER_ENABLED : 0;
servers->err = 0;
}
ILB_RULE_REFRELE(rule);
*num_servers -= cnt;
return (0);
}
void
ilb_get_rulenames(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_names,
char *buf)
{
ilb_rule_t *tmp_rule;
int cnt;
if (*num_names == 0)
return;
mutex_enter(&ilbs->ilbs_g_lock);
for (cnt = 0, tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
tmp_rule = tmp_rule->ir_next) {
if (tmp_rule->ir_zoneid != zoneid)
continue;
(void) memcpy(buf, tmp_rule->ir_name, ILB_RULE_NAMESZ);
buf += ILB_RULE_NAMESZ;
if (++cnt == *num_names)
break;
}
mutex_exit(&ilbs->ilbs_g_lock);
*num_names = cnt;
}
int
ilb_rule_list(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_rule_cmd_t *cmd)
{
ilb_rule_t *rule;
int err;
if ((rule = ilb_find_rule(ilbs, zoneid, cmd->name, &err)) == NULL) {
return (err);
}
/*
* Except the enabled flags, none of the following will change
* in the life time of a rule. So we don't hold the mutex when
* reading them. The worst is to report a wrong enabled flags.
*/
cmd->ip_ver = rule->ir_ipver;
cmd->proto = rule->ir_proto;
cmd->min_port = htons(rule->ir_min_port);
cmd->max_port = htons(rule->ir_max_port);
cmd->vip = rule->ir_target_v6;
cmd->algo = rule->ir_alg_type;
cmd->topo = rule->ir_topo;
cmd->nat_src_start = rule->ir_nat_src_start;
cmd->nat_src_end = rule->ir_nat_src_end;
cmd->conn_drain_timeout = rule->ir_conn_drain_timeout;
cmd->nat_expiry = rule->ir_nat_expiry;
cmd->sticky_expiry = rule->ir_sticky_expiry;
cmd->flags = 0;
if (rule->ir_flags & ILB_RULE_ENABLED)
cmd->flags |= ILB_RULE_ENABLED;
if (rule->ir_flags & ILB_RULE_STICKY) {
cmd->flags |= ILB_RULE_STICKY;
cmd->sticky_mask = rule->ir_sticky_mask;
}
ILB_RULE_REFRELE(rule);
return (0);
}
static void *
ilb_stack_init(netstackid_t stackid, netstack_t *ns)
{
ilb_stack_t *ilbs;
char tq_name[TASKQ_NAMELEN];
ilbs = kmem_alloc(sizeof (ilb_stack_t), KM_SLEEP);
ilbs->ilbs_netstack = ns;
ilbs->ilbs_rule_head = NULL;
ilbs->ilbs_g_hash = NULL;
mutex_init(&ilbs->ilbs_g_lock, NULL, MUTEX_DEFAULT, NULL);
ilbs->ilbs_kstat = kmem_alloc(sizeof (ilb_g_kstat_t), KM_SLEEP);
if ((ilbs->ilbs_ksp = ilb_kstat_g_init(stackid, ilbs)) == NULL) {
kmem_free(ilbs, sizeof (ilb_stack_t));
return (NULL);
}
/*
* ilbs_conn/sticky_hash related info is initialized in
* ilb_conn/sticky_hash_init().
*/
ilbs->ilbs_conn_taskq = NULL;
ilbs->ilbs_rule_hash_size = ilb_rule_hash_size;
ilbs->ilbs_conn_hash_size = ilb_conn_hash_size;
ilbs->ilbs_c2s_conn_hash = NULL;
ilbs->ilbs_s2c_conn_hash = NULL;
ilbs->ilbs_conn_timer_list = NULL;
ilbs->ilbs_sticky_hash = NULL;
ilbs->ilbs_sticky_hash_size = ilb_sticky_hash_size;
ilbs->ilbs_sticky_timer_list = NULL;
ilbs->ilbs_sticky_taskq = NULL;
/* The allocation is done later when there is a rule using NAT mode. */
ilbs->ilbs_nat_src = NULL;
ilbs->ilbs_nat_src_hash_size = ilb_nat_src_hash_size;
mutex_init(&ilbs->ilbs_nat_src_lock, NULL, MUTEX_DEFAULT, NULL);
ilbs->ilbs_nat_src_tid = 0;
/* For listing the conn hash table */
mutex_init(&ilbs->ilbs_conn_list_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ilbs->ilbs_conn_list_cv, NULL, CV_DEFAULT, NULL);
ilbs->ilbs_conn_list_busy = B_FALSE;
ilbs->ilbs_conn_list_cur = 0;
ilbs->ilbs_conn_list_connp = NULL;
/* For listing the sticky hash table */
mutex_init(&ilbs->ilbs_sticky_list_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ilbs->ilbs_sticky_list_cv, NULL, CV_DEFAULT, NULL);
ilbs->ilbs_sticky_list_busy = B_FALSE;
ilbs->ilbs_sticky_list_cur = 0;
ilbs->ilbs_sticky_list_curp = NULL;
(void) snprintf(tq_name, sizeof (tq_name), "ilb_rule_taskq_%p",
(void *)ns);
ilbs->ilbs_rule_taskq = taskq_create(tq_name, ILB_RULE_TASKQ_NUM_THR,
minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
return (ilbs);
}
/* ARGSUSED */
static void
ilb_stack_shutdown(netstackid_t stackid, void *arg)
{
ilb_stack_t *ilbs = (ilb_stack_t *)arg;
ilb_rule_t *tmp_rule;
ilb_sticky_hash_fini(ilbs);
ilb_conn_hash_fini(ilbs);
mutex_enter(&ilbs->ilbs_g_lock);
while ((tmp_rule = ilbs->ilbs_rule_head) != NULL) {
ilb_rule_hash_del(tmp_rule);
ilb_rule_g_del(ilbs, tmp_rule);
mutex_exit(&ilbs->ilbs_g_lock);
ilb_rule_del_common(ilbs, tmp_rule);
mutex_enter(&ilbs->ilbs_g_lock);
}
mutex_exit(&ilbs->ilbs_g_lock);
if (ilbs->ilbs_nat_src != NULL)
ilb_nat_src_fini(ilbs);
}
static void
ilb_stack_fini(netstackid_t stackid, void * arg)
{
ilb_stack_t *ilbs = (ilb_stack_t *)arg;
ilb_rule_hash_fini(ilbs);
taskq_destroy(ilbs->ilbs_rule_taskq);
ilb_kstat_g_fini(stackid, ilbs);
kmem_free(ilbs->ilbs_kstat, sizeof (ilb_g_kstat_t));
kmem_free(ilbs, sizeof (ilb_stack_t));
}
void
ilb_ddi_g_init(void)
{
netstack_register(NS_ILB, ilb_stack_init, ilb_stack_shutdown,
ilb_stack_fini);
}
void
ilb_ddi_g_destroy(void)
{
netstack_unregister(NS_ILB);
ilb_conn_cache_fini();
ilb_sticky_cache_fini();
}