ilb_conn.c revision d17b05b6ba5ce4569b13b250fe44164219de8c53
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright 2014 Joyent, Inc. All rights reserved.
*/
#include <sys/sysmacros.h>
#include <inet/udp_impl.h>
#include "ilb_stack.h"
#include "ilb_impl.h"
#include "ilb_conn.h"
#include "ilb_nat.h"
/*
* Timer struct for ilb_conn_t and ilb_sticky_t garbage collection
*
* start: starting index into the hash table to do gc
* end: ending index into the hash table to do gc
* ilbs: pointer to the ilb_stack_t of the IP stack
* tid_lock: mutex to protect the timer id.
* tid: timer id of the timer
*/
typedef struct ilb_timer_s {
} ilb_timer_t;
/* Hash macro for finding the index to the conn hash table */
((hash_size) - 1))
/* Kmem cache for the conn hash entry */
/*
* There are 60 timers running to do conn cache garbage collection. Each
* gc thread is responsible for 1/60 of the conn hash table.
*/
static int ilb_conn_timer_size = 60;
/* Each of the above gc timers wake up every 15s to do the gc. */
static int ilb_conn_cache_timeout = 15;
/*
* There are 60 timers running to do sticky cache garbage collection. Each
* gc thread is responsible for 1/60 of the sticky hash table.
*/
static int ilb_sticky_timer_size = 60;
/* Each of the above gc timers wake up every 15s to do the gc. */
static int ilb_sticky_timeout = 15;
#define ILB_STICKY_REFRELE(s) \
{ \
(s)->refcnt--; \
(s)->atime = ddi_get_lbolt64(); \
}
static void
ilb_conn_cache_init(void)
{
}
void
ilb_conn_cache_fini(void)
{
if (ilb_conn_cache != NULL) {
}
}
static void
{
if (c2s) {
} else {
}
} else {
}
hash->ilb_conn_cnt--;
}
static void
{
}
}
/*
* Routine to do periodic garbage collection of conn hash entries. When
* a conn hash timer fires, it dispatches a taskq to call this function
* to do the gc. Note that each taskq is responisble for a portion of
* the table. The portion is stored in timer->start, timer->end.
*/
static void
ilb_conn_cleanup(void *arg)
{
uint32_t i;
now = ddi_get_lbolt64();
continue;
}
do {
else
/* Need to update the nat list cur_connp */
}
goto nxt_connp;
}
goto nxt_connp;
/* Update and check TCP related conn info */
if (connp->conn_c2s_tcp_fin_sent &&
connp->conn_c2s_tcp_fss)) {
}
if (connp->conn_s2c_tcp_fin_sent &&
connp->conn_s2c_tcp_fss)) {
}
if (connp->conn_c2s_tcp_fin_acked &&
}
}
}
/* Conn hash timer routine. It dispatches a taskq and restart the timer */
static void
ilb_conn_timer(void *arg)
{
} else {
}
}
void
{
extern pri_t minclsyspri;
int i, part;
char tq_name[TASKQ_NAMELEN];
/*
* If ilbs->ilbs_conn_hash_size is not a power of 2, bump it up to
* the next power of 2.
*/
for (i = 0; i < 31; i++) {
break;
}
}
/*
* Can sleep since this should be called when a rule is being added,
* hence we are not in interrupt context.
*/
for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
}
for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
}
if (ilb_conn_cache == NULL)
(void *)ilbs->ilbs_netstack);
/*
* The hash table is divided in equal partition for those timers
* to do garbage collection.
*/
for (i = 0; i < ilb_conn_timer_size; i++) {
/* Spread out the starting execution time of all the timers. */
}
}
void
{
uint32_t i;
return;
}
/* Stop all the timers first. */
for (i = 0; i < ilb_conn_timer_size; i++) {
/* Setting tid to 0 tells the timer handler not to restart. */
}
/* Then remove all the conns. */
for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
/*
* src_ent will be freed in ilb_nat_src_fini().
*/
}
}
}
}
/*
* Internet checksum adjustment calculation routines. We pre-calculate
* checksum adjustment so that we don't need to compute the checksum on
*/
static void
{
while ((sum >> 16) != 0)
}
static void
{
while ((sum >> 16) != 0)
}
static void
{
while ((sum >> 16) != 0)
}
static void
{
while ((sum >> 16) != 0)
}
/*
* Add a conn hash entry to the tables. Note that a conn hash entry
* (ilb_conn_t) contains info on both directions. And there are two hash
* tables, one for client to server and the other for server to client.
* So the same entry is added to both tables and can be ccessed by two
* thread simultaneously. But each thread will only access data on one
* direction, so there is no conflict.
*/
int
{
int i;
if (s != NULL) {
1);
}
}
return (ENOMEM);
}
connp->conn_sticky = s;
/* Client to server info. */
/* The packet ths triggers this creation should be counted */
/* Server to client info, before NAT */
case ILB_TOPO_IMPL_HALF_NAT:
/* Pre-calculate checksum changes for both directions */
&connp->conn_c2s_ip_sum);
&connp->conn_s2c_ip_sum);
&connp->conn_s2c_tp_sum);
} else {
connp->conn_c2s_ip_sum = 0;
*ip_sum = 0;
connp->conn_s2c_ip_sum = 0;
&connp->conn_s2c_tp_sum);
}
break;
case ILB_TOPO_IMPL_NAT:
0, 0, 0, 0, &connp->conn_c2s_ip_sum);
0, 0, 0, 0, &connp->conn_s2c_ip_sum);
} else {
connp->conn_c2s_ip_sum = 0;
*ip_sum = 0;
&connp->conn_s2c_tp_sum);
connp->conn_s2c_ip_sum = 0;
}
break;
}
/* Add it to the s2c hash table. */
hash[i].ilb_conn_cnt++;
/* Add it to the c2s hash table. */
hash[i].ilb_conn_cnt++;
return (0);
}
/*
* If a connection is using TCP, we keep track of simple TCP state transition
* so that we know when to clean up an entry.
*/
static boolean_t
{
return (B_FALSE);
if (c2s) {
}
/* Port reuse by the client, restart the conn. */
if (connp->conn_c2s_tcp_fin_sent &&
}
} else {
}
/* Port reuse by the client, restart the conn. */
if (connp->conn_s2c_tcp_fin_sent &&
}
}
return (B_TRUE);
}
/*
* Helper routint to find conn hash entry given some packet information and
* the traffic direction (c2s, client to server?)
*/
static boolean_t
{
uint_t i;
if (c2s) {
break;
}
}
} else {
break;
}
}
}
if (ret) {
pkt_len);
switch (l4) {
case (IPPROTO_TCP):
c2s);
if (!tcp_alive) {
}
break;
default:
break;
}
}
return (ret);
}
/*
* To check if a give packet matches an existing conn hash entry. If it
* does, return the information about this entry so that the caller can
* do the proper NAT.
*/
{
/* Check the incoming hash table. */
switch (rule_cache.topo) {
case ILB_TOPO_IMPL_NAT:
break;
case ILB_TOPO_IMPL_HALF_NAT:
break;
default:
break;
}
return (ret);
}
switch (rule_cache.topo) {
case ILB_TOPO_IMPL_NAT:
break;
case ILB_TOPO_IMPL_HALF_NAT:
break;
default:
break;
}
return (ret);
}
return (B_FALSE);
}
/*
* To check if an ICMP packet belongs to a connection in one of the conn
* hash entries.
*/
{
int l4;
uint_t i;
if (l3 == IPPROTO_IP) {
return (B_FALSE);
}
return (B_FALSE);
} else {
return (B_FALSE);
}
/* We don't go deep inside an IPv6 packet yet. */
return (B_FALSE);
}
break;
}
}
return (B_FALSE);
}
switch (rule_cache.topo) {
case ILB_TOPO_IMPL_NAT:
break;
case ILB_TOPO_IMPL_HALF_NAT:
break;
default:
return (B_FALSE);
}
if (l3 == IPPROTO_IP) {
} else {
}
return (B_TRUE);
}
/*
* This routine sends up the conn hash table to user land. Note that the
* request is an ioctl, hence we cannot really differentiate requests
* from different clients. There is no context shared between different
* ioctls. Here we make the assumption that the user land ilbd will
* only allow one client to show the conn hash table at any time.
* Otherwise, the results will be "very" inconsistent.
*
* In each ioctl, a flag (ILB_LIST_BEGIN) indicates whether the client wants
* to read from the beginning of the able. After a certain entries
* are reported, the kernel remembers the position of the last returned
* entry. When the next ioctl comes in with the ILB_LIST_BEGIN flag,
* it will return entries starting from where it was left off. When
* the end of table is reached, a flag (ILB_LIST_END) is set to tell
* the client that there is no more entry.
*
* It is assumed that the caller has checked the size of nat so that it
* can hold num entries.
*/
/* ARGSUSED */
int
{
uint32_t i, j;
int ret = 0;
while (ilbs->ilbs_conn_list_busy) {
&ilbs->ilbs_conn_list_lock) == 0) {
return (EINTR);
}
}
*num = 0;
*flags |= ILB_LIST_END;
return (0);
}
if (*flags & ILB_LIST_BEGIN) {
i = 0;
} else if (*flags & ILB_LIST_CONT) {
*num = 0;
*flags |= ILB_LIST_END;
goto done;
}
i = ilbs->ilbs_conn_list_cur;
} else {
goto done;
}
j = 0;
while (j < *num) {
if (++i == ilbs->ilbs_conn_hash_size) {
*flags |= ILB_LIST_END;
break;
}
continue;
}
nat[j].last_access_time =
/*
* The conn_s2c_pkt_cnt may not be accurate since we are not
* holding the s2c hash lock.
*/
j++;
}
if (j == *num)
ilbs->ilbs_conn_list_cur = i;
*num = j;
done:
return (ret);
}
/*
* Stickiness (persistence) handling routines.
*/
static void
ilb_sticky_cache_init(void)
{
}
void
ilb_sticky_cache_fini(void)
{
if (ilb_sticky_cache != NULL) {
}
}
void
{
}
static ilb_sticky_t *
{
ilb_sticky_t *s;
return (s);
}
}
return (NULL);
}
static ilb_sticky_t *
{
ilb_sticky_t *s;
return (NULL);
/*
* The rule instance is for handling the scenario when the same
* client talks to different rules at the same time. Stickiness
* is per rule so we can use the rule instance to differentiate
* the client's request.
*/
/*
* Copy the rule name for listing all sticky cache entry. ir_name
* is guaranteed to be NULL terminated.
*/
/*
* Grab a ref cnt on the server so that it won't go away while
* it is still in the sticky table.
*/
s->refcnt = 1;
/*
* There is no need to set atime here since the refcnt is not
* zero. A sticky entry is removed only when the refcnt is
* zero. But just set it here for debugging purpose. The
* atime is set when a refrele is done on a sticky entry.
*/
s->atime = ddi_get_lbolt64();
hash->sticky_cnt++;
return (s);
}
/*
* This routine checks if there is an existing sticky entry which matches
* a given packet. If there is one, return it. If there is not, create
* a sticky entry using the packet's info.
*/
{
int i;
ilb_sticky_t *s;
/* First check if there is already an entry. */
/* No sticky entry, add one. */
if (s == NULL) {
if (s == NULL) {
return (NULL);
}
/*
* Find a source for this server. All subseqent requests from
* the same client matching this sticky entry will use this
* source address in doing NAT. The current algorithm is
* simple, rotate the source address. Note that the
* source address array does not change after it's created, so
* it is OK to just increment the cur index.
*/
/* It is a hint, does not need to be atomic. */
s->nat_src_idx = *src_ent_idx;
}
*res = s;
return (server);
}
/*
* We don't hold any lock accessing iser_enabled. Refer to the
* comment in ilb_server_add() about iser_lock.
*/
if (!s->server->iser_enabled) {
/*
* s->server == server can only happen if there is a race in
* toggling the iser_enabled flag (we don't hold a lock doing
* that) so that the load balance algorithm still returns a
* disabled server. In this case, just drop the packet...
*/
return (NULL);
}
/*
* The old server is disabled and there is a new server, use
* the new one to create a sticky entry. Since we will
* add the entry at the beginning, subsequent lookup will
* find this new entry instead of the old one.
*/
goto add_new_entry;
}
s->refcnt++;
*res = s;
*src_ent_idx = s->nat_src_idx;
return (s->server);
}
static void
ilb_sticky_cleanup(void *arg)
{
uint32_t i;
ilb_sticky_t *s, *nxt_s;
now = ddi_get_lbolt64();
s = nxt_s) {
if (s->refcnt != 0)
continue;
ILB_SERVER_REFRELE(s->server);
hash[i].sticky_cnt--;
}
}
}
}
static void
ilb_sticky_timer(void *arg)
{
} else {
}
}
void
{
extern pri_t minclsyspri;
int i, part;
char tq_name[TASKQ_NAMELEN];
for (i = 0; i < 31; i++) {
break;
}
}
for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
sizeof (ilb_sticky_t),
}
if (ilb_sticky_cache == NULL)
(void *)ilbs->ilbs_netstack);
for (i = 0; i < ilb_sticky_timer_size; i++) {
/* Spread out the starting execution time of all the timers. */
SEC_TO_TICK(ilb_sticky_timeout + i));
}
}
void
{
int i;
ilb_sticky_t *s;
return;
/* Stop all the timers first. */
for (i = 0; i < ilb_sticky_timer_size; i++) {
/* Setting tid to 0 tells the timer handler not to restart. */
}
for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
!= NULL) {
ILB_SERVER_REFRELE(s->server);
kmem_free(s, sizeof (ilb_sticky_t));
}
}
sizeof (ilb_sticky_hash_t));
}
/*
* This routine sends up the sticky hash table to user land. Refer to
* the comments before ilb_list_nat(). Both routines assume similar
* conditions.
*
* It is assumed that the caller has checked the size of st so that it
* can hold num entries.
*/
/* ARGSUSED */
int
{
uint32_t i, j;
int ret = 0;
while (ilbs->ilbs_sticky_list_busy) {
&ilbs->ilbs_sticky_list_lock) == 0) {
return (EINTR);
}
}
*num = 0;
*flags |= ILB_LIST_END;
return (0);
}
if (*flags & ILB_LIST_BEGIN) {
i = 0;
} else if (*flags & ILB_LIST_CONT) {
*num = 0;
*flags |= ILB_LIST_END;
goto done;
}
i = ilbs->ilbs_sticky_list_cur;
} else {
goto done;
}
j = 0;
while (j < *num) {
if (++i == ilbs->ilbs_sticky_hash_size) {
*flags |= ILB_LIST_END;
break;
}
continue;
}
j++;
}
if (j == *num)
ilbs->ilbs_sticky_list_cur = i;
*num = j;
done:
return (ret);
}