ip_squeue.c revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* IP interface to squeues.
*
* IP creates an squeue instance for each CPU. The squeue pointer is saved in
* cpu_squeue field of the cpu structure. Each squeue is associated with a
* connection instance (conn_t).
*
* For CPUs available at system startup time the squeue creation and association
* with CPU happens at MP initialization time. For CPUs added during dynamic
* reconfiguration, the initialization happens when the new CPU is configured in
* the system. The squeue is chosen using IP_SQUEUE_GET macro which will either
* return per-CPU squeue or random squeue based on the ip_squeue_fanout
* variable.
*
* There are two modes of associating connection with squeues. The first mode
* associates each connection with the CPU that creates the connection (either
* during open time or during accept time). The second mode associates each
* connection with a random CPU, effectively distributing load over all CPUs
* and all squeues in the system. The mode is controlled by the
* ip_squeue_fanout variable.
*
* NOTE: The fact that there is an association between each connection and
* squeue and squeue and CPU does not mean that each connection is always
* processed on this CPU and on this CPU only. Any thread calling squeue_enter()
* may process the connection on whatever CPU it is scheduled. The squeue to CPU
* binding is only relevant for the worker thread.
*
* The list of all created squeues is kept in squeue_set structure. This list is
* used when ip_squeue_fanout is set and the load is distributed across all
* squeues.
*
* INTERFACE:
*
* squeue_t *ip_squeue_get(hint)
*
* Find an squeue based on the 'hint' value. The hint is used as an index
* in the array of IP squeues available. The way hint is computed may
* affect the effectiveness of the squeue distribution. Currently squeues
* are assigned in round-robin fashion using lbolt as a hint.
*
*
* DR Notes
* ========
*
* The ip_squeue_init() registers a call-back function with the CPU DR
* subsystem using register_cpu_setup_func(). The call-back function does two
* things:
*
* o When the CPU is going off-line or unconfigured, the worker thread is
* unbound from the CPU. This allows the CPU unconfig code to move it to
* another CPU.
*
* o When the CPU is going online, it creates a new squeue for this CPU if
* necessary and binds the squeue worker thread to this CPU.
*
* TUNEBALES:
*
* ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU
* associated with an squeue instance.
*
* ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c
* should be compiled with SQUEUE_PROFILE enabled for this variable to have
* an impact.
*
* ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue,
* otherwise get it from CPU->cpu_squeue.
*
* ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and
*
* ip_squeue_worker_wait: global value for the sq_wait field for all squeues
* created. This is the time squeue code waits before waking up the worker
* thread after queuing a request.
*/
#include <inet/ipclassifier.h>
#include <sys/squeue_impl.h>
/*
* We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1
* mapping between squeue and NIC (or Rx ring) for performance reasons so
* each squeue can uniquely own a NIC or a Rx ring and do polling
* (PSARC 2004/630). So we allow up to MAX_THREAD_PER_CPU squeues per CPU.
* We start by creating MIN_THREAD_PER_CPU squeues per CPU but more squeues
* can be created dynamically as needed.
*/
#define MAX_THREAD_PER_CPU 32
#define MIN_THREAD_PER_CPU 1
/*
* List of all created squeue sets. The size is protected by cpu_lock
*/
int ip_squeue_bind = B_TRUE;
int ip_squeue_profile = B_TRUE;
/*
* ip_squeue_worker_wait: global value for the sq_wait field for all squeues
* created. This is the time squeue code waits before waking up the worker
* thread after queuing a request.
*/
static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
static void ip_squeue_set_bind(squeue_set_t *);
static void ip_squeue_set_unbind(squeue_set_t *);
/*
* Create squeue set containing ip_threads_per_cpu number of squeues
* for this CPU and bind them all to the CPU.
*/
static squeue_set_t *
{
int i;
char sqname[64];
if (reuse) {
int i;
/*
* We may already have an squeue created for this CPU. Try to
* find one and reuse it if possible.
*/
for (i = 0; i < sqset_global_size; i++) {
sqs = sqset_global_list[i];
return (sqs);
}
}
for (i = 0; i < ip_threads_per_cpu; i++) {
if (ip_squeue_create_callback != NULL)
}
if (ip_squeue_bind)
return (sqs);
}
/*
* Initialize IP squeues.
*/
void
{
int i;
else if (ip_threads_per_cpu > MAX_THREAD_PER_CPU)
squeue_init();
sqset_global_size = 0;
/* Create squeue for each active CPU available */
for (i = 0; i < NCPU; i++) {
}
}
if (ip_squeue_profile)
}
/*
* Get squeue_t structure based on index.
* Since the squeue list can only grow, no need to grab any lock.
*/
squeue_t *
{
}
/* ARGSUSED */
void
{
return;
}
/*
* Clean up squeue
*/
/*
* Cleanup the ring
*/
/*
* Signal ill that cleanup is done
*/
}
typedef struct ip_taskq_arg {
/*
* Do a Rx ring to squeue binding. Find a unique squeue that is not
* managing a receive ring. If no such squeue exists, dynamically
* create a new one in the squeue set.
*
* The function runs via the system taskq. The ill passed as an
* argument can't go away since we hold a ref. The lock order is
* ill_lock -> sqs_lock -> sq_lock.
*
* If we are binding a Rx ring to a squeue attached to the offline CPU,
* no need to check that because squeues are never destroyed once
* created.
*/
/* ARGSUSED */
static void
ip_squeue_extend(void *arg)
{
char sqname[64];
int i;
/*
* If this ill represents link aggregation, then there might be
* multiple NICs trying to register them selves at the same time
* and in order to ensure that test and assignment of free rings
* is sequential, we need to hold the ill_lock.
*/
break;
}
}
/* Need to create a new squeue */
/*
* Reached the max limit for squeue
* we can allocate on this CPU. Leave
* ill_ring_state set to ILL_RING_INPROC
* so that ip_squeue_direct will just
* assign the default squeue for this
* ring for future connections.
*/
#ifdef DEBUG
" threads per CPU for sqp = %p\n", (void *)sqp);
#endif
return;
}
if (ip_squeue_create_callback != NULL)
if (ip_squeue_bind) {
}
}
/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
}
/*
* Find the squeue assigned to manage this Rx ring. If the Rx ring is not
* owned by a squeue yet, do the assignment. When the NIC registers it
* Rx rings with IP, we don't know where the interrupts will land and
* hence we need to wait till this point to do the assignment.
*/
squeue_t *
{
int interrupt;
if (ill_rx_ring == NULL)
return (IP_SQUEUE_GET(lbolt));
/*
* Do a quick check. If it's not NULL, we are done.
* Squeues are never destroyed so worse we will bind
* this connection to a suboptimal squeue.
*
* This is the fast path case.
*/
return (sqp);
/*
* Do the ring to squeue binding only if we are in interrupt
* context and there is no one else trying the bind already.
*/
return (IP_SQUEUE_GET(lbolt));
}
/*
* No sqp assigned yet. Can't really do that in interrupt
* context. Assign the default sqp to this connection and
* trigger creation of new sqp and binding it to this ring
* via taskq. Need to make sure ill stays around.
*/
if (refheld) {
return (IP_SQUEUE_GET(lbolt));
}
}
/*
* The ill is closing and we could not get a reference on the ill OR
* taskq_dispatch failed probably due to memory allocation failure.
* We will try again next time.
*/
if (refheld)
return (IP_SQUEUE_GET(lbolt));
}
/*
* NDD hooks for setting ip_squeue_xxx tuneables.
*/
/* ARGSUSED */
int
{
int *bind_enabled = (int *)addr;
long new_value;
int i;
return (EINVAL);
if (ip_squeue_bind == new_value)
return (0);
if (new_value == 0) {
for (i = 0; i < sqset_global_size; i++)
} else {
for (i = 0; i < sqset_global_size; i++)
}
return (0);
}
/*
* Set squeue profiling.
* 0 means "disable"
* 1 means "enable"
* 2 means "enable and reset"
*/
/* ARGSUSED */
int
{
int *profile_enabled = (int *)cp;
long new_value;
return (EINVAL);
if (new_value == 0)
else if (new_value == 1)
else if (new_value == 2) {
int i, j;
for (i = 0; i < sqset_global_size; i++) {
sqs = sqset_global_list[i];
}
}
new_value = 1;
}
return (0);
}
/*
* Reconfiguration callback
*/
/* ARGSUSED */
static int
{
switch (what) {
case CPU_ON:
case CPU_INIT:
case CPU_CPUPART_IN:
/* New CPU! */
}
if (ip_squeue_bind)
break;
case CPU_UNCONFIG:
case CPU_OFF:
case CPU_CPUPART_OUT:
}
break;
default:
break;
}
return (0);
}
/* ARGSUSED */
static void
{
int i;
if (!ip_squeue_bind)
return;
continue;
}
}
static void
{
int i;
continue;
}
}