ip_squeue.c revision 8df01f7616e4cc4b4c813f2ae0169d87ff185fa2
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* IP interface to squeues.
*
* IP creates an squeue instance for each CPU. The squeue pointer is saved in
* cpu_squeue field of the cpu structure. Each squeue is associated with a
* connection instance (conn_t).
*
* For CPUs available at system startup time the squeue creation and association
* with CPU happens at MP initialization time. For CPUs added during dynamic
* reconfiguration, the initialization happens when the new CPU is configured in
* the system. The squeue is chosen using IP_SQUEUE_GET macro which will either
* return per-CPU squeue or random squeue based on the ip_squeue_fanout
* variable.
*
* There are two modes of associating connection with squeues. The first mode
* associates each connection with the CPU that creates the connection (either
* during open time or during accept time). The second mode associates each
* connection with a random CPU, effectively distributing load over all CPUs
* and all squeues in the system. The mode is controlled by the
* ip_squeue_fanout variable.
*
* NOTE: The fact that there is an association between each connection and
* squeue and squeue and CPU does not mean that each connection is always
* processed on this CPU and on this CPU only. Any thread calling squeue_enter()
* may process the connection on whatever CPU it is scheduled. The squeue to CPU
* binding is only relevant for the worker thread.
*
* The list of all created squeues is kept in squeue_set structure. This list is
* used when ip_squeue_fanout is set and the load is distributed across all
* squeues.
*
* INTERFACE:
*
* squeue_t *ip_squeue_get(hint)
*
* Find an squeue based on the 'hint' value. The hint is used as an index
* in the array of IP squeues available. The way hint is computed may
* affect the effectiveness of the squeue distribution. Currently squeues
* are assigned in round-robin fashion using lbolt as a hint.
*
*
* DR Notes
* ========
*
* The ip_squeue_init() registers a call-back function with the CPU DR
* subsystem using register_cpu_setup_func(). The call-back function does two
* things:
*
* o When the CPU is going off-line or unconfigured, the worker thread is
* unbound from the CPU. This allows the CPU unconfig code to move it to
* another CPU.
*
* o When the CPU is going online, it creates a new squeue for this CPU if
* necessary and binds the squeue worker thread to this CPU.
*
* TUNEBALES:
*
* ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU
* associated with an squeue instance.
*
* ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c
* should be compiled with SQUEUE_PROFILE enabled for this variable to have
* an impact.
*
* ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue,
* otherwise get it from CPU->cpu_squeue.
*
* ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and
*
* ip_squeue_worker_wait: global value for the sq_wait field for all squeues
* created. This is the time squeue code waits before waking up the worker
* thread after queuing a request.
*/
#include <inet/ipclassifier.h>
#include <sys/squeue_impl.h>
/*
* We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1
* mapping between squeue and NIC (or Rx ring) for performance reasons so
* each squeue can uniquely own a NIC or a Rx ring and do polling
* (PSARC 2004/630). So we allow up to MAX_SQUEUES_PER_CPU squeues per CPU.
* We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues
* can be created dynamically as needed.
*/
#define MAX_SQUEUES_PER_CPU 32
#define MIN_SQUEUES_PER_CPU 1
#define IP_NUM_SOFT_RINGS 2
/*
* List of all created squeue sets. The size is protected by cpu_lock
*/
int ip_squeue_bind = B_TRUE;
int ip_squeue_profile = B_TRUE;
/*
* ip_squeue_worker_wait: global value for the sq_wait field for all squeues
* created. This is the time squeue code waits before waking up the worker
* thread after queuing a request.
*/
static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
static void ip_squeue_set_bind(squeue_set_t *);
static void ip_squeue_set_unbind(squeue_set_t *);
static void ip_squeue_clean(void *, mblk_t *, void *);
/*
* Create squeue set containing ip_squeues_per_cpu number of squeues
* for this CPU and bind them all to the CPU.
*/
static squeue_set_t *
{
int i;
char sqname[64];
if (reuse) {
int i;
/*
* We may already have an squeue created for this CPU. Try to
* find one and reuse it if possible.
*/
for (i = 0; i < sqset_global_size; i++) {
sqs = sqset_global_list[i];
return (sqs);
}
}
for (i = 0; i < ip_squeues_per_cpu; i++) {
/*
* The first squeue in each squeue_set is the DEFAULT
* squeue.
*/
if (ip_squeue_create_callback != NULL)
}
return (sqs);
}
/*
* Initialize IP squeues.
*/
void
{
int i;
else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU)
squeue_init();
sqset_global_size = 0;
/* Create squeue for each active CPU available */
for (i = 0; i < NCPU; i++) {
}
}
if (ip_squeue_profile)
}
/*
* Get squeue_t structure based on index.
* Since the squeue list can only grow, no need to grab any lock.
*/
squeue_t *
{
}
/* ARGSUSED */
static void
{
return;
}
/*
* Clean up squeue
*/
}
/*
* Cleanup the ring
*/
/*
* Signal ill that cleanup is done
*/
}
/*
* Clean up one squeue element. ill_inuse_ref is protected by ill_lock.
* The real cleanup happens behind the squeue via ip_squeue_clean function but
* we need to protect ourselves from 2 threads trying to cleanup at the same
* time (possible with one port going down for aggr and someone tearing down the
* entire aggr simultaneously). So we use ill_inuse_ref protected by ill_lock
* to indicate when the cleanup has started (1 ref) and when the cleanup
* is done (0 ref). When a new ring gets assigned to squeue, we start by
* putting 2 ref on ill_inuse_ref.
*/
static void
{
/* Just clean one squeue */
/*
* Reset the ILL_SOFT_RING_ASSIGN bit so that
* ip_squeue_soft_ring_affinty() will not go
* ahead with assigning rings.
*/
/* Some operations pending on the ring. Wait */
/*
* Someone already trying to clean
* this squeue or it's already been cleaned.
*/
return;
}
/*
* The rx_ring never had a squeue assigned to it.
* We are under ill_lock so we can clean it up
* here itself since no one can get to it.
*/
return;
}
/* Indicate that it's being cleaned */
/*
* Use the preallocated ill_unbind_conn for this purpose
*/
} else {
"concurrent use of tcp_closemp_used: connp %p tcp %p\n",
}
}
void
{
int idx;
/*
* No need to clean if poll_capab isn't set for this ill
*/
return;
}
}
typedef struct ip_taskq_arg {
/*
* Do a Rx ring to squeue binding. Find a unique squeue that is not
* managing a receive ring. If no such squeue exists, dynamically
* create a new one in the squeue set.
*
* The function runs via the system taskq. The ill passed as an
* argument can't go away since we hold a ref. The lock order is
* ill_lock -> sqs_lock -> sq_lock.
*
* If we are binding a Rx ring to a squeue attached to the offline CPU,
* no need to check that because squeues are never destroyed once
* created.
*/
/* ARGSUSED */
static void
ip_squeue_extend(void *arg)
{
/*
* Make sure the CPU that originally took the interrupt still
* exists.
*/
/*
* If this ill represents link aggregation, then there might be
* multiple NICs trying to register them selves at the same time
* and in order to ensure that test and assignment of free rings
* is sequential, we need to hold the ill_lock.
*/
/*
* We hit the max limit of squeues allowed per CPU.
* Assign this rx_ring to DEFAULT squeue of the
* interrupted CPU but the squeue will not manage
* the ring. Also print a warning.
*/
"has max number of squeues. System performance might "
/* the first squeue in the list is the default squeue */
return;
}
/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
}
/*
* Do a Rx ring to squeue binding. Find a unique squeue that is not
* managing a receive ring. If no such squeue exists, dynamically
* create a new one in the squeue set.
*
* The function runs via the system taskq. The ill passed as an
* argument can't go away since we hold a ref. The lock order is
* ill_lock -> sqs_lock -> sq_lock.
*
* If we are binding a Rx ring to a squeue attached to the offline CPU,
* no need to check that because squeues are never destroyed once
* created.
*/
/* ARGSUSED */
static void
ip_squeue_soft_ring_affinity(void *arg)
{
int min_cpu_id, max_cpu_id;
int i, j;
/*
* Make sure the CPU that originally took the interrupt still
* exists.
*/
}
/*
* If this ill represents link aggregation, then there might be
* multiple NICs trying to register them selves at the same time
* and in order to ensure that test and assignment of free rings
* is sequential, we need to hold the ill_lock.
*/
return;
}
/*
* We need to fanout the interrupts from the NIC. We do that by
* telling the driver underneath to create soft rings and use
* worker threads (if the driver advertized SOFT_RING capability)
* Its still a big performance win to if we can fanout to the
* threads on the same core that is taking interrupts.
*
* Since we don't know the interrupt to CPU binding, we don't
* assign any squeues or affinity to worker threads in the NIC.
* At the time of the first interrupt, we know which CPU is
* taking interrupts and try to find other threads on the same
* core. Assuming, ip_threads_per_cpu is correct and cpus are
* numbered sequentially for each core (XXX need something better
* than this in future), find the lowest number and highest
* number thread for that core.
*
* If we have one more thread per core than number of soft rings,
* then don't assign any worker threads to the H/W thread (cpu)
* taking interrupts (capability negotiation tries to ensure this)
*
* If the number of threads per core are same as the number of
* soft rings, then assign the worker affinity and squeue to
* the same cpu.
*
* Otherwise, just fanout to higher number CPUs starting from
* the interrupted CPU.
*/
/*
* Quickly check if there are enough CPUs present for fanout
* and also max_cpu_id is less than the id of the active CPU.
* We use the cpu_id stored in the last squeue_set to get
* an idea. The scheme is by no means perfect since it doesn't
* take into account CPU DR operations and the fact that
* interrupts themselves might change. An ideal scenario
* would be to ensure that interrupts run cpus by themselves
* and worker threads never have affinity to those CPUs. If
* the interrupts move to CPU which had a worker thread, it
* should be changed. Probably callbacks similar to CPU offline
* are needed to make it work perfectly.
*/
if ((max_cpu_id - min_cpu_id) >
else if ((max_cpu_id - min_cpu_id) >=
}
j = 0;
for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) {
if (enough_uniq_cpus) {
if ((min_cpu_id + i) == cpu_id) {
j++;
continue;
}
} else if (enough_cpus) {
} else {
/* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */
}
/*
* Check if the CPU actually exist and active. If not,
* use the interrupted CPU. ip_find_unused_squeue() will
* find the right CPU to fanout anyway.
*/
/*
* We hit the max limit of squeues allowed per CPU.
* Assign this rx_ring to DEFAULT squeue of the
* interrupted CPU but thesqueue will not manage
* the ring. Also print a warning.
*/
"%d/%p already has max number of squeues. System "
"performance might become suboptimal\n",
/* the first squeue in the list is the default squeue */
continue;
}
/* assign affinity to soft ring */
}
}
/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
}
/* ARGSUSED */
void
{
taskq_arg = (ip_taskq_arg_t *)
goto out;
/*
* Set ILL_SOFT_RING_ASSIGN flag. We don't want
* the next interrupt to schedule a task for calling
* ip_squeue_soft_ring_affinity();
*/
} else {
goto out;
}
if (refheld) {
goto out;
/* release ref on ill if taskq dispatch fails */
}
/*
* Turn on CAPAB_SOFT_RING so that affinity assignment
* can be tried again later.
*/
out:
}
static squeue_t *
{
int i;
int min_sq = 0;
char sqname[64];
/*
* If fanout is set and the passed squeue_set already has some
* squeues which are managing the NICs, try to find squeues on
* unused CPU.
*/
/*
* First check to see if any squeue on the CPU passed
* is managing a NIC.
*/
break;
}
}
for (i = sqset_global_size - 2; i >= 0; i--) {
curr_sqs = sqset_global_list[i];
}
}
}
}
(SQS_DEFAULT|SQS_ILL_BOUND)) == 0) {
break;
}
}
/* Need to create a new squeue */
/*
* Reached the max limit for squeue
* we can allocate on this CPU.
*/
return (NULL);
}
if (ip_squeue_create_callback != NULL)
}
}
return (sqp);
}
/*
* Find the squeue assigned to manage this Rx ring. If the Rx ring is not
* owned by a squeue yet, do the assignment. When the NIC registers it
* Rx rings with IP, we don't know where the interrupts will land and
* hence we need to wait till this point to do the assignment.
*/
squeue_t *
{
int interrupt;
if (ill_rx_ring == NULL)
return (IP_SQUEUE_GET(lbolt));
/*
* Do a quick check. If it's not NULL, we are done.
* Squeues are never destroyed so worse we will bind
* this connection to a suboptimal squeue.
*
* This is the fast path case.
*/
return (sqp);
/*
* Check sqp under the lock again for atomicity. Possible race with
* a previously scheduled ip_squeue_get -> ip_squeue_extend.
* Do the ring to squeue binding only if we are in interrupt context
* AND the ring is not already bound AND there is no one else trying
* the bind already.
*/
/*
* Note that the ring might get bound once we drop the lock
* below, if a previous request is in progress i.e. if the ring
* state is ILL_RING_INPROC. The incoming connection on whose
* behalf we are currently here might get a suboptimal squeue
* via the call to IP_SQUEUE_GET below, but there is no
* correctness issue.
*/
return (sqp);
return (IP_SQUEUE_GET(lbolt));
}
/*
* No sqp assigned yet. Can't really do that in interrupt
* context. Assign the default sqp to this connection and
* trigger creation of new sqp and binding it to this ring
* via taskq. Need to make sure ill stays around.
*/
if (refheld) {
return (IP_SQUEUE_GET(lbolt));
}
}
/*
* The ill is closing and we could not get a reference on the ill OR
* taskq_dispatch failed probably due to memory allocation failure.
* We will try again next time.
*/
if (refheld)
return (IP_SQUEUE_GET(lbolt));
}
/*
* NDD hooks for setting ip_squeue_xxx tuneables.
*/
/* ARGSUSED */
int
{
int *bind_enabled = (int *)addr;
long new_value;
int i;
return (EINVAL);
if (ip_squeue_bind == new_value)
return (0);
if (new_value == 0) {
for (i = 0; i < sqset_global_size; i++)
} else {
for (i = 0; i < sqset_global_size; i++)
}
return (0);
}
/*
* Set squeue profiling.
* 0 means "disable"
* 1 means "enable"
* 2 means "enable and reset"
*/
/* ARGSUSED */
int
{
int *profile_enabled = (int *)cp;
long new_value;
return (EINVAL);
if (new_value == 0)
else if (new_value == 1)
else if (new_value == 2) {
int i, j;
for (i = 0; i < sqset_global_size; i++) {
sqs = sqset_global_list[i];
}
}
new_value = 1;
}
return (0);
}
/*
* Reconfiguration callback
*/
/* ARGSUSED */
static int
{
switch (what) {
case CPU_CONFIG:
/*
* A new CPU is added. Create an squeue for it but do not bind
* it yet.
*/
break;
case CPU_ON:
case CPU_INIT:
case CPU_CPUPART_IN:
}
if (ip_squeue_bind)
break;
case CPU_UNCONFIG:
case CPU_OFF:
case CPU_CPUPART_OUT:
}
break;
default:
break;
}
return (0);
}
/* ARGSUSED */
static void
{
int i;
if (!ip_squeue_bind)
return;
continue;
}
}
static void
{
int i;
/*
* CPU is going offline. Remove the thread affinity
* for any soft ring threads the squeue is managing.
*/
}
}
continue;
}
}