inet/ip/ip_squeue.c

	ip_squeue.c revision 8df01f7616e4cc4b4c813f2ae0169d87ff185fa2
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

/*
 * IP interface to squeues.
 *
 * IP creates an squeue instance for each CPU. The squeue pointer is saved in
 * cpu_squeue field of the cpu structure. Each squeue is associated with a
 * connection instance (conn_t).
 *
 * For CPUs available at system startup time the squeue creation and association
 * with CPU happens at MP initialization time. For CPUs added during dynamic
 * reconfiguration, the initialization happens when the new CPU is configured in
 * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either
 * return per-CPU squeue or random squeue based on the ip_squeue_fanout
 * variable.
 *
 * There are two modes of associating connection with squeues. The first mode
 * associates each connection with the CPU that creates the connection (either
 * during open time or during accept time). The second mode associates each
 * connection with a random CPU, effectively distributing load over all CPUs
 * and all squeues in the system. The mode is controlled by the
 * ip_squeue_fanout variable.
 *
 * NOTE: The fact that there is an association between each connection and
 * squeue and squeue and CPU does not mean that each connection is always
 * processed on this CPU and on this CPU only. Any thread calling squeue_enter()
 * may process the connection on whatever CPU it is scheduled. The squeue to CPU
 * binding is only relevant for the worker thread.
 *
 * The list of all created squeues is kept in squeue_set structure. This list is
 * used when ip_squeue_fanout is set and the load is distributed across all
 * squeues.
 *
 * INTERFACE:
 *
 * squeue_t *ip_squeue_get(hint)
 *
 *  Find an squeue based on the 'hint' value. The hint is used as an index
 *  in the array of IP squeues available. The way hint is computed may
 *  affect the effectiveness of the squeue distribution. Currently squeues
 *  are assigned in round-robin fashion using lbolt as a hint.
 *
 *
 * DR Notes
 * ========
 *
 * The ip_squeue_init() registers a call-back function with the CPU DR
 * subsystem using register_cpu_setup_func(). The call-back function does two
 * things:
 *
 * o When the CPU is going off-line or unconfigured, the worker thread is
 *  unbound from the CPU. This allows the CPU unconfig code to move it to
 *  another CPU.
 *
 * o When the CPU is going online, it creates a new squeue for this CPU if
 *  necessary and binds the squeue worker thread to this CPU.
 *
 * TUNEBALES:
 *
 * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU
 *  associated with an squeue instance.
 *
 * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c
 *  should be compiled with SQUEUE_PROFILE enabled for this variable to have
 *  an impact.
 *
 * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue,
 *  otherwise get it from CPU->cpu_squeue.
 *
 * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and
 * changed using ndd on /dev/tcp or /dev/ip.
 *
 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
 *  created. This is the time squeue code waits before waking up the worker
 *  thread after queuing a request.
 */

#include <sys/types.h>
#include <sys/debug.h>
#include <sys/kmem.h>
#include <sys/cpuvar.h>

#include <sys/cmn_err.h>

#include <inet/common.h>
#include <inet/ip.h>
#include <inet/ip_if.h>
#include <inet/nd.h>
#include <inet/ipclassifier.h>
#include <sys/types.h>
#include <sys/conf.h>
#include <sys/sunddi.h>
#include <sys/dlpi.h>
#include <sys/squeue_impl.h>

/*
 * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1
 * mapping between squeue and NIC (or Rx ring) for performance reasons so
 * each squeue can uniquely own a NIC or a Rx ring and do polling
 * (PSARC 2004/630). So we allow up to  MAX_SQUEUES_PER_CPU squeues per CPU.
 * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues
 * can be created dynamically as needed.
 */
#define MAX_SQUEUES_PER_CPU 32
#define MIN_SQUEUES_PER_CPU 1
uint_t  ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;

#define IP_NUM_SOFT_RINGS   2
uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS;

/*
 * List of all created squeue sets. The size is protected by cpu_lock
 */
squeue_set_t    **sqset_global_list;
uint_t      sqset_global_size;

int ip_squeue_bind = B_TRUE;
int ip_squeue_profile = B_TRUE;
static void (*ip_squeue_create_callback)(squeue_t *) = NULL;

/*
 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
 *  created. This is the time squeue code waits before waking up the worker
 *  thread after queuing a request.
 */
uint_t ip_squeue_worker_wait = 10;

static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t);
static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);

static void ip_squeue_set_bind(squeue_set_t *);
static void ip_squeue_set_unbind(squeue_set_t *);
static squeue_t *ip_find_unused_squeue(squeue_set_t *, cpu_t *, boolean_t);
static void ip_squeue_clean(void *, mblk_t *, void *);
static void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *);

#define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))

/*
 * Create squeue set containing ip_squeues_per_cpu number of squeues
 * for this CPU and bind them all to the CPU.
 */
static squeue_set_t *
ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
{
    int i;
    squeue_set_t    *sqs;
    squeue_t    *sqp;
    char        sqname[64];
    processorid_t   id = cp->cpu_id;

    if (reuse) {
        int i;

        /*
         * We may already have an squeue created for this CPU. Try to
         * find one and reuse it if possible.
         */
        for (i = 0; i < sqset_global_size; i++) {
            sqs = sqset_global_list[i];
            if (id == sqs->sqs_bind)
                return (sqs);
        }
    }

    sqs = kmem_zalloc(sizeof (squeue_set_t) +
        (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP);
    mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL);
    sqs->sqs_list = (squeue_t **)&sqs[1];
    sqs->sqs_max_size = MAX_SQUEUES_PER_CPU;
    sqs->sqs_bind = id;

    for (i = 0; i < ip_squeues_per_cpu; i++) {
        bzero(sqname, sizeof (sqname));

        (void) snprintf(sqname, sizeof (sqname),
            "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid,
            cp->cpu_id, i);

        sqp = squeue_create(sqname, id, ip_squeue_worker_wait,
            minclsyspri);

        /*
         * The first squeue in each squeue_set is the DEFAULT
         * squeue.
         */
        sqp->sq_state |= SQS_DEFAULT;

        ASSERT(sqp != NULL);

        squeue_profile_enable(sqp);
        sqs->sqs_list[sqs->sqs_size++] = sqp;

        if (ip_squeue_create_callback != NULL)
            ip_squeue_create_callback(sqp);
    }

    if (ip_squeue_bind && cpu_is_online(cp))
        ip_squeue_set_bind(sqs);

    sqset_global_list[sqset_global_size++] = sqs;
    ASSERT(sqset_global_size <= NCPU);
    return (sqs);
}

/*
 * Initialize IP squeues.
 */
void
ip_squeue_init(void (*callback)(squeue_t *))
{
    int i;

    ASSERT(sqset_global_list == NULL);

    if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU)
        ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
    else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU)
        ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU;

    ip_squeue_create_callback = callback;
    squeue_init();
    sqset_global_list =
        kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP);
    sqset_global_size = 0;
    mutex_enter(&cpu_lock);

    /* Create squeue for each active CPU available */
    for (i = 0; i < NCPU; i++) {
        cpu_t *cp = cpu[i];
        if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
            cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE);
        }
    }

    register_cpu_setup_func(ip_squeue_cpu_setup, NULL);

    mutex_exit(&cpu_lock);

    if (ip_squeue_profile)
        squeue_profile_start();
}

/*
 * Get squeue_t structure based on index.
 * Since the squeue list can only grow, no need to grab any lock.
 */
squeue_t *
ip_squeue_random(uint_t index)
{
    squeue_set_t *sqs;

    sqs = sqset_global_list[index % sqset_global_size];
    return (sqs->sqs_list[index % sqs->sqs_size]);
}

/* ARGSUSED */
static void
ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2)
{
    squeue_t    *sqp = arg2;
    ill_rx_ring_t   *ring = sqp->sq_rx_ring;
    ill_t       *ill;

    ASSERT(sqp != NULL);

    if (ring == NULL) {
        return;
    }

    /*
     * Clean up squeue
     */
    mutex_enter(&sqp->sq_lock);
    sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB);
    sqp->sq_rx_ring = NULL;
    mutex_exit(&sqp->sq_lock);

    ill = ring->rr_ill;
    if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
        ASSERT(ring->rr_handle != NULL);
        ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle);
    }

    /*
     * Cleanup the ring
     */

    ring->rr_blank = NULL;
    ring->rr_handle = NULL;
    ring->rr_sqp = NULL;

    /*
     * Signal ill that cleanup is done
     */
    mutex_enter(&ill->ill_lock);
    ring->rr_ring_state = ILL_RING_FREE;
    cv_signal(&ill->ill_cv);
    mutex_exit(&ill->ill_lock);
}

/*
 * Clean up one squeue element. ill_inuse_ref is protected by ill_lock.
 * The real cleanup happens behind the squeue via ip_squeue_clean function but
 * we need to protect ourselves from 2 threads trying to cleanup at the same
 * time (possible with one port going down for aggr and someone tearing down the
 * entire aggr simultaneously). So we use ill_inuse_ref protected by ill_lock
 * to indicate when the cleanup has started (1 ref) and when the cleanup
 * is done (0 ref). When a new ring gets assigned to squeue, we start by
 * putting 2 ref on ill_inuse_ref.
 */
static void
ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
{
    conn_t *connp;
    squeue_t *sqp;
    mblk_t *mp;

    ASSERT(rx_ring != NULL);

    /* Just clean one squeue */
    mutex_enter(&ill->ill_lock);
    /*
     * Reset the ILL_SOFT_RING_ASSIGN bit so that
     * ip_squeue_soft_ring_affinty() will not go
     * ahead with assigning rings.
     */
    ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
    while (rx_ring->rr_ring_state == ILL_RING_INPROC)
        /* Some operations pending on the ring. Wait */
        cv_wait(&ill->ill_cv, &ill->ill_lock);

    if (rx_ring->rr_ring_state != ILL_RING_INUSE) {
        /*
         * Someone already trying to clean
         * this squeue or it's already been cleaned.
         */
        mutex_exit(&ill->ill_lock);
        return;
    }
    sqp = rx_ring->rr_sqp;

    if (sqp == NULL) {
        /*
         * The rx_ring never had a squeue assigned to it.
         * We are under ill_lock so we can clean it up
         * here itself since no one can get to it.
         */
        rx_ring->rr_blank = NULL;
        rx_ring->rr_handle = NULL;
        rx_ring->rr_sqp = NULL;
        rx_ring->rr_ring_state = ILL_RING_FREE;
        mutex_exit(&ill->ill_lock);
        return;
    }

    /* Indicate that it's being cleaned */
    rx_ring->rr_ring_state = ILL_RING_BEING_FREED;
    ASSERT(sqp != NULL);
    mutex_exit(&ill->ill_lock);

    /*
     * Use the preallocated ill_unbind_conn for this purpose
     */
    connp = ill->ill_dls_capab->ill_unbind_conn;

    if (connp->conn_tcp->tcp_closemp.b_prev == NULL) {
        connp->conn_tcp->tcp_closemp_used = B_TRUE;
    } else {
        cmn_err(CE_PANIC, "ip_squeue_clean_ring: "
            "concurrent use of tcp_closemp_used: connp %p tcp %p\n",
            (void *)connp, (void *)connp->conn_tcp);
    }

    TCP_DEBUG_GETPCSTACK(connp->conn_tcp->tcmp_stk, 15);
    mp = &connp->conn_tcp->tcp_closemp;
    CONN_INC_REF(connp);
    squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL);

    mutex_enter(&ill->ill_lock);
    while (rx_ring->rr_ring_state != ILL_RING_FREE)
        cv_wait(&ill->ill_cv, &ill->ill_lock);
    mutex_exit(&ill->ill_lock);
}

void
ip_squeue_clean_all(ill_t *ill)
{
    int idx;

    /*
     * No need to clean if poll_capab isn't set for this ill
     */
    if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)))
        return;

    for (idx = 0; idx < ILL_MAX_RINGS; idx++) {
        ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx];

        ip_squeue_clean_ring(ill, ipr);
    }

    ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING);
}

typedef struct ip_taskq_arg {
    ill_t       *ip_taskq_ill;
    ill_rx_ring_t   *ip_taskq_ill_rx_ring;
    cpu_t       *ip_taskq_cpu;
} ip_taskq_arg_t;

/*
 * Do a Rx ring to squeue binding. Find a unique squeue that is not
 * managing a receive ring. If no such squeue exists, dynamically
 * create a new one in the squeue set.
 *
 * The function runs via the system taskq. The ill passed as an
 * argument can't go away since we hold a ref. The lock order is
 * ill_lock -> sqs_lock -> sq_lock.
 *
 * If we are binding a Rx ring to a squeue attached to the offline CPU,
 * no need to check that because squeues are never destroyed once
 * created.
 */
/* ARGSUSED */
static void
ip_squeue_extend(void *arg)
{
    ip_taskq_arg_t  *sq_arg = (ip_taskq_arg_t *)arg;
    ill_t       *ill = sq_arg->ip_taskq_ill;
    ill_rx_ring_t   *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
    cpu_t       *intr_cpu = sq_arg->ip_taskq_cpu;
    squeue_set_t    *sqs;
    squeue_t    *sqp = NULL;

    ASSERT(ill != NULL);
    ASSERT(ill_rx_ring != NULL);
    kmem_free(arg, sizeof (ip_taskq_arg_t));

    /*
     * Make sure the CPU that originally took the interrupt still
     * exists.
     */
    if (!CPU_ISON(intr_cpu))
        intr_cpu = CPU;

    sqs = intr_cpu->cpu_squeue_set;

    /*
     * If this ill represents link aggregation, then there might be
     * multiple NICs trying to register them selves at the same time
     * and in order to ensure that test and assignment of free rings
     * is sequential, we need to hold the ill_lock.
     */
    mutex_enter(&ill->ill_lock);
    sqp = ip_find_unused_squeue(sqs, intr_cpu, B_FALSE);
    if (sqp == NULL) {
        /*
         * We hit the max limit of squeues allowed per CPU.
         * Assign this rx_ring to DEFAULT squeue of the
         * interrupted CPU but the squeue will not manage
         * the ring. Also print a warning.
         */
        cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already "
            "has max number of squeues. System performance might "
            "become suboptimal\n", sqs->sqs_bind, (void *)sqs);

        /* the first squeue in the list is the default squeue */
        sqp = sqs->sqs_list[0];
        ASSERT(sqp != NULL);
        ill_rx_ring->rr_sqp = sqp;
        ill_rx_ring->rr_ring_state = ILL_RING_INUSE;

        mutex_exit(&ill->ill_lock);
        ill_waiter_dcr(ill);
        return;
    }

    ASSERT(MUTEX_HELD(&sqp->sq_lock));
    sqp->sq_rx_ring = ill_rx_ring;
    ill_rx_ring->rr_sqp = sqp;
    ill_rx_ring->rr_ring_state = ILL_RING_INUSE;

    sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
    mutex_exit(&sqp->sq_lock);

    mutex_exit(&ill->ill_lock);

    /* ill_waiter_dcr will also signal any waiters on ill_ring_state */
    ill_waiter_dcr(ill);
}

/*
 * Do a Rx ring to squeue binding. Find a unique squeue that is not
 * managing a receive ring. If no such squeue exists, dynamically
 * create a new one in the squeue set.
 *
 * The function runs via the system taskq. The ill passed as an
 * argument can't go away since we hold a ref. The lock order is
 * ill_lock -> sqs_lock -> sq_lock.
 *
 * If we are binding a Rx ring to a squeue attached to the offline CPU,
 * no need to check that because squeues are never destroyed once
 * created.
 */
/* ARGSUSED */
static void
ip_squeue_soft_ring_affinity(void *arg)
{
    ip_taskq_arg_t      *sq_arg = (ip_taskq_arg_t *)arg;
    ill_t           *ill = sq_arg->ip_taskq_ill;
    ill_dls_capab_t *ill_soft_ring = ill->ill_dls_capab;
    ill_rx_ring_t       *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
    cpu_t           *intr_cpu = sq_arg->ip_taskq_cpu;
    cpu_t           *bind_cpu;
    int         cpu_id = intr_cpu->cpu_id;
    int         min_cpu_id, max_cpu_id;
    boolean_t       enough_uniq_cpus = B_FALSE;
    boolean_t       enough_cpus = B_FALSE;
    squeue_set_t        *sqs, *last_sqs;
    squeue_t        *sqp = NULL;
    int         i, j;

    ASSERT(ill != NULL);
    kmem_free(arg, sizeof (ip_taskq_arg_t));

    /*
     * Make sure the CPU that originally took the interrupt still
     * exists.
     */
    if (!CPU_ISON(intr_cpu)) {
        intr_cpu = CPU;
        cpu_id = intr_cpu->cpu_id;
    }

    /*
     * If this ill represents link aggregation, then there might be
     * multiple NICs trying to register them selves at the same time
     * and in order to ensure that test and assignment of free rings
     * is sequential, we need to hold the ill_lock.
     */
    mutex_enter(&ill->ill_lock);

    if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
        mutex_exit(&ill->ill_lock);
        return;
    }
    /*
     * We need to fanout the interrupts from the NIC. We do that by
     * telling the driver underneath to create soft rings and use
     * worker threads (if the driver advertized SOFT_RING capability)
     * Its still a big performance win to if we can fanout to the
     * threads on the same core that is taking interrupts.
     *
     * Since we don't know the interrupt to CPU binding, we don't
     * assign any squeues or affinity to worker threads in the NIC.
     * At the time of the first interrupt, we know which CPU is
     * taking interrupts and try to find other threads on the same
     * core. Assuming, ip_threads_per_cpu is correct and cpus are
     * numbered sequentially for each core (XXX need something better
     * than this in future), find the lowest number and highest
     * number thread for that core.
     *
     * If we have one more thread per core than number of soft rings,
     * then don't assign any worker threads to the H/W thread (cpu)
     * taking interrupts (capability negotiation tries to ensure this)
     *
     * If the number of threads per core are same as the number of
     * soft rings, then assign the worker affinity and squeue to
     * the same cpu.
     *
     * Otherwise, just fanout to higher number CPUs starting from
     * the interrupted CPU.
     */

    min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu;
    max_cpu_id = min_cpu_id + ip_threads_per_cpu;

    /*
     * Quickly check if there are enough CPUs present for fanout
     * and also max_cpu_id is less than the id of the active CPU.
     * We use the cpu_id stored in the last squeue_set to get
     * an idea. The scheme is by no means perfect since it doesn't
     * take into account CPU DR operations and the fact that
     * interrupts themselves might change. An ideal scenario
     * would be to ensure that interrupts run cpus by themselves
     * and worker threads never have affinity to those CPUs. If
     * the interrupts move to CPU which had a worker thread, it
     * should be changed. Probably callbacks similar to CPU offline
     * are needed to make it work perfectly.
     */
    last_sqs = sqset_global_list[sqset_global_size - 1];
    if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) {
        if ((max_cpu_id - min_cpu_id) >
            ill_soft_ring->ill_dls_soft_ring_cnt)
            enough_uniq_cpus = B_TRUE;
        else if ((max_cpu_id - min_cpu_id) >=
            ill_soft_ring->ill_dls_soft_ring_cnt)
            enough_cpus = B_TRUE;
    }

    j = 0;
    for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) {
        if (enough_uniq_cpus) {
            if ((min_cpu_id + i) == cpu_id) {
                j++;
                continue;
            }
            bind_cpu = cpu[min_cpu_id + i];
        } else if (enough_cpus) {
            bind_cpu = cpu[min_cpu_id + i];
        } else {
            /* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */
            bind_cpu = cpu[(cpu_id + i) % ncpus];
        }

        /*
         * Check if the CPU actually exist and active. If not,
         * use the interrupted CPU. ip_find_unused_squeue() will
         * find the right CPU to fanout anyway.
         */
        if (!CPU_ISON(bind_cpu))
            bind_cpu = intr_cpu;

        sqs = bind_cpu->cpu_squeue_set;
        ASSERT(sqs != NULL);
        ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j];

        sqp = ip_find_unused_squeue(sqs, bind_cpu, B_TRUE);
        if (sqp == NULL) {
            /*
             * We hit the max limit of squeues allowed per CPU.
             * Assign this rx_ring to DEFAULT squeue of the
             * interrupted CPU but thesqueue will not manage
             * the ring. Also print a warning.
             */
            cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = "
                "%d/%p already has max number of squeues. System "
                "performance might become suboptimal\n",
                sqs->sqs_bind, (void *)sqs);

            /* the first squeue in the list is the default squeue */
            sqp = intr_cpu->cpu_squeue_set->sqs_list[0];
            ASSERT(sqp != NULL);

            ill_rx_ring->rr_sqp = sqp;
            ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
            continue;

        }
        ASSERT(MUTEX_HELD(&sqp->sq_lock));
        ill_rx_ring->rr_sqp = sqp;
        sqp->sq_rx_ring = ill_rx_ring;
        ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
        sqp->sq_state |= SQS_ILL_BOUND;

        /* assign affinity to soft ring */
        if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) {
            ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle,
                sqp->sq_bind);
        }
        mutex_exit(&sqp->sq_lock);
    }
    mutex_exit(&ill->ill_lock);

    ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle,
        SOFT_RING_FANOUT);

    mutex_enter(&ill->ill_lock);
    ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
    mutex_exit(&ill->ill_lock);

    /* ill_waiter_dcr will also signal any waiters on ill_ring_state */
    ill_waiter_dcr(ill);
}

/* ARGSUSED */
void
ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring,
    mblk_t *mp_chain, struct mac_header_info_s *mhip)
{
    ip_taskq_arg_t  *taskq_arg;
    boolean_t   refheld;

    ASSERT(servicing_interrupt());

    mutex_enter(&ill->ill_lock);
    if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
        taskq_arg = (ip_taskq_arg_t *)
            kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP);

        if (taskq_arg == NULL)
            goto out;

        taskq_arg->ip_taskq_ill = ill;
        taskq_arg->ip_taskq_ill_rx_ring = NULL;
        taskq_arg->ip_taskq_cpu = CPU;

        /*
         * Set ILL_SOFT_RING_ASSIGN flag. We don't want
         * the next interrupt to schedule a task for calling
         * ip_squeue_soft_ring_affinity();
         */
        ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN;
    } else {
        mutex_exit(&ill->ill_lock);
        goto out;
    }
    mutex_exit(&ill->ill_lock);
    refheld = ill_waiter_inc(ill);
    if (refheld) {
        if (taskq_dispatch(system_taskq,
            ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP))
            goto out;

        /* release ref on ill if taskq dispatch fails */
        ill_waiter_dcr(ill);
    }
    /*
     * Turn on CAPAB_SOFT_RING so that affinity assignment
     * can be tried again later.
     */
    mutex_enter(&ill->ill_lock);
    ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
    mutex_exit(&ill->ill_lock);
    kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));

out:
    ip_input(ill, NULL, mp_chain, mhip);
}

static squeue_t *
ip_find_unused_squeue(squeue_set_t *sqs, cpu_t *bind_cpu, boolean_t fanout)
{
    int         i;
    squeue_set_t    *best_sqs = NULL;
    squeue_set_t    *curr_sqs = NULL;
    int     min_sq = 0;
    squeue_t    *sqp = NULL;
    char        sqname[64];

    /*
     * If fanout is set and the passed squeue_set already has some
     * squeues which are managing the NICs, try to find squeues on
     * unused CPU.
     */
    if (sqs->sqs_size > 1 && fanout) {
        /*
         * First check to see if any squeue on the CPU passed
         * is managing a NIC.
         */
        for (i = 0; i < sqs->sqs_size; i++) {
            mutex_enter(&sqs->sqs_list[i]->sq_lock);
            if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) &&
                !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) {
                mutex_exit(&sqs->sqs_list[i]->sq_lock);
                break;
            }
            mutex_exit(&sqs->sqs_list[i]->sq_lock);
        }
        if (i != sqs->sqs_size) {
            best_sqs = sqset_global_list[sqset_global_size - 1];
            min_sq = best_sqs->sqs_size;

            for (i = sqset_global_size - 2; i >= 0; i--) {
                curr_sqs = sqset_global_list[i];
                if (curr_sqs->sqs_size < min_sq) {
                    best_sqs = curr_sqs;
                    min_sq = curr_sqs->sqs_size;
                }
            }

            ASSERT(best_sqs != NULL);
            sqs = best_sqs;
            bind_cpu = cpu[sqs->sqs_bind];
        }
    }

    mutex_enter(&sqs->sqs_lock);

    for (i = 0; i < sqs->sqs_size; i++) {
        mutex_enter(&sqs->sqs_list[i]->sq_lock);
        if ((sqs->sqs_list[i]->sq_state &
            (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) {
            sqp = sqs->sqs_list[i];
            break;
        }
        mutex_exit(&sqs->sqs_list[i]->sq_lock);
    }

    if (sqp == NULL) {
        /* Need to create a new squeue */
        if (sqs->sqs_size == sqs->sqs_max_size) {
            /*
             * Reached the max limit for squeue
             * we can allocate on this CPU.
             */
            mutex_exit(&sqs->sqs_lock);
            return (NULL);
        }

        bzero(sqname, sizeof (sqname));
        (void) snprintf(sqname, sizeof (sqname),
            "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid,
            bind_cpu->cpu_id, sqs->sqs_size);

        sqp = squeue_create(sqname, bind_cpu->cpu_id,
            ip_squeue_worker_wait, minclsyspri);

        ASSERT(sqp != NULL);

        squeue_profile_enable(sqp);
        sqs->sqs_list[sqs->sqs_size++] = sqp;

        if (ip_squeue_create_callback != NULL)
            ip_squeue_create_callback(sqp);

        mutex_enter(&cpu_lock);
        if (ip_squeue_bind && cpu_is_online(bind_cpu)) {
            squeue_bind(sqp, -1);
        }
        mutex_exit(&cpu_lock);

        mutex_enter(&sqp->sq_lock);
    }

    mutex_exit(&sqs->sqs_lock);
    ASSERT(sqp != NULL);
    return (sqp);
}

/*
 * Find the squeue assigned to manage this Rx ring. If the Rx ring is not
 * owned by a squeue yet, do the assignment. When the NIC registers it
 * Rx rings with IP, we don't know where the interrupts will land and
 * hence we need to wait till this point to do the assignment.
 */
squeue_t *
ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
{
    squeue_t    *sqp;
    ill_t       *ill;
    int     interrupt;
    ip_taskq_arg_t  *taskq_arg;
    boolean_t   refheld;

    if (ill_rx_ring == NULL)
        return (IP_SQUEUE_GET(lbolt));

    sqp = ill_rx_ring->rr_sqp;
    /*
     * Do a quick check. If it's not NULL, we are done.
     * Squeues are never destroyed so worse we will bind
     * this connection to a suboptimal squeue.
     *
     * This is the fast path case.
     */
    if (sqp != NULL)
        return (sqp);

    ill = ill_rx_ring->rr_ill;
    ASSERT(ill != NULL);

    interrupt = servicing_interrupt();
    taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t),
        KM_NOSLEEP);

    mutex_enter(&ill->ill_lock);
    /*
     * Check sqp under the lock again for atomicity. Possible race with
     * a previously scheduled ip_squeue_get -> ip_squeue_extend.
     * Do the ring to squeue binding only if we are in interrupt context
     * AND the ring is not already bound AND there is no one else trying
     * the bind already.
     */
    sqp = ill_rx_ring->rr_sqp;
    if (sqp != NULL || !interrupt ||
        ill_rx_ring->rr_ring_state != ILL_RING_INUSE || taskq_arg == NULL) {
        /*
         * Note that the ring might get bound once we drop the lock
         * below, if a previous request is in progress i.e. if the ring
         * state is ILL_RING_INPROC. The incoming connection on whose
         * behalf we are currently here might get a suboptimal squeue
         * via the call to IP_SQUEUE_GET below, but there is no
         * correctness issue.
         */
        mutex_exit(&ill->ill_lock);
        if (taskq_arg != NULL)
            kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
        if (sqp != NULL)
            return (sqp);
        return (IP_SQUEUE_GET(lbolt));
    }

    /*
     * No sqp assigned yet. Can't really do that in interrupt
     * context. Assign the default sqp to this connection and
     * trigger creation of new sqp and binding it to this ring
     * via taskq. Need to make sure ill stays around.
     */
    taskq_arg->ip_taskq_ill = ill;
    taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring;
    taskq_arg->ip_taskq_cpu = CPU;
    ill_rx_ring->rr_ring_state = ILL_RING_INPROC;
    mutex_exit(&ill->ill_lock);
    refheld = ill_waiter_inc(ill);
    if (refheld) {
        if (taskq_dispatch(system_taskq, ip_squeue_extend,
            taskq_arg, TQ_NOSLEEP) != NULL) {
            return (IP_SQUEUE_GET(lbolt));
        }
    }
    /*
     * The ill is closing and we could not get a reference on the ill OR
     * taskq_dispatch failed probably due to memory allocation failure.
     * We will try again next time.
     */
    mutex_enter(&ill->ill_lock);
    ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
    mutex_exit(&ill->ill_lock);
    kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
    if (refheld)
        ill_waiter_dcr(ill);

    return (IP_SQUEUE_GET(lbolt));
}

/*
 * NDD hooks for setting ip_squeue_xxx tuneables.
 */

/* ARGSUSED */
int
ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value,
    caddr_t addr, cred_t *cr)
{
    int *bind_enabled = (int *)addr;
    long new_value;
    int i;

    if (ddi_strtol(value, NULL, 10, &new_value) != 0)
        return (EINVAL);

    if (ip_squeue_bind == new_value)
        return (0);

    *bind_enabled = new_value;
    mutex_enter(&cpu_lock);
    if (new_value == 0) {
        for (i = 0; i < sqset_global_size; i++)
            ip_squeue_set_unbind(sqset_global_list[i]);
    } else {
        for (i = 0; i < sqset_global_size; i++)
            ip_squeue_set_bind(sqset_global_list[i]);
    }

    mutex_exit(&cpu_lock);
    return (0);
}

/*
 * Set squeue profiling.
 * 0 means "disable"
 * 1 means "enable"
 * 2 means "enable and reset"
 */
/* ARGSUSED */
int
ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
    cred_t *cr)
{
    int *profile_enabled = (int *)cp;
    long new_value;
    squeue_set_t *sqs;

    if (ddi_strtol(value, NULL, 10, &new_value) != 0)
        return (EINVAL);

    if (new_value == 0)
        squeue_profile_stop();
    else if (new_value == 1)
        squeue_profile_start();
    else if (new_value == 2) {
        int i, j;

        squeue_profile_stop();
        mutex_enter(&cpu_lock);
        for (i = 0; i < sqset_global_size; i++) {
            sqs = sqset_global_list[i];
            for (j = 0; j < sqs->sqs_size; j++) {
                squeue_profile_reset(sqs->sqs_list[j]);
            }
        }
        mutex_exit(&cpu_lock);

        new_value = 1;
        squeue_profile_start();
    }
    *profile_enabled = new_value;

    return (0);
}

/*
 * Reconfiguration callback
 */

/* ARGSUSED */
static int
ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
{
    cpu_t *cp = cpu[id];

    ASSERT(MUTEX_HELD(&cpu_lock));
    switch (what) {
    case CPU_CONFIG:
        /*
         * A new CPU is added. Create an squeue for it but do not bind
         * it yet.
         */
        if (cp->cpu_squeue_set == NULL)
            cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
        break;
    case CPU_ON:
    case CPU_INIT:
    case CPU_CPUPART_IN:
        if (cp->cpu_squeue_set == NULL) {
            cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
        }
        if (ip_squeue_bind)
            ip_squeue_set_bind(cp->cpu_squeue_set);
        break;
    case CPU_UNCONFIG:
    case CPU_OFF:
    case CPU_CPUPART_OUT:
        ASSERT((cp->cpu_squeue_set != NULL) ||
            (cp->cpu_flags & CPU_OFFLINE));

        if (cp->cpu_squeue_set != NULL) {
            ip_squeue_set_unbind(cp->cpu_squeue_set);
        }
        break;
    default:
        break;
    }
    return (0);
}

/* ARGSUSED */
static void
ip_squeue_set_bind(squeue_set_t *sqs)
{
    int i;
    squeue_t *sqp;

    if (!ip_squeue_bind)
        return;

    mutex_enter(&sqs->sqs_lock);
    for (i = 0; i < sqs->sqs_size; i++) {
        sqp = sqs->sqs_list[i];
        if (sqp->sq_state & SQS_BOUND)
            continue;
        squeue_bind(sqp, -1);
    }
    mutex_exit(&sqs->sqs_lock);
}

static void
ip_squeue_set_unbind(squeue_set_t *sqs)
{
    int i;
    squeue_t *sqp;

    mutex_enter(&sqs->sqs_lock);
    for (i = 0; i < sqs->sqs_size; i++) {
        sqp = sqs->sqs_list[i];

        /*
         * CPU is going offline. Remove the thread affinity
         * for any soft ring threads the squeue is managing.
         */
        if (sqp->sq_state & SQS_ILL_BOUND) {
            ill_rx_ring_t   *ring = sqp->sq_rx_ring;
            ill_t       *ill = ring->rr_ill;

            if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
                ASSERT(ring->rr_handle != NULL);
                ill->ill_dls_capab->ill_dls_unbind(
                    ring->rr_handle);
            }
        }
        if (!(sqp->sq_state & SQS_BOUND))
            continue;
        squeue_unbind(sqp);
    }
    mutex_exit(&sqs->sqs_lock);
}