ddi_intr_irm.c revision 7ff178cd8db129d385d3177eb20744d3b6efc59b
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/note.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kmem.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/ddi.h>
#include <sys/sunndi.h>
#include <sys/ndi_impldefs.h> /* include prototypes */
#if defined(__i386) || defined(__amd64)
/*
* MSI-X allocation limit.
*/
extern uint_t ddi_msix_alloc_limit;
#endif
/*
* Interrupt Resource Management (IRM).
*/
#define DDI_IRM_BALANCE_DELAY (60) /* In seconds */
#define DDI_IRM_HAS_CB(c) ((c) && (c->cb_flags & DDI_CB_FLAG_INTR))
#define DDI_IRM_IS_REDUCIBLE(r) (((r->ireq_flags & DDI_IRM_FLAG_CALLBACK) && \
(r->ireq_type == DDI_INTR_TYPE_MSIX)) || \
(r->ireq_flags & DDI_IRM_FLAG_NEW))
extern pri_t minclsyspri;
/* Global policies */
int irm_enable = 1;
boolean_t irm_active = B_FALSE;
int irm_default_policy = DDI_IRM_POLICY_LARGE;
uint_t irm_balance_delay = DDI_IRM_BALANCE_DELAY;
/* Global list of interrupt pools */
kmutex_t irm_pools_lock;
list_t irm_pools_list;
/* Global debug tunables */
#ifdef DEBUG
int irm_debug_policy = 0;
uint_t irm_debug_size = 0;
#endif /* DEBUG */
static void irm_balance_thread(ddi_irm_pool_t *);
static void i_ddi_irm_balance(ddi_irm_pool_t *);
static void i_ddi_irm_enqueue(ddi_irm_pool_t *, boolean_t);
static void i_ddi_irm_reduce(ddi_irm_pool_t *pool);
static int i_ddi_irm_reduce_by_policy(ddi_irm_pool_t *, int, int);
static void i_ddi_irm_reduce_new(ddi_irm_pool_t *, int);
static void i_ddi_irm_insertion_sort(list_t *, ddi_irm_req_t *);
static int i_ddi_irm_notify(ddi_irm_pool_t *, ddi_irm_req_t *);
static int i_ddi_irm_modify_increase(ddi_irm_req_t *, int);
/*
* OS Initialization Routines
*/
/*
* irm_init()
*
* Initialize IRM subsystem before any drivers are attached.
*/
void
irm_init(void)
{
/* Do nothing if IRM is disabled */
if (!irm_enable)
return;
/* Verify that the default balancing policy is valid */
if (!DDI_IRM_POLICY_VALID(irm_default_policy))
irm_default_policy = DDI_IRM_POLICY_LARGE;
/* Initialize the global list of interrupt pools */
mutex_init(&irm_pools_lock, NULL, MUTEX_DRIVER, NULL);
list_create(&irm_pools_list, sizeof (ddi_irm_pool_t),
offsetof(ddi_irm_pool_t, ipool_link));
}
/*
* i_ddi_irm_poststartup()
*
* IRM is not activated until after the IO subsystem is initialized.
* When activated, per-pool balancing threads are spawned and a flag
* is set so that all future pools will be activated when created.
*
* NOTE: the global variable 'irm_enable' disables IRM if zero.
*/
void
i_ddi_irm_poststartup(void)
{
ddi_irm_pool_t *pool_p;
/* Do nothing if IRM is disabled */
if (!irm_enable)
return;
/* Lock the global list */
mutex_enter(&irm_pools_lock);
/* Activate all defined pools */
for (pool_p = list_head(&irm_pools_list); pool_p;
pool_p = list_next(&irm_pools_list, pool_p))
pool_p->ipool_thread = thread_create(NULL, 0,
irm_balance_thread, pool_p, 0, &p0, TS_RUN, minclsyspri);
/* Set future pools to be active */
irm_active = B_TRUE;
/* Unlock the global list */
mutex_exit(&irm_pools_lock);
}
/*
* NDI interfaces for creating/destroying IRM pools.
*/
/*
* ndi_irm_create()
*
* Nexus interface to create an IRM pool. Create the new
* pool and add it to the global list of interrupt pools.
*/
int
ndi_irm_create(dev_info_t *dip, ddi_irm_params_t *paramsp,
ddi_irm_pool_t **pool_retp)
{
ddi_irm_pool_t *pool_p;
ASSERT(dip != NULL);
ASSERT(paramsp != NULL);
ASSERT(pool_retp != NULL);
ASSERT(paramsp->iparams_total >= 1);
ASSERT(paramsp->iparams_types != 0);
DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_create: dip %p\n", (void *)dip));
/* Check if IRM is enabled */
if (!irm_enable)
return (NDI_FAILURE);
/* Validate parameters */
if ((dip == NULL) || (paramsp == NULL) || (pool_retp == NULL) ||
(paramsp->iparams_total < 1) || (paramsp->iparams_types == 0))
return (NDI_FAILURE);
/* Allocate and initialize the pool */
pool_p = kmem_zalloc(sizeof (ddi_irm_pool_t), KM_SLEEP);
pool_p->ipool_owner = dip;
pool_p->ipool_policy = irm_default_policy;
pool_p->ipool_types = paramsp->iparams_types;
pool_p->ipool_totsz = paramsp->iparams_total;
pool_p->ipool_defsz = MIN(DDI_MAX_MSIX_ALLOC, MAX(DDI_MIN_MSIX_ALLOC,
paramsp->iparams_total / DDI_MSIX_ALLOC_DIVIDER));
list_create(&pool_p->ipool_req_list, sizeof (ddi_irm_req_t),
offsetof(ddi_irm_req_t, ireq_link));
list_create(&pool_p->ipool_scratch_list, sizeof (ddi_irm_req_t),
offsetof(ddi_irm_req_t, ireq_scratch_link));
cv_init(&pool_p->ipool_cv, NULL, CV_DRIVER, NULL);
mutex_init(&pool_p->ipool_lock, NULL, MUTEX_DRIVER, NULL);
mutex_init(&pool_p->ipool_navail_lock, NULL, MUTEX_DRIVER, NULL);
/* Add to global list of pools */
mutex_enter(&irm_pools_lock);
list_insert_tail(&irm_pools_list, pool_p);
mutex_exit(&irm_pools_lock);
/* If IRM is active, then activate the pool */
if (irm_active)
pool_p->ipool_thread = thread_create(NULL, 0,
irm_balance_thread, pool_p, 0, &p0, TS_RUN, minclsyspri);
*pool_retp = pool_p;
return (NDI_SUCCESS);
}
/*
* ndi_irm_resize_pool()
*
* Nexus interface to resize IRM pool. If the pool size drops
* below the allocated number of vectors then initiate rebalance
* operation before resizing the pool. If rebalance operation fails
* then return NDI_FAILURE.
*/
int
ndi_irm_resize_pool(ddi_irm_pool_t *pool_p, uint_t new_size)
{
uint_t prev_size;
ASSERT(pool_p != NULL);
DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_resize_pool: pool_p %p"
" current-size 0x%x new-size 0x%x\n",
(void *)pool_p, pool_p->ipool_totsz, new_size));
if (pool_p == NULL)
return (NDI_EINVAL);
/* Check if IRM is enabled */
if (!irm_enable)
return (NDI_FAILURE);
mutex_enter(&pool_p->ipool_lock);
/*
* If we are increasing the pool size or if the reserved
* number of vectors is <= the new pool size then simply
* update the pool size and enqueue a reblance operation
* if necessary to use the new vectors.
*/
if ((pool_p->ipool_totsz < new_size) ||
(pool_p->ipool_resno <= new_size)) {
/* set new pool size */
pool_p->ipool_totsz = new_size;
/* adjust the default allocation limit */
pool_p->ipool_defsz = MIN(DDI_MAX_MSIX_ALLOC,
MAX(DDI_MIN_MSIX_ALLOC, new_size / DDI_MSIX_ALLOC_DIVIDER));
/* queue a rebalance operation to use the new vectors */
if (pool_p->ipool_reqno > pool_p->ipool_resno)
i_ddi_irm_enqueue(pool_p, B_FALSE);
mutex_exit(&pool_p->ipool_lock);
return (NDI_SUCCESS);
}
DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_resize_pool: pool_p %p"
" needs a rebalance operation\n", (void *)pool_p));
/*
* requires a rebalance operation
*/
/* save the current pool size */
prev_size = pool_p->ipool_totsz;
/* set the pool size to the desired new value */
pool_p->ipool_totsz = new_size;
/* perform the rebalance operation */
i_ddi_irm_enqueue(pool_p, B_TRUE);
/*
* If rebalance operation couldn't free up enough
* vectors then fail the resize operation.
*/
if (pool_p->ipool_resno > new_size) { /* rebalance failed */
/* restore the pool size to the previous value */
pool_p->ipool_totsz = prev_size;
/* enqueue a rebalance operation for the original pool size */
i_ddi_irm_enqueue(pool_p, B_FALSE);
mutex_exit(&pool_p->ipool_lock);
return (NDI_FAILURE);
} else { /* rebalance worked */
/* adjust the default allocation limit */
pool_p->ipool_defsz = MIN(DDI_MAX_MSIX_ALLOC,
MAX(DDI_MIN_MSIX_ALLOC, new_size / DDI_MSIX_ALLOC_DIVIDER));
mutex_exit(&pool_p->ipool_lock);
DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_resize_pool: pool_p %p"
" resized from %x to %x\n",
(void *)pool_p, prev_size, pool_p->ipool_totsz));
return (NDI_SUCCESS);
}
}
/*
* ndi_irm_destroy()
*
* Nexus interface to destroy an IRM pool. Destroy the pool
* and remove it from the global list of interrupt pools.
*/
int
ndi_irm_destroy(ddi_irm_pool_t *pool_p)
{
ASSERT(pool_p != NULL);
ASSERT(pool_p->ipool_resno == 0);
DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_destroy: pool_p %p\n",
(void *)pool_p));
/* Validate parameters */
if (pool_p == NULL)
return (NDI_FAILURE);
/* Validate that pool is empty */
if (pool_p->ipool_resno != 0)
return (NDI_BUSY);
/* Remove the pool from the global list */
mutex_enter(&irm_pools_lock);
list_remove(&irm_pools_list, pool_p);
mutex_exit(&irm_pools_lock);
/* Terminate the balancing thread */
mutex_enter(&pool_p->ipool_lock);
if (pool_p->ipool_thread &&
(pool_p->ipool_flags & DDI_IRM_FLAG_ACTIVE)) {
pool_p->ipool_flags |= DDI_IRM_FLAG_EXIT;
cv_signal(&pool_p->ipool_cv);
mutex_exit(&pool_p->ipool_lock);
thread_join(pool_p->ipool_thread->t_did);
} else
mutex_exit(&pool_p->ipool_lock);
/* Destroy the pool */
cv_destroy(&pool_p->ipool_cv);
mutex_destroy(&pool_p->ipool_lock);
mutex_destroy(&pool_p->ipool_navail_lock);
list_destroy(&pool_p->ipool_req_list);
list_destroy(&pool_p->ipool_scratch_list);
kmem_free(pool_p, sizeof (ddi_irm_pool_t));
return (NDI_SUCCESS);
}
/*
* Insert/Modify/Remove Interrupt Requests
*/
/*
* i_ddi_irm_insert()
*
* Insert a new request into an interrupt pool, and balance the pool.
*/
int
i_ddi_irm_insert(dev_info_t *dip, int type, int count)
{
ddi_irm_req_t *req_p;
devinfo_intr_t *intr_p;
ddi_irm_pool_t *pool_p;
uint_t nreq, nmin, npartial;
boolean_t irm_flag = B_FALSE;
ASSERT(dip != NULL);
ASSERT(DDI_INTR_TYPE_FLAG_VALID(type));
ASSERT(count > 0);
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: dip %p type %d count %d\n",
(void *)dip, type, count));
/* Validate parameters */
if ((dip == NULL) || (count < 1) || !DDI_INTR_TYPE_FLAG_VALID(type)) {
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: invalid args\n"));
return (DDI_EINVAL);
}
/* Check for an existing request */
if (((intr_p = DEVI(dip)->devi_intr_p) != NULL) &&
(intr_p->devi_irm_req_p != NULL))
return (DDI_SUCCESS);
/* Check for IRM support from the system */
if ((pool_p = i_ddi_intr_get_pool(dip, type)) == NULL) {
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: not supported\n"));
return (DDI_ENOTSUP);
}
/* Check for IRM support from the driver */
if (i_ddi_irm_supported(dip, type) == DDI_SUCCESS)
irm_flag = B_TRUE;
/* Determine request size */
nreq = (irm_flag) ? count :
MIN(count, i_ddi_intr_get_limit(dip, type, pool_p));
nmin = (irm_flag) ? 1 : nreq;
npartial = MIN(nreq, pool_p->ipool_defsz);
/* Allocate and initialize the request */
req_p = kmem_zalloc(sizeof (ddi_irm_req_t), KM_SLEEP);
req_p->ireq_type = type;
req_p->ireq_dip = dip;
req_p->ireq_pool_p = pool_p;
req_p->ireq_nreq = nreq;
req_p->ireq_flags = DDI_IRM_FLAG_NEW;
if (irm_flag)
req_p->ireq_flags |= DDI_IRM_FLAG_CALLBACK;
/* Lock the pool */
mutex_enter(&pool_p->ipool_lock);
/* Check for minimal fit before inserting */
if ((pool_p->ipool_minno + nmin) > pool_p->ipool_totsz) {
cmn_err(CE_WARN, "%s%d: interrupt pool too full.\n",
ddi_driver_name(dip), ddi_get_instance(dip));
mutex_exit(&pool_p->ipool_lock);
kmem_free(req_p, sizeof (ddi_irm_req_t));
return (DDI_EAGAIN);
}
/* Insert the request into the pool */
pool_p->ipool_reqno += nreq;
pool_p->ipool_minno += nmin;
i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
/*
* Try to fulfill the request.
*
* If all the interrupts are available, and either the request
* is static or the pool is active, then just take them directly.
*
* If only some of the interrupts are available, and the request
* can receive future callbacks, then take some now but queue the
* pool to be rebalanced later.
*
* Otherwise, immediately rebalance the pool and wait.
*/
if ((!irm_flag || (pool_p->ipool_flags & DDI_IRM_FLAG_ACTIVE)) &&
((pool_p->ipool_resno + nreq) <= pool_p->ipool_totsz)) {
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: "
"request completely fulfilled.\n"));
pool_p->ipool_resno += nreq;
req_p->ireq_navail = nreq;
req_p->ireq_flags &= ~(DDI_IRM_FLAG_NEW);
} else if (irm_flag &&
((pool_p->ipool_resno + npartial) <= pool_p->ipool_totsz)) {
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: "
"request partially fulfilled.\n"));
pool_p->ipool_resno += npartial;
req_p->ireq_navail = npartial;
req_p->ireq_flags &= ~(DDI_IRM_FLAG_NEW);
i_ddi_irm_enqueue(pool_p, B_FALSE);
} else {
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: "
"request needs immediate rebalance.\n"));
i_ddi_irm_enqueue(pool_p, B_TRUE);
req_p->ireq_flags &= ~(DDI_IRM_FLAG_NEW);
}
/* Fail if the request cannot be fulfilled at all */
if (req_p->ireq_navail == 0) {
cmn_err(CE_WARN, "%s%d: interrupt pool too full.\n",
ddi_driver_name(dip), ddi_get_instance(dip));
pool_p->ipool_reqno -= nreq;
pool_p->ipool_minno -= nmin;
list_remove(&pool_p->ipool_req_list, req_p);
mutex_exit(&pool_p->ipool_lock);
kmem_free(req_p, sizeof (ddi_irm_req_t));
return (DDI_EAGAIN);
}
/* Unlock the pool */
mutex_exit(&pool_p->ipool_lock);
intr_p->devi_irm_req_p = req_p;
return (DDI_SUCCESS);
}
/*
* i_ddi_irm_modify()
*
* Modify an existing request in an interrupt pool, and balance the pool.
*/
int
i_ddi_irm_modify(dev_info_t *dip, int nreq)
{
devinfo_intr_t *intr_p;
ddi_irm_req_t *req_p;
ddi_irm_pool_t *pool_p;
int type;
int retval = DDI_SUCCESS;
ASSERT(dip != NULL);
ASSERT(nreq > 0);
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: dip %p nreq %d\n",
(void *)dip, nreq));
/* Validate parameters */
if ((dip == NULL) || (nreq < 1)) {
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: invalid args\n"));
return (DDI_EINVAL);
}
/* Do nothing if not mapped to an IRM pool */
if (((intr_p = DEVI(dip)->devi_intr_p) == NULL) ||
((req_p = intr_p->devi_irm_req_p) == NULL))
return (DDI_SUCCESS);
/* Do nothing if new size is the same */
if (nreq == req_p->ireq_nreq)
return (DDI_SUCCESS);
/* Do not allow MSI requests to be resized */
if ((type = req_p->ireq_type) == DDI_INTR_TYPE_MSI) {
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: invalid type\n"));
return (DDI_ENOTSUP);
}
/* Select the pool */
if ((pool_p = req_p->ireq_pool_p) == NULL) {
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: missing pool\n"));
return (DDI_FAILURE);
}
/* Validate request size is not too large */
if (nreq > i_ddi_intr_get_limit(dip, type, pool_p)) {
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: invalid args\n"));
return (DDI_EINVAL);
}
/* Lock the pool */
mutex_enter(&pool_p->ipool_lock);
/*
* Process the modification.
*
* - To increase a non-IRM request, call the implementation in
* i_ddi_irm_modify_increase().
*
* - To decrease a non-IRM request, directly update the pool and
* request, then queue the pool for later rebalancing.
*
* - To modify an IRM request, always queue the pool for later
* rebalancing. IRM consumers rely upon callbacks for changes.
*/
if ((nreq > req_p->ireq_nreq) &&
(i_ddi_irm_supported(dip, type) != DDI_SUCCESS)) {
retval = i_ddi_irm_modify_increase(req_p, nreq);
} else {
/* Update pool and request */
pool_p->ipool_reqno -= req_p->ireq_nreq;
pool_p->ipool_reqno += nreq;
if (i_ddi_irm_supported(dip, type) != DDI_SUCCESS) {
pool_p->ipool_minno -= req_p->ireq_navail;
pool_p->ipool_resno -= req_p->ireq_navail;
pool_p->ipool_minno += nreq;
pool_p->ipool_resno += nreq;
req_p->ireq_navail = nreq;
}
req_p->ireq_nreq = nreq;
/* Re-sort request into the pool */
list_remove(&pool_p->ipool_req_list, req_p);
i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
/* Queue pool for asynchronous rebalance */
i_ddi_irm_enqueue(pool_p, B_FALSE);
}
/* Unlock the pool */
mutex_exit(&pool_p->ipool_lock);
return (retval);
}
/*
* i_ddi_irm_modify_increase()
*
* Increase a non-IRM request. The additional interrupts are
* directly taken from the pool when possible. Otherwise, an
* immediate, synchronous rebalance is performed. A temporary
* proxy request is used for any rebalance operation to ensure
* the request is not reduced below its current allocation.
*
* NOTE: pool must already be locked.
*/
static int
i_ddi_irm_modify_increase(ddi_irm_req_t *req_p, int nreq)
{
dev_info_t *dip = req_p->ireq_dip;
ddi_irm_pool_t *pool_p = req_p->ireq_pool_p;
ddi_irm_req_t new_req;
int count, delta;
ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
/* Compute number of additional vectors */
count = nreq - req_p->ireq_nreq;
/* Check for minimal fit */
if ((pool_p->ipool_minno + count) > pool_p->ipool_totsz) {
cmn_err(CE_WARN, "%s%d: interrupt pool too full.\n",
ddi_driver_name(dip), ddi_get_instance(dip));
return (DDI_EAGAIN);
}
/* Update the pool */
pool_p->ipool_reqno += count;
pool_p->ipool_minno += count;
/* Attempt direct implementation */
if ((pool_p->ipool_resno + count) <= pool_p->ipool_totsz) {
req_p->ireq_nreq += count;
req_p->ireq_navail += count;
pool_p->ipool_resno += count;
return (DDI_SUCCESS);
}
/* Rebalance required: fail if pool is not active */
if ((pool_p->ipool_flags & DDI_IRM_FLAG_ACTIVE) == 0) {
pool_p->ipool_reqno -= count;
pool_p->ipool_minno -= count;
return (DDI_EAGAIN);
}
/* Insert temporary proxy request */
bzero(&new_req, sizeof (ddi_irm_req_t));
new_req.ireq_dip = dip;
new_req.ireq_nreq = count;
new_req.ireq_pool_p = pool_p;
new_req.ireq_type = req_p->ireq_type;
new_req.ireq_flags = DDI_IRM_FLAG_NEW;
i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, &new_req);
/* Synchronously rebalance */
i_ddi_irm_enqueue(pool_p, B_TRUE);
/* Remove proxy request, and merge into original request */
req_p->ireq_nreq += count;
if ((delta = (count - new_req.ireq_navail)) > 0) {
req_p->ireq_nreq -= delta;
pool_p->ipool_reqno -= delta;
pool_p->ipool_minno -= delta;
}
req_p->ireq_navail += new_req.ireq_navail;
list_remove(&pool_p->ipool_req_list, req_p);
list_remove(&pool_p->ipool_req_list, &new_req);
i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
return (DDI_SUCCESS);
}
/*
* i_ddi_irm_remove()
*
* Remove a request from an interrupt pool, and balance the pool.
*/
int
i_ddi_irm_remove(dev_info_t *dip)
{
devinfo_intr_t *intr_p;
ddi_irm_pool_t *pool_p;
ddi_irm_req_t *req_p;
uint_t nmin;
ASSERT(dip != NULL);
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_remove: dip %p\n", (void *)dip));
/* Validate parameters */
if (dip == NULL) {
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_remove: invalid args\n"));
return (DDI_EINVAL);
}
/* Check if the device has a request */
if (!(intr_p = DEVI(dip)->devi_intr_p) ||
!(req_p = intr_p->devi_irm_req_p)) {
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: not found\n"));
return (DDI_EINVAL);
}
/* Lock the pool */
pool_p = req_p->ireq_pool_p;
mutex_enter(&pool_p->ipool_lock);
/* Remove request */
nmin = DDI_IRM_IS_REDUCIBLE(req_p) ? 1 : req_p->ireq_nreq;
pool_p->ipool_minno -= nmin;
pool_p->ipool_reqno -= req_p->ireq_nreq;
pool_p->ipool_resno -= req_p->ireq_navail;
list_remove(&pool_p->ipool_req_list, req_p);
/* Queue pool to be rebalanced */
i_ddi_irm_enqueue(pool_p, B_FALSE);
/* Unlock the pool */
mutex_exit(&pool_p->ipool_lock);
/* Destroy the request */
intr_p->devi_irm_req_p = NULL;
kmem_free(req_p, sizeof (ddi_irm_req_t));
return (DDI_SUCCESS);
}
/*
* i_ddi_irm_set_cb()
*
* Change the callback flag for a request, in response to
* a change in its callback registration. Then rebalance
* the interrupt pool.
*
* NOTE: the request is not locked because the navail value
* is not directly affected. The balancing thread may
* modify the navail value in the background after it
* locks the request itself.
*/
void
i_ddi_irm_set_cb(dev_info_t *dip, boolean_t has_cb_flag)
{
devinfo_intr_t *intr_p;
ddi_irm_pool_t *pool_p;
ddi_irm_req_t *req_p;
uint_t nreq;
ASSERT(dip != NULL);
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_set_cb: dip %p has_cb_flag %d\n",
(void *)dip, (int)has_cb_flag));
/* Validate parameters */
if (dip == NULL)
return;
/* Check for association with interrupt pool */
if (!(intr_p = DEVI(dip)->devi_intr_p) ||
!(req_p = intr_p->devi_irm_req_p)) {
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_set_cb: not in pool\n"));
return;
}
/* Lock the pool */
pool_p = req_p->ireq_pool_p;
mutex_enter(&pool_p->ipool_lock);
/*
* Update the request and the pool
*/
if (has_cb_flag) {
/* Update pool statistics */
if (req_p->ireq_type == DDI_INTR_TYPE_MSIX)
pool_p->ipool_minno -= (req_p->ireq_nreq - 1);
/* Update request */
req_p->ireq_flags |= DDI_IRM_FLAG_CALLBACK;
/* Rebalance in background */
i_ddi_irm_enqueue(pool_p, B_FALSE);
} else {
/* Determine new request size */
nreq = MIN(req_p->ireq_nreq, pool_p->ipool_defsz);
#if defined(__i386) || defined(__amd64)
/* Use the default static limit for non-IRM drivers */
if (req_p->ireq_type == DDI_INTR_TYPE_MSIX)
nreq = MIN(nreq, ddi_msix_alloc_limit);
#endif
/* Update pool statistics */
pool_p->ipool_reqno -= req_p->ireq_nreq;
pool_p->ipool_reqno += nreq;
if (req_p->ireq_type == DDI_INTR_TYPE_MSIX) {
pool_p->ipool_minno -= 1;
pool_p->ipool_minno += nreq;
} else {
pool_p->ipool_minno -= req_p->ireq_nreq;
pool_p->ipool_minno += nreq;
}
/* Update request size, and re-sort in pool */
req_p->ireq_nreq = nreq;
list_remove(&pool_p->ipool_req_list, req_p);
i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
/* Rebalance synchronously, before losing callback */
i_ddi_irm_enqueue(pool_p, B_TRUE);
/* Remove callback flag */
req_p->ireq_flags &= ~(DDI_IRM_FLAG_CALLBACK);
}
/* Unlock the pool */
mutex_exit(&pool_p->ipool_lock);
}
/*
* i_ddi_irm_supported()
*
* Query if IRM is supported by a driver using a specific interrupt type.
* Notice that IRM is limited to MSI-X users with registered callbacks.
*/
int
i_ddi_irm_supported(dev_info_t *dip, int type)
{
ddi_cb_t *cb_p = DEVI(dip)->devi_cb_p;
return ((DDI_IRM_HAS_CB(cb_p) && (type == DDI_INTR_TYPE_MSIX)) ?
DDI_SUCCESS : DDI_ENOTSUP);
}
/*
* Interrupt Pool Balancing
*/
/*
* irm_balance_thread()
*
* One instance of this thread operates per each defined IRM pool.
* It does the initial activation of the pool, as well as balancing
* any requests that were queued up before the pool was active.
* Once active, it waits forever to service balance operations.
*/
static void
irm_balance_thread(ddi_irm_pool_t *pool_p)
{
clock_t interval;
DDI_INTR_IRMDBG((CE_CONT, "irm_balance_thread: pool_p %p\n",
(void *)pool_p));
/* Lock the pool */
mutex_enter(&pool_p->ipool_lock);
/* Perform initial balance if required */
if (pool_p->ipool_reqno > pool_p->ipool_resno)
i_ddi_irm_balance(pool_p);
/* Activate the pool */
pool_p->ipool_flags |= DDI_IRM_FLAG_ACTIVE;
/* Main loop */
for (;;) {
/* Compute the delay interval */
interval = drv_usectohz(irm_balance_delay * 1000000);
/* Sleep until queued */
cv_wait(&pool_p->ipool_cv, &pool_p->ipool_lock);
DDI_INTR_IRMDBG((CE_CONT, "irm_balance_thread: signaled.\n"));
/* Wait one interval, or until there are waiters */
if ((interval > 0) &&
!(pool_p->ipool_flags & DDI_IRM_FLAG_WAITERS) &&
!(pool_p->ipool_flags & DDI_IRM_FLAG_EXIT)) {
(void) cv_reltimedwait(&pool_p->ipool_cv,
&pool_p->ipool_lock, interval, TR_CLOCK_TICK);
}
/* Check if awakened to exit */
if (pool_p->ipool_flags & DDI_IRM_FLAG_EXIT) {
DDI_INTR_IRMDBG((CE_CONT,
"irm_balance_thread: exiting...\n"));
mutex_exit(&pool_p->ipool_lock);
thread_exit();
}
/* Balance the pool */
i_ddi_irm_balance(pool_p);
/* Notify waiters */
if (pool_p->ipool_flags & DDI_IRM_FLAG_WAITERS) {
cv_broadcast(&pool_p->ipool_cv);
pool_p->ipool_flags &= ~(DDI_IRM_FLAG_WAITERS);
}
/* Clear QUEUED condition */
pool_p->ipool_flags &= ~(DDI_IRM_FLAG_QUEUED);
}
}
/*
* i_ddi_irm_balance()
*
* Balance a pool. The general algorithm is to first reset all
* requests to their maximum size, use reduction algorithms to
* solve any imbalance, and then notify affected drivers.
*/
static void
i_ddi_irm_balance(ddi_irm_pool_t *pool_p)
{
ddi_irm_req_t *req_p;
#ifdef DEBUG
uint_t debug_totsz = 0;
int debug_policy = 0;
#endif /* DEBUG */
ASSERT(pool_p != NULL);
ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_balance: pool_p %p\n",
(void *)pool_p));
#ifdef DEBUG /* Adjust size and policy settings */
if (irm_debug_size > pool_p->ipool_minno) {
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_balance: debug size %d\n",
irm_debug_size));
debug_totsz = pool_p->ipool_totsz;
pool_p->ipool_totsz = irm_debug_size;
}
if (DDI_IRM_POLICY_VALID(irm_debug_policy)) {
DDI_INTR_IRMDBG((CE_CONT,
"i_ddi_irm_balance: debug policy %d\n", irm_debug_policy));
debug_policy = pool_p->ipool_policy;
pool_p->ipool_policy = irm_debug_policy;
}
#endif /* DEBUG */
/* Lock the availability lock */
mutex_enter(&pool_p->ipool_navail_lock);
/*
* Put all of the reducible requests into a scratch list.
* Reset each one of them to their maximum availability.
*/
for (req_p = list_head(&pool_p->ipool_req_list); req_p;
req_p = list_next(&pool_p->ipool_req_list, req_p)) {
if (DDI_IRM_IS_REDUCIBLE(req_p)) {
pool_p->ipool_resno -= req_p->ireq_navail;
req_p->ireq_scratch = req_p->ireq_navail;
req_p->ireq_navail = req_p->ireq_nreq;
pool_p->ipool_resno += req_p->ireq_navail;
list_insert_tail(&pool_p->ipool_scratch_list, req_p);
}
}
/* Balance the requests */
i_ddi_irm_reduce(pool_p);
/* Unlock the availability lock */
mutex_exit(&pool_p->ipool_navail_lock);
/*
* Process REMOVE notifications.
*
* If a driver fails to release interrupts: exclude it from
* further processing, correct the resulting imbalance, and
* start over again at the head of the scratch list.
*/
req_p = list_head(&pool_p->ipool_scratch_list);
while (req_p) {
if ((req_p->ireq_navail < req_p->ireq_scratch) &&
(i_ddi_irm_notify(pool_p, req_p) != DDI_SUCCESS)) {
list_remove(&pool_p->ipool_scratch_list, req_p);
mutex_enter(&pool_p->ipool_navail_lock);
i_ddi_irm_reduce(pool_p);
mutex_exit(&pool_p->ipool_navail_lock);
req_p = list_head(&pool_p->ipool_scratch_list);
} else {
req_p = list_next(&pool_p->ipool_scratch_list, req_p);
}
}
/*
* Process ADD notifications.
*
* This is the last use of the scratch list, so empty it.
*/
while (req_p = list_remove_head(&pool_p->ipool_scratch_list)) {
if (req_p->ireq_navail > req_p->ireq_scratch) {
(void) i_ddi_irm_notify(pool_p, req_p);
}
}
#ifdef DEBUG /* Restore size and policy settings */
if (debug_totsz != 0)
pool_p->ipool_totsz = debug_totsz;
if (debug_policy != 0)
pool_p->ipool_policy = debug_policy;
#endif /* DEBUG */
}
/*
* i_ddi_irm_reduce()
*
* Use reduction algorithms to correct an imbalance in a pool.
*/
static void
i_ddi_irm_reduce(ddi_irm_pool_t *pool_p)
{
int imbalance;
ASSERT(pool_p != NULL);
ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
ASSERT(DDI_IRM_POLICY_VALID(pool_p->ipool_policy));
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_reduce: pool_p %p\n",
(void *)pool_p));
/* Compute the imbalance. Do nothing if already balanced. */
if ((imbalance = pool_p->ipool_resno - pool_p->ipool_totsz) <= 0)
return;
/*
* Try policy based reduction first. If it failed, then
* possibly reduce new requests as a last resort.
*/
if (i_ddi_irm_reduce_by_policy(pool_p, imbalance, pool_p->ipool_policy)
!= DDI_SUCCESS) {
DDI_INTR_IRMDBG((CE_CONT,
"i_ddi_irm_reduce: policy reductions failed.\n"));
/* Compute remaining imbalance */
imbalance = pool_p->ipool_resno - pool_p->ipool_totsz;
ASSERT(imbalance > 0);
i_ddi_irm_reduce_new(pool_p, imbalance);
}
}
/*
* i_ddi_irm_enqueue()
*
* Queue a pool to be balanced. Signals the balancing thread to wake
* up and process the pool. If 'wait_flag' is true, then the current
* thread becomes a waiter and blocks until the balance is completed.
*/
static void
i_ddi_irm_enqueue(ddi_irm_pool_t *pool_p, boolean_t wait_flag)
{
ASSERT(pool_p != NULL);
ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_enqueue: pool_p %p wait_flag %d\n",
(void *)pool_p, (int)wait_flag));
/* Do nothing if pool is already balanced */
#ifndef DEBUG
if ((pool_p->ipool_reqno == pool_p->ipool_resno)) {
#else
if ((pool_p->ipool_reqno == pool_p->ipool_resno) && !irm_debug_size) {
#endif /* DEBUG */
DDI_INTR_IRMDBG((CE_CONT,
"i_ddi_irm_enqueue: pool already balanced\n"));
return;
}
/* Avoid deadlocks when IRM is not active */
if (!irm_active && wait_flag) {
DDI_INTR_IRMDBG((CE_CONT,
"i_ddi_irm_enqueue: pool not active.\n"));
return;
}
if (wait_flag)
pool_p->ipool_flags |= DDI_IRM_FLAG_WAITERS;
if (wait_flag || !(pool_p->ipool_flags & DDI_IRM_FLAG_QUEUED)) {
pool_p->ipool_flags |= DDI_IRM_FLAG_QUEUED;
cv_signal(&pool_p->ipool_cv);
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_enqueue: pool queued.\n"));
}
if (wait_flag) {
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_enqueue: waiting...\n"));
cv_wait(&pool_p->ipool_cv, &pool_p->ipool_lock);
}
}
/*
* i_ddi_irm_reduce_by_policy()
*
* Reduces requests based on reduction policies.
*
* For the DDI_IRM_POLICY_LARGE reduction policy, the algorithm
* generally reduces larger requests first, before advancing
* to smaller requests.
* For the DDI_IRM_POLICY_EVEN reduction policy, the algorithm
* reduces requests evenly, without giving a specific preference
* to smaller or larger requests. Each iteration reduces all
* reducible requests by the same amount until the imbalance is
* corrected.
*
* The scratch list is initially sorted in descending order by current
* navail values, which are maximized prior to reduction. This sorted
* order is preserved. It avoids reducing requests below the threshold
* of the interrupt pool's default allocation size.
*
* Optimizations in this algorithm include trying to reduce multiple
* requests together. And the algorithm attempts to reduce in larger
* increments when possible to minimize the total number of iterations.
*/
static int
i_ddi_irm_reduce_by_policy(ddi_irm_pool_t *pool_p, int imbalance, int policy)
{
ASSERT(pool_p != NULL);
ASSERT(imbalance > 0);
ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
while (imbalance > 0) {
list_t *slist_p = &pool_p->ipool_scratch_list;
ddi_irm_req_t *req_p = list_head(slist_p), *last_p;
uint_t nreduce = 0, nremain = 0, stop_navail;
uint_t pool_defsz = pool_p->ipool_defsz;
uint_t reduction, max_redu;
/* Fail if none are reducible */
if (!req_p || req_p->ireq_navail <= pool_defsz) {
DDI_INTR_IRMDBG((CE_CONT,
"i_ddi_irm_reduce_by_policy: Failure. "
"All requests have downsized to low limit.\n"));
return (DDI_FAILURE);
}
/* Count reducible requests */
stop_navail = (policy == DDI_IRM_POLICY_LARGE) ?
req_p->ireq_navail - 1 : pool_defsz;
for (; req_p; req_p = list_next(slist_p, req_p)) {
if (req_p->ireq_navail <= stop_navail)
break;
nreduce++;
}
/* Compute reduction */
last_p = req_p ? list_prev(slist_p, req_p) : list_tail(slist_p);
if ((policy == DDI_IRM_POLICY_LARGE) && req_p &&
req_p->ireq_navail > pool_defsz)
reduction = last_p->ireq_navail - req_p->ireq_navail;
else
reduction = last_p->ireq_navail - pool_defsz;
if ((max_redu = reduction * nreduce) > imbalance) {
reduction = imbalance / nreduce;
nremain = imbalance % nreduce;
pool_p->ipool_resno -= imbalance;
imbalance = 0;
} else {
pool_p->ipool_resno -= max_redu;
imbalance -= max_redu;
}
/* Reduce */
for (req_p = list_head(slist_p); (reduction != 0) && nreduce--;
req_p = list_next(slist_p, req_p)) {
req_p->ireq_navail -= reduction;
}
for (req_p = last_p; nremain--;
req_p = list_prev(slist_p, req_p)) {
req_p->ireq_navail--;
}
}
return (DDI_SUCCESS);
}
/*
* i_ddi_irm_reduce_new()
*
* Reduces new requests. This is only used as a last resort
* after another reduction algorithm failed.
*
* NOTE: The pool locking in i_ddi_irm_insert() ensures
* there can be only one new request at a time in a pool.
*/
static void
i_ddi_irm_reduce_new(ddi_irm_pool_t *pool_p, int imbalance)
{
ddi_irm_req_t *req_p;
ASSERT(pool_p != NULL);
ASSERT(imbalance > 0);
ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
DDI_INTR_IRMDBG((CE_CONT,
"i_ddi_irm_reduce_new: pool_p %p imbalance %d\n",
(void *)pool_p, imbalance));
for (req_p = list_head(&pool_p->ipool_scratch_list); req_p;
req_p = list_next(&pool_p->ipool_scratch_list, req_p)) {
if (req_p->ireq_flags & DDI_IRM_FLAG_NEW) {
ASSERT(req_p->ireq_navail >= imbalance);
req_p->ireq_navail -= imbalance;
pool_p->ipool_resno -= imbalance;
return;
}
}
/* should never go here */
ASSERT(B_FALSE);
}
/*
* Miscellaneous Helper Functions
*/
/*
* i_ddi_intr_get_pool()
*
* Get an IRM pool that supplies interrupts of a specified type.
* Invokes a DDI_INTROP_GETPOOL to the bus nexus driver. Fails
* if no pool exists.
*/
ddi_irm_pool_t *
i_ddi_intr_get_pool(dev_info_t *dip, int type)
{
devinfo_intr_t *intr_p;
ddi_irm_pool_t *pool_p;
ddi_irm_req_t *req_p;
ddi_intr_handle_impl_t hdl;
ASSERT(dip != NULL);
ASSERT(DDI_INTR_TYPE_FLAG_VALID(type));
if (((intr_p = DEVI(dip)->devi_intr_p) != NULL) &&
((req_p = intr_p->devi_irm_req_p) != NULL) &&
((pool_p = req_p->ireq_pool_p) != NULL) &&
(pool_p->ipool_types & type)) {
return (pool_p);
}
bzero(&hdl, sizeof (ddi_intr_handle_impl_t));
hdl.ih_dip = dip;
hdl.ih_type = type;
if (i_ddi_intr_ops(dip, dip, DDI_INTROP_GETPOOL,
&hdl, (void *)&pool_p) == DDI_SUCCESS)
return (pool_p);
return (NULL);
}
/*
* i_ddi_irm_insertion_sort()
*
* Use the insertion sort method to insert a request into a list.
* The list is sorted in descending order by request size.
*/
static void
i_ddi_irm_insertion_sort(list_t *req_list, ddi_irm_req_t *req_p)
{
ddi_irm_req_t *next_p;
next_p = list_head(req_list);
while (next_p && (next_p->ireq_nreq > req_p->ireq_nreq))
next_p = list_next(req_list, next_p);
list_insert_before(req_list, next_p, req_p);
}
/*
* i_ddi_irm_notify()
*
* Notify a driver of changes to its interrupt request using the
* generic callback mechanism. Checks for errors in processing.
*/
static int
i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p)
{
ddi_cb_action_t action;
ddi_cb_t *cb_p;
uint_t nintrs;
int ret, count;
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_notify: pool_p %p req_p %p\n",
(void *)pool_p, (void *)req_p));
/* Do not notify new or unchanged requests */
if ((req_p->ireq_navail == req_p->ireq_scratch) ||
(req_p->ireq_flags & DDI_IRM_FLAG_NEW))
return (DDI_SUCCESS);
/* Determine action and count */
if (req_p->ireq_navail > req_p->ireq_scratch) {
action = DDI_CB_INTR_ADD;
count = req_p->ireq_navail - req_p->ireq_scratch;
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_notify: adding %d\n",
count));
} else {
action = DDI_CB_INTR_REMOVE;
count = req_p->ireq_scratch - req_p->ireq_navail;
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_notify: removing %d\n",
count));
}
/* Lookup driver callback */
if ((cb_p = DEVI(req_p->ireq_dip)->devi_cb_p) == NULL) {
DDI_INTR_IRMDBG((CE_WARN, "i_ddi_irm_notify: no callback!\n"));
return (DDI_FAILURE);
}
/* Do callback */
ret = cb_p->cb_func(req_p->ireq_dip, action, (void *)(uintptr_t)count,
cb_p->cb_arg1, cb_p->cb_arg2);
/* Log callback errors */
if (ret != DDI_SUCCESS) {
cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n",
ddi_driver_name(req_p->ireq_dip),
ddi_get_instance(req_p->ireq_dip), (int)action, ret);
}
/* Check if the driver exceeds its availability */
nintrs = i_ddi_intr_get_current_nintrs(req_p->ireq_dip);
if (nintrs > req_p->ireq_navail) {
cmn_err(CE_WARN, "%s%d: failed to release interrupts "
"(nintrs=%d, navail=%d).\n",
ddi_driver_name(req_p->ireq_dip),
ddi_get_instance(req_p->ireq_dip), nintrs,
req_p->ireq_navail);
pool_p->ipool_resno += (nintrs - req_p->ireq_navail);
req_p->ireq_navail = nintrs;
return (DDI_FAILURE);
}
/* Update request */
req_p->ireq_scratch = req_p->ireq_navail;
return (DDI_SUCCESS);
}
/*
* i_ddi_irm_debug_balance()
*
* A debug/test only routine to force the immediate,
* synchronous rebalancing of an interrupt pool.
*/
#ifdef DEBUG
void
i_ddi_irm_debug_balance(dev_info_t *dip, boolean_t wait_flag)
{
ddi_irm_pool_t *pool_p;
int type;
DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_debug_balance: dip %p wait %d\n",
(void *)dip, (int)wait_flag));
if (((type = i_ddi_intr_get_current_type(dip)) != 0) &&
((pool_p = i_ddi_intr_get_pool(dip, type)) != NULL)) {
mutex_enter(&pool_p->ipool_lock);
i_ddi_irm_enqueue(pool_p, wait_flag);
mutex_exit(&pool_p->ipool_lock);
}
}
#endif