pool.c revision e76e762ef75f893b9c9cd50e3212110e2dce7d6f
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/pool_impl.h>
#include <sys/pool_pset.h>
#include <sys/id_space.h>
/*
* RESOURCE POOLS
*
* The resource pools facility brings together process-bindable resource into
* a common abstraction called a pool. Processor sets and other entities can
* be configured, grouped, and labelled such that workload components can be
* associated with a subset of a system's total resources.
*
* When disabled, the pools facility is "invisible". All processes belong
* to the same pool (pool_default), and processor sets can be managed through
* the old pset() system call. When enabled, processor sets can only be
* managed via the pools facility. New pools can be created and associated
* with processor sets. Processes can be bound to pools which have non-empty
* resource sets.
*
* Locking: pool_lock() protects global pools state and must be called
* before modifying the configuration, or when taking a snapshot of the
* configuration. If pool_lock_intr() is used, the operation may be
* interrupted by a signal or a request.
*
* To prevent processes from being rebound between pools while they are
* the middle of an operation which affects resource set bindings, such
* operations must be surrounded by calls to pool_barrier_enter() and
* pool_barrier_exit(). This mechanism guarantees that such processes will
* be stopped either at the beginning or at the end of the barrier so that
* the rebind operation can atomically bind the process and its threads
* to new resource sets, and then let process run again.
*
* Lock ordering with respect to other locks is as follows:
*
* pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock
*
* Most static and global variables defined in this file are protected
* by calling pool_lock().
*
* The operation that binds tasks and projects to pools is atomic. That is,
* either all processes in a given task or a project will be bound to a
* new pool, or (in case of an error) they will be all left bound to the
* old pool. Processes in a given task or a given project can only be bound to
* different pools if they were rebound individually one by one as single
* processes. Threads or LWPs of the same process do not have pool bindings,
* and are bound to the same resource sets associated with the resource pool
* of that process.
*
* The following picture shows one possible pool configuration with three
* pools and three processor sets. Note that processor set "foo" is not
* associated with any pools and therefore cannot have any processes
* bound to it. Two pools (default and foo) are associated with the
* same processor set (default). Also, note that processes in Task 2
* are bound to different pools.
*
*
* Processor Sets
* +---------+
* +--------------+========================>| default |
* a| | +---------+
* s| | ||
* s| | +---------+
* o| | | foo |
* c| | +---------+
* i| | ||
* a| | +---------+
* t| | +------>| bar |
* e| | | +---------+
* d| | |
* | | |
* +---------+ +---------+ +---------+
* Pools | default |======| foo |======| bar |
* +---------+ +---------+ +---------+
* @ @ @ @ @ @
* b| | | | | |
* o| | | | | |
* u| +-----+ | +-------+ | +---+
* n| | | | | |
* ....d|........|......|......|.........|.......|....
* : | :: | | | :: | | :
* : +---+ :: +---+ +---+ +---+ :: +---+ +---+ :
* Processes : | p | :: | p | | p | | p | :: | p |...| p | :
* : +---+ :: +---+ +---+ +---+ :: +---+ +---+ :
* :........::......................::...............:
* Task 1 Task 2 Task N
* | | |
* | | |
* | +-----------+ | +-----------+
* +--| Project 1 |--+ | Project N |
* +-----------+ +-----------+
*
* This is just an illustration of relationships between processes, tasks,
* projects, pools, and processor sets. New types of resource sets will be
* added in the future.
*/
int pool_count; /* number of pools created on this system */
int pool_state; /* pools state -- enabled/disabled */
void *pool_buf; /* pre-commit snapshot of the pools state */
static int pool_barrier_count; /* synch. with pool_barrier_* */
/*
* Boot-time pool initialization.
*/
void
pool_init(void)
{
/*
* Initialize default pool.
*/
/*
* Initialize plugins for resource sets.
*/
pool_count = 1;
}
/*
* Synchronization routines.
*
* pool_lock is only called from syscall-level routines (processor_bind(),
* periods of time, including across sleeping operations, so we allow its
* acquisition to be interruptible.
*
* The current thread that owns the "lock" is stored in the variable
* pool_busy_thread, both to let pool_lock_held() work and to aid debugging.
*/
void
pool_lock(void)
{
ASSERT(!pool_lock_held());
while (pool_busy_thread != NULL)
}
int
pool_lock_intr(void)
{
ASSERT(!pool_lock_held());
while (pool_busy_thread != NULL) {
return (1);
}
}
return (0);
}
int
pool_lock_held(void)
{
return (pool_busy_thread == curthread);
}
void
pool_unlock(void)
{
ASSERT(pool_lock_held());
}
/*
* Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize
* with pool_do_bind().
*
* Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all
* operations which modify pool or pset associations. They can be called
* while the process is multi-threaded. In the common case, when current
* process is not being rebound (PBWAIT flag is not set), these functions
* will be just incrementing and decrementing reference counts.
*/
void
pool_barrier_enter(void)
{
while (p->p_poolflag & PBWAIT)
p->p_poolcnt++;
}
void
pool_barrier_exit(void)
{
p->p_poolcnt--;
if (p->p_poolflag & PBWAIT) {
ASSERT(pool_barrier_count > 0);
if (pool_barrier_count == 0)
while (p->p_poolflag & PBWAIT)
}
}
/*
* Enable pools facility.
*/
static int
pool_enable(void)
{
int ret;
ASSERT(pool_lock_held());
ret = pool_pset_enable();
if (ret != 0)
return (ret);
"default");
"wt-load");
"pool.name", "pool_default");
"pool.importance", 1);
return (ret);
}
/*
* Disable pools facility.
*/
static int
pool_disable(void)
{
int ret;
ASSERT(pool_lock_held());
return (EBUSY);
ret = pool_pset_disable();
if (ret != 0)
return (ret);
if (pool_sys_prop != NULL) {
}
}
return (0);
}
pool_t *
pool_lookup_pool_by_name(char *name)
{
char *p;
ASSERT(pool_lock_held());
return (pool);
}
return (NULL);
}
pool_t *
{
ASSERT(pool_lock_held());
return (pool);
}
return (NULL);
}
/*
* Create new pool, associate it with default resource sets, and give
* it a temporary name.
*/
static int
{
char pool_name[40];
ASSERT(pool_lock_held());
pool_pool_mod = gethrtime();
pool_count++;
return (0);
}
struct destroy_zone_arg {
};
/*
* Update pool pointers for zones that are currently bound to pool "old"
* to be bound to pool "new".
*/
static int
{
ASSERT(pool_lock_held());
return (0);
}
/*
* Destroy specified pool, and rebind all processes in it
* to the default pool.
*/
static int
{
int ret;
ASSERT(pool_lock_held());
if (poolid == POOL_DEFAULT)
return (EINVAL);
return (ESRCH);
if (ret == 0) {
struct destroy_zone_arg dzarg;
pool_count--;
pool_pool_mod = gethrtime();
}
return (ret);
}
/*
* Create new pool or resource set.
*/
int
{
int ret;
ASSERT(pool_lock_held());
if (pool_state == POOL_DISABLED)
return (ENOTACTIVE);
switch (class) {
case PEC_POOL:
break;
case PEC_RES_COMP:
switch (subclass) {
case PREC_PSET:
break;
default:
}
break;
case PEC_RES_AGG:
break;
default:
}
return (ret);
}
/*
* Destroy an existing pool or resource set.
*/
int
{
int ret;
ASSERT(pool_lock_held());
if (pool_state == POOL_DISABLED)
return (ENOTACTIVE);
switch (class) {
case PEC_POOL:
break;
case PEC_RES_COMP:
switch (subclass) {
case PREC_PSET:
break;
default:
}
break;
case PEC_RES_AGG:
break;
default:
}
return (ret);
}
/*
* Enable or disable pools.
*/
int
pool_status(int status)
{
int ret = 0;
ASSERT(pool_lock_held());
if (pool_state == status)
return (0);
switch (status) {
case POOL_ENABLED:
ret = pool_enable();
if (ret != 0)
return (ret);
break;
case POOL_DISABLED:
ret = pool_disable();
if (ret != 0)
return (ret);
break;
default:
}
return (ret);
}
/*
* Associate pool with resource set.
*/
int
{
int ret;
ASSERT(pool_lock_held());
if (pool_state == POOL_DISABLED)
return (ENOTACTIVE);
switch (idtype) {
case PREC_PSET:
break;
default:
}
if (ret == 0)
pool_pool_mod = gethrtime();
return (ret);
}
/*
* Disassociate resource set from pool.
*/
int
{
int ret;
ASSERT(pool_lock_held());
if (pool_state == POOL_DISABLED)
return (ENOTACTIVE);
switch (idtype) {
case PREC_PSET:
break;
default:
}
if (ret == 0)
pool_pool_mod = gethrtime();
return (ret);
}
/*
* Transfer specified quantity of resources between resource sets.
*/
/*ARGSUSED*/
int
{
return (ret);
}
/*
* Transfer resources specified by their IDs between resource sets.
*/
int
{
int ret;
ASSERT(pool_lock_held());
if (pool_state == POOL_DISABLED)
return (ENOTACTIVE);
switch (type) {
case PREC_PSET:
break;
default:
}
return (ret);
}
/*
* Bind processes to pools.
*/
int
{
ASSERT(pool_lock_held());
if (pool_state == POOL_DISABLED)
return (ENOTACTIVE);
return (ESRCH);
switch (idtype) {
case P_PID:
case P_TASKID:
case P_PROJID:
case P_ZONEID:
break;
default:
return (EINVAL);
}
}
/*
* Query pool binding of the specifed process.
*/
int
{
proc_t *p;
return (ENOTSUP);
ASSERT(pool_lock_held());
return (ESRCH);
}
mutex_enter(&p->p_lock);
/*
* In local zones, lie about pool bindings of processes from
* the global zone.
*/
} else {
}
mutex_exit(&p->p_lock);
return (0);
}
static ea_object_t *
pool_system_pack(void)
{
ASSERT(pool_lock_held());
if (INGLOBALZONE(curproc))
sizeof (hrtime_t),
else
(void) ea_attach_item(eo_system,
sizeof (hrtime_t),
return (eo_system);
}
/*
* Pack information about pools and attach it to specified exacct group.
*/
static int
{
char *buf;
ASSERT(pool_lock_held());
continue;
bufsz = 0;
NV_ENCODE_NATIVE, 0);
}
return (0);
}
/*
* Pack the whole pool configuration in the specified buffer.
*/
int
{
int ret = 0;
ASSERT(pool_lock_held());
else
return (ret);
}
/*
* in progress, then all POOL_QUERY ioctls will return pools configuration
* at the beginning of transaction.
*/
int
pool_commit(int state)
{
int ret = 0;
ASSERT(pool_lock_held());
if (pool_state == POOL_DISABLED)
return (ENOTACTIVE);
switch (state) {
case 1:
/*
* Beginning commit transation.
*/
return (EBUSY);
break;
case 0:
/*
* Finishing commit transaction.
*/
pool_bufsz = 0;
}
break;
default:
}
return (ret);
}
/*
* Check is the specified property is special
*/
static pool_property_t *
{
return (prop);
return (NULL);
}
static pool_property_t pool_prop_sys[] = {
{ "system.allocate-method", DATA_TYPE_STRING,
PP_RDWR | PP_OPTIONAL },
{ "system.poold.log-level", DATA_TYPE_STRING,
PP_RDWR | PP_OPTIONAL },
{ "system.poold.log-location", DATA_TYPE_STRING,
PP_RDWR | PP_OPTIONAL },
{ "system.poold.monitor-interval", DATA_TYPE_UINT64,
PP_RDWR | PP_OPTIONAL },
{ "system.poold.history-file", DATA_TYPE_STRING,
PP_RDWR | PP_OPTIONAL },
{ "system.poold.objectives", DATA_TYPE_STRING,
PP_RDWR | PP_OPTIONAL },
{ NULL, 0, 0 }
};
static pool_property_t pool_prop_pool[] = {
{ "pool.scheduler", DATA_TYPE_STRING,
PP_RDWR | PP_OPTIONAL },
{ NULL, 0, 0 }
};
/*
* Common routine to put new property on the specified list
*/
int
{
/*
* No read-only properties or properties with bad types
*/
return (EINVAL);
}
}
/*
* Common routine to remove property from the given list
*/
int
{
return (EINVAL);
}
}
static int
{
int ret;
ASSERT(pool_lock_held());
if (ret == 0)
pool_sys_mod = gethrtime();
return (ret);
}
static int
pool_system_proprm(char *name)
{
int ret;
ASSERT(pool_lock_held());
if (ret == 0)
pool_sys_mod = gethrtime();
return (ret);
}
static int
{
int ret;
ASSERT(pool_lock_held());
return (ESRCH);
if (ret == 0)
pool_pool_mod = gethrtime();
return (ret);
}
static int
{
int ret;
ASSERT(pool_lock_held());
return (ESRCH);
if (ret == 0)
pool_pool_mod = gethrtime();
return (ret);
}
int
{
int ret;
ASSERT(pool_lock_held());
if (pool_state == POOL_DISABLED)
return (ENOTACTIVE);
switch (class) {
case PEC_SYSTEM:
break;
case PEC_POOL:
break;
case PEC_RES_COMP:
switch (subclass) {
case PREC_PSET:
break;
default:
}
break;
case PEC_RES_AGG:
break;
case PEC_COMP:
switch (subclass) {
case PCEC_CPU:
break;
default:
}
break;
default:
}
return (ret);
}
int
{
int ret;
ASSERT(pool_lock_held());
if (pool_state == POOL_DISABLED)
return (ENOTACTIVE);
switch (class) {
case PEC_SYSTEM:
break;
case PEC_POOL:
break;
case PEC_RES_COMP:
switch (subclass) {
case PREC_PSET:
break;
default:
}
break;
case PEC_RES_AGG:
break;
case PEC_COMP:
switch (subclass) {
case PCEC_CPU:
break;
default:
}
break;
default:
}
return (ret);
}
int
{
int ret;
ASSERT(pool_lock_held());
if (pool_state == POOL_DISABLED)
return (ENOTACTIVE);
switch (class) {
case PEC_SYSTEM:
case PEC_POOL:
break;
case PEC_RES_COMP:
switch (subclass) {
case PREC_PSET:
break;
default:
}
break;
case PEC_RES_AGG:
break;
case PEC_COMP:
switch (subclass) {
case PCEC_CPU:
break;
default:
}
break;
default:
}
if (ret == 0)
else
return (ret);
}
/*
* pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs
* in case of failure in pool_do_bind().
*/
static void
pool_bind_wake(proc_t *p)
{
ASSERT(pool_lock_held());
mutex_enter(&p->p_lock);
if (p->p_poolcnt > 0) {
pool_barrier_count -= p->p_poolcnt;
}
p->p_poolflag &= ~PBWAIT;
mutex_exit(&p->p_lock);
}
static void
{
ASSERT(pool_lock_held());
pool_bind_wake(p);
}
/*
* Return the scheduling class for this pool, or
* POOL_CLASS_UNSET if not set
* POOL_CLASS_INVAL if set to an invalid class ID.
*/
{
char *name;
ASSERT(pool_lock_held());
&name) == 0) {
return (cid);
else
return (POOL_CLASS_INVAL);
}
return (POOL_CLASS_UNSET);
}
/*
* Move process to the new scheduling class.
*/
static void
{
kthread_t *t;
void *cldata;
void **bufs;
void **buf;
int nlwp;
int ret;
int i;
/*
* Do not move kernel processes (such as zsched).
*/
return;
/*
* This process is in the pool barrier, so it can't possibly be
* adding new threads and we can use p_lwpcnt + p_zombcnt + 1
* (for possible agent LWP which doesn't use pool barrier) as
* our upper bound.
*/
/*
* Pre-allocate scheduling class specific buffers before
* grabbing p_lock.
*/
}
/*
* Move threads one by one to the new scheduling class.
* This never fails because we have all the right
* privileges here.
*/
mutex_enter(&p->p_lock);
t = p->p_tlist;
do {
}
mutex_exit(&p->p_lock);
/*
* Free unused scheduling class specific buffers.
*/
}
}
}
/*
* The meat of the bind operation. The steps in pool_do_bind are:
*
* 1) Set PBWAIT in the p_poolflag of any process of interest, and add all
* such processes to an array. For any interesting process that has
* threads inside the pool barrier set, increment a counter by the
* count of such threads. Once PBWAIT is set on a process, that process
* will not disappear.
*
* 2) Wait for the counter from step 2 to drop to zero. Any process which
* calls pool_barrier_exit() and notices that PBWAIT has been set on it
* will decrement that counter before going to sleep, and the process
* calling pool_barrier_exit() which does the final decrement will wake us.
*
* 3) For each interesting process, perform a calculation on it to see if
* the bind will actually succeed. This uses the following three
* resource-set-specific functions:
*
* - int set_bind_start(procs, pool)
*
* Determine whether the given array of processes can be bound to the
* resource set associated with the given pool. If it can, take and hold
* any locks necessary to ensure that the operation will succeed, and
* make any necessary reservations in the target resource set. If it
* can't, return failure with no reservations made and no new locks held.
*
* - void set_bind_abort(procs, pool)
*
* set_bind_start() has completed successfully, but another resource set's
* set_bind_start() has failed, and we haven't begun the bind yet. Undo
* any reservations made and drop any locks acquired by our
* set_bind_start().
*
* - void set_bind_finish(void)
*
* The bind has completed successfully. The processes have been released,
* and the reservation acquired in set_bind_start() has been depleted as
* the processes have finished their bindings. Drop any locks acquired by
* set_bind_start().
*
* 4) If we've decided that we can proceed with the bind, iterate through
* the list of interesting processes, grab the necessary locks (which
* may differ per resource set), perform the bind, and ASSERT that it
* succeeds. Once a process has been rebound, it can be awakened.
*
* The operations from step 4 must be kept in sync with anything which might
* cause the bind operations (e.g., cpupart_bind_thread()) to fail, and
* are thus located in the same source files as the associated bind operations.
*/
int
{
int procs_count = 0;
int procs_size;
int rv = 0;
proc_t *p;
ASSERT(pool_lock_held());
return (EINVAL);
return (ESRCH);
return (EBUSY);
}
}
return (ESRCH);
}
/*
* Fast-path for a single process case.
*/
} else {
/*
* We will need enough slots for proc_t pointers for as many as
* twice the number of currently running processes (assuming
* that each one could be in fork() creating a new child).
*/
for (;;) {
KM_SLEEP);
break;
/*
* If nproc has changed, try again.
*/
}
}
/*
* Do a first scan, and select target processes.
*/
else
mutex_enter(&p->p_lock);
/*
* Skip processes that don't match our (id, idtype) set or
* on the way of becoming zombies. Skip kernel processes
* from the global zone.
*/
p->p_poolflag & PEXITED ||
mutex_exit(&p->p_lock);
continue;
}
if (!INGLOBALZONE(p)) {
switch (idtype) {
case P_PID:
case P_TASKID:
/*
* Can't bind processes or tasks
* in local zones to pools.
*/
mutex_exit(&p->p_lock);
goto out;
case P_PROJID:
/*
* Only projects in the global
* zone can be rebound.
*/
mutex_exit(&p->p_lock);
continue;
case P_POOLID:
/*
* When rebinding pools, processes can be
* in different zones.
*/
break;
}
}
p->p_poolflag |= PBWAIT;
/*
* If some threads in this process are inside the pool
* barrier, add them to pool_barrier_count, as we have
* to wait for all of them to exit the barrier.
*/
if (p->p_poolcnt > 0) {
pool_barrier_count += p->p_poolcnt;
}
*pp++ = p;
procs_count++;
mutex_exit(&p->p_lock);
/*
* We just found our process, so if we're only rebinding a
* single process then get out of this loop.
*/
break;
}
/*
* Wait for relevant processes to stop before they try to enter the
* barrier or at the exit from the barrier. Make sure that we do
* not get stopped here while we're holding pool_lock. If we were
* requested to stop, or got a signal then return EAGAIN to let the
* library know that it needs to retry.
*/
lwp->lwp_nostop++;
while (pool_barrier_count > 0) {
if (pool_barrier_count > 0) {
/*
* We either got a signal or were requested to
* stop by /proc. Bail out with EAGAIN. If we were
* requested to stop, we'll stop in post_syscall()
* on our way back to userland.
*/
lwp->lwp_nostop--;
goto out;
}
}
lwp->lwp_nostop--;
goto skip;
/*
* Do another run, and drop processes that were inside the barrier
* in exit(), but when they have dropped to pool_barrier_exit
* they have become of no interest to us. Pick up child processes that
* were created by fork() but didn't exist during our first scan.
* Their parents are now stopped at pool_barrier_exit in cfork().
*/
if (p->p_poolflag & PEXITED) {
pool_bind_wake(p);
procs_count--;
pp--; /* try this slot again */
continue;
}
/*
* Look at the child and check if it should be rebound also.
* We're holding pidlock, so it is safe to reference p_child.
*/
continue;
mutex_enter(&p->p_lock);
/*
* Skip processes in local zones if we're not binding
* zones to pools (P_ZONEID). Skip kernel processes also.
*/
mutex_exit(&p->p_lock);
continue;
}
/*
* If the child process has been already created by fork(), has
* not exited, and has not been added to the list already,
* then add it now. We will hit this process again (since we
* stick it at the end of the procs list) but it will ignored
* because it will have the PBWAIT flag set.
*/
!(p->p_poolflag & PEXITED) &&
!(p->p_poolflag & PBWAIT)) {
procs[procs_count] = p;
procs_count++;
p->p_poolflag |= PBWAIT;
}
mutex_exit(&p->p_lock);
}
skip:
/*
* If there's no processes to rebind then return ESRCH, unless
* we're associating a pool with new resource set, destroying it,
* or binding a zone to a pool.
*/
if (procs_count == 0) {
rv = 0;
else
goto out;
}
#ifdef DEBUG
/*
* All processes in the array should have PBWAIT set, and none should
* be in the critical section. Even though p_poolflag is protected by
* the p_lock, these assertions should be stable across the dropping of
* p_lock.
*/
}
#endif
/*
* Do the check if processor set rebinding is going to succeed or not.
*/
if ((flags & POOL_BIND_PSET) &&
goto out;
}
/*
* At this point, all bind operations should succeed.
*/
if (flags & POOL_BIND_PSET) {
void *zonebuf;
void *projbuf;
/*
* Pre-allocate one buffer for FSS (per-project
* buffer for a new pset) in case if this is the
* first thread from its current project getting
* bound to this processor set.
*/
mutex_enter(&p->p_lock);
mutex_exit(&p->p_lock);
/*
* Free buffers pre-allocated above if it
* wasn't actually used.
*/
}
/*
* Now let's change the scheduling class of this
* process if our target pool has it defined.
*/
if (cid != POOL_CLASS_UNSET)
pool_change_class(p, cid);
/*
* It is safe to reference p_pool here without holding
* p_lock because it cannot change underneath of us.
* We're holding pool_lock here, so nobody else can be
* moving this process between pools. If process "p"
* would be exiting, we're guaranteed that it would be blocked
* at pool_barrier_enter() in exit(). Otherwise, it would've
* been skipped by one of our scans of the practive list
* as a process with PEXITED flag set.
*/
}
/*
* Okay, we've tortured this guy enough.
* Let this poor process go now.
*/
pool_bind_wake(p);
}
if (flags & POOL_BIND_PSET)
case P_PROJID:
break;
case P_ZONEID:
if (rv == 0) {
}
break;
}
ASSERT(pool_barrier_count == 0);
return (rv);
}