/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/sysmacros.h>
#include <sys/cpucaps_impl.h>
/*
* CPU Caps implementation
* =======================
*
* A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU
* usage for all projects running inside the zone. If the zone CPU cap is set
* below the project CPU cap, the latter will have no effect.
*
* them do not get scheduled and instead are placed on wait queues associated
* with a cap. Such threads will start running again only when CPU usage drops
* below the cap level. Each zone and each project has its own wait queue.
*
* When CPU cap is set, the kernel continously keeps track of CPU time used by
* current CPU usage as a percentage. When the accumulated usage reaches the CPU
* cap, LWPs running in the user-land (when they are not holding any critical
* kernel locks) are placed on special wait queues until their project's or
* zone's CPU usage drops below the cap.
*
* The system maintains a list of all capped projects and all capped zones. On
* every clock tick every active thread belonging to a capped project adds its
* CPU usage to its project. Usage from all projects belonging to a capped zone
* is aggregated to get the zone usage.
*
* When the current CPU usage is above the cap, a project or zone is considered
* over-capped. Every user thread caught running in an over-capped project or
* zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and
* is requested to surrender its CPU. This causes scheduling class specific
* CL_PREEMPT() callback to be invoked. The callback function places threads
* marked as TS_PROJWAIT on a wait queue and calls switch().
*
* Threads are only placed on wait queues after trapping from user-land
* (they could be holding some user locks, but no kernel locks) and while
* returning from the trap back to the user-land when no kernel locks are held.
* Putting threads on wait queues in random places while running in the
* kernel might lead to all kinds of locking problems.
*
* Accounting
* ==========
*
* Accounting of CPU usage is based on per-thread micro-state accounting data.
* On every clock tick clock() adds new on-CPU time for every thread found on
* CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU.
* New times means time since it was last accounted for. On-CPU times greater
* than 1 tick are truncated to 1 tick.
*
* Project CPU usage is aggregated from all threads within the project.
* Zone CPU usage is the sum of usages for all projects within the zone. Zone
* CPU usage is calculated on every clock tick by walking list of projects and
* adding their usage together.
*
* Decay
* =====
*
* CPU usage is decayed by the caps_update() routine which is called once per
* every clock tick. It walks lists of project caps and decays their usages by
* one per cent. If CPU usage drops below cap levels, threads on the wait queue
* are made runnable again, one thread per clock tick.
*
* Interfaces
* ==========
*
* The CPU Caps facility provides the following interfaces to the rest of the
* system:
*
* cpucaps_project_add(kproject_t *)
*
* Notifies the framework of a new project. It should be put on the
* capped_projects list if its zone has a cap.
*
* cpucaps_project_remove(kproject_t *)
*
* Remove the association between the specified project and its cap.
* Called right before the project is destroyed.
*
* cpucaps_project_set(kproject_t *, rctl_qty_t)
*
* Set project cap of the specified project to the specified value. Setting the
* value to NOCAP is equivalent to removing the cap.
*
* cpucaps_zone_set(zone_t *, rctl_qty_t)
*
* Set zone cap of the specified zone to the specified value. Setting the value
* to NOCAP is equivalent to removing the cap.
*
* cpucaps_zone_remove(zone_t *)
*
* Remove the association between the zone and its cap.
*
* cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t)
*
* Charges specified thread's project the amount of on-CPU time that it used.
* If the third argument is CPUCAPS_CHARGE_ONLY returns False.
* Otherwise returns True if project or zone should be penalized because its
* project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ
* bits in t_schedflag in this case.
*
* CPUCAPS_ENFORCE(kthread_id_t *)
*
* Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER
* state on project or zone wait queues, as requested by TS_PROJWAITQ or
* TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a
* wait queue or False otherwise.
*
* cpucaps_sc_init(caps_sc_t *)
*
* Initializes the scheduling-class specific CPU Caps data for a thread.
*
* LOCKS
* =====
*
* all the individual caps structures and their lists are protected by a global
* caps_lock mutex. The lock is grabbed either by clock() or by events modifying
* caps, so it is usually uncontended. We avoid all blocking memory allocations
* while holding caps_lock to prevent clock() from blocking.
*
* Thread state is protected by the thread lock. It protects the association
* between a thread and its project and, as a consequence, to its zone. The
* association can not break while thread lock is held, so the project or zone
* cap are not going to disappear while thread lock is held.
*
* Cap usage field is protected by high-pil spin-lock cap_usagelock. It is
* grabbed by scheduling classes already holding thread lock at high PIL and by
* clock thread performing usage decay. We should do as little work as possible
* while holding the lock since it may be very hot. All threads in the project
* contend for the same cache line doing cap usage updates.
*/
/*
* caps_lock protects list of capped projects and zones, changes in the cap
* state and changes of the global cpucaps_enabled flag.
*
* Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is
* modified in parallel. This can be per-zone cap flag, but we don't keep any
* cap state for now.
*/
/*
* The accounting is based on the number of nanoseconds threads spend running
* during a tick which is kept in the cap_tick_cost variable.
*/
/*
* How much of the usage value is decayed every clock tick
* Decay one per cent of value per tick
*/
/*
* Scale the value and round it to the closest integer value
*/
static void caps_update();
/*
* CAP kstats.
*/
struct cap_kstat {
} cap_kstat = {
{ "value", KSTAT_DATA_UINT64 },
{ "usage", KSTAT_DATA_UINT64 },
{ "nwait", KSTAT_DATA_UINT64 },
{ "below_sec", KSTAT_DATA_UINT64 },
{ "above_sec", KSTAT_DATA_UINT64 },
{ "maxusage", KSTAT_DATA_UINT64 },
{ "zonename", KSTAT_DATA_STRING },
};
static int cap_kstat_update(kstat_t *, int);
/*
* Initialize CPU caps infrastructure.
* - Initialize lists of capped zones and capped projects
* - Set cpucaps_clock_callout to NULL
*/
void
{
/*
* Initialize global variables
*/
}
/*
* Initialize scheduling-class specific CPU Caps data.
*/
void
{
csc->csc_cputime = 0;
}
/*
* Allocate and initialize cpucap structure
*/
static cpucap_t *
cap_alloc(void)
{
return (cap);
}
/*
* Free cpucap structure
*/
static void
{
return;
/*
* This cap should not be active
*/
}
/*
* Activate cap - insert into active list and unblock its
* wait queue. Should be called with caps_lock held.
* The cap_value field is set to the value supplied.
*/
static void
{
/*
* Cap can not be already enabled
*/
list_insert_tail(l, cap);
cap->cap_maxusage = 0;
if (CPUCAPS_OFF()) {
}
}
/*
* Deactivate cap
* - Block its wait queue. This prevents any new threads from being
* enqueued there and moves all enqueued threads to the run queue.
* - Remove cap from list l.
* - Disable CPU caps globally if there are no capped projects or zones
*
* Should be called with caps_lock held.
*/
static void
{
/*
* Cap should be currently active
*/
ASSERT(CPUCAPS_ON());
list_remove(l, cap);
}
}
}
/*
* Enable cap for a project kpj
* It is safe to enable already enabled project cap.
* Should be called with caps_lock held.
*/
static void
{
if (CAP_DISABLED(cap)) {
/*
* Create cap kstats
*/
sizeof (cap_kstat) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL)) != NULL) {
}
}
}
/*
* Disable project cap.
* It is safe to disable already disabled project cap.
* Should be called with caps_lock held.
*/
static void
{
if (CAP_ENABLED(cap))
}
/*
* Enable cap for a zone
* It is safe to enable already enabled zone cap.
* Should be called with caps_lock held.
*/
static void
{
if (CAP_DISABLED(cap)) {
/*
* Create cap kstats
*/
sizeof (cap_kstat) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL)) != NULL) {
}
}
}
/*
* Disable zone cap.
* It is safe to disable already disabled zone cap.
* Should be called with caps_lock held.
*/
static void
{
if (CAP_ENABLED(cap))
}
/*
* Apply specified callback to all caps contained in the list `l'.
*/
static void
{
}
}
/*
* If cap limit is not reached, make one thread from wait queue runnable.
* The waitq_isempty check is performed without the waitq lock. If a new thread
* is placed on the waitq right after the check, it will be picked up during the
* next invocation of cap_poke_waitq().
*/
/* ARGSUSED */
static void
{
} else {
if (!waitq_isempty(wq))
}
}
/*
* The callback function called for every cap on capped_projects list.
* Decay cap usage by CAP_DECAY_FACTOR
* Add this cap project usage to its zone usage.
* Kick off a thread from the cap waitq if cap is not reached.
*/
static void
{
/*
* Set or clear the CAP_REACHED flag based on the current usage.
* Only projects having their own caps are ever marked as CAP_REACHED.
*/
cap_poke_waitq(cap, 0);
/*
* Add project's CPU usage to our zone's CPU usage.
*/
if (ZONE_IS_CAPPED(zone)) {
/*
* If we haven't reset this zone's usage during this clock tick
* yet, then do it now. The cap_gen field is used to check
* whether this is the first zone's project we see during this
* tick or a subsequent one.
*/
}
/* Check for overflows */
}
/*
* Decay project usage.
*/
}
/*
* On every clock tick walk the list of project caps and update the CPU usage.
* Also walk the list of zone caps checking whether any threads should
* transition from wait queue to run queue.
*
* This function gets called by the clock thread directly when there are any
* defined caps. The only lock that it grabs is caps_lock. Nothing else grabs
* caps_lock for long periods of time, so there should be almost no contention
* for it.
*/
static void
{
}
/*
* The function is called for each project in a zone when the zone cap is
* modified. It enables project caps if zone cap is enabled and disables if the
* zone cap is disabled and project doesn't have its own cap.
*
* For each project that does not have cpucap structure allocated it allocates a
* new structure and assigns to kpj->cpu_cap. The allocation is performed
* without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock
* held.
*/
static int
{
/*
* This is the first time any cap was established for this
* project. Allocate a new cpucap structure for it.
*/
project_cap = cap_alloc();
}
/*
* Double-check that kpj_cpucap is still NULL - now with caps_lock held
* and assign the newly allocated cpucap structure to it.
*/
} else if (project_cap != NULL) {
}
if (CAP_DISABLED(zone_cap)) {
/*
* Remove all projects in this zone without caps
* from the capped_projects list.
*/
}
} else if (CAP_DISABLED(project_cap)) {
/*
* Add the project to capped_projects list.
*/
}
return (0);
}
/*
* Set zone cap to cap_val
* If cap_val is equal to NOCAP, disable zone cap.
*
* If this is the first time a cap is set on a zone, allocate cpucap structure
* without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held.
*/
int
{
if (cap_val == 0)
return (EINVAL);
/*
* Nothing to do if trying to disable a cap on a zone when caps are off
* or a zone which does not have a cap yet.
*/
return (0);
if (cpucaps_busy) {
return (EBUSY);
}
/*
* Double-check whether zone->zone_cpucap is NULL, now with caps_lock
* held. If it is still NULL, assign a newly allocated cpucap to it.
*/
}
if (value < 0)
/* Nothing to do if the value is staying the same */
return (0);
}
/*
* Clear cap statistics since the cap value itself changes.
*/
if (CAP_ENABLED(cap)) {
/*
* Remove cap for the zone
*/
/*
* Disable caps for all project belonging to this zone
* unless they have their own cap.
*/
}
} else if (CAP_DISABLED(cap)) {
/*
* Set a cap on a zone which previously was not capped.
*/
/*
* Enable cap for all projects belonging to this zone.
*/
} else {
/*
* No state transitions, just change the value
*/
}
return (0);
}
/*
* The project is going away so disable its cap.
*/
void
{
if (PROJECT_IS_CAPPED(kpj))
}
}
/*
* The zone is going away, so disable its cap.
*/
void
{
while (ZONE_IS_CAPPED(zone)) {
}
}
}
/*
* New project was created. It should be put on the capped_projects list if
* its zone has a cap.
*/
void
{
return;
/*
* This project was never capped before, so allocate its cap structure.
*/
/*
* Double-check with caps_lock held
*/
}
}
/*
* Set project cap to cap_val
* If cap_val is equal to NOCAP, disable project cap.
*
* If this is the first time a cap is set on a project, allocate cpucap
* structure without holding caps_lock to avoid KM_SLEEP allocation with
* caps_lock held.
*/
int
{
if (cap_val == 0)
return (EINVAL);
/*
* Nothing to do if trying to disable project cap and caps are not
* enabled or if trying to disable cap on a project that does not have
* cap enabled.
*/
return (0);
/*
* This project was never capped before, so allocate its cap
* structure.
*/
}
/*
* Double-check with caps_lock held.
*/
}
/*
* Get the actual pointer to the project cap.
*/
if (value < 0)
/*
* Nothing to do if the value is not changing
*/
return (0);
}
/*
* Clear cap statistics since the cap value itself changes.
*/
cap->cap_maxusage = 0;
/*
* Enable this cap if it is not already enabled.
*/
if (CAP_DISABLED(cap))
else
} else if (CAP_ENABLED(cap)) {
/*
* User requested to drop a cap on the project. If it is part of
* capped zone, keep the cap and set the value to MAX_USAGE,
* otherwise disable the cap.
*/
} else {
}
}
return (0);
}
/*
* Get cap usage.
*/
static rctl_qty_t
{
}
/*
* Get current project usage.
*/
{
}
/*
* Get current zone usage.
*/
{
}
/*
* Charge project of thread t the time thread t spent on CPU since previously
* adjusted.
*
* Record the current on-CPU time in the csc structure.
*
* Do not adjust for more than one tick worth of time.
*
* It is possible that the project cap is being disabled while this routine is
* executed. This should not cause any issues since the association between the
* thread and its project is protected by thread lock.
*/
static void
{
ASSERT(THREAD_LOCK_HELD(t));
/* Get on-CPU time since birth of a thread */
/* Time spent on CPU since last checked */
/* Save the accumulated on-CPU time */
/* Charge at most one tick worth of on-CPU time */
if (usage_delta > cap_tick_cost)
/* Add usage_delta to the project usage value. */
if (usage_delta > 0) {
/* Check for overflows */
/*
* cap_maxusage is only kept for observability. Move it outside
* the lock to reduce the time spent while holding the lock.
*/
}
}
/*
* Charge thread's project and return True if project or zone should be
* penalized because its project or zone is exceeding its cap. Also sets
* TS_PROJWAITQ or TS_ZONEWAITQ in this case.
*
* It is possible that the project cap is being disabled while this routine is
* executed. This should not cause any issues since the association between the
* thread and its project is protected by thread lock. It will still set
* TS_PROJECTWAITQ/TS_ZONEWAITQ in this case but cpucaps_enforce will not place
* anything on the blocked wait queue.
*
*/
{
ASSERT(THREAD_LOCK_HELD(t));
/* Nothing to do for projects that are not capped. */
return (B_FALSE);
caps_charge_adjust(t, csc);
/*
* The caller only requested to charge the project usage, no enforcement
* part.
*/
if (charge_type == CPUCAPS_CHARGE_ONLY)
return (B_FALSE);
t->t_schedflag |= TS_PROJWAITQ;
} else if (t->t_schedflag & TS_PROJWAITQ) {
t->t_schedflag &= ~TS_PROJWAITQ;
}
if (!ZONE_IS_CAPPED(zone)) {
if (t->t_schedflag & TS_ZONEWAITQ)
t->t_schedflag &= ~TS_ZONEWAITQ;
} else {
t->t_schedflag |= TS_ZONEWAITQ;
} else if (t->t_schedflag & TS_ZONEWAITQ) {
t->t_schedflag &= ~TS_ZONEWAITQ;
}
}
return (rc);
}
/*
* Enforce CPU caps. If got preempted in the user-land, we know that thread does
* not hold any kernel locks, so enqueue ourselves on the waitq, if needed.
*
* CPU Caps are only enforced for user threads.
*
* Threads flagged with TS_PROJWAITQ are placed on their project wait queues and
* threads marked with TS_ZONEWAITQ are placed on their zone wait queue.
*
* It is possible that by the time we enter cpucaps_enforce() the cap is already
* disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We
* still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer
* apply.
*/
{
ASSERT(THREAD_LOCK_HELD(t));
if (t->t_schedflag & TS_PROJWAITQ) {
t->t_schedflag &= ~TS_ANYWAITQ;
t)) {
return (B_TRUE);
}
}
if (t->t_schedflag & TS_ZONEWAITQ) {
t->t_schedflag &= ~TS_ZONEWAITQ;
t)) {
return (B_TRUE);
}
}
}
/*
* The thread is not enqueued on the wait queue.
*/
return (B_FALSE);
}
/*
* Convert internal cap statistics into values exported by cap kstat.
*/
static int
{
if (rw == KSTAT_WRITE)
return (EACCES);
return (0);
}