cpu_pm.c revision 0e7515250c8395f368aa45fb9acae7c4f8f8b786
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/cpu_pm.h>
#include <sys/cmn_err.h>
#include <sys/sdt.h>
/*
* Solaris Event Based CPU Power Manager
*
* This file implements platform independent event based CPU power management.
* When CPUs are configured into the system, the CMT scheduling subsystem will
* query the platform to determine if the CPU belongs to any power management
* domains. That is, sets of CPUs that share power management states.
*
* Active Power Management domains represent a group of CPUs across which the
* Operating System can request speed changes (which may in turn result
* in voltage changes). This allows the operating system to trade off
* performance for power savings.
*
* Idle Power Management domains can enter power savings states when they are
* unutilized. These states allow the Operating System to trade off power
* for performance (in the form of latency to transition from the idle state
* to an active one).
*
* For each active and idle power domain the CMT subsystem instantiates, a
* cpupm_domain_t structure is created. As the dispatcher schedules threads
* to run on the system's CPUs, it will also track the utilization of the
* enumerated power domains. Significant changes in utilization will result
* in the dispatcher sending the power manager events that relate to the
* utilization of the power domain. The power manager recieves the events,
* and in the context of the policy objectives in force, may decide to request
* the domain's power/performance state be changed.
*
* Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
* manager will request the CPUs in the domain run at their fastest (and most
* power consuming) state. When the domain becomes idle (utilization at zero),
* the power manager will request that the CPUs run at a speed that saves the
* most power.
*
* The advantage of this scheme, is that the CPU power manager working with the
* dispatcher can be extremely responsive to changes in utilization. Optimizing
* for performance in the presence of utilization, and power savings in the
* presence of idleness. Such close collaboration with the dispatcher has other
* benefits that will play out in the form of more sophisticated power /
* performance policy in the near future.
*
* Avoiding state thrashing in the presence of transient periods of utilization
* and idleness while still being responsive to non-transient periods is key.
* The power manager implmeents several "governors" that are used to throttle
* state transitions when a significant amount of transient idle or transient
* work is detected.
*
* Kernel background activity (e.g. taskq threads) are by far the most common
* form of transient utilization. Ungoverned in the face of this utililzation,
* hundreds of state transitions per second would result on an idle system.
*
* Transient idleness is common when a thread briefly yields the CPU to
* wait for an event elsewhere in the system. Where the idle period is short
* enough, the overhead associated with making the state transition doesn't
* justify the power savings.
*/
static cpupm_domain_t *cpupm_domains = NULL;
/*
* Uninitialized state of CPU power management is disabled
*/
cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED;
/*
* Periods of utilization lasting less than this time interval are characterized
* as transient. State changes associated with transient work are considered
* to be mispredicted. That is, it's not worth raising and lower power states
* where the utilization lasts for less than this interval.
*/
hrtime_t cpupm_tw_predict_interval;
/*
* Periods of idleness lasting less than this time interval are characterized
* as transient. State changes associated with transient idle are considered
* to be mispredicted. That is, it's not worth lowering and raising power
* states where the idleness lasts for less than this interval.
*/
hrtime_t cpupm_ti_predict_interval;
/*
* Number of mispredictions after which future transitions will be governed.
*/
int cpupm_mispredict_thresh = 2;
/*
* Likewise, the number of mispredicted governed transitions after which the
* governor will be removed.
*/
int cpupm_mispredict_gov_thresh = 10;
/*
* The transient work and transient idle prediction intervals are initialized
* to be some multiple of the amount of time it takes to transition a power
* domain from the highest to the lowest power state, and back again, which
* is measured.
*
* The default values of those multiples are specified here. Tuning them higher
* will result in the transient work, and transient idle governors being used
* more aggresively, which limits the frequency of state transitions at the
* expense of performance and power savings, respectively.
*/
#define CPUPM_TI_GOV_DEFAULT_MULTIPLE 600
#define CPUPM_TW_GOV_DEFAULT_MULTIPLE 25
/*
* Number of high=>low=>high measurements performed, of which the average
* is taken.
*/
#define CPUPM_BENCHMARK_ITERS 5
int cpupm_ti_gov_multiple = CPUPM_TI_GOV_DEFAULT_MULTIPLE;
int cpupm_tw_gov_multiple = CPUPM_TW_GOV_DEFAULT_MULTIPLE;
static int cpupm_governor_initialize(void);
static void cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t);
cpupm_policy_t
cpupm_get_policy(void)
{
return (cpupm_policy);
}
int
cpupm_set_policy(cpupm_policy_t new_policy)
{
static int gov_init = 0;
int result = 0;
mutex_enter(&cpu_lock);
if (new_policy == cpupm_policy) {
mutex_exit(&cpu_lock);
return (result);
}
/*
* Pausing CPUs causes a high priority thread to be scheduled
* on all other CPUs (besides the current one). This locks out
* other CPUs from making CPUPM state transitions.
*/
switch (new_policy) {
case CPUPM_POLICY_DISABLED:
pause_cpus(NULL);
cpupm_policy = CPUPM_POLICY_DISABLED;
start_cpus();
result = cmt_pad_disable(PGHW_POW_ACTIVE);
/*
* Once PAD has been enabled, it should always be possible
* to disable it.
*/
ASSERT(result == 0);
/*
* Bring all the active power domains to the maximum
* performance state.
*/
cpupm_state_change_global(CPUPM_DTYPE_ACTIVE,
CPUPM_STATE_MAX_PERF);
break;
case CPUPM_POLICY_ELASTIC:
result = cmt_pad_enable(PGHW_POW_ACTIVE);
if (result < 0) {
/*
* Failed to enable PAD across the active power
* domains, which may well be because none were
* enumerated.
*/
break;
}
pause_cpus(NULL);
/*
* Attempt to initialize the governor parameters the first
* time through.
*/
if (gov_init == 0) {
result = cpupm_governor_initialize();
if (result == 0) {
gov_init = 1;
} else {
/*
* Failed to initialize the governor parameters
*/
start_cpus();
break;
}
}
cpupm_policy = CPUPM_POLICY_ELASTIC;
start_cpus();
break;
default:
cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n",
new_policy);
ASSERT(0);
break;
}
mutex_exit(&cpu_lock);
return (result);
}
/*
* Look for an existing power domain
*/
static cpupm_domain_t *
cpupm_domain_find(id_t id, cpupm_dtype_t type)
{
ASSERT(MUTEX_HELD(&cpu_lock));
cpupm_domain_t *dom;
dom = cpupm_domains;
while (dom != NULL) {
if (id == dom->cpd_id && type == dom->cpd_type)
return (dom);
dom = dom->cpd_next;
}
return (NULL);
}
/*
* Create a new domain
*/
static cpupm_domain_t *
cpupm_domain_create(id_t id, cpupm_dtype_t type)
{
cpupm_domain_t *dom;
ASSERT(MUTEX_HELD(&cpu_lock));
dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP);
dom->cpd_id = id;
dom->cpd_type = type;
/* Link into the known domain list */
dom->cpd_next = cpupm_domains;
cpupm_domains = dom;
return (dom);
}
static void
cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom)
{
/*
* In the envent we're enumerating because the domain's state
* configuration has changed, toss any existing states.
*/
if (dom->cpd_nstates > 0) {
kmem_free(dom->cpd_states,
sizeof (cpupm_state_t) * dom->cpd_nstates);
dom->cpd_nstates = 0;
}
/*
* Query to determine the number of states, allocate storage
* large enough to hold the state information, and pass it back
* to the platform driver to complete the enumeration.
*/
dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL);
if (dom->cpd_nstates == 0)
return;
dom->cpd_states =
kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP);
(void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states);
}
/*
* Initialize the specified type of power domain on behalf of the CPU
*/
cpupm_domain_t *
cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type)
{
cpupm_domain_t *dom;
id_t did;
ASSERT(MUTEX_HELD(&cpu_lock));
/*
* Instantiate the domain if it doesn't already exist
* and enumerate its power states.
*/
did = cpupm_domain_id(cp, type);
dom = cpupm_domain_find(did, type);
if (dom == NULL) {
dom = cpupm_domain_create(did, type);
cpupm_domain_state_enum(cp, dom);
}
/*
* Named state initialization
*/
if (type == CPUPM_DTYPE_ACTIVE) {
/*
* For active power domains, the highest performance
* state is defined as first state returned from
* the domain enumeration.
*/
dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
&dom->cpd_states[0];
dom->cpd_named_states[CPUPM_STATE_LOW_POWER] =
&dom->cpd_states[dom->cpd_nstates - 1];
/*
* Begin by assuming CPU is running at the max perf state.
*/
dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
}
return (dom);
}
/*
* Return the id associated with the given type of domain
* to which cp belongs
*/
id_t
cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type)
{
return (cpupm_plat_domain_id(cp, type));
}
/*
* Initiate a state change for the specified domain on behalf of cp
*/
int
cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state)
{
if (cpupm_plat_change_state(cp, state) < 0)
return (-1);
DTRACE_PROBE2(cpupm__change__state,
cpupm_domain_t *, dom,
cpupm_state_t *, state);
dom->cpd_state = state;
return (0);
}
/*
* Interface into the CPU power manager to indicate a significant change
* in utilization of the specified active power domain
*/
void
cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom,
cpupm_util_event_t event)
{
cpupm_state_t *new_state = NULL;
hrtime_t last;
if (cpupm_policy == CPUPM_POLICY_DISABLED) {
return;
}
/*
* What follows is a simple elastic power state management policy.
*
* If the utilization has become non-zero, and the domain was
* previously at it's lowest power state, then transition it
* to the highest state in the spirit of "race to idle".
*
* If the utilization has dropped to zero, then transition the
* domain to its lowest power state.
*
* Statistics are maintained to implement governors to reduce state
* transitions resulting from either transient work, or periods of
* transient idleness on the domain.
*/
switch (event) {
case CPUPM_DOM_REMAIN_BUSY:
/*
* We've received an event that the domain is running a thread
* that's made it to the end of it's time slice. If we are at
* low power, then raise it. If the transient work governor
* is engaged, then remove it.
*/
if (dom->cpd_state ==
dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
new_state =
dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
if (dom->cpd_tw_governed == B_TRUE) {
dom->cpd_tw_governed = B_FALSE;
dom->cpd_tw = 0;
}
}
break;
case CPUPM_DOM_BUSY_FROM_IDLE:
last = dom->cpd_last_lower;
dom->cpd_last_raise = now;
DTRACE_PROBE3(cpupm__raise__req,
cpupm_domain_t *, dom,
hrtime_t, last,
hrtime_t, now);
if (dom->cpd_state ==
dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
/*
* There's non-zero utilization, and the domain is
* running in the lower power state. Before we
* consider raising power, perform some book keeping
* for the transient idle governor.
*/
if (dom->cpd_ti_governed == B_FALSE) {
if ((now - last) < cpupm_ti_predict_interval) {
/*
* We're raising the domain power and
* we *just* lowered it. Consider
* this a mispredicted power state
* transition due to a transient
* idle period.
*/
if (++dom->cpd_ti >=
cpupm_mispredict_thresh) {
/*
* There's enough transient
* idle transitions to
* justify governing future
* lowering requests.
*/
dom->cpd_ti_governed = B_TRUE;
dom->cpd_ti = 0;
DTRACE_PROBE1(
cpupm__ti__governed,
cpupm_domain_t *, dom);
}
} else {
/*
* We correctly predicted the last
* lowering.
*/
dom->cpd_ti = 0;
}
}
if (dom->cpd_tw_governed == B_TRUE) {
/*
* Raise requests are governed due to
* transient work.
*/
DTRACE_PROBE1(cpupm__raise__governed,
cpupm_domain_t *, dom);
/*
* It's likely that we'll be governed for a
* while. If the transient idle governor is
* also in place, examine the preceeding idle
* interval to see if that still makes sense.
*/
if (dom->cpd_ti_governed == B_TRUE &&
((now - last) >=
cpupm_ti_predict_interval)) {
if (++dom->cpd_ti >=
cpupm_mispredict_gov_thresh) {
dom->cpd_ti_governed =
B_FALSE;
dom->cpd_ti = 0;
}
}
return;
}
/*
* Prepare to transition to the higher power state
*/
new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
} else if (dom->cpd_state ==
dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
/*
* Utilization is non-zero, and we're already running
* in the higher power state. Take this opportunity to
* perform some book keeping if the last lowering
* request was governed.
*/
if (dom->cpd_ti_governed == B_TRUE) {
if ((now - last) >= cpupm_ti_predict_interval) {
/*
* The domain is transient idle
* governed, and we mispredicted
* governing the last lowering request.
*/
if (++dom->cpd_ti >=
cpupm_mispredict_gov_thresh) {
/*
* There's enough non-transient
* idle periods to justify
* removing the governor.
*/
dom->cpd_ti_governed = B_FALSE;
dom->cpd_ti = 0;
DTRACE_PROBE1(
cpupm__ti__ungoverned,
cpupm_domain_t *, dom);
}
} else {
/*
* Correctly predicted governing the
* last lowering request.
*/
dom->cpd_ti = 0;
}
}
}
break;
case CPUPM_DOM_IDLE_FROM_BUSY:
last = dom->cpd_last_raise;
dom->cpd_last_lower = now;
DTRACE_PROBE3(cpupm__lower__req,
cpupm_domain_t *, dom,
hrtime_t, last,
hrtime_t, now);
if (dom->cpd_state ==
dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
/*
* The domain is idle, and is running in the highest
* performance state. Before we consider lowering power,
* perform some book keeping for the transient work
* governor.
*/
if (dom->cpd_tw_governed == B_FALSE) {
if ((now - last) < cpupm_tw_predict_interval) {
/*
* We're lowering the domain power and
* we *just* raised it. Consider the
* last raise mispredicted due to
* transient work.
*/
if (++dom->cpd_tw >=
cpupm_mispredict_thresh) {
/*
* There's enough transient idle
* transitions to justify
* governing future lowering
* requests.
*/
dom->cpd_tw_governed = B_TRUE;
dom->cpd_tw = 0;
DTRACE_PROBE1(
cpupm__tw__governed,
cpupm_domain_t *, dom);
}
} else {
/*
* We correctly predicted during the
* last raise.
*/
dom->cpd_tw = 0;
}
}
if (dom->cpd_ti_governed == B_TRUE) {
/*
* Lowering requests are governed due to
* transient idleness.
*/
DTRACE_PROBE1(cpupm__lowering__governed,
cpupm_domain_t *, dom);
/*
* It's likely that we'll be governed for a
* while. If the transient work governor is
* also in place, examine the preceeding busy
* interval to see if that still makes sense.
*/
if (dom->cpd_tw_governed == B_TRUE &&
((now - last) >=
cpupm_tw_predict_interval)) {
if (++dom->cpd_tw >=
cpupm_mispredict_gov_thresh) {
dom->cpd_tw_governed =
B_FALSE;
dom->cpd_tw = 0;
}
}
return;
}
/*
* Prepare to transition to a lower power state.
*/
new_state =
dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
} else if (dom->cpd_state ==
dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
/*
* The domain is idle, and we're already running in
* the lower power state. Take this opportunity to
* perform some book keeping if the last raising
* request was governed.
*/
if (dom->cpd_tw_governed == B_TRUE) {
if ((now - last) >= cpupm_tw_predict_interval) {
/*
* The domain is transient work
* governed, and we mispredicted
* governing the last raising request.
*/
if (++dom->cpd_tw >=
cpupm_mispredict_gov_thresh) {
/*
* There's enough non-transient
* work to justify removing
* the governor.
*/
dom->cpd_tw_governed = B_FALSE;
dom->cpd_tw = 0;
DTRACE_PROBE1(
cpupm__tw__ungoverned,
cpupm_domain_t *, dom);
}
} else {
/*
* We correctly predicted governing
* the last raise.
*/
dom->cpd_tw = 0;
}
}
}
break;
}
/*
* Change the power state
* Not much currently done if this doesn't succeed
*/
if (new_state)
(void) cpupm_change_state(cp, dom, new_state);
}
/*
* Interface called by platforms to dynamically change the
* MAX performance cpupm state
*/
void
cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level)
{
cpupm_domain_t *dom;
id_t did;
cpupm_dtype_t type = CPUPM_DTYPE_ACTIVE;
boolean_t change_state = B_FALSE;
cpupm_state_t *new_state = NULL;
did = cpupm_domain_id(cp, type);
mutex_enter(&cpu_lock);
dom = cpupm_domain_find(did, type);
mutex_exit(&cpu_lock);
/*
* Can use a lock to avoid changing the power state of the cpu when
* CPUPM_STATE_MAX_PERF is getting changed.
* Since the occurance of events to change MAX_PERF is not frequent,
* it may not be a good idea to overburden with locks. In the worst
* case, for one cycle the power may not get changed to the required
* level
*/
if (dom != NULL) {
if (dom->cpd_state ==
dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
change_state = B_TRUE;
}
/*
* If an out of range level is passed, use the lowest supported
* speed.
*/
if (max_perf_level >= dom->cpd_nstates &&
dom->cpd_nstates > 1) {
max_perf_level = dom->cpd_nstates - 1;
}
dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
&dom->cpd_states[max_perf_level];
/*
* If the current state is MAX_PERF, change the current state
* to the new MAX_PERF
*/
if (change_state) {
new_state =
dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
if (new_state) {
(void) cpupm_change_state(cp, dom, new_state);
}
}
}
}
/*
* Benchmark some power state transitions and use the transition latencies as
* a basis for initializing parameters for the transient idle and transient
* work governors.
*
* Returns 0 on success or -1 if the governor parameters could not be
* initialized.
*/
static int
cpupm_governor_initialize(void)
{
cpu_t *cp = CPU;
cpupm_domain_t *dom;
cpupm_state_t *low, *high;
id_t did;
hrtime_t start, delta, deltas = 0;
int iterations;
did = cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE);
if (did == CPUPM_NO_DOMAIN)
return (-1);
dom = cpupm_domain_find(did, CPUPM_DTYPE_ACTIVE);
if (dom == NULL)
return (-1);
low = dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
high = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
for (iterations = 0; iterations < CPUPM_BENCHMARK_ITERS; iterations++) {
/*
* Measure the amount of time it takes to transition the
* domain down to the lowest, and back to the highest power
* state.
*/
start = gethrtime_unscaled();
(void) cpupm_change_state(cp, dom, low);
(void) cpupm_change_state(cp, dom, high);
delta = gethrtime_unscaled() - start;
DTRACE_PROBE1(cpupm__benchmark__latency,
hrtime_t, delta);
deltas += delta;
}
/*
* Figure the average latency, and tune the transient work and
* transient idle prediction intervals accordingly.
*/
delta = deltas / iterations;
cpupm_ti_predict_interval = delta * cpupm_ti_gov_multiple;
cpupm_tw_predict_interval = delta * cpupm_tw_gov_multiple;
return (0);
}
/*
* Initiate a state change in all CPUPM domain instances of the specified type
*/
static void
cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state)
{
cpu_t *cp;
pg_cmt_t *pwr_pg;
cpupm_domain_t *dom;
group_t *hwset;
group_iter_t giter;
pg_cpu_itr_t cpu_iter;
pghw_type_t hw;
ASSERT(MUTEX_HELD(&cpu_lock));
switch (type) {
case CPUPM_DTYPE_ACTIVE:
hw = PGHW_POW_ACTIVE;
break;
default:
/*
* Power domain types other than "active" unsupported.
*/
ASSERT(type == CPUPM_DTYPE_ACTIVE);
return;
}
if ((hwset = pghw_set_lookup(hw)) == NULL)
return;
/*
* Iterate over the power domains
*/
group_iter_init(&giter);
while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) {
dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle;
/*
* Iterate over the CPUs in each domain
*/
PG_CPU_ITR_INIT(pwr_pg, cpu_iter);
while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
(void) cpupm_change_state(cp, dom,
dom->cpd_named_states[state]);
}
}
}