cpu_pm.c revision 0e7515250c8395f368aa45fb9acae7c4f8f8b786
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Solaris Event Based CPU Power Manager
*
* This file implements platform independent event based CPU power management.
* When CPUs are configured into the system, the CMT scheduling subsystem will
* query the platform to determine if the CPU belongs to any power management
* domains. That is, sets of CPUs that share power management states.
*
* Active Power Management domains represent a group of CPUs across which the
* Operating System can request speed changes (which may in turn result
* in voltage changes). This allows the operating system to trade off
* performance for power savings.
*
* Idle Power Management domains can enter power savings states when they are
* unutilized. These states allow the Operating System to trade off power
* for performance (in the form of latency to transition from the idle state
* to an active one).
*
* For each active and idle power domain the CMT subsystem instantiates, a
* cpupm_domain_t structure is created. As the dispatcher schedules threads
* to run on the system's CPUs, it will also track the utilization of the
* enumerated power domains. Significant changes in utilization will result
* in the dispatcher sending the power manager events that relate to the
* utilization of the power domain. The power manager recieves the events,
* and in the context of the policy objectives in force, may decide to request
* the domain's power/performance state be changed.
*
* Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
* manager will request the CPUs in the domain run at their fastest (and most
* power consuming) state. When the domain becomes idle (utilization at zero),
* the power manager will request that the CPUs run at a speed that saves the
* most power.
*
* The advantage of this scheme, is that the CPU power manager working with the
* dispatcher can be extremely responsive to changes in utilization. Optimizing
* for performance in the presence of utilization, and power savings in the
* presence of idleness. Such close collaboration with the dispatcher has other
* benefits that will play out in the form of more sophisticated power /
* performance policy in the near future.
*
* Avoiding state thrashing in the presence of transient periods of utilization
* and idleness while still being responsive to non-transient periods is key.
* The power manager implmeents several "governors" that are used to throttle
* state transitions when a significant amount of transient idle or transient
* work is detected.
*
* Kernel background activity (e.g. taskq threads) are by far the most common
* form of transient utilization. Ungoverned in the face of this utililzation,
* hundreds of state transitions per second would result on an idle system.
*
* Transient idleness is common when a thread briefly yields the CPU to
* wait for an event elsewhere in the system. Where the idle period is short
* enough, the overhead associated with making the state transition doesn't
* justify the power savings.
*/
/*
* Uninitialized state of CPU power management is disabled
*/
/*
* Periods of utilization lasting less than this time interval are characterized
* as transient. State changes associated with transient work are considered
* to be mispredicted. That is, it's not worth raising and lower power states
* where the utilization lasts for less than this interval.
*/
/*
* Periods of idleness lasting less than this time interval are characterized
* as transient. State changes associated with transient idle are considered
* to be mispredicted. That is, it's not worth lowering and raising power
* states where the idleness lasts for less than this interval.
*/
/*
* Number of mispredictions after which future transitions will be governed.
*/
int cpupm_mispredict_thresh = 2;
/*
* Likewise, the number of mispredicted governed transitions after which the
* governor will be removed.
*/
int cpupm_mispredict_gov_thresh = 10;
/*
* The transient work and transient idle prediction intervals are initialized
* to be some multiple of the amount of time it takes to transition a power
* domain from the highest to the lowest power state, and back again, which
* is measured.
*
* The default values of those multiples are specified here. Tuning them higher
* will result in the transient work, and transient idle governors being used
* more aggresively, which limits the frequency of state transitions at the
* expense of performance and power savings, respectively.
*/
#define CPUPM_TI_GOV_DEFAULT_MULTIPLE 600
#define CPUPM_TW_GOV_DEFAULT_MULTIPLE 25
/*
* Number of high=>low=>high measurements performed, of which the average
* is taken.
*/
#define CPUPM_BENCHMARK_ITERS 5
static int cpupm_governor_initialize(void);
cpupm_get_policy(void)
{
return (cpupm_policy);
}
int
{
static int gov_init = 0;
int result = 0;
if (new_policy == cpupm_policy) {
return (result);
}
/*
* Pausing CPUs causes a high priority thread to be scheduled
* on all other CPUs (besides the current one). This locks out
* other CPUs from making CPUPM state transitions.
*/
switch (new_policy) {
case CPUPM_POLICY_DISABLED:
start_cpus();
/*
* Once PAD has been enabled, it should always be possible
* to disable it.
*/
/*
* Bring all the active power domains to the maximum
* performance state.
*/
break;
case CPUPM_POLICY_ELASTIC:
if (result < 0) {
/*
* Failed to enable PAD across the active power
* domains, which may well be because none were
* enumerated.
*/
break;
}
/*
* Attempt to initialize the governor parameters the first
* time through.
*/
if (gov_init == 0) {
if (result == 0) {
gov_init = 1;
} else {
/*
* Failed to initialize the governor parameters
*/
start_cpus();
break;
}
}
start_cpus();
break;
default:
ASSERT(0);
break;
}
return (result);
}
/*
* Look for an existing power domain
*/
static cpupm_domain_t *
{
dom = cpupm_domains;
return (dom);
}
return (NULL);
}
/*
* Create a new domain
*/
static cpupm_domain_t *
{
/* Link into the known domain list */
cpupm_domains = dom;
return (dom);
}
static void
{
/*
* In the envent we're enumerating because the domain's state
* configuration has changed, toss any existing states.
*/
if (dom->cpd_nstates > 0) {
dom->cpd_nstates = 0;
}
/*
* Query to determine the number of states, allocate storage
* large enough to hold the state information, and pass it back
* to the platform driver to complete the enumeration.
*/
if (dom->cpd_nstates == 0)
return;
dom->cpd_states =
}
/*
* Initialize the specified type of power domain on behalf of the CPU
*/
{
/*
* Instantiate the domain if it doesn't already exist
* and enumerate its power states.
*/
}
/*
* Named state initialization
*/
if (type == CPUPM_DTYPE_ACTIVE) {
/*
* For active power domains, the highest performance
* state is defined as first state returned from
* the domain enumeration.
*/
&dom->cpd_states[0];
/*
* Begin by assuming CPU is running at the max perf state.
*/
}
return (dom);
}
/*
* Return the id associated with the given type of domain
* to which cp belongs
*/
{
}
/*
* Initiate a state change for the specified domain on behalf of cp
*/
int
{
return (-1);
cpupm_domain_t *, dom,
cpupm_state_t *, state);
return (0);
}
/*
* Interface into the CPU power manager to indicate a significant change
* in utilization of the specified active power domain
*/
void
{
if (cpupm_policy == CPUPM_POLICY_DISABLED) {
return;
}
/*
* What follows is a simple elastic power state management policy.
*
* If the utilization has become non-zero, and the domain was
* previously at it's lowest power state, then transition it
* to the highest state in the spirit of "race to idle".
*
* If the utilization has dropped to zero, then transition the
* domain to its lowest power state.
*
* Statistics are maintained to implement governors to reduce state
* transitions resulting from either transient work, or periods of
* transient idleness on the domain.
*/
switch (event) {
case CPUPM_DOM_REMAIN_BUSY:
/*
* We've received an event that the domain is running a thread
* that's made it to the end of it's time slice. If we are at
* low power, then raise it. If the transient work governor
* is engaged, then remove it.
*/
}
}
break;
case CPUPM_DOM_BUSY_FROM_IDLE:
cpupm_domain_t *, dom,
/*
* There's non-zero utilization, and the domain is
* running in the lower power state. Before we
* consider raising power, perform some book keeping
* for the transient idle governor.
*/
/*
* We're raising the domain power and
* we *just* lowered it. Consider
* this a mispredicted power state
* transition due to a transient
* idle period.
*/
/*
* There's enough transient
* idle transitions to
* justify governing future
* lowering requests.
*/
cpupm_domain_t *, dom);
}
} else {
/*
* We correctly predicted the last
* lowering.
*/
}
}
/*
* Raise requests are governed due to
* transient work.
*/
cpupm_domain_t *, dom);
/*
* It's likely that we'll be governed for a
* while. If the transient idle governor is
* also in place, examine the preceeding idle
* interval to see if that still makes sense.
*/
}
}
return;
}
/*
* Prepare to transition to the higher power state
*/
/*
* Utilization is non-zero, and we're already running
* in the higher power state. Take this opportunity to
* perform some book keeping if the last lowering
* request was governed.
*/
/*
* The domain is transient idle
* governed, and we mispredicted
* governing the last lowering request.
*/
/*
* There's enough non-transient
* idle periods to justify
* removing the governor.
*/
cpupm_domain_t *, dom);
}
} else {
/*
* Correctly predicted governing the
* last lowering request.
*/
}
}
}
break;
case CPUPM_DOM_IDLE_FROM_BUSY:
cpupm_domain_t *, dom,
/*
* The domain is idle, and is running in the highest
* performance state. Before we consider lowering power,
* perform some book keeping for the transient work
* governor.
*/
/*
* We're lowering the domain power and
* we *just* raised it. Consider the
* last raise mispredicted due to
* transient work.
*/
/*
* There's enough transient idle
* transitions to justify
* governing future lowering
* requests.
*/
cpupm_domain_t *, dom);
}
} else {
/*
* We correctly predicted during the
* last raise.
*/
}
}
/*
* Lowering requests are governed due to
* transient idleness.
*/
cpupm_domain_t *, dom);
/*
* It's likely that we'll be governed for a
* while. If the transient work governor is
* also in place, examine the preceeding busy
* interval to see if that still makes sense.
*/
}
}
return;
}
/*
* Prepare to transition to a lower power state.
*/
/*
* The domain is idle, and we're already running in
* the lower power state. Take this opportunity to
* perform some book keeping if the last raising
* request was governed.
*/
/*
* The domain is transient work
* governed, and we mispredicted
* governing the last raising request.
*/
/*
* There's enough non-transient
* work to justify removing
* the governor.
*/
cpupm_domain_t *, dom);
}
} else {
/*
* We correctly predicted governing
* the last raise.
*/
}
}
}
break;
}
/*
* Change the power state
* Not much currently done if this doesn't succeed
*/
if (new_state)
}
/*
* Interface called by platforms to dynamically change the
* MAX performance cpupm state
*/
void
{
/*
* Can use a lock to avoid changing the power state of the cpu when
* CPUPM_STATE_MAX_PERF is getting changed.
* Since the occurance of events to change MAX_PERF is not frequent,
* it may not be a good idea to overburden with locks. In the worst
* case, for one cycle the power may not get changed to the required
* level
*/
}
/*
* If an out of range level is passed, use the lowest supported
* speed.
*/
}
/*
* If the current state is MAX_PERF, change the current state
* to the new MAX_PERF
*/
if (change_state) {
if (new_state) {
}
}
}
}
/*
* Benchmark some power state transitions and use the transition latencies as
* a basis for initializing parameters for the transient idle and transient
* work governors.
*
* Returns 0 on success or -1 if the governor parameters could not be
* initialized.
*/
static int
{
int iterations;
if (did == CPUPM_NO_DOMAIN)
return (-1);
return (-1);
/*
* Measure the amount of time it takes to transition the
* domain down to the lowest, and back to the highest power
* state.
*/
start = gethrtime_unscaled();
}
/*
* Figure the average latency, and tune the transient work and
* transient idle prediction intervals accordingly.
*/
return (0);
}
/*
* Initiate a state change in all CPUPM domain instances of the specified type
*/
static void
{
switch (type) {
case CPUPM_DTYPE_ACTIVE:
break;
default:
/*
* Power domain types other than "active" unsupported.
*/
return;
}
return;
/*
* Iterate over the power domains
*/
/*
* Iterate over the CPUs in each domain
*/
}
}
}