cpu_pm.c revision 113b131b48d0e653a91612bb4461ea90adbd849a
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Solaris Event Based CPU Power Manager
*
* This file implements platform independent event based CPU power management.
* When CPUs are configured into the system, the CMT scheduling subsystem will
* query the platform to determine if the CPU belongs to any power management
* domains. That is, sets of CPUs that share power management states.
*
* Active Power Management domains represent a group of CPUs across which the
* Operating System can request speed changes (which may in turn result
* in voltage changes). This allows the operating system to trade off
* performance for power savings.
*
* Idle Power Management domains can enter power savings states when they are
* unutilized. These states allow the Operating System to trade off power
* for performance (in the form of latency to transition from the idle state
* to an active one).
*
* For each active and idle power domain the CMT subsystem instantiates, a
* cpupm_domain_t structure is created. As the dispatcher schedules threads
* to run on the system's CPUs, it will also track the utilization of the
* enumerated power domains. Significant changes in utilization will result
* in the dispatcher sending the power manager events that relate to the
* utilization of the power domain. The power manager recieves the events,
* and in the context of the policy objectives in force, may decide to request
* the domain's power/performance state be changed.
*
* Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
* manager will request the CPUs in the domain run at their fastest (and most
* power consuming) state. When the domain becomes idle (utilization at zero),
* the power manager will request that the CPUs run at a speed that saves the
* most power.
*
* The advantage of this scheme, is that the CPU power manager working with the
* dispatcher can be extremely responsive to changes in utilization. Optimizing
* for performance in the presence of utilization, and power savings in the
* presence of idleness. Such close collaboration with the dispatcher has other
* benefits that will play out in the form of more sophisticated power /
* performance policy in the near future.
*
* Avoiding state thrashing in the presence of transient periods of utilization
* and idleness while still being responsive to non-transient periods is key.
* The power manager implements a "governor" that is used to throttle
* state transitions when a significant amount of transient idle or transient
* work is detected.
*
* Kernel background activity (e.g. taskq threads) are by far the most common
* form of transient utilization. Ungoverned in the face of this utililzation,
* hundreds of state transitions per second would result on an idle system.
*
* Transient idleness is common when a thread briefly yields the CPU to
* wait for an event elsewhere in the system. Where the idle period is short
* enough, the overhead associated with making the state transition doesn't
* justify the power savings.
*
* The following is the state machine for the governor implemented by
* cpupm_utilization_event():
*
* ----->---tw---->-----
* / \
* (I)-<-ti-<- -<-ntw-<(W)
* | \ / |
* \ \ / /
* Key:
*
* States
* - (D): Default (ungoverned)
* - (W): Transient work governed
* - (I): Transient idle governed
* State Transitions
* - tw: transient work
* - ti: transient idleness
* - ntw: non-transient work
* - nti: non-transient idleness
* - rm: thread remain event
*/
/*
* Uninitialized state of CPU power management is disabled
*/
/*
* Periods of utilization lasting less than this time interval are characterized
* as transient. State changes associated with transient work are considered
* to be mispredicted. That is, it's not worth raising and lower power states
* where the utilization lasts for less than this interval.
*/
/*
* Periods of idleness lasting less than this time interval are characterized
* as transient. State changes associated with transient idle are considered
* to be mispredicted. That is, it's not worth lowering and raising power
* states where the idleness lasts for less than this interval.
*/
/*
* Number of mispredictions after which future transitions will be governed.
*/
int cpupm_mispredict_thresh = 4;
/*
* Likewise, the number of mispredicted governed transitions after which the
* governor will be removed.
*/
int cpupm_mispredict_gov_thresh = 4;
/*
* The transient work and transient idle prediction intervals are specified
* here. Tuning them higher will result in the transient work, and transient
* idle governors being used more aggresively, which limits the frequency of
* state transitions at the expense of performance and power savings,
* respectively. The intervals are specified in nanoseconds.
*/
/*
* 400 usec
*/
#define CPUPM_DEFAULT_TI_INTERVAL 400000
/*
* 400 usec
*/
#define CPUPM_DEFAULT_TW_INTERVAL 400000
static void cpupm_governor_initialize(void);
cpupm_get_policy(void)
{
return (cpupm_policy);
}
int
{
static int gov_init = 0;
int result = 0;
if (new_policy == cpupm_policy) {
return (result);
}
/*
* Pausing CPUs causes a high priority thread to be scheduled
* on all other CPUs (besides the current one). This locks out
* other CPUs from making CPUPM state transitions.
*/
switch (new_policy) {
case CPUPM_POLICY_DISABLED:
start_cpus();
/*
* Once PAD has been enabled, it should always be possible
* to disable it.
*/
/*
* Bring all the active power domains to the maximum
* performance state.
*/
break;
case CPUPM_POLICY_ELASTIC:
if (result < 0) {
/*
* Failed to enable PAD across the active power
* domains, which may well be because none were
* enumerated.
*/
break;
}
/*
* Initialize the governor parameters the first time through.
*/
if (gov_init == 0) {
gov_init = 1;
}
start_cpus();
break;
default:
ASSERT(0);
break;
}
return (result);
}
/*
* Look for an existing power domain
*/
static cpupm_domain_t *
{
dom = cpupm_domains;
return (dom);
}
return (NULL);
}
/*
* Create a new domain
*/
static cpupm_domain_t *
{
/* Link into the known domain list */
cpupm_domains = dom;
return (dom);
}
static void
{
/*
* In the envent we're enumerating because the domain's state
* configuration has changed, toss any existing states.
*/
if (dom->cpd_nstates > 0) {
dom->cpd_nstates = 0;
}
/*
* Query to determine the number of states, allocate storage
* large enough to hold the state information, and pass it back
* to the platform driver to complete the enumeration.
*/
if (dom->cpd_nstates == 0)
return;
dom->cpd_states =
}
/*
* Initialize the specified type of power domain on behalf of the CPU
*/
{
/*
* Instantiate the domain if it doesn't already exist
* and enumerate its power states.
*/
}
/*
* Named state initialization
*/
if (type == CPUPM_DTYPE_ACTIVE) {
/*
* For active power domains, the highest performance
* state is defined as first state returned from
* the domain enumeration.
*/
&dom->cpd_states[0];
/*
* Begin by assuming CPU is running at the max perf state.
*/
}
return (dom);
}
/*
* Return the id associated with the given type of domain
* to which cp belongs
*/
{
}
/*
* Initiate a state change for the specified domain on behalf of cp
*/
int
{
return (-1);
cpupm_domain_t *, dom,
cpupm_state_t *, state);
return (0);
}
/*
* Interface into the CPU power manager to indicate a significant change
* in utilization of the specified active power domain
*/
void
{
if (cpupm_policy == CPUPM_POLICY_DISABLED) {
return;
}
/*
* What follows is a simple elastic power state management policy.
*
* If the utilization has become non-zero, and the domain was
* previously at it's lowest power state, then transition it
* to the highest state in the spirit of "race to idle".
*
* If the utilization has dropped to zero, then transition the
* domain to its lowest power state.
*
* Statistics are maintained to implement a governor to reduce state
* transitions resulting from either transient work, or periods of
* transient idleness on the domain.
*/
switch (event) {
case CPUPM_DOM_REMAIN_BUSY:
/*
* We've received an event that the domain is running a thread
* that's made it to the end of it's time slice. If we are at
* low power, then raise it. If the transient work governor
* is engaged, then remove it.
*/
}
}
break;
case CPUPM_DOM_BUSY_FROM_IDLE:
cpupm_domain_t *, dom,
/*
* There's non-zero utilization, and the domain is
* running in the lower power state. Before we
* consider raising power, check if the preceeding
* idle period was transient in duration.
*
* If the domain is already transient work governed,
* then we don't bother maintaining transient idle
* statistics, as the presence of enough transient work
* can also make the domain frequently transiently idle.
* In this case, we still want to remain transient work
* governed.
*/
/*
* We're raising the domain power and
* we *just* lowered it. Consider
* this a mispredicted power state
* transition due to a transient
* idle period.
*/
/*
* There's enough transient
* idle transitions to
* justify governing future
* lowering requests.
*/
dom->cpd_governor =
cpupm_domain_t *, dom);
}
} else {
/*
* We correctly predicted the last
* lowering.
*/
}
}
/*
* Raise requests are governed due to
* transient work.
*/
cpupm_domain_t *, dom);
return;
}
/*
* Prepare to transition to the higher power state
*/
/*
* Utilization is non-zero, and we're already running
* in the higher power state. Take this opportunity to
* perform some book keeping if the last lowering
* request was governed.
*/
/*
* The domain is transient idle
* governed, and we mispredicted
* governing the last lowering request.
*/
/*
* There's enough non-transient
* idle periods to justify
* removing the governor.
*/
dom->cpd_governor =
cpupm_domain_t *, dom);
}
} else {
/*
* Correctly predicted governing the
* last lowering request.
*/
}
}
}
break;
case CPUPM_DOM_IDLE_FROM_BUSY:
cpupm_domain_t *, dom,
/*
* The domain is idle, and is running in the highest
* performance state. Before we consider lowering power,
* perform some book keeping for the transient work
* governor.
*/
/*
* We're lowering the domain power and
* we *just* raised it. Consider the
* last raise mispredicted due to
* transient work.
*/
/*
* There's enough transient work
* transitions to justify
* governing future raise
* requests.
*/
dom->cpd_governor =
cpupm_domain_t *, dom);
}
} else {
/*
* We correctly predicted during the
* last raise.
*/
}
}
/*
* Lowering requests are governed due to
* transient idleness.
*/
cpupm_domain_t *, dom);
return;
}
/*
* Prepare to transition to a lower power state.
*/
/*
* The domain is idle, and we're already running in
* the lower power state. Take this opportunity to
* perform some book keeping if the last raising
* request was governed.
*/
/*
* The domain is transient work
* governed, and we mispredicted
* governing the last raising request.
*/
/*
* There's enough non-transient
* work to justify removing
* the governor.
*/
dom->cpd_governor =
cpupm_domain_t *, dom);
}
} else {
/*
* We correctly predicted governing
* the last raise.
*/
}
}
}
break;
}
/*
* Change the power state
* Not much currently done if this doesn't succeed
*/
if (new_state)
}
/*
* Interface called by platforms to dynamically change the
* MAX performance cpupm state
*/
void
{
/*
* Can use a lock to avoid changing the power state of the cpu when
* CPUPM_STATE_MAX_PERF is getting changed.
* Since the occurance of events to change MAX_PERF is not frequent,
* it may not be a good idea to overburden with locks. In the worst
* case, for one cycle the power may not get changed to the required
* level
*/
}
/*
* If an out of range level is passed, use the lowest supported
* speed.
*/
}
/*
* If the current state is MAX_PERF, change the current state
* to the new MAX_PERF
*/
if (change_state) {
if (new_state) {
}
}
}
}
/*
* Initialize the parameters for the transience governor state machine
*/
static void
{
/*
* The default prediction intervals are specified in nanoseconds.
* Convert these to the equivalent in unscaled hrtime, which is the
* format of the timestamps passed to cpupm_utilization_event()
*/
}
/*
* Initiate a state change in all CPUPM domain instances of the specified type
*/
static void
{
switch (type) {
case CPUPM_DTYPE_ACTIVE:
break;
default:
/*
* Power domain types other than "active" unsupported.
*/
return;
}
return;
/*
* Iterate over the power domains
*/
/*
* Iterate over the CPUs in each domain
*/
}
}
}