fipe_pm.c revision 90b1de135fcfa7ce4adc9138a885aa94bbcef04f
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2009, Intel Corporation.
* All rights reserved.
*/
#include <sys/cpu_event.h>
#include <sys/sysmacros.h>
int fipe_pm_throttle_level = 1;
/* Enable kstat support. */
#define FIPE_KSTAT_SUPPORT 1
/* Enable performance relative statistics. */
#define FIPE_KSTAT_DETAIL 1
/* Enable builtin IOAT driver if no IOAT driver is available. */
#define FIPE_IOAT_BUILTIN 0
#if defined(FIPE_IOAT_BUILTIN) && (FIPE_IOAT_BUILTIN == 0)
#endif
#ifdef FIPE_IOAT_BUILTIN
/* Use IOAT channel 3 to generate memory transactions. */
#define FIPE_IOAT_CHAN_CTRL 0x200
#define FIPE_IOAT_CHAN_STS_LO 0x204
#define FIPE_IOAT_CHAN_STS_HI 0x208
#define FIPE_IOAT_CHAN_ADDR_LO 0x20C
#define FIPE_IOAT_CHAN_ADDR_HI 0x210
#define FIPE_IOAT_CHAN_CMD 0x214
#define FIPE_IOAT_CHAN_ERR 0x228
#else /* FIPE_IOAT_BUILTIN */
#endif /* FIPE_IOAT_BUILTIN */
/* Memory controller relative PCI configuration constants. */
#define FIPE_MC_GBLACT 0x60
#define FIPE_MC_THRTLOW 0x64
#define FIPE_MC_THRTCTRL 0x67
#define FIPE_MC_THRTCTRL_HUNT 0x1
/* Hardware recommended values. */
#define FIPE_MC_MEMORY_OFFSET 1024
#define FIPE_MC_MEMORY_SIZE 128
/* Number of IOAT commands posted when entering idle. */
#define FIPE_IOAT_CMD_NUM 2
/* Resource allocation retry interval in microsecond. */
/* Statistics update interval in nanosecond. */
/* Configuration profile support. */
/* Priority assigned to FIPE memory power management driver on x86. */
/* Structure to support power management profile. */
static struct fipe_profile {
} fipe_profiles[FIPE_PM_POLICY_MAX] = {
{ 0, 0, 0, 0, 0 },
{ 5, 30, 20, 50, 5 },
{ 10, 40, 40, 75, 4 },
{ 15, 50, 60, 100, 2 },
};
/* Structure to store memory controller relative data. */
static struct fipe_mc_ctrl {
unsigned char mc_thrtctrl;
unsigned char mc_thrtlow;
unsigned char mc_gblact;
} fipe_mc_ctrl;
/* Structure to store IOAT relative information. */
static struct fipe_ioat_control {
#ifdef FIPE_IOAT_BUILTIN
#else /* FIPE_IOAT_BUILTIN */
#endif /* FIPE_IOAT_BUILTIN */
char *ioat_buf_virtaddr;
char *ioat_buf_start;
static struct fipe_idle_ctrl {
/* Put here for cache efficiency, it should be in fipe_global_ctrl. */
/*
* Global control structure.
* Solaris idle thread has no reentrance issue, so it's enough to count CPUs
* in idle state. Otherwise cpuset_t bitmap should be used to track idle CPUs.
*/
static struct fipe_global_ctrl {
volatile uint64_t io_waiters;
char *state_buf;
#ifdef FIPE_KSTAT_SUPPORT
#endif /* FIPE_KSTAT_SUPPORT */
#define FIPE_CPU_STATE_PAD (128 - \
/* Per-CPU status. */
#pragma pack(1)
typedef struct fipe_cpu_state {
char pad1[FIPE_CPU_STATE_PAD];
#pragma pack()
#ifdef FIPE_KSTAT_SUPPORT
static struct fipe_kstat_s {
#ifdef FIPE_KSTAT_DETAIL
#endif /* FIPE_KSTAT_DETAIL */
} fipe_kstat = {
{ "fipe_enabled", KSTAT_DATA_INT32 },
{ "fipe_policy", KSTAT_DATA_INT32 },
{ "fipe_pm_time", KSTAT_DATA_UINT64 },
#ifdef FIPE_KSTAT_DETAIL
{ "ioat_ready", KSTAT_DATA_INT32 },
{ "pm_tryenter_cnt", KSTAT_DATA_UINT64 },
{ "pm_success_cnt", KSTAT_DATA_UINT64 },
{ "pm_race_cnt", KSTAT_DATA_UINT64 },
{ "cpu_loop_cnt", KSTAT_DATA_UINT64 },
{ "cpu_busy_cnt", KSTAT_DATA_UINT64 },
{ "cpu_idle_cnt", KSTAT_DATA_UINT64 },
{ "cpu_intr_busy_cnt", KSTAT_DATA_UINT64 },
{ "cpu_intr_thrt_cnt", KSTAT_DATA_UINT64 },
{ "bio_busy_cnt", KSTAT_DATA_UINT64 },
{ "ioat_start_fail_cnt", KSTAT_DATA_UINT64 },
{ "ioat_stop_fail_cnt", KSTAT_DATA_UINT64 }
#endif /* FIPE_KSTAT_DETAIL */
};
#define FIPE_KSTAT_INC(v) \
#ifdef FIPE_KSTAT_DETAIL
#define FIPE_KSTAT_DETAIL_INC(v) \
#else /* FIPE_KSTAT_DETAIL */
#define FIPE_KSTAT_DETAIL_INC(v)
#endif /* FIPE_KSTAT_DETAIL */
#else /* FIPE_KSTAT_SUPPORT */
#define FIPE_KSTAT_INC(v)
#define FIPE_KSTAT_DETAIL_INC(v)
#endif /* FIPE_KSTAT_SUPPORT */
/*
* There is no lock to protect fipe_profile_curr, so fipe_profile_curr
* could change on threads in fipe_idle_enter. This is not an issue,
* as it always points to a valid profile, and though it might make
* an incorrect choice for the new profile, it will still be a valid
* selection, and would do the correct operation for the new profile on
* next cpu_idle_enter cycle. Since the selections would always be
* valid for some profile, the overhead for the lock is not wasted.
*/
int flags);
static cpu_idle_callback_t fipe_idle_cb = {
};
/*
* Configure memory controller into power saving mode:
* 1) OLTT activation limit is set to unlimited
* 2) MC works in S-CLTT mode
*/
static int
fipe_mc_change(int throttle)
{
/* Set OLTT activation limit to unlimited */
/*
* Set S-CLTT low throttling to desired value. The lower value,
* the more power saving and the less available memory bandwidth.
*/
return (0);
}
/*
* Restore memory controller's original configuration.
*/
static void
fipe_mc_restore(void)
{
}
/*
* Initialize memory controller's data structure and status.
*/
static int
{
/* Hold one reference count and will be released in fipe_mc_fini. */
/* Setup pci configuration handler. */
"!fipe: failed to setup pcicfg handler in mc_init.");
return (-1);
}
/* Save original configuration. */
return (0);
}
/*
* Restore memory controller's configuration and release resources.
*/
static void
fipe_mc_fini(void)
{
if (fipe_mc_ctrl.mc_initialized) {
}
}
/* Search device with specific pci ids. */
struct fipe_pci_ioat_id {
char *unitaddr;
};
static struct fipe_pci_ioat_id fipe_pci_ioat_ids[] = {
};
/*ARGSUSED*/
static int
{
char *unit;
struct fipe_pci_ioat_id *id;
/* Query PCI id properties. */
"vendor-id", 0xffffffff);
if (venid == 0xffffffff) {
return (DDI_WALK_CONTINUE);
}
"device-id", 0xffffffff);
if (devid == 0xffffffff) {
return (DDI_WALK_CONTINUE);
}
"subsystem-vendor-id", 0xffffffff);
if (subvenid == 0xffffffff) {
return (DDI_WALK_CONTINUE);
}
"subsystem-id", 0xffffffff);
if (subvenid == 0xffffffff) {
return (DDI_WALK_CONTINUE);
}
return (DDI_WALK_CONTINUE);
}
for (i = 0; i < max; i++) {
id = &fipe_pci_ioat_ids[i];
break;
}
}
if (i >= max) {
return (DDI_WALK_CONTINUE);
}
/* Found IOAT device, hold one reference count. */
return (DDI_WALK_TERMINATE);
}
/*
* To enable FBDIMM idle power enhancement mechanism, IOAT will be used to
* generate enough memory traffic to trigger memory controller thermal throttle
* circuitry.
* with IOAT. Otherwise the built-in driver will directly talk to IOAT
* hardware.
*/
#ifdef FIPE_IOAT_BUILTIN
static int
fipe_ioat_trigger(void)
{
/* Check channel in use flag. */
if (ctrl & 0x100) {
/*
* Channel is in use by somebody else. IOAT driver may have
* been loaded, forbid fipe from accessing IOAT hardware
* anymore.
*/
return (-1);
} else {
/* Set channel in use flag. */
}
/* Write command address. */
/* Check and clear error flags. */
if (err != 0) {
}
/* Start channel. */
return (0);
}
static void
fipe_ioat_cancel(void)
{
/*
* Reset channel. Sometimes reset is not reliable,
* so check completion or abort status after reset.
*/
/* LINTED: constant in conditional context */
while (1) {
/* Issue reset channel command. */
/* Query command status. */
if (status & 0x1) {
/* Reset channel completed. */
break;
} else {
SMT_PAUSE();
}
}
/* Put channel into "not in use" state. */
}
/*ARGSUSED*/
static void
fipe_ioat_alloc(void *arg)
{
/*
* fipe_ioat_alloc() is called in DEVICE ATTACH context when loaded.
* In DEVICE ATTACH context, it can't call ddi_walk_devs(), so just
* schedule a timer and exit.
*/
goto out_error;
}
/* Check whether has been initialized or encountered permanent error. */
return;
}
/* Find dev_info_t for IOAT engine. */
"!fipe: no IOAT hardware found, disable pm.");
goto out_error;
}
}
/* Map in IOAT control register window. */
goto out_error;
}
if (rc != DDI_SUCCESS) {
goto out_error;
}
/* Mark IOAT status. */
return;
if (fatal) {
/* Mark permanent error and give up. */
/* Release reference count hold by ddi_find_devinfo. */
}
} else {
/*
* Schedule another timer to keep on trying.
* timeout() should always succeed, no need to check
* return.
*/
}
}
}
static void
fipe_ioat_free(void)
{
/* Cancel timeout to avoid race condition. */
if (fipe_ioat_ctrl.ioat_timerid != 0) {
}
if (fipe_ioat_ctrl.ioat_reg_mapped) {
}
}
#else /* FIPE_IOAT_BUILTIN */
/*
* Trigger IOAT memory copy operation when entering power saving state.
* A group of commands will be posted to IOAT driver and those commands
* will be placed into an IOAT ring buffer.
*/
static int
fipe_ioat_trigger(void)
{
int idx;
continue;
} else {
/*
* Don't rollback on failure, it doesn't hurt much more
* than some small memory copy operations.
*/
return (-1);
}
}
return (0);
}
/*
* Cancel the memory copy operations posted by fipe_ioat_trigger.
* It's achieved by posting a new command which will break the ring
* created by fipe_ioat_trigger. If it fails, the best way to recover
* is to just let it go. IOAT will recover when posting next command
* on the same channel.
*/
static void
fipe_ioat_cancel(void)
{
}
}
/*
* This function will be called from allocate IOAT resources.
* Allocation may fail due to following reasons:
* 1) IOAT driver hasn't been loaded yet. Keep on trying in this case.
* 2) IOAT resources are temporarily unavailable. Keep on trying in this case.
* 3) Other no recoverable reasons. Disable power management function.
*/
/*ARGSUSED*/
static void
fipe_ioat_alloc(void *arg)
{
/*
* fipe_ioat_alloc() is called in DEVICE ATTACH context when loaded.
* In DEVICE ATTACH context, it can't call ddi_walk_devs(), so just
* schedule a timer and exit.
*/
goto out_error;
}
/*
* Check whether device has been initialized or if it encountered
* some permanent error.
*/
return;
}
/* Find dev_info_t for IOAT engine. */
"!fipe: no IOAT hardware found, disable pm.");
goto out_error;
}
}
/* Check, allocate and initialize IOAT resources with lock released. */
dcopy_query(&info);
/* Permanent error, give up. */
goto out_error;
} else if (info.dq_num_channels == 0) {
/* IOAT driver hasn't been loaded, keep trying. */
goto out_error;
}
/* Allocate IOAT channel. */
if (rc == DCOPY_NORESOURCES) {
/* Resource temporarily not available, keep trying. */
goto out_error;
} else if (rc != DCOPY_SUCCESS) {
/* Permanent error, give up. */
goto out_error;
}
/*
* Allocate multiple IOAT commands and organize them into a ring to
* loop forever. Commands number is determined by IOAT descriptor size
* and memory interleave pattern.
* cmd[0] is used break the loop and disable IOAT operation.
* cmd[1, FIPE_IOAT_CMD_NUM] are grouped into a ring and cmd[1] is the
* list head.
*/
/* Allocate IOAT commands. */
} else {
/*
* To link commands into a list, the initial value of
* cmd need to be set to next cmd on list.
*/
}
if (rc == DCOPY_NORESOURCES) {
goto out_freecmd;
} else if (rc != DCOPY_SUCCESS) {
/* Permanent error, give up. */
"!fipe: failed to allocate IOAT command.");
goto out_freecmd;
}
/* Specially handle commands on the list. */
if (idx != 0) {
/* Disable IOAT status. */
/* Disable waiting for resources. */
if (idx == 1) {
/* The list head, chain command into loop. */
} else {
/* Queue all other commands except head. */
}
}
if (idx == 0) {
/*
* Command 0 is used to cancel memory copy by breaking
* the ring created in fipe_ioat_trigger().
* For efficiency, use the smallest memory copy size.
*/
} else {
}
}
/* Update IOAT control status if it hasn't been initialized yet. */
}
return;
}
/* Initialized by another thread, fall through to free resources. */
dcopy_cmd_free(&cmds[0]);
}
/* Only need to free head, dcopy will free all commands on the list. */
break;
}
}
dcopy_free(&handle);
if (fatal) {
/* Mark permanent error and give up. */
/* Release reference count hold by ddi_find_devinfo. */
}
} else {
/*
* Schedule another timer to keep on trying.
* timeout() should always success, no need to check.
*/
}
}
}
/*
* Free resources allocated in fipe_ioat_alloc.
*/
static void
fipe_ioat_free(void)
{
int idx = 0;
/* Cancel timeout to avoid race condition. */
if (fipe_ioat_ctrl.ioat_timerid != 0) {
}
/* Free ioat resources. */
if (fipe_ioat_ctrl.ioat_ready) {
dcopy_cmd_free(&cmds[0]);
}
break;
}
}
sizeof (fipe_ioat_ctrl.ioat_cmds));
}
/* Release reference count hold by ddi_find_devinfo. */
}
}
#endif /* FIPE_IOAT_BUILTIN */
/*
* Initialize IOAT relative resources.
*/
static int
fipe_ioat_init(void)
{
char *buf;
/*
* Allocate memory for IOAT memory copy operation.
* The allocated memory should be page aligned to achieve better power
* savings.
* Don't use ddi_dma_mem_alloc here to keep thing simple. This also
* makes quiesce easier.
*/
size <<= 1;
}
#ifdef FIPE_IOAT_BUILTIN
{
/* IOAT descriptor data structure copied from ioat.h. */
struct fipe_ioat_cmd_desc {
} *desc;
/*
* Build two IOAT command descriptors and chain them into ring.
* Control flags as below:
* 0x2: disable source snoop
* 0x4: disable destination snoop
* 0x0 << 24: memory copy operation
* The layout for command descriptors and memory buffers are
* organized for power saving effect, please don't change it.
*/
/* First command descriptor. */
/* Point to second descriptor. */
/* Second command descriptor. */
/* Point to first descriptor. */
}
#endif /* FIPE_IOAT_BUILTIN */
return (0);
}
static void
fipe_ioat_fini(void)
{
/* Release reference count hold by ddi_find_devinfo. */
}
}
}
static int
fipe_idle_start(void)
{
int rc;
if (fipe_idle_ctrl.idle_ready) {
return (0);
}
&fipe_idle_ctrl.prop_enter) != 0) {
return (-1);
}
&fipe_idle_ctrl.prop_exit) != 0) {
return (-1);
}
&fipe_idle_ctrl.prop_idle) != 0) {
return (-1);
}
&fipe_idle_ctrl.prop_busy) != 0) {
return (-1);
}
&fipe_idle_ctrl.prop_intr) != 0) {
return (-1);
}
/* Register idle state notification callback. */
if (rc != 0) {
return (-1);
}
return (0);
}
static int
fipe_idle_stop(void)
{
int rc;
return (0);
}
if (rc != 0) {
"!fipe: failed to unregister cpuidle callback.");
return (-1);
}
return (0);
}
#ifdef FIPE_KSTAT_SUPPORT
static int
{
struct fipe_kstat_s *sp;
if (rw == KSTAT_WRITE) {
return (EACCES);
}
scalehrtime(&hrt);
#ifdef FIPE_KSTAT_DETAIL
#endif /* FIPE_KSTAT_DETAIL */
return (0);
}
#endif /* FIPE_KSTAT_SUPPORT */
/*
* Initialize memory power management subsystem.
* Note: This function should only be called from ATTACH.
* Note: caller must ensure exclusive access to all fipe_xxx interfaces.
*/
int
{
/* Initialize global control structure. */
/* Query power management policy from device property. */
"?fipe: invalid power management policy %d.\n",
}
/*
* Compute unscaled hrtime value corresponding to FIPE_STAT_INTERVAL.
* (1 << 36) should be big enough here.
*/
scalehrtime(&hrt);
if (fipe_mc_init(dip) != 0) {
goto out_mc_error;
}
if (fipe_ioat_init() != 0) {
goto out_ioat_error;
}
/* Allocate per-CPU structure. */
#ifdef FIPE_KSTAT_SUPPORT
} else {
}
#endif /* FIPE_KSTAT_SUPPORT */
return (0);
fipe_mc_fini();
return (-1);
}
/*
* Destroy memory power management subsystem.
* Note: This function should only be called from DETACH.
* Note: caller must ensure exclusive access to all fipe_xxx interfaces.
*/
int
fipe_fini(void)
{
if (fipe_gbl_ctrl.pm_enabled) {
return (EBUSY);
}
fipe_mc_fini();
#ifdef FIPE_KSTAT_SUPPORT
}
#endif /* FIPE_KSTAT_SUPPORT */
}
return (0);
}
/*
* Start memory power management subsystem.
* Note: caller must ensure exclusive access to all fipe_xxx interfaces.
*/
int
fipe_start(void)
{
return (0);
}
if (fipe_idle_start() != 0) {
return (-1);
}
return (0);
}
/*
* Stop memory power management subsystem.
* Note: caller must ensure exclusive access to all fipe_xxx interfaces.
*/
int
fipe_stop(void)
{
if (fipe_gbl_ctrl.pm_enabled) {
if (fipe_idle_stop() != 0) {
"!fipe: failed to stop PM subsystem.");
return (-1);
}
}
return (0);
}
int
fipe_suspend(void)
{
/* Save current power management policy. */
/* Disable PM by setting profile to FIPE_PM_POLICY_DISABLE. */
return (0);
}
int
fipe_resume(void)
{
/* Restore saved power management policy. */
return (0);
}
fipe_get_pmpolicy(void)
{
return (fipe_pm_policy);
}
int
{
return (EINVAL);
}
return (0);
}
/*
* Check condition (fipe_gbl_ctrl.cpu_cnt == ncpus) to make sure that
* there is other CPU trying to wake up system from memory power saving state.
* If a CPU is waking up system, fipe_disable() will set
* fipe_gbl_ctrl.pm_active to false as soon as possible and allow other CPU's
* to continue, and it will take the responsibility to recover system from
* memory power saving state.
*/
static void
{
extern void membar_sync(void);
/*
* Check CPU wakeup events.
*/
if (check_func != NULL) {
(*check_func)(check_arg);
}
/*
* Try to acquire mutex, which also implicitly has the same effect
* of calling membar_sync().
* If mutex_tryenter fails, that means other CPU is waking up.
*/
/*
* Handle a special race condition for the case that a CPU wakes
* and then enters into idle state within a short period.
* This case can't be reliably detected by cpu_count mechanism.
*/
} else if (fipe_gbl_ctrl.pm_active) {
} else {
membar_sync();
} else if (fipe_ioat_trigger() != 0) {
fipe_mc_change(throttle) != 0) {
}
} else {
}
}
}
static void
fipe_disable(void)
{
/*
* Try to acquire lock, which also implicitly has the same effect
* of calling membar_sync().
*/
/*
* If power saving is inactive, just return and all dirty
* house-keeping work will be handled in fipe_enable().
*/
return;
} else {
(void) SMT_PAUSE();
}
}
/* Disable power saving if it's active. */
if (fipe_gbl_ctrl.pm_active) {
/*
* Set pm_active to FALSE as soon as possible to prevent
* other CPUs from waiting on pm_active flag.
*/
}
}
/*ARGSUSED*/
static boolean_t
{
/* Treat CPU in offline state as ready. */
return (B_TRUE);
/* Set default value. */
sp->idle_count = 0;
/* Calculate idle percent. */
/* Check idle condition. */
} else {
}
} else {
}
/* Calculate interrupt count. */
&val) == 0) {
if (diff != 0) {
} else {
}
} else {
}
/*
* System is busy with interrupts, so disable all PM
* status checks for INTR_BUSY_THROTTLE ticks.
* Interrupts are disabled when FIPE callbacks are called,
* so this optimization will help to reduce interrupt
* latency.
*/
if (intr >= FIPE_PROF_INTR_BUSY_THRESHOLD) {
} else if (intr >= FIPE_PROF_INTR_THRESHOLD) {
}
sp->idle_count = 0;
return (B_FALSE);
}
return (sp->cond_ready);
}
/*ARGSUSED*/
static void
{
struct fipe_cpu_state *sp;
if (fipe_pm_policy != FIPE_PM_POLICY_DISABLE &&
/* Adjust iowait count for local CPU. */
}
/* Check current CPU status. */
/* Increase count of CPU ready for power saving. */
do {
/*
* Enable power saving if all CPUs are idle.
*/
if (fipe_gbl_ctrl.io_waiters == 0) {
/* There are ongoing block io operations. */
} else {
}
}
}
} else if (fipe_pm_policy == FIPE_PM_POLICY_DISABLE ||
}
sp->throttle_ts = 0;
sp->idle_count = 0;
}
}
/*ARGSUSED*/
static void
{
struct fipe_cpu_state *sp;
if (sp->cond_ready) {
do {
/*
* Try to disable power saving state.
* Only the first CPU waking from idle state will try to
* disable power saving state, all other CPUs will just go
* on and not try to wait for memory to recover from power
* saving state.
* So there are possible periods during which some CPUs are in
* active state but memory is in power saving state.
* This is OK, since it is an uncommon case, and it is
* better for performance to let them continue as their
* blocking latency is smaller than a mutex, and is only
* hit in the uncommon condition.
*/
fipe_disable();
ctx);
}
}
}