suspend.c revision 98b45ebecf42e6d81a4aa85f88ffcc06af817f34
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
#include <sys/hypervisor_api.h>
#include <sys/mach_descrip.h>
#include <sys/cpu_module.h>
#include <sys/sysmacros.h>
#include <vm/hat_sfmmu.h>
/*
* Sun4v OS Suspend
*
* Provides a means to suspend a sun4v guest domain by pausing CPUs and then
* calling into the HV to initiate a suspension. Suspension is sequenced
* externally by calling suspend_pre, suspend_start, and suspend_post.
* suspend_pre and suspend_post are meant to perform any special operations
* callbacks to cluster software to disable heartbeat monitoring before the
* system is suspended. suspend_start prepares kernel services to be suspended
* and then suspends the domain by calling hv_guest_suspend.
*
* Special Handling for %tick and %stick Registers
*
* jumped forwards or backwards. The delta is assumed to be consistent across
* all CPUs, within the negligible level of %tick and %stick variation
* acceptable on a cold boot. In order to maintain increasing %tick and %stick
* counter values without exposing large positive or negative jumps to kernel
* or user code, a %tick and %stick offset is used. Kernel reads of these
* counters return the sum of the hardware register counter and offset
* are emulated. Suspend code enables emulation by setting the
* %{tick,stick}.NPT fields which trigger a privileged instruction access
* trap whenever the registers are read from user mode. If emulation has been
* enabled, the trap handler emulates the instruction. Emulation is only
* enabled, CPUs that are DR'd into the system will have their
* %{tick,stick}.NPT bits set to 1 as well.
*/
extern uint64_t gettick_npt(void);
extern uint64_t getstick_npt(void);
extern int mach_descrip_update(void);
extern cpuset_t cpu_ready_set;
extern uint64_t native_tick_offset;
extern uint64_t native_stick_offset;
extern uint64_t sys_tick_freq;
/*
*/
const char *(*cl_suspend_error_decode)(int);
int (*cl_suspend_pre_callback)(void);
int (*cl_suspend_post_callback)(void);
#define SC_PRE_FAIL_STR_FMT "Sun Cluster pre-suspend failure: %d"
#define SC_POST_FAIL_STR_FMT "Sun Cluster post-suspend failure: %d"
#define SC_FAIL_STR_MAX 256
/*
* The minimum major and minor version of the HSVC_GROUP_CORE API group
* required in order to use OS suspend.
*/
#define SUSPEND_CORE_MAJOR 1
#define SUSPEND_CORE_MINOR 2
/*
* By default, sun4v OS suspend is supported if the required HV version
* is present. suspend_disabled should be set on platforms that do not
* allow OS suspend regardless of whether or not the HV supports it.
*/
static int suspend_disabled = 0;
/*
* Controls whether or not user-land tick and stick register emulation
* will be enabled following a successful suspend operation.
*/
static int enable_user_tick_stick_emulation = 1;
/*
* Indicates whether or not tick and stick emulation is currently active.
* After a successful suspend operation, if emulation is enabled, this
* variable is set to B_TRUE. Global scope to allow emulation code to
* check if emulation is active.
*/
/*
* When non-zero, after a successful suspend and resume, cpunodes, CPU HW
* sharing data structures, and processor groups will be updated using
* information from the updated MD.
*/
static int suspend_update_cpu_mappings = 1;
/*
* The maximum number of microseconds by which the %tick or %stick register
* can vary between any two CPUs in the system. To calculate the
* native_stick_offset and native_tick_offset, we measure the change in these
* slightly larger or smaller changes. %tick and %stick should be synchronized
* between CPUs, but there may be some variation. So we add an additional value
* derived from this variable to ensure that these registers always increase
* are synchronized (within a certain limit) across CPUs in the system. The
* delta between %sticks on different CPUs should be a small number of cycles,
* not perceptible to readers of %stick that migrate between CPUs. We set this
* all CPU's %tick and %stick will advance forwards as long as, across all
* CPUs, the %tick and %stick are synchronized to within 1 ms. This applies to
* CPUs before the suspend and CPUs after the resume. 1 ms is conservative,
* but small enough to not trigger TOD faults.
*/
/*
* The number of times the system has been suspended and resumed.
*/
static uint64_t suspend_count = 0;
/*
* DBG and DBG_PROM() macro.
*/
#ifdef DEBUG
static int suspend_debug_flag = 0;
#define DBG_PROM \
if (suspend_debug_flag) \
#define DBG \
if (suspend_debug_flag) \
static void
suspend_debug(const char *fmt, ...)
{
char buf[512];
}
#else /* DEBUG */
#define DBG_PROM
#define DBG
#endif /* DEBUG */
/*
* Return true if the HV supports OS suspend and if suspend has not been
* disabled on this platform.
*/
suspend_supported(void)
{
if (suspend_disabled)
return (B_FALSE);
return (B_FALSE);
(major > SUSPEND_CORE_MAJOR));
}
/*
* Memory DR is not permitted if the system has been suspended and resumed.
* It is the responsibility of the caller of suspend_start and the DR
* subsystem to serialize DR operations and suspend_memdr_allowed() checks.
*/
suspend_memdr_allowed(void)
{
return (suspend_count == 0);
}
/*
* Given a source tick, stick, and tod value, set the tick and stick offsets
* such that the (current physical register value) + offset == (source value)
* and in addition account for some variation between the %tick/%stick on
* different CPUs. We account for this variation by adding in double the value
* of suspend_tick_stick_max_delta. The following is an explanation of why
* suspend_tick_stick_max_delta must be multplied by two and added to
* native_stick_offset.
*
* Consider a guest instance that is yet to be suspended with CPUs p0 and p1
* with physical "source" %stick values s0 and s1 respectively. When the guest
* is first resumed, the physical "target" %stick values are t0 and t1
* respectively. The virtual %stick values after the resume are v0 and v1
* respectively. Let x be the maximum difference between any two CPU's %stick
* register at a given point in time and let the %stick values be assigned
* such that
*
* s1 = s0 + x and
* t1 = t0 - x
*
* Let us assume that p0 is driving the suspend and resume. Then, we will
* calculate the stick offset f and the virtual %stick on p0 after the
* resume as follows.
*
* f = s0 - t0 and
* v0 = t0 + f
*
* We calculate the virtual %stick v1 on p1 after the resume as
*
* v1 = t1 + f
*
* Substitution yields
*
* v1 = t1 + (s0 - t0)
* v1 = (t0 - x) + (s0 - t0)
* v1 = -x + s0
* v1 = s0 - x
* v1 = (s1 - x) - x
* v1 = s1 - 2x
*
* Therefore, in this scenario, without accounting for %stick variation in
* the calculation of the native_stick_offset f, the virtual %stick on p1
* is less than the value of the %stick on p1 before the suspend which is
* unacceptable. By adding 2x to v1, we guarantee it will be equal to s1
* which means the %stick on p1 after the resume will always be greater
* than or equal to the %stick on p1 before the suspend. Since v1 = t1 + f
* at any point in time, we can accomplish this by adding 2x to f. This
* guarantees any processes bound to CPU P0 or P1 will not see a %stick
* suspend_tick_stick_max_delta by two in the calculation for
* native_stick_offset, native_tick_offset, and target_hrtime.
*/
static void
{
/*
* Temporarily set the offsets to zero so that the following reads
* of the registers will yield physical unadjusted counter values.
*/
native_tick_offset = 0;
native_stick_offset = 0;
/*
* Calculate the new offsets. In addition to the delta observed on
* this CPU, add an additional value. Multiply the %tick/%stick
* frequency by suspend_tick_stick_max_delta (us). Then, multiply by 2
* to account for a delta between CPUs before the suspend and a
* delta between CPUs after the resume.
*/
/*
* We've effectively increased %stick and %tick by twice the value
* of suspend_tick_stick_max_delta to account for variation across
* CPUs. Now adjust the preserved TOD by the same amount.
*/
}
/*
* Set the {tick,stick}.NPT field to 1 on this CPU.
*/
static void
enable_tick_stick_npt(void)
{
(void) hv_stick_set_npt(1);
(void) hv_tick_set_npt(1);
}
/*
* Synchronize a CPU's {tick,stick}.NPT fields with the current state
* of the system. This is used when a CPU is DR'd into the system.
*/
void
{
if (tick_stick_emulation_active) {
(void) hv_stick_set_npt(1);
(void) hv_tick_set_npt(1);
} else {
ASSERT(gettick_npt() == 0);
ASSERT(getstick_npt() == 0);
}
}
/*
* Obtain an updated MD from the hypervisor and update cpunodes, CPU HW
* sharing data structures, and processor groups.
*/
static void
update_cpu_mappings(void)
{
DBG("suspend: md_get_handle failed");
return;
}
DBG("suspend: updating CPU mappings");
continue;
}
/*
* Re-calculate processor groups.
*
* First tear down all PG information before adding any new PG
* information derived from the MD we just downloaded. We must
* call pg_cpu_inactive and pg_cpu_active with CPUs paused and
* we want to minimize the number of times pause_cpus is called.
* Inactivating all CPUs would leave PGs without any active CPUs,
* so while CPUs are paused, call pg_cpu_inactive and swap in the
* bootstrap PG structure saving the original PG structure to be
* fini'd afterwards. This prevents the dispatcher from encountering
* PGs in which all CPUs are inactive. Offline CPUs are already
* inactive in their PGs and shouldn't be reactivated, so we must
* not call pg_cpu_inactive or pg_cpu_active for those CPUs.
*/
continue;
}
start_cpus();
/*
* pg_cpu_fini* and pg_cpu_init* must be called while CPUs are
* not paused. Use two separate loops here so that we do not
* initialize PG data for CPUs until all the old PG data structures
* are torn down.
*/
continue;
}
/*
* Initialize PG data for each CPU, but leave the bootstrapped
* PG structure in place to avoid running with any PGs containing
* nothing but inactive CPUs.
*/
continue;
}
/*
* Now that PG data has been initialized for all CPUs in the
* system, replace the bootstrapped PG structure with the
* initialized PG structure and call pg_cpu_active for each CPU.
*/
continue;
}
start_cpus();
(void) md_fini_handle(mdp);
}
/*
* Wrapper for the Sun Cluster error decoding function.
*/
static int
{
const char *decoded;
ASSERT(max_reason_len > 0);
if (cl_suspend_error_decode == NULL)
return (-1);
return (-1);
/* Get number of non-NULL bytes */
return (-1);
/*
* The error string returned from cl_suspend_error_decode
* should be NULL-terminated, but set the terminator here
* because we only copied non-NULL bytes. If the decoded
* string was not NULL-terminated, this guarantees that
* error_reason will be.
*/
return (0);
}
/*
* Wrapper for the Sun Cluster pre-suspend callback.
*/
static int
{
int rv = 0;
if (cl_suspend_pre_callback != NULL) {
rv = (*cl_suspend_pre_callback)();
max_reason_len)) {
}
}
}
return (rv);
}
/*
* Wrapper for the Sun Cluster post-suspend callback.
*/
static int
{
int rv = 0;
if (cl_suspend_post_callback != NULL) {
rv = (*cl_suspend_post_callback)();
max_reason_len)) {
(void) snprintf(error_reason,
}
}
}
return (rv);
}
/*
* Execute pre-suspend callbacks preparing the system for a suspend operation.
* Returns zero on success, non-zero on failure. Sets the recovered argument
* to indicate whether or not callbacks could be undone in the event of a
* failure--if callbacks were successfully undone, *recovered is set to B_TRUE,
* otherwise *recovered is set to B_FALSE. Must be called successfully before
* suspend_start can be called. Callers should first call suspend_support to
* determine if OS suspend is supported.
*/
int
{
int rv;
/*
* Return an error if suspend_pre is erreoneously called
* when OS suspend is not supported.
*/
if (!suspend_supported()) {
DBG("suspend: suspend_pre called without suspend support");
return (ENOTSUP);
}
/*
* At present, only one pre-suspend operation exists.
* If it fails, no recovery needs to be done.
*/
return (rv);
}
/*
* Execute post-suspend callbacks. Returns zero on success, non-zero on
* failure. Must be called after suspend_start is called, regardless of
* whether or not suspend_start is successful.
*/
int
{
}
/*
* Suspends the OS by pausing CPUs and calling into the HV to initiate
* the suspend. When the HV routine hv_guest_suspend returns, the system
* will be resumed. Must be called after a successful call to suspend_pre.
* suspend_post must be called after suspend_start, whether or not
* suspend_start returns an error.
*/
/*ARGSUSED*/
int
{
int spl;
/* Suspend the watchdog */
/* Record the TOD */
source_tod = tod_get();
/* Pause all other CPUs */
DBG_PROM("suspend: CPUs paused\n");
/* Suspend cyclics */
DBG_PROM("suspend: cyclics suspended\n");
/* Disable interrupts */
DBG_PROM("suspend: spl8()\n");
source_stick = gettick();
/*
* Call into the HV to initiate the suspend. hv_guest_suspend()
* returns after the guest has been resumed or if the suspend
* operation failed or was cancelled. After a successful suspend,
* the %tick and %stick registers may have changed by an amount
* that is not proportional to the amount of time that has passed.
* They may have jumped forwards or backwards. Some variation is
* allowed and accounted for using suspend_tick_stick_max_delta,
* but otherwise this jump must be uniform across all CPUs and we
* operate under the assumption that it is (maintaining two global
* offset variables--one for %tick and one for %stick.)
*/
DBG_PROM("suspend: suspending... \n");
rv = hv_guest_suspend();
if (rv != 0) {
start_cpus();
return (rv);
}
/* Update the global tick and stick offsets and the preserved TOD */
/* Ensure new offsets are globally visible before resuming CPUs */
membar_sync();
/* Enable interrupts */
/* Set the {%tick,%stick}.NPT bits on all CPUs */
ASSERT(gettick_npt() != 0);
ASSERT(getstick_npt() != 0);
}
/* If emulation is enabled, but not currently active, enable it */
}
/* Resume cyclics, unpause CPUs */
start_cpus();
/* Set the TOD */
/* Re-enable the watchdog */
/* Download the latest MD */
if ((rv = mach_descrip_update()) != 0)
rv);
/* Get new MD, update CPU mappings/relationships */
DBG("suspend: user %%tick/%%stick emulation is %d",
DBG("suspend: finished");
return (0);
}