memscrub.c revision 1a5e258f5471356ca102c7176637cdce45bac147
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* sun4u Memory Scrubbing
*
* On detection of a correctable memory ECC error, the sun4u kernel
* returns the corrected data to the requester and re-writes it
* to memory (DRAM). So if the correctable error was transient,
* the read has effectively been cleaned (scrubbed) from memory.
*
* Scrubbing thus reduces the likelyhood that multiple transient errors
* will occur in the same memory word, making uncorrectable errors due
* to transients less likely.
*
* Thus is born the desire that every memory location be periodically
* accessed.
*
* This file implements a memory scrubbing thread. This scrubber
* guarantees that all of physical memory is accessed periodically
* (memscrub_period_sec -- 12 hours).
*
* It attempts to do this as unobtrusively as possible. The thread
* schedules itself to wake up at an interval such that if it reads
* memscrub_span_pages (32MB) on each wakeup, it will read all of physical
* memory in in memscrub_period_sec (12 hours).
*
* The scrubber uses the block load and prefetch hardware to read memory
* @ 1300MB/s, so it reads spans of 32MB in 0.025 seconds. Unlike the
* original sun4d scrubber the sun4u scrubber does not read ahead if the
* system is idle because we can read memory very efficently.
*
* The scrubber maintains a private copy of the phys_install memory list
* to keep track of what memory should be scrubbed.
*
* The global routines memscrub_add_span() and memscrub_delete_span() are
* used to add and delete from this list. If hotplug memory is later
* supported these two routines can be used to notify the scrubber of
* memory configuration changes.
*
* The following parameters can be set via /etc/system
*
* memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES (8MB)
* memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC (12 hours)
* memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI (MINCLSYSPRI)
* memscrub_delay_start_sec = (5 minutes)
* memscrub_verbose = (0)
* memscrub_override_ticks = (1 tick)
* disable_memscrub = (0)
* pause_memscrub = (0)
* read_all_memscrub = (0)
*
* The scrubber will print NOTICE messages of what it is doing if
* "memscrub_verbose" is set.
*
* If the scrubber's sleep time calculation drops to zero ticks,
* memscrub_override_ticks will be used as the sleep time instead. The
* sleep time should only drop to zero on a system with over 131.84
* terabytes of memory, or where the default scrubber parameters have
* been adjusted. For example, reducing memscrub_span_pages or
* memscrub_period_sec causes the sleep time to drop to zero with less
* memory. Note that since the sleep time is calculated in clock ticks,
* using hires clock ticks allows for more memory before the sleep time
* becomes zero.
*
* The scrubber will exit (or never be started) if it finds the variable
* "disable_memscrub" set.
*
* The scrubber will pause (not read memory) when "pause_memscrub"
* is set. It will check the state of pause_memscrub at each wakeup
* period. The scrubber will not make up for lost time. If you
* pause the scrubber for a prolonged period of time you can use
* the "read_all_memscrub" switch (see below) to catch up. In addition,
* pause_memscrub is used internally by the post memory DR callbacks.
* It is set for the small period of time during which the callbacks
* are executing. This ensures "memscrub_lock" will be released,
* allowing the callbacks to finish.
*
* The scrubber will read all memory if "read_all_memscrub" is set.
* The normal span read will also occur during the wakeup.
*
* MEMSCRUB_MIN_PAGES (32MB) is the minimum amount of memory a system
* must have before we'll start the scrubber.
*
* MEMSCRUB_DFL_SPAN_PAGES (32MB) is based on the guess that 0.025 sec
* is a "good" amount of minimum time for the thread to run at a time.
*
* MEMSCRUB_DFL_PERIOD_SEC (12 hours) is nearly a total guess --
* twice the frequency the hardware folk estimated would be necessary.
*
* MEMSCRUB_DFL_THREAD_PRI (MINCLSYSPRI) is based on the assumption
* that the scurbber should get its fair share of time (since it
* is short). At a priority of 0 the scrubber will be starved.
*/
#include <sys/systm.h> /* timeout, types, t_lock */
#include <sys/cmn_err.h>
#include <sys/sysmacros.h> /* MIN */
#include <sys/memlist.h> /* memlist */
#include <sys/mem_config.h> /* memory add/delete */
#include <sys/kmem.h> /* KMEM_NOSLEEP */
#include <sys/cpuvar.h> /* ncpus_online */
#include <sys/debug.h> /* ASSERTs */
#include <sys/machsystm.h> /* lddphys */
#include <sys/cpu_module.h> /* vtag_flushpage */
#include <sys/kstat.h>
#include <sys/atomic.h> /* atomic_add_32 */
#include <vm/hat.h>
#include <vm/seg_kmem.h>
#include <vm/hat_sfmmu.h> /* XXX FIXME - delete */
#include <sys/time.h>
#include <sys/callb.h> /* CPR callback */
#include <sys/ontrap.h>
/*
* Should really have paddr_t defined, but it is broken. Use
* ms_paddr_t in the meantime to make the code cleaner
*/
typedef uint64_t ms_paddr_t;
/*
* Global Routines:
*/
int memscrub_add_span(pfn_t pfn, pgcnt_t pages);
int memscrub_delete_span(pfn_t pfn, pgcnt_t pages);
int memscrub_init(void);
void memscrub_induced_error(void);
/*
* Global Data:
*/
/*
* scrub if we have at least this many pages
*/
#define MEMSCRUB_MIN_PAGES (32 * 1024 * 1024 / PAGESIZE)
/*
* scan all of physical memory at least once every MEMSCRUB_PERIOD_SEC
*/
#define MEMSCRUB_DFL_PERIOD_SEC (12 * 60 * 60) /* 12 hours */
/*
* scan at least MEMSCRUB_DFL_SPAN_PAGES each iteration
*/
#define MEMSCRUB_DFL_SPAN_PAGES ((32 * 1024 * 1024) / PAGESIZE)
/*
* almost anything is higher priority than scrubbing
*/
#define MEMSCRUB_DFL_THREAD_PRI MINCLSYSPRI
/*
* size used when scanning memory
*/
#define MEMSCRUB_BLOCK_SIZE 256
#define MEMSCRUB_BLOCK_SIZE_SHIFT 8 /* log2(MEMSCRUB_BLOCK_SIZE) */
#define MEMSCRUB_BLOCKS_PER_PAGE (PAGESIZE >> MEMSCRUB_BLOCK_SIZE_SHIFT)
#define MEMSCRUB_BPP4M MMU_PAGESIZE4M >> MEMSCRUB_BLOCK_SIZE_SHIFT
#define MEMSCRUB_BPP512K MMU_PAGESIZE512K >> MEMSCRUB_BLOCK_SIZE_SHIFT
#define MEMSCRUB_BPP64K MMU_PAGESIZE64K >> MEMSCRUB_BLOCK_SIZE_SHIFT
#define MEMSCRUB_BPP MMU_PAGESIZE >> MEMSCRUB_BLOCK_SIZE_SHIFT
/*
* This message indicates that we have exceeded the limitations of
* the memscrubber. See the comments above regarding what would
* cause the sleep time to become zero. In DEBUG mode, this message
* is logged on the console and in the messages file. In non-DEBUG
* mode, it is only logged in the messages file.
*/
#ifdef DEBUG
#define MEMSCRUB_OVERRIDE_MSG "Memory scrubber sleep time is zero " \
"seconds, consuming entire CPU."
#else
#define MEMSCRUB_OVERRIDE_MSG "!Memory scrubber sleep time is zero " \
"seconds, consuming entire CPU."
#endif /* DEBUG */
/*
* we can patch these defaults in /etc/system if necessary
*/
uint_t disable_memscrub = 0;
uint_t pause_memscrub = 0;
uint_t read_all_memscrub = 0;
uint_t memscrub_verbose = 0;
uint_t memscrub_all_idle = 0;
uint_t memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES;
uint_t memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC;
uint_t memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI;
uint_t memscrub_delay_start_sec = 5 * 60;
uint_t memscrub_override_ticks = 1;
/*
* Static Routines
*/
static void memscrubber(void);
static void memscrub_cleanup(void);
static int memscrub_add_span_gen(pfn_t, pgcnt_t, struct memlist **, uint_t *);
static int memscrub_verify_span(ms_paddr_t *addrp, pgcnt_t *pagesp);
static void memscrub_scan(uint_t blks, ms_paddr_t src);
/*
* Static Data
*/
static struct memlist *memscrub_memlist;
static uint_t memscrub_phys_pages;
static kcondvar_t memscrub_cv;
static kmutex_t memscrub_lock;
/*
* memscrub_lock protects memscrub_memlist, interval_ticks, cprinfo, ...
*/
static void memscrub_init_mem_config(void);
static void memscrub_uninit_mem_config(void);
/*
* Linked list of memscrub aware spans having retired pages.
* Currently enabled only on sun4u USIII-based platforms.
*/
typedef struct memscrub_page_retire_span {
ms_paddr_t address;
struct memscrub_page_retire_span *next;
} memscrub_page_retire_span_t;
static memscrub_page_retire_span_t *memscrub_page_retire_span_list = NULL;
static void memscrub_page_retire_span_add(ms_paddr_t);
static void memscrub_page_retire_span_delete(ms_paddr_t);
static int memscrub_page_retire_span_search(ms_paddr_t);
static void memscrub_page_retire_span_list_update(void);
/*
* add_to_page_retire_list: Set by cpu_async_log_err() routine
* by calling memscrub_induced_error() when CE/UE occurs on a retired
* page due to memscrub reading. Cleared by memscrub after updating
* global page retire span list. Piggybacking on protection of
* memscrub_lock, which is held during set and clear.
* Note: When cpu_async_log_err() calls memscrub_induced_error(), it is running
* on softint context, which gets fired on a cpu memscrub thread currently
* running. Memscrub thread has affinity set during memscrub_read(), hence
* migration to new cpu not expected.
*/
static int add_to_page_retire_list = 0;
/*
* Keep track of some interesting statistics
*/
static struct memscrub_kstats {
kstat_named_t done_early; /* ahead of schedule */
kstat_named_t early_sec; /* by cumulative num secs */
kstat_named_t done_late; /* behind schedule */
kstat_named_t late_sec; /* by cumulative num secs */
kstat_named_t interval_ticks; /* num ticks between intervals */
kstat_named_t force_run; /* forced to run, non-timeout */
kstat_named_t errors_found; /* num errors found by memscrub */
} memscrub_counts = {
{ "done_early", KSTAT_DATA_UINT32 },
{ "early_sec", KSTAT_DATA_UINT32 },
{ "done_late", KSTAT_DATA_UINT32 },
{ "late_sec", KSTAT_DATA_UINT32 },
{ "interval_ticks", KSTAT_DATA_UINT32 },
{ "force_run", KSTAT_DATA_UINT32 },
{ "errors_found", KSTAT_DATA_UINT32 },
};
#define MEMSCRUB_STAT_INC(stat) memscrub_counts.stat.value.ui32++
#define MEMSCRUB_STAT_SET(stat, val) memscrub_counts.stat.value.ui32 = (val)
#define MEMSCRUB_STAT_NINC(stat, val) memscrub_counts.stat.value.ui32 += (val)
static struct kstat *memscrub_ksp = (struct kstat *)NULL;
static timeout_id_t memscrub_tid = 0; /* keep track of timeout id */
/*
* create memscrub_memlist from phys_install list
* initialize locks, set memscrub_phys_pages.
*/
int
memscrub_init(void)
{
struct memlist *src;
/*
* only startup the scrubber if we have a minimum
* number of pages
*/
if (physinstalled >= MEMSCRUB_MIN_PAGES) {
/*
* initialize locks
*/
mutex_init(&memscrub_lock, NULL, MUTEX_DRIVER, NULL);
cv_init(&memscrub_cv, NULL, CV_DRIVER, NULL);
/*
* copy phys_install to memscrub_memlist
*/
for (src = phys_install; src; src = src->ml_next) {
if (memscrub_add_span(
(pfn_t)(src->ml_address >> PAGESHIFT),
(pgcnt_t)(src->ml_size >> PAGESHIFT))) {
memscrub_cleanup();
return (-1);
}
}
/*
* initialize kstats
*/
memscrub_ksp = kstat_create("unix", 0, "memscrub_kstat",
"misc", KSTAT_TYPE_NAMED,
sizeof (memscrub_counts) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
if (memscrub_ksp) {
memscrub_ksp->ks_data = (void *)&memscrub_counts;
kstat_install(memscrub_ksp);
} else {
cmn_err(CE_NOTE, "Memscrubber cannot create kstats\n");
}
/*
* create memscrubber thread
*/
(void) thread_create(NULL, 0, (void (*)())memscrubber,
NULL, 0, &p0, TS_RUN, memscrub_thread_pri);
/*
* We don't want call backs changing the list
* if there is no thread running. We do not
* attempt to deal with stopping/starting scrubbing
* on memory size changes.
*/
memscrub_init_mem_config();
}
return (0);
}
static void
memscrub_cleanup(void)
{
memscrub_uninit_mem_config();
while (memscrub_memlist) {
(void) memscrub_delete_span(
(pfn_t)(memscrub_memlist->ml_address >> PAGESHIFT),
(pgcnt_t)(memscrub_memlist->ml_size >> PAGESHIFT));
}
if (memscrub_ksp)
kstat_delete(memscrub_ksp);
cv_destroy(&memscrub_cv);
mutex_destroy(&memscrub_lock);
}
#ifdef MEMSCRUB_DEBUG
static void
memscrub_printmemlist(char *title, struct memlist *listp)
{
struct memlist *list;
cmn_err(CE_CONT, "%s:\n", title);
for (list = listp; list; list = list->ml_next) {
cmn_err(CE_CONT, "addr = 0x%llx, size = 0x%llx\n",
list->ml_address, list->ml_size);
}
}
#endif /* MEMSCRUB_DEBUG */
/* ARGSUSED */
static void
memscrub_wakeup(void *c)
{
/*
* grab mutex to guarantee that our wakeup call
* arrives after we go to sleep -- so we can't sleep forever.
*/
mutex_enter(&memscrub_lock);
cv_signal(&memscrub_cv);
mutex_exit(&memscrub_lock);
}
/*
* provide an interface external to the memscrubber
* which will force the memscrub thread to run vs.
* waiting for the timeout, if one is set
*/
void
memscrub_run(void)
{
MEMSCRUB_STAT_INC(force_run);
if (memscrub_tid) {
(void) untimeout(memscrub_tid);
memscrub_wakeup((void *)NULL);
}
}
/*
* this calculation doesn't account for the time
* that the actual scan consumes -- so we'd fall
* slightly behind schedule with this interval.
* It's very small.
*/
static uint_t
compute_interval_ticks(void)
{
/*
* We use msp_safe mpp_safe below to insure somebody
* doesn't set memscrub_span_pages or memscrub_phys_pages
* to 0 on us.
*/
static uint_t msp_safe, mpp_safe;
static uint_t interval_ticks, period_ticks;
msp_safe = memscrub_span_pages;
mpp_safe = memscrub_phys_pages;
period_ticks = memscrub_period_sec * hz;
interval_ticks = period_ticks;
ASSERT(mutex_owned(&memscrub_lock));
if ((msp_safe != 0) && (mpp_safe != 0)) {
if (memscrub_phys_pages <= msp_safe) {
interval_ticks = period_ticks;
} else {
interval_ticks = (period_ticks /
(mpp_safe / msp_safe));
}
}
return (interval_ticks);
}
void
memscrubber(void)
{
ms_paddr_t address, addr;
time_t deadline;
pgcnt_t pages;
uint_t reached_end = 1;
uint_t paused_message = 0;
uint_t interval_ticks = 0;
uint_t sleep_warn_printed = 0;
callb_cpr_t cprinfo;
/*
* notify CPR of our existence
*/
CALLB_CPR_INIT(&cprinfo, &memscrub_lock, callb_generic_cpr, "memscrub");
mutex_enter(&memscrub_lock);
if (memscrub_memlist == NULL) {
cmn_err(CE_WARN, "memscrub_memlist not initialized.");
goto memscrub_exit;
}
address = memscrub_memlist->ml_address;
deadline = gethrestime_sec() + memscrub_delay_start_sec;
for (;;) {
if (disable_memscrub)
break;
/*
* compute interval_ticks
*/
interval_ticks = compute_interval_ticks();
/*
* If the calculated sleep time is zero, and pause_memscrub
* has been set, make sure we sleep so that another thread
* can acquire memscrub_lock.
*/
if (interval_ticks == 0 && pause_memscrub) {
interval_ticks = hz;
}
/*
* And as a fail safe, under normal non-paused operation, do
* not allow the sleep time to be zero.
*/
if (interval_ticks == 0) {
interval_ticks = memscrub_override_ticks;
if (!sleep_warn_printed) {
cmn_err(CE_NOTE, MEMSCRUB_OVERRIDE_MSG);
sleep_warn_printed = 1;
}
}
MEMSCRUB_STAT_SET(interval_ticks, interval_ticks);
/*
* Did we just reach the end of memory? If we are at the
* end of memory, delay end of memory processing until
* pause_memscrub is not set.
*/
if (reached_end && !pause_memscrub) {
time_t now = gethrestime_sec();
if (now >= deadline) {
MEMSCRUB_STAT_INC(done_late);
MEMSCRUB_STAT_NINC(late_sec, now - deadline);
/*
* past deadline, start right away
*/
interval_ticks = 0;
deadline = now + memscrub_period_sec;
} else {
/*
* we finished ahead of schedule.
* wait till previous deadline before re-start.
*/
interval_ticks = (deadline - now) * hz;
MEMSCRUB_STAT_INC(done_early);
MEMSCRUB_STAT_NINC(early_sec, deadline - now);
deadline += memscrub_period_sec;
}
reached_end = 0;
sleep_warn_printed = 0;
}
if (interval_ticks != 0) {
/*
* it is safe from our standpoint for CPR to
* suspend the system
*/
CALLB_CPR_SAFE_BEGIN(&cprinfo);
/*
* hit the snooze bar
*/
memscrub_tid = timeout(memscrub_wakeup, NULL,
interval_ticks);
/*
* go to sleep
*/
cv_wait(&memscrub_cv, &memscrub_lock);
/*
* at this point, no timeout should be set
*/
memscrub_tid = 0;
/*
* we need to goto work and will be modifying
* our internal state and mapping/unmapping
* TTEs
*/
CALLB_CPR_SAFE_END(&cprinfo, &memscrub_lock);
}
if (memscrub_phys_pages == 0) {
cmn_err(CE_WARN, "Memory scrubber has 0 pages to read");
goto memscrub_exit;
}
if (!pause_memscrub) {
if (paused_message) {
paused_message = 0;
if (memscrub_verbose)
cmn_err(CE_NOTE, "Memory scrubber "
"resuming");
}
if (read_all_memscrub) {
if (memscrub_verbose)
cmn_err(CE_NOTE, "Memory scrubber "
"reading all memory per request");
addr = memscrub_memlist->ml_address;
reached_end = 0;
while (!reached_end) {
if (disable_memscrub)
break;
pages = memscrub_phys_pages;
reached_end = memscrub_verify_span(
&addr, &pages);
memscrub_scan(pages *
MEMSCRUB_BLOCKS_PER_PAGE, addr);
addr += ((uint64_t)pages * PAGESIZE);
}
read_all_memscrub = 0;
}
/*
* read 1 span
*/
pages = memscrub_span_pages;
if (disable_memscrub)
break;
/*
* determine physical address range
*/
reached_end = memscrub_verify_span(&address,
&pages);
memscrub_scan(pages * MEMSCRUB_BLOCKS_PER_PAGE,
address);
address += ((uint64_t)pages * PAGESIZE);
}
if (pause_memscrub && !paused_message) {
paused_message = 1;
if (memscrub_verbose)
cmn_err(CE_NOTE, "Memory scrubber paused");
}
}
memscrub_exit:
cmn_err(CE_NOTE, "Memory scrubber exiting");
CALLB_CPR_EXIT(&cprinfo);
memscrub_cleanup();
thread_exit();
/* NOTREACHED */
}
/*
* condition address and size
* such that they span legal physical addresses.
*
* when appropriate, address will be rounded up to start of next
* struct memlist, and pages will be rounded down to the end of the
* memlist size.
*
* returns 1 if reached end of list, else returns 0.
*/
static int
memscrub_verify_span(ms_paddr_t *addrp, pgcnt_t *pagesp)
{
struct memlist *mlp;
ms_paddr_t address = *addrp;
uint64_t bytes = (uint64_t)*pagesp * PAGESIZE;
uint64_t bytes_remaining;
int reached_end = 0;
ASSERT(mutex_owned(&memscrub_lock));
/*
* find memlist struct that contains addrp
* assumes memlist is sorted by ascending address.
*/
for (mlp = memscrub_memlist; mlp != NULL; mlp = mlp->ml_next) {
/*
* if before this chunk, round up to beginning
*/
if (address < mlp->ml_address) {
address = mlp->ml_address;
break;
}
/*
* if before end of chunk, then we found it
*/
if (address < (mlp->ml_address + mlp->ml_size))
break;
/* else go to next struct memlist */
}
/*
* if we hit end of list, start at beginning
*/
if (mlp == NULL) {
mlp = memscrub_memlist;
address = mlp->ml_address;
}
/*
* now we have legal address, and its mlp, condition bytes
*/
bytes_remaining = (mlp->ml_address + mlp->ml_size) - address;
if (bytes > bytes_remaining)
bytes = bytes_remaining;
/*
* will this span take us to end of list?
*/
if ((mlp->ml_next == NULL) &&
((mlp->ml_address + mlp->ml_size) == (address + bytes)))
reached_end = 1;
/* return values */
*addrp = address;
*pagesp = bytes / PAGESIZE;
return (reached_end);
}
/*
* add a span to the memscrub list
* add to memscrub_phys_pages
*/
int
memscrub_add_span(pfn_t pfn, pgcnt_t pages)
{
#ifdef MEMSCRUB_DEBUG
ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
uint64_t bytes = (uint64_t)pages << PAGESHIFT;
#endif /* MEMSCRUB_DEBUG */
int retval;
mutex_enter(&memscrub_lock);
#ifdef MEMSCRUB_DEBUG
memscrub_printmemlist("memscrub_memlist before", memscrub_memlist);
cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
cmn_err(CE_CONT, "memscrub_add_span: address: 0x%llx"
" size: 0x%llx\n", address, bytes);
#endif /* MEMSCRUB_DEBUG */
retval = memscrub_add_span_gen(pfn, pages, &memscrub_memlist,
&memscrub_phys_pages);
#ifdef MEMSCRUB_DEBUG
memscrub_printmemlist("memscrub_memlist after", memscrub_memlist);
cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
#endif /* MEMSCRUB_DEBUG */
mutex_exit(&memscrub_lock);
return (retval);
}
static int
memscrub_add_span_gen(
pfn_t pfn,
pgcnt_t pages,
struct memlist **list,
uint_t *npgs)
{
ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
uint64_t bytes = (uint64_t)pages << PAGESHIFT;
struct memlist *dst;
struct memlist *prev, *next;
int retval = 0;
/*
* allocate a new struct memlist
*/
dst = (struct memlist *)
kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);
if (dst == NULL) {
retval = -1;
goto add_done;
}
dst->ml_address = address;
dst->ml_size = bytes;
/*
* first insert
*/
if (*list == NULL) {
dst->ml_prev = NULL;
dst->ml_next = NULL;
*list = dst;
goto add_done;
}
/*
* insert into sorted list
*/
for (prev = NULL, next = *list;
next != NULL;
prev = next, next = next->ml_next) {
if (address > (next->ml_address + next->ml_size))
continue;
/*
* else insert here
*/
/*
* prepend to next
*/
if ((address + bytes) == next->ml_address) {
kmem_free(dst, sizeof (struct memlist));
next->ml_address = address;
next->ml_size += bytes;
goto add_done;
}
/*
* append to next
*/
if (address == (next->ml_address + next->ml_size)) {
kmem_free(dst, sizeof (struct memlist));
if (next->ml_next) {
/*
* don't overlap with next->ml_next
*/
if ((address + bytes) >
next->ml_next->ml_address) {
retval = -1;
goto add_done;
}
/*
* concatenate next and next->ml_next
*/
if ((address + bytes) ==
next->ml_next->ml_address) {
struct memlist *mlp = next->ml_next;
if (next == *list)
*list = next->ml_next;
mlp->ml_address = next->ml_address;
mlp->ml_size += next->ml_size;
mlp->ml_size += bytes;
if (next->ml_prev)
next->ml_prev->ml_next = mlp;
mlp->ml_prev = next->ml_prev;
kmem_free(next,
sizeof (struct memlist));
goto add_done;
}
}
next->ml_size += bytes;
goto add_done;
}
/* don't overlap with next */
if ((address + bytes) > next->ml_address) {
retval = -1;
kmem_free(dst, sizeof (struct memlist));
goto add_done;
}
/*
* insert before next
*/
dst->ml_prev = prev;
dst->ml_next = next;
next->ml_prev = dst;
if (prev == NULL) {
*list = dst;
} else {
prev->ml_next = dst;
}
goto add_done;
} /* end for */
/*
* end of list, prev is valid and next is NULL
*/
prev->ml_next = dst;
dst->ml_prev = prev;
dst->ml_next = NULL;
add_done:
if (retval != -1)
*npgs += pages;
return (retval);
}
/*
* delete a span from the memscrub list
* subtract from memscrub_phys_pages
*/
int
memscrub_delete_span(pfn_t pfn, pgcnt_t pages)
{
ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
uint64_t bytes = (uint64_t)pages << PAGESHIFT;
struct memlist *dst, *next;
int retval = 0;
mutex_enter(&memscrub_lock);
#ifdef MEMSCRUB_DEBUG
memscrub_printmemlist("memscrub_memlist Before", memscrub_memlist);
cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
cmn_err(CE_CONT, "memscrub_delete_span: 0x%llx 0x%llx\n",
address, bytes);
#endif /* MEMSCRUB_DEBUG */
/*
* find struct memlist containing page
*/
for (next = memscrub_memlist; next != NULL; next = next->ml_next) {
if ((address >= next->ml_address) &&
(address < next->ml_address + next->ml_size))
break;
}
/*
* if start address not in list
*/
if (next == NULL) {
retval = -1;
goto delete_done;
}
/*
* error if size goes off end of this struct memlist
*/
if (address + bytes > next->ml_address + next->ml_size) {
retval = -1;
goto delete_done;
}
/*
* pages at beginning of struct memlist
*/
if (address == next->ml_address) {
/*
* if start & size match, delete from list
*/
if (bytes == next->ml_size) {
if (next == memscrub_memlist)
memscrub_memlist = next->ml_next;
if (next->ml_prev != NULL)
next->ml_prev->ml_next = next->ml_next;
if (next->ml_next != NULL)
next->ml_next->ml_prev = next->ml_prev;
kmem_free(next, sizeof (struct memlist));
} else {
/*
* increment start address by bytes
*/
next->ml_address += bytes;
next->ml_size -= bytes;
}
goto delete_done;
}
/*
* pages at end of struct memlist
*/
if (address + bytes == next->ml_address + next->ml_size) {
/*
* decrement size by bytes
*/
next->ml_size -= bytes;
goto delete_done;
}
/*
* delete a span in the middle of the struct memlist
*/
{
/*
* create a new struct memlist
*/
dst = (struct memlist *)
kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);
if (dst == NULL) {
retval = -1;
goto delete_done;
}
/*
* existing struct memlist gets address
* and size up to pfn
*/
dst->ml_address = address + bytes;
dst->ml_size =
(next->ml_address + next->ml_size) - dst->ml_address;
next->ml_size = address - next->ml_address;
/*
* new struct memlist gets address starting
* after pfn, until end
*/
/*
* link in new memlist after old
*/
dst->ml_next = next->ml_next;
dst->ml_prev = next;
if (next->ml_next != NULL)
next->ml_next->ml_prev = dst;
next->ml_next = dst;
}
delete_done:
if (retval != -1) {
memscrub_phys_pages -= pages;
if (memscrub_phys_pages == 0)
disable_memscrub = 1;
}
#ifdef MEMSCRUB_DEBUG
memscrub_printmemlist("memscrub_memlist After", memscrub_memlist);
cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
#endif /* MEMSCRUB_DEBUG */
mutex_exit(&memscrub_lock);
return (retval);
}
static void
memscrub_scan(uint_t blks, ms_paddr_t src)
{
uint_t psz, bpp, pgsread;
pfn_t pfn;
ms_paddr_t pa;
caddr_t va;
on_trap_data_t otd;
int scan_mmu_pagesize = 0;
int retired_pages = 0;
extern void memscrub_read(caddr_t src, uint_t blks);
ASSERT(mutex_owned(&memscrub_lock));
pgsread = 0;
pa = src;
if (memscrub_page_retire_span_list != NULL) {
if (memscrub_page_retire_span_search(src)) {
/* retired pages in current span */
scan_mmu_pagesize = 1;
}
}
#ifdef MEMSCRUB_DEBUG
cmn_err(CE_NOTE, "scan_mmu_pagesize = %d\n" scan_mmu_pagesize);
#endif /* MEMSCRUB_DEBUG */
while (blks != 0) {
/* Ensure the PA is properly aligned */
if (((pa & MMU_PAGEMASK4M) == pa) &&
(blks >= MEMSCRUB_BPP4M)) {
psz = MMU_PAGESIZE4M;
bpp = MEMSCRUB_BPP4M;
} else if (((pa & MMU_PAGEMASK512K) == pa) &&
(blks >= MEMSCRUB_BPP512K)) {
psz = MMU_PAGESIZE512K;
bpp = MEMSCRUB_BPP512K;
} else if (((pa & MMU_PAGEMASK64K) == pa) &&
(blks >= MEMSCRUB_BPP64K)) {
psz = MMU_PAGESIZE64K;
bpp = MEMSCRUB_BPP64K;
} else if ((pa & MMU_PAGEMASK) == pa) {
psz = MMU_PAGESIZE;
bpp = MEMSCRUB_BPP;
} else {
if (memscrub_verbose) {
cmn_err(CE_NOTE, "Memory scrubber ignoring "
"non-page aligned block starting at 0x%"
PRIx64, src);
}
return;
}
if (blks < bpp) bpp = blks;
#ifdef MEMSCRUB_DEBUG
cmn_err(CE_NOTE, "Going to run psz=%x, "
"bpp=%x pa=%llx\n", psz, bpp, pa);
#endif /* MEMSCRUB_DEBUG */
/*
* MEMSCRUBBASE is a 4MB aligned page in the
* kernel so that we can quickly map the PA
* to a VA for the block loads performed in
* memscrub_read.
*/
pfn = mmu_btop(pa);
va = (caddr_t)MEMSCRUBBASE;
hat_devload(kas.a_hat, va, psz, pfn, PROT_READ,
HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
/*
* Can't allow the memscrubber to migrate across CPUs as
* we need to know whether CEEN is enabled for the current
* CPU to enable us to scrub the memory. Don't use
* kpreempt_disable as the time we take to scan a span (even
* without cpu_check_ce having to manually cpu_check_block)
* is too long to hold a higher priority thread (eg, RT)
* off cpu.
*/
thread_affinity_set(curthread, CPU_CURRENT);
/*
* Protect read scrub from async faults. For now, we simply
* maintain a count of such faults caught.
*/
if (!on_trap(&otd, OT_DATA_EC) && !scan_mmu_pagesize) {
memscrub_read(va, bpp);
/*
* Check if CEs require logging
*/
cpu_check_ce(SCRUBBER_CEEN_CHECK,
(uint64_t)pa, va, psz);
no_trap();
thread_affinity_clear(curthread);
} else {
no_trap();
thread_affinity_clear(curthread);
/*
* Got an async error..
* Try rescanning it at MMU_PAGESIZE
* granularity if we were trying to
* read at a larger page size.
* This is to ensure we continue to
* scan the rest of the span.
* OR scanning MMU_PAGESIZE granularity to avoid
* reading retired pages memory when scan_mmu_pagesize
* is set.
*/
if (psz > MMU_PAGESIZE || scan_mmu_pagesize) {
caddr_t vaddr = va;
ms_paddr_t paddr = pa;
int tmp = 0;
for (; tmp < bpp; tmp += MEMSCRUB_BPP) {
/* Don't scrub retired pages */
if (page_retire_check(paddr, NULL)
== 0) {
vaddr += MMU_PAGESIZE;
paddr += MMU_PAGESIZE;
retired_pages++;
continue;
}
thread_affinity_set(curthread,
CPU_CURRENT);
if (!on_trap(&otd, OT_DATA_EC)) {
memscrub_read(vaddr,
MEMSCRUB_BPP);
cpu_check_ce(
SCRUBBER_CEEN_CHECK,
(uint64_t)paddr, vaddr,
MMU_PAGESIZE);
no_trap();
} else {
no_trap();
MEMSCRUB_STAT_INC(errors_found);
}
thread_affinity_clear(curthread);
vaddr += MMU_PAGESIZE;
paddr += MMU_PAGESIZE;
}
}
}
hat_unload(kas.a_hat, va, psz, HAT_UNLOAD_UNLOCK);
blks -= bpp;
pa += psz;
pgsread++;
}
/*
* If just finished scrubbing MMU_PAGESIZE at a time, but no retired
* pages found so delete span from global list.
*/
if (scan_mmu_pagesize && retired_pages == 0)
memscrub_page_retire_span_delete(src);
/*
* Encountered CE/UE on a retired page during memscrub read of current
* span. Adding span to global list to enable avoid reading further.
*/
if (add_to_page_retire_list) {
if (!memscrub_page_retire_span_search(src))
memscrub_page_retire_span_add(src);
add_to_page_retire_list = 0;
}
if (memscrub_verbose) {
cmn_err(CE_NOTE, "Memory scrubber read 0x%x pages starting "
"at 0x%" PRIx64, pgsread, src);
}
}
/*
* Called by cpu_async_log_err() when memscrub read causes
* CE/UE on a retired page.
*/
void
memscrub_induced_error(void)
{
add_to_page_retire_list = 1;
}
/*
* Called by page_retire() when toxic pages cannot be retired
* immediately and are scheduled for retire. Memscrubber stops
* scrubbing them to avoid further CE/UEs.
*/
void
memscrub_notify(ms_paddr_t pa)
{
mutex_enter(&memscrub_lock);
if (!memscrub_page_retire_span_search(pa))
memscrub_page_retire_span_add(pa);
mutex_exit(&memscrub_lock);
}
/*
* Called by memscrub_scan() and memscrub_notify().
* pa: physical address of span with CE/UE, add to global list.
*/
static void
memscrub_page_retire_span_add(ms_paddr_t pa)
{
memscrub_page_retire_span_t *new_span;
new_span = (memscrub_page_retire_span_t *)
kmem_zalloc(sizeof (memscrub_page_retire_span_t), KM_NOSLEEP);
if (new_span == NULL) {
#ifdef MEMSCRUB_DEBUG
cmn_err(CE_NOTE, "failed to allocate new span - span with"
" retired page/s not tracked.\n");
#endif /* MEMSCRUB_DEBUG */
return;
}
new_span->address = pa;
new_span->next = memscrub_page_retire_span_list;
memscrub_page_retire_span_list = new_span;
}
/*
* Called by memscrub_scan().
* pa: physical address of span to be removed from global list.
*/
static void
memscrub_page_retire_span_delete(ms_paddr_t pa)
{
memscrub_page_retire_span_t *prev_span, *next_span;
prev_span = memscrub_page_retire_span_list;
next_span = memscrub_page_retire_span_list->next;
if (pa == prev_span->address) {
memscrub_page_retire_span_list = next_span;
kmem_free(prev_span, sizeof (memscrub_page_retire_span_t));
return;
}
while (next_span) {
if (pa == next_span->address) {
prev_span->next = next_span->next;
kmem_free(next_span,
sizeof (memscrub_page_retire_span_t));
return;
}
prev_span = next_span;
next_span = next_span->next;
}
}
/*
* Called by memscrub_scan() and memscrub_notify().
* pa: physical address of span to be searched in global list.
*/
static int
memscrub_page_retire_span_search(ms_paddr_t pa)
{
memscrub_page_retire_span_t *next_span = memscrub_page_retire_span_list;
while (next_span) {
if (pa == next_span->address)
return (1);
next_span = next_span->next;
}
return (0);
}
/*
* Called from new_memscrub() as a result of memory delete.
* Using page_numtopp_nolock() to determine if we have valid PA.
*/
static void
memscrub_page_retire_span_list_update(void)
{
memscrub_page_retire_span_t *prev, *cur, *next;
if (memscrub_page_retire_span_list == NULL)
return;
prev = cur = memscrub_page_retire_span_list;
next = cur->next;
while (cur) {
if (page_numtopp_nolock(mmu_btop(cur->address)) == NULL) {
if (cur == memscrub_page_retire_span_list) {
memscrub_page_retire_span_list = next;
kmem_free(cur,
sizeof (memscrub_page_retire_span_t));
prev = cur = memscrub_page_retire_span_list;
} else {
prev->next = cur->next;
kmem_free(cur,
sizeof (memscrub_page_retire_span_t));
cur = next;
}
} else {
prev = cur;
cur = next;
}
if (cur != NULL)
next = cur->next;
}
}
/*
* The memory add/delete callback mechanism does not pass in the
* page ranges. The phys_install list has been updated though, so
* create a new scrub list from it.
*/
static int
new_memscrub(int update_page_retire_list)
{
struct memlist *src, *list, *old_list;
uint_t npgs;
/*
* copy phys_install to memscrub_memlist
*/
list = NULL;
npgs = 0;
memlist_read_lock();
for (src = phys_install; src; src = src->ml_next) {
if (memscrub_add_span_gen((pfn_t)(src->ml_address >> PAGESHIFT),
(pgcnt_t)(src->ml_size >> PAGESHIFT), &list, &npgs)) {
memlist_read_unlock();
while (list) {
struct memlist *el;
el = list;
list = list->ml_next;
kmem_free(el, sizeof (struct memlist));
}
return (-1);
}
}
memlist_read_unlock();
mutex_enter(&memscrub_lock);
memscrub_phys_pages = npgs;
old_list = memscrub_memlist;
memscrub_memlist = list;
if (update_page_retire_list)
memscrub_page_retire_span_list_update();
mutex_exit(&memscrub_lock);
while (old_list) {
struct memlist *el;
el = old_list;
old_list = old_list->ml_next;
kmem_free(el, sizeof (struct memlist));
}
return (0);
}
/*ARGSUSED*/
static void
memscrub_mem_config_post_add(
void *arg,
pgcnt_t delta_pages)
{
/*
* We increment pause_memscrub before entering new_memscrub(). This
* will force the memscrubber to sleep, allowing the DR callback
* thread to acquire memscrub_lock in new_memscrub(). The use of
* atomic_add_32() allows concurrent memory DR operations to use the
* callbacks safely.
*/
atomic_inc_32(&pause_memscrub);
ASSERT(pause_memscrub != 0);
/*
* "Don't care" if we are not scrubbing new memory.
*/
(void) new_memscrub(0); /* retain page retire list */
/* Restore the pause setting. */
atomic_dec_32(&pause_memscrub);
}
/*ARGSUSED*/
static int
memscrub_mem_config_pre_del(
void *arg,
pgcnt_t delta_pages)
{
/* Nothing to do. */
return (0);
}
/*ARGSUSED*/
static void
memscrub_mem_config_post_del(
void *arg,
pgcnt_t delta_pages,
int cancelled)
{
/*
* We increment pause_memscrub before entering new_memscrub(). This
* will force the memscrubber to sleep, allowing the DR callback
* thread to acquire memscrub_lock in new_memscrub(). The use of
* atomic_add_32() allows concurrent memory DR operations to use the
* callbacks safely.
*/
atomic_inc_32(&pause_memscrub);
ASSERT(pause_memscrub != 0);
/*
* Must stop scrubbing deleted memory as it may be disconnected.
*/
if (new_memscrub(1)) { /* update page retire list */
disable_memscrub = 1;
}
/* Restore the pause setting. */
atomic_dec_32(&pause_memscrub);
}
static kphysm_setup_vector_t memscrub_mem_config_vec = {
KPHYSM_SETUP_VECTOR_VERSION,
memscrub_mem_config_post_add,
memscrub_mem_config_pre_del,
memscrub_mem_config_post_del,
};
static void
memscrub_init_mem_config()
{
int ret;
ret = kphysm_setup_func_register(&memscrub_mem_config_vec,
(void *)NULL);
ASSERT(ret == 0);
}
static void
memscrub_uninit_mem_config()
{
/* This call is OK if the register call was not done. */
kphysm_setup_func_unregister(&memscrub_mem_config_vec, (void *)NULL);
}