cpr_dump.c revision bf30efa4af94cd71664f6c1be0e6e950b1d7a0f4
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* Fill in and write out the cpr state file
* 1. Allocate and write headers, ELF and cpr dump header
* 2. Allocate bitmaps according to phys_install
* 3. Tag kernel pages into corresponding bitmap
* 4. Write bitmaps to state file
* 5. Write actual physical page data to state file
*/
#include <vm/seg_kmem.h>
/* Local defines and variables */
static uint_t cpr_pages_tobe_dumped;
static uint_t cpr_regular_pgs_dumped;
static int cpr_dump_regular_pages(vnode_t *);
static int cpr_count_upages(int, bitfunc_t);
int cpr_flush_write(vnode_t *);
int cpr_contig_pages(vnode_t *, int);
void cpr_clear_bitmaps();
extern int i_cpr_dump_setup(vnode_t *);
extern int i_cpr_blockzero(char *, char **, int *, vnode_t *);
extern int cpr_test_mode;
char *cpr_buf, *cpr_buf_end;
int cpr_buf_blocks; /* size of cpr_buf in blocks */
int cpr_nbitmaps;
char *cpr_pagedata; /* page buffer for compression / tmp copy */
static char *cpr_wptr; /* keep track of where to write to next */
static int cpr_file_bn; /* cpr state-file block offset */
static int cpr_disk_writes_ok;
static size_t cpr_dev_space = 0;
/*
* On some platforms bcopy may modify the thread structure
* during bcopy (eg, to prevent cpu migration). If the
* range we are currently writing out includes our own
* thread structure then it will be snapshotted by bcopy
* including those modified members - and the updates made
* on exit from bcopy will no longer be seen when we later
* restore the mid-bcopy kthread_t. So if the range we
* need to copy overlaps with our thread structure we will
* use a simple byte copy.
*/
void
{
extern int curthreadremapped;
while (bytes-- > 0)
} else {
}
}
/*
* Allocate pages for buffers used in writing out the statefile
*/
static int
cpr_alloc_bufs(void)
{
char *allocerr = "Unable to allocate memory for cpr buffer";
/*
* set the cpr write buffer size to at least the historic
* size (128k) or large enough to store the both the early
* set of statefile structures (well under 0x800) plus the
* bitmaps, and roundup to the next pagesize.
*/
return (ENOMEM);
}
if (cpr_pagedata == NULL) {
return (ENOMEM);
}
return (0);
}
/*
* Set bitmap size in bytes based on phys_install.
*/
void
cpr_set_bitmap_size(void)
{
}
/*
* CPR dump header contains the following information:
* 1. header magic -- unique to cpr state file
* 2. kernel return pc & ppn for resume
* 3. current thread info
* 4. debug level and test mode
* 5. number of bitmaps allocated
* 6. number of page records
*/
static int
{
extern ushort_t cpr_mach_type;
struct cpr_dump_desc cdump;
/*
* Remember how many pages we plan to save to statefile.
* This information will be used for sanity checks.
* Untag those pages that will not be saved to statefile.
*/
"\ncpr_write_header: kpages %ld - vpages %ld + upages %ld = %d\n",
/*
* Some pages contain volatile data (cpr_buf and storage area for
* sensitive kpages), which are no longer needed after the statefile
* is dumped to disk. We have already untagged them from regular
* bitmaps. Now tag them into the volatile bitmaps. The pages in
* volatile bitmaps will be claimed during resume, and the resumed
* kernel will free them.
*/
/*
* Export accurate statefile size for statefile allocation retry.
* statefile_size = all the headers + total pages +
* number of pages used by the bitmaps.
* Roundup will be done in the file allocation code.
*/
/*
* If the estimated statefile is not big enough,
* go retry now to save un-necessary operations.
*/
errp("cpr_write_header: STAT->cs_nocomp_statefsz > "
"STAT->cs_est_statefsz\n");
return (ENOSPC);
}
/* now write cpr dump descriptor */
}
/*
* CPR dump tail record contains the following information:
* 1. header magic -- unique to cpr state file
* 2. all misc info that needs to be passed to cprboot or resumed kernel
*/
static int
{
/* count the last one (flush) */
STAT->cs_real_statefsz));
}
/*
* Write bitmap descriptor array, followed by merged bitmaps.
*/
static int
{
int err;
return (err);
/*
* merge regular and volatile bitmaps into tmp space
* and write to disk
*/
break;
}
}
return (err);
}
static int
{
extern int i_cpr_check_pgs_dumped();
void flush_windows(void);
char *str;
/*
* to get an accurate view of kas, we need to untag sensitive
* pages *before* dumping them because the disk driver makes
* allocations and changes kas along the way. The remaining
* pages referenced in the bitmaps are dumped out later as
* regular kpages.
*/
str = "cpr_write_statefile:";
/*
* now it's OK to call a driver that makes allocations
*/
cpr_disk_writes_ok = 1;
/*
* now write out the clean sensitive kpages
* according to the sensitive descriptors
*/
if (error) {
return (error);
}
/*
* cpr_dump_regular_pages() counts cpr_regular_pgs_dumped
*/
if (error) {
return (error);
}
/*
* sanity check to verify the right number of pages were dumped
*/
if (error) {
#ifdef DEBUG
if (cpr_test_mode)
#endif
}
return (error);
}
/*
* creates the CPR state file, the following sections are
* written out in sequence:
* - writes the cpr dump header
* - writes the memory usage bitmaps
* - writes the platform dependent info
* - writes the remaining user pages
* - writes the kernel pages
*/
int
{
int error;
if (error = cpr_alloc_bufs())
return (error);
}
/* point to top of internal buffer */
/* initialize global variables used by the write operation */
cpr_dev_space = 0;
/* allocate bitmaps */
if (error = i_cpr_alloc_bitmaps()) {
return (error);
}
}
return (error);
return (error);
/*
* set internal cross checking; we dont want to call
* a disk driver that makes allocations until after
* sensitive pages are saved
*/
cpr_disk_writes_ok = 0;
/*
* 1253112: heap corruption due to memory allocation when dumpping
* statefile.
* Theoretically on Sun4u only the kernel data nucleus, kvalloc and
* kvseg segments can be contaminated should memory allocations happen
* during sddump, which is not supposed to happen after the system
* is quiesced. Let's call the kernel pages that tend to be affected
* 'sensitive kpages' here. To avoid saving inconsistent pages, we
* will allocate some storage space to save the clean sensitive pages
* aside before statefile dumping takes place. Since there may not be
* much memory left at this stage, the sensitive pages will be
* compressed before they are saved into the storage area.
*/
if (error = i_cpr_save_sensitive_kpages()) {
return (error);
}
/*
* since all cpr allocations are done (space for sensitive kpages,
* bitmaps, cpr_buf), kas is stable, and now we can accurately
* count regular and sensitive kpages.
*/
return (error);
}
return (error);
return (error);
return (error);
return (error);
}
return (error);
return (error);
return (error);
return (0);
}
/*
* cpr_xwalk() is called many 100x with a range within kvseg or kvseg_reloc;
* a page-count from each range is accumulated at arg->pages.
*/
static void
{
}
/*
* cpr_walk() is called many 100x with a range within kvseg or kvseg_reloc;
* a page-count from each range is accumulated at arg->pages.
*/
static void
{
/*
* If we are about to start walking the range of addresses we
* carved out of the kernel heap for the large page heap walk
* heap_lp_arena to find what segments are actually populated
*/
if (SEGKMEM_USE_LARGEPAGES &&
} else {
}
}
/*
* faster scan of kvseg using vmem_walk() to visit
* allocated ranges.
*/
{
struct cpr_walkinfo cwinfo;
errp("walked %d sub-ranges, total pages %ld\n",
}
}
/*
* cpr_walk_kpm() is called for every used area within the large
* segkpm virtual address window. A page-count is accumulated at
* arg->pages.
*/
static void
{
}
/*
* faster scan of segkpm using hat_kpm_walk() to visit only used ranges.
*/
/*ARGSUSED*/
static pgcnt_t
{
struct cpr_walkinfo cwinfo;
if (kpm_enable == 0)
return (0);
errp("walked %d sub-ranges, total pages %ld\n",
}
}
/*
* Sparsely filled kernel segments are registered in kseg_table for
* easier lookup. See also block comment for cpr_count_seg_pages.
*/
#define KSEG_SEG_ADDR 0 /* address of struct seg */
typedef struct {
int st_addrtype; /* address type in st_seg */
ksegtbl_entry_t kseg_table[] = {
{NULL, 0, 0}
};
/*
* Compare seg with each entry in kseg_table; when there is a match
* return the entry pointer, otherwise return NULL.
*/
static ksegtbl_entry_t *
{
return (ste);
}
return ((ksegtbl_entry_t *)NULL);
}
/*
* Count pages within each kernel segment; call cpr_sparse_seg_check()
* to find out whether a sparsely filled segment needs special
* treatment (e.g. kvseg).
* Todo: A "SEGOP_CPR" like SEGOP_DUMP should be introduced, the cpr
* module shouldn't need to know segment details like if it is
* sparsely filled or not (makes kseg_table obsolete).
*/
{
pages = 0;
} else {
}
}
return (pages);
}
/*
* count kernel pages within kas and any special ranges
*/
{
/*
* Some pages need to be taken care of differently.
* eg: panicbuf pages of sun4m are not in kas but they need
* to be saved. On sun4u, the physical pages of panicbuf are
* allocated via prom_retain().
*/
return (kas_cnt);
}
/*
* Set a bit corresponding to the arg phys page number;
* returns 0 when the ppn is valid and the corresponding
* map bit was clear, otherwise returns 1.
*/
int
{
char *bitmap;
int clr;
return (clr == 0);
}
}
return (1);
}
/*
* Clear a bit corresponding to the arg phys page number.
*/
int
{
char *bitmap;
int set;
return (set == 0);
}
}
return (1);
}
/* ARGSUSED */
int
{
return (0);
}
/*
* Lookup a bit corresponding to the arg phys page number.
*/
int
{
char *bitmap;
}
}
return (0);
}
/*
* Go thru all pages and pick up any page not caught during the invalidation
* stage. This is also used to save pages with cow lock or phys page lock held
* (none zero p_lckcnt or p_cowcnt)
*/
static int
{
do {
#if defined(__sparc)
extern struct vnode prom_ppages;
#else
#endif /* __sparc */
continue;
if (pf_is_memory(pfn)) {
tcnt++;
dcnt++; /* dirty count */
}
return (dcnt);
}
/*
* try compressing pages based on cflag,
* and for DEBUG kernels, verify uncompressed data checksum;
*
* this routine replaces common code from
* i_cpr_compress_and_save() and cpr_compress_and_write()
*/
char *
{
char *datap;
/*
* set length to the original uncompressed data size;
* always init cpd_flag to zero
*/
#ifdef DEBUG
/*
* Make a copy of the uncompressed data so we can checksum it.
* Compress that copy so the checksum works at the other end
*/
#else
#endif
/*
* try compressing the raw data to cpr_pagedata;
* if there was a size reduction: record the new length,
* flag the compression, and point to the compressed data.
*/
if (cflag) {
#ifdef DEBUG
/*
* decompress the data back to a scratch area
* and compare the new checksum with the original
* checksum to verify the compression.
*/
clen, sizeof (cpr_pagecopy));
#endif
}
}
return (datap);
}
/*
* 1. Prepare cpr page descriptor and write it to file
* 2. Compress page data and write it out
*/
static int
{
int error = 0;
char *datap;
/*
* Fill cpr page descriptor.
*/
/* Write cpr page descriptor */
/* Write compressed page data */
/*
* Unmap the pages for tlb and vac flushing
*/
if (error) {
} else {
}
return (error);
}
int
{
int error;
if (cpr_dev_space == 0) {
} else
}
/*
* break the write into multiple part if request is large,
* calculate count up to buf page boundary, then write it out.
* repeat until done.
*/
while (size) {
if (cpr_wptr < cpr_buf_end)
return (0); /* buffer not full yet */
if (wbytes > cpr_dev_space)
return (ENOSPC);
} else {
return (ENOSPC);
}
/*
* cross check, this should not happen!
*/
if (cpr_disk_writes_ok == 0) {
errp("cpr_write: disk write too early!\n");
return (EINVAL);
}
do_polled_io = 1;
do_polled_io = 0;
if (error) {
return (error);
}
}
return (0);
}
int
{
int nblk;
int error;
/*
* Calculate remaining blocks in buffer, rounded up to nearest
* disk block
*/
do_polled_io = 1;
do_polled_io = 0;
cpr_file_bn += nblk;
if (error)
return (error);
}
void
cpr_clear_bitmaps(void)
{
}
}
int
{
extern int i_cpr_compress_and_save();
i = 0; /* Beginning of bitmap */
j = 0;
while (i < totbit) {
while ((j < CPR_MAXCONTIG) && ((j + i) < totbit)) {
j++;
else /* not contiguous anymore */
break;
}
if (j) {
chunks++;
if (flag == SAVE_TO_STORAGE) {
if (error)
return (error);
} else if (flag == WRITE_TO_STATEFILE) {
spfn + i, j);
if (error)
return (error);
else {
spin_cnt++;
}
}
}
i += j;
if (j != CPR_MAXCONTIG) {
/* Stopped on a non-tagged page */
i++;
}
j = 0;
}
}
if (flag == STORAGE_DESC_ALLOC)
return (chunks);
else
return (0);
}
void
{
if (bitfunc == cpr_setbit)
action = "tag";
else if (bitfunc == cpr_clrbit)
action = "untag";
else
action = "none";
errp("range (0x%p, 0x%p), %s bitmap, %s %ld\n",
}
{
count++;
}
}
return (count);
}
{
if (cpr_buf) {
}
if (cpr_pagedata) {
}
return (count);
}
static int
{
int error;
if (!error)
return (error);
}