vm_pageout.c revision 06cfbf35ec2ffad381362c0b3886d50578d0b9f9
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
* The Regents of the University of California
* All Rights Reserved
*
* University Acknowledgment- Portions of this document are derived from
* software developed by the University of California, Berkeley, and its
* contributors.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/tnf_probe.h>
#include <sys/mem_cage.h>
#include <vm/seg_kmem.h>
/*
* The following parameters control operation of the page replacement
* algorithm. They are initialized to 0, and then computed at boot time
* based on the size of the system. If they are patched non-zero in
* a loaded vmunix they are left alone and may thus be changed per system
* using adb on the loaded system.
*/
static pgcnt_t handspreadpages = 0;
static int loopfraction = 2;
static int min_percent_cpu = 4;
static int max_percent_cpu = 80;
static pgcnt_t maxfastscan = 0;
pgcnt_t throttlefree = 0;
pgcnt_t pageout_reserve = 0;
/*
* Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
* are the number of ticks in each wakeup cycle that gives the
* equivalent of some underlying %CPU duty cycle.
* When RATETOSCHEDPAGING is 4, and hz is 100, pageout_scanner is
* awakened every 25 clock ticks. So, converting from %CPU to ticks
* per wakeup cycle would be x% of 25, that is (x * 100) / 25.
* So, for example, 4% == 1 tick and 80% == 20 ticks.
*
* min_pageout_ticks:
*
* max_pageout_ticks:
*
* pageout_ticks:
* Number of clock ticks budgeted for each wakeup cycle.
* Computed each time around by schedpaging().
* Varies between min_pageout_ticks .. max_pageout_ticks,
* depending on memory pressure.
*
* pageout_lbolt:
* Timestamp of the last time pageout_scanner woke up and started
* (or resumed) scanning for not recently referenced pages.
*/
static clock_t min_pageout_ticks;
static clock_t max_pageout_ticks;
static clock_t pageout_ticks;
static clock_t pageout_lbolt;
static uint_t reset_hands;
#define PAGES_POLL_MASK 1023
/*
* pageout_sample_lim:
* The limit on the number of samples needed to establish a value
* for new pageout parameters, fastscan, slowscan, and handspreadpages.
*
* pageout_sample_cnt:
* Current sample number. Once the sample gets large enough,
* set new values for handspreadpages, fastscan and slowscan.
*
* pageout_sample_pages:
* The accumulated number of pages scanned during sampling.
*
* pageout_sample_ticks:
* The accumulated clock ticks for the sample.
*
* pageout_rate:
* Rate in pages/nanosecond, computed at the end of sampling.
*
* pageout_new_spread:
* The new value to use for fastscan and handspreadpages.
* Calculated after enough samples have been taken.
*/
static uint64_t pageout_sample_cnt = 0;
static pgcnt_t pageout_sample_pages = 0;
static hrrate_t pageout_rate = 0;
static pgcnt_t pageout_new_spread = 0;
static clock_t pageout_cycle_ticks;
static hrtime_t pageout_sample_etime = 0;
/*
* Record number of times a pageout_scanner wakeup cycle finished because it
* timed out (exceeded its CPU budget), rather than because it visited
* its budgeted number of pages.
*/
uint64_t pageout_timeouts = 0;
#ifdef VM_STATS
static struct pageoutvmstats_str {
#endif /* VM_STATS */
/*
* Threads waiting for free memory use this condition variable and lock until
* memory becomes available.
*/
/*
* The size of the clock loop.
*/
#define LOOPPAGES total_pages
/*
* Set up the paging constants for the clock algorithm.
* Called after the system is initialized and the amount of memory
* and number of paging devices is known.
*
* lotsfree is 1/64 of memory, but at least 512K.
* desfree is 1/2 of lotsfree.
* minfree is 1/2 of desfree.
*
* Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
*
* lotsfree = btop(512K)
* desfree = btop(200K)
* minfree = btop(100K)
* throttlefree = INT_MIN
* max_percent_cpu = 4
*/
void
setupclock(int recalc)
{
/*
* setupclock can now be called to recalculate the paging
* parameters in the case of dynamic addition of memory.
* So to make sure we make the proper calculations, if such a
* situation should arise, we save away the initial values
* of each parameter so we can recall them when needed. This
* way we don't lose the settings an admin might have made
*/
if (!recalc) {
}
/*
* Set up thresholds for paging:
*/
/*
* Lotsfree is threshold where paging daemon turns on.
*/
else
/*
* Desfree is amount of memory desired free.
* If less than this for extended period, start swapping.
*/
else
/*
* Minfree is minimal amount of free memory which is tolerable.
*/
else
/*
* Throttlefree is the point at which we start throttling
* PG_WAIT requests until enough memory becomes available.
*/
else
/*
* Pageout_reserve is the number of pages that we keep in
* stock for pageout's own use. Having a few such pages
* provides insurance against system deadlock due to
* pageout needing pages. When freemem < pageout_reserve,
* non-blocking allocations are denied to any threads
* other than pageout and sched. (At some point we might
* want to consider a per-thread flag like T_PUSHING_PAGES
* to indicate that a thread is part of the page-pushing
* dance (e.g. an interrupt thread) and thus is entitled
* to the same special dispensation we accord pageout.)
*/
else
/*
* Maxpgio thresholds how much paging is acceptable.
* This figures that 2/3 busy on an arm is all that is
* tolerable for paging. We assume one operation per disk rev.
*
* XXX - Does not account for multiple swap devices.
*/
if (init_mpgio == 0)
else
/*
* The clock scan rate varies between fastscan and slowscan
* based on the amount of free memory available. Fastscan
* rate should be set based on the number pages that can be
* scanned per sec using ~10% of processor time. Since this
* value depends on the processor, MMU, Mhz etc., it is
* difficult to determine it in a generic manner for all
* architectures.
*
* Instead of trying to determine the number of pages scanned
* per sec for every processor, fastscan is set to be the smaller
* of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
* time is limited to ~4% of processor time.
*
* Setting fastscan to be 1/2 of memory allows pageout to scan
* all of memory in ~2 secs. This implies that user pages not
* accessed within 1 sec (assuming, handspreadpages == fastscan)
* can be reclaimed when free memory is very low. Stealing pages
* not accessed within 1 sec seems reasonable and ensures that
* active user processes don't thrash.
*
* Smaller values of fastscan result in scanning fewer pages
* every second and consequently pageout may not be able to free
* sufficient memory to maintain the minimum threshold. Larger
* values of fastscan result in scanning a lot more pages which
* could lead to thrashing and higher CPU usage.
*
* Fastscan needs to be limited to a maximum value and should not
* scale with memory to prevent pageout from consuming too much
* time for scanning on slow CPU's and avoid thrashing, as a
* result of scanning too many pages, on faster CPU's.
* The value of 64 Meg was chosen for MAXHANDSPREADPAGES
* (the upper bound for fastscan) based on the average number
* of pages that can potentially be scanned in ~1 sec (using ~4%
* of the CPU) on some of the following machines that currently
* run Solaris 2.x:
*
* average memory scanned in ~1 sec
*
* 25 Mhz SS1+: 23 Meg
* LX: 37 Meg
* 50 Mhz SC2000: 68 Meg
*
* 40 Mhz 486: 26 Meg
* 66 Mhz 486: 42 Meg
*
* When free memory falls just below lotsfree, the scan rate
* goes from 0 to slowscan (i.e., pageout starts running). This
* transition needs to be smooth and is achieved by ensuring that
* pageout scans a small number of pages to satisfy the transient
* wakeup) since scanning that many pages has no noticible impact
* on system performance.
*
* In addition to setting fastscan and slowscan, pageout is
* limited to using ~4% of the CPU. This results in increasing
* the time taken to scan all of memory, which in turn means that
* user processes have a better opportunity of preventing their
* pages from being stolen. This has a positive effect on
* interactive and overall system performance when memory demand
* is high.
*
* Thus, the rate at which pages are scanned for replacement will
* vary linearly between slowscan and the number of pages that
* can be scanned using ~4% of processor time instead of varying
* linearly between slowscan and fastscan.
*
* Also, the processor time used by pageout will vary from ~1%
* at slowscan to ~4% at fastscan instead of varying between
* ~1% at slowscan and ~10% at fastscan.
*
* The values chosen for the various VM parameters (fastscan,
* handspreadpages, etc) are not universally true for all machines,
* but appear to be a good rule of thumb for the machines we've
* tested. They have the following ranges:
*
* cpu speed: 20 to 70 Mhz
* page size: 4K to 8K
* memory size: 16M to 5G
* page scan rate: 4000 - 17400 4K pages per sec
*
* The values need to be re-examined for machines which don't
* fall into the various ranges (e.g., slower or faster CPUs,
* smaller or larger pagesizes etc) shown above.
*
* On an MP machine, pageout is often unable to maintain the
* minimum paging thresholds under heavy load. This is due to
* the fact that user processes running on other CPU's can be
* dirtying memory at a much faster pace than pageout can find
* pages to free. The memory demands could be met by enabling
* more than one CPU to run the clock algorithm in such a manner
* that the various clock hands don't overlap. This also makes
* it more difficult to determine the values for fastscan, slowscan
* and handspreadpages.
*
* The swapper is currently used to free up memory when pageout
* is unable to meet memory demands by swapping out processes.
* In addition to freeing up memory, swapping also reduces the
* demand for memory by preventing user processes from running
* and thereby consuming memory.
*/
if (init_mfscan == 0) {
if (pageout_new_spread != 0)
else
} else {
}
if (init_fscan == 0)
else
/*
* Set slow scan time to 1/10 the fast scan time, but
* not to exceed maxslowscan.
*/
if (init_sscan == 0)
else
/*
* Handspreadpages is distance (in pages) between front and back
* pageout daemon hands. The amount of time to reclaim a page
* once pageout examines it increases with this distance and
* decreases as the scan rate rises. It must be < the amount
* of pageable memory.
*
* Since pageout is limited to ~4% of the CPU, setting handspreadpages
* to be "fastscan" results in the front hand being a few secs
* (varies based on the processor speed) ahead of the back hand
* at fastscan rates. This distance can be further reduced, if
* necessary, by increasing the processor time used by pageout
* to be more than ~4% and preferrably not more than ~10%.
*
* As a result, user processes have a much better chance of
* referencing their pages before the back hand examines them.
* This also significantly lowers the number of reclaims from
* the freelist since pageout does not end up freeing pages which
* may be referenced a sec later.
*/
if (init_hspages == 0)
else
/*
* Make sure that back hand follows front hand by at least
* 1/RATETOSCHEDPAGING seconds. Without this test, it is possible
* for the back hand to look at a page during the same wakeup of
* the pageout daemon in which the front hand cleared its ref bit.
*/
if (handspreadpages >= looppages)
/*
* If we have been called to recalculate the parameters,
* set a flag to re-evaluate the clock hand pointers.
*/
if (recalc)
reset_hands = 1;
}
/*
* Pageout scheduling.
*
* Schedpaging controls the rate at which the page out daemon runs by
* setting the global variables nscan and desscan RATETOSCHEDPAGING
* times a second. Nscan records the number of pages pageout has examined
* in its current pass; schedpaging resets this value to zero each time
* it runs. Desscan records the number of pages pageout should examine
* in its next pass; schedpaging sets this value based on the amount of
* currently available memory.
*/
/*
* Pool of available async pageout putpage requests.
*/
static struct async_reqs *push_req;
static kcondvar_t push_cv;
static void pageout_scanner(void);
/*
* If a page is being shared more than "po_share" times
* then leave it alone- don't page it out.
*/
#define MIN_PO_SHARE (8)
/*
* Schedule rate for paging.
* Rate is linear interpolation between
* slowscan with lotsfree and fastscan when out of memory.
*/
static void
schedpaging(void *arg)
{
kmem_reap();
seg_preap();
if (mutex_tryenter(&pageout_mutex)) {
/* pageout() not running */
nscan = 0;
if (pageout_new_spread != 0)
if (vavail < 0)
vavail = 0;
/*
* Fix for 1161438 (CRS SPR# 73922). All variables
* in the original calculation for desscan were 32 bit signed
* ints. As freemem approaches 0x0 on a system with 1 Gig or
* more of memory, the calculation can overflow. When this
* happens, desscan becomes negative and pageout_scanner()
* stops paging out.
*/
if ((needfree) && (pageout_new_spread == 0)) {
/*
* If we've not yet collected enough samples to
* calculate a spread, use the old logic of kicking
* into high gear anytime needfree is non-zero.
*/
} else {
/*
* Once we've calculated a spread based on system
* memory and usage, just treat needfree as another
* form of deficit.
*/
}
"pageout_cv_signal:freemem %ld", freemem);
} else {
/*
* There are enough free pages, no need to
* kick the scanner thread. And next time
* around, keep more of the `highly shared'
* pages.
*/
if (po_share > MIN_PO_SHARE) {
po_share >>= 1;
}
}
}
/*
* Signal threads waiting for available memory.
* NOTE: usually we need to grab memavail_lock before cv_broadcast, but
* in this case it is not needed - the waiters will be waken up during
* the next invocation of this function.
*/
if (kmem_avail() > 0)
}
#define FRONT 1
#define BACK 2
/*
* The page out daemon, which runs as process 2.
*
* As long as there are at least lotsfree pages,
* this process is not run. When the number of free
* pages stays in the range desfree to lotsfree,
* this daemon runs through the pages in the loop
* at a rate determined in schedpaging(). Pageout manages
* two hands on the clock. The front hand moves through
* memory, clearing the reference bit,
* and stealing pages from procs that are over maxrss.
* The back hand travels a distance behind the front hand,
* freeing the pages that have not been referenced in the time
* since the front hand passed. If modified, they are pushed to
* swap before being freed.
*
* There are 2 threads that act on behalf of the pageout process.
* One thread scans pages (pageout_scanner) and frees them up if
* they don't require any VOP_PUTPAGE operation. If a page must be
* written back to its backing store, the request is put on a list
* and the other (pageout) thread is signaled. The pageout thread
* grabs VOP_PUTPAGE requests from the list, and processes them.
* Some filesystems may require resources for the VOP_PUTPAGE
* operations (like memory) and hence can block the pageout
* thread, but the scanner thread can still operate. There is still
* no guarantee that memory deadlocks cannot occur.
*
* For now, this thing is in very rough form.
*/
void
pageout()
{
struct async_reqs *arg;
int i;
proc_pageout->p_cstime = 0;
proc_pageout->p_stime = 0;
proc_pageout->p_cutime = 0;
proc_pageout->p_utime = 0;
/*
* Create pageout scanner thread
*/
/*
* Allocate and initialize the async request structures
* for pageout.
*/
push_req = (struct async_reqs *)
for (i = 0; i < async_list_size - 1; i++)
/*
* kick off pageout scheduler.
*/
/*
* Create kernel cage thread.
* The kernel cage thread is started under the pageout process
* to take advantage of the less restricted page allocation
* in page_create_throttle().
*/
/*
* Limit pushes to avoid saturating pageout devices.
*/
for (;;) {
pushes = 0;
}
pushes++;
}
/* vp held by checkpage() */
req_freelist = arg;
}
}
/*
* Kernel thread that scans pages looking for ones to free
*/
static void
pageout_scanner(void)
{
/*
* The restart case does not attempt to point the hands at roughly
* the right point on the assumption that after one circuit things
* will have settled down - and restarts shouldn't be that often.
*/
/*
* Set the two clock hands to be separated by a reasonable amount,
* but no more than 360 degrees apart.
*/
backhand = page_first();
if (handspreadpages >= total_pages)
else
loop:
if (!dopageout)
goto loop;
if (reset_hands) {
reset_hands = 0;
backhand = page_first();
if (handspreadpages >= total_pages)
else
}
count = 0;
"pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
/* Kernel probe */
pcount = 0;
if (pageout_sample_cnt < pageout_sample_lim) {
} else {
}
sample_start = gethrtime();
/*
* Scan the appropriate number of pages for a single duty cycle.
* However, stop scanning as soon as there is enough free memory.
* For a short while, we will be sampling the performance of the
* scanner and need to keep running just to get sample data, in
* which case we keep going and don't pay attention to whether
* or not there is enough free memory.
*/
/*
* Check to see if we have exceeded our %CPU budget
* for this wakeup, but not on every single page visited,
* just every once in a while.
*/
if (pageout_cycle_ticks >= pageout_ticks) {
break;
}
}
/*
* If checkpage manages to add a page to the free list,
* we give ourselves another couple of trips around the loop.
*/
count = 0;
count = 0;
++pcount;
/*
* protected by pageout_mutex instead of cpu_stat_lock
*/
/*
* Don't include ineligible pages in the number scanned.
*/
nscan++;
/*
* backhand update and wraparound check are done separately
* because lint barks when it finds an empty "if" body
*/
"pageout_hand_wrap:freemem %ld whichhand %d",
/*
* protected by pageout_mutex instead of cpu_stat_lock
*/
if (++count > 1) {
/*
* Extremely unlikely, but it happens.
* We went around the loop at least once
* and didn't get far enough.
* If we are still skipping `highly shared'
* pages, skip fewer of them. Otherwise,
* give up till the next clock tick.
*/
if (po_share < MAX_PO_SHARE) {
po_share <<= 1;
} else {
/*
* Really a "goto loop", but
* if someone is TRACing or
* TNF_PROBE_ing, at least
* make records to show
* where we are.
*/
break;
}
}
}
}
sample_end = gethrtime();
"pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
/* Kernel probe */
if (pageout_sample_cnt < pageout_sample_lim) {
}
if (pageout_sample_cnt >= pageout_sample_lim &&
pageout_new_spread == 0) {
setupclock(1);
}
goto loop;
}
/*
* Look at the page at hand. If it is locked (e.g., for physical i/o),
* system (u., page table) or free, then leave it alone. Otherwise,
* if we are running the front hand, turn off the page's reference bit.
* If the proc is over maxrss, we take it. If running the back hand,
* check whether the page has been reclaimed. If not, free the page,
* pushing it to disk first if necessary.
*
* Return values:
* -1 if the page is not a candidate at all,
* 0 if not freed, or
* 1 if we freed it.
*/
static int
{
int ppattr;
int isfs = 0;
int isexec = 0;
int pagesync_flag;
/*
* Skip pages:
* - associated with the kernel vnode since
* they are always "exclusively" locked.
* - that are free
* - that are shared more than po_share'd times
* - its already locked
*
* NOTE: These optimizations assume that reads are atomic.
*/
top:
return (-1);
}
/*
* Skip the page if we can't acquire the "exclusive" lock.
*/
return (-1);
/*
* It became free between the above check and our actually
* locking the page. Oh, well there will be other pages.
*/
return (-1);
}
/*
* Reject pages that cannot be freed. The page_struct_lock
* need not be acquired to examine these
* fields since the page has an "exclusive" lock.
*/
return (-1);
}
/*
* Maintain statistics for what we are freeing
*/
isexec = 1;
isfs = 1;
}
/*
* Turn off REF and MOD bits with the front hand.
* The back hand examines the REF bit and always considers
* SHARED pages as referenced.
*/
else
/*
* If page is referenced; make unreferenced but reclaimable.
* If this page is not referenced, then it must be reclaimable
* and we can add it to the free list.
*/
/*
* Checking of rss or madvise flags needed here...
*
* If not "well-behaved", fall through into the code
* for not referenced.
*/
hat_clrref(pp);
}
/*
* Somebody referenced the page since the front
* hand went by, so it's not a candidate for
* freeing up.
*/
return (0);
}
/*
* If large page, attempt to demote it. If successfully demoted,
* retry the checkpage.
*/
if (!page_try_demote_pages(pp)) {
return (-1);
}
/*
* since page_try_demote_pages() could have unloaded some
* mappings it makes sense to reload ppattr.
*/
}
/*
* If the page is currently dirty, we have to arrange
* to have it cleaned before it can be freed.
*
* XXX - ASSERT(pp->p_vnode != NULL);
*/
/*
* XXX - Test for process being swapped out or about to exit?
* [Can't get back to process(es) using the page.]
*/
/*
* Hold the vnode before releasing the page lock to
* prevent it from being freed and re-used by some
* other thread.
*/
/*
* Queue i/o request for the pageout thread.
*/
return (0);
}
return (1);
}
/*
* Now we unload all the translations,
* and put the page back on to the free list.
* If the page was used (referenced or modified) after
* the pagesync but before it was unloaded we catch it
* and handle the page properly.
*/
goto recheck;
/*LINTED: constant in conditional context*/
if (isfs) {
if (isexec) {
} else {
}
} else {
}
return (1); /* freed a page! */
}
/*
* Queue async i/o request from pageout_scanner and segment swapout
* routines on one common list. This ensures that pageout devices (swap)
* are not saturated by pageout_scanner or swapout requests.
* The pageout thread empties this list by initiating i/o operations.
*/
int
{
struct async_reqs *arg;
/*
* If we cannot allocate an async request struct,
* skip this page.
*/
return (0);
}
/*
* Add to list of pending write requests.
*/
if (req_freelist == NULL) {
/*
* No free async requests left. The lock is held so we
* might as well signal the pusher thread now.
*/
}
return (1);
}
/*
* Wakeup pageout to initiate i/o if push_list is not empty.
*/
void
{
}
}