/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright (c) 2015, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
* The Regents of the University of California
* All Rights Reserved
*
* University Acknowledgment- Portions of this document are derived from
* software developed by the University of California, Berkeley, and its
* contributors.
*/
/*
* VM - segment management.
*/
#include <sys/inttypes.h>
#include <sys/sysmacros.h>
#include <sys/tuneable.h>
#include <sys/mem_config.h>
#include <vm/seg_kmem.h>
/*
* kstats for segment advise
*/
{ "MADV_FREE_hit", KSTAT_DATA_ULONG },
{ "MADV_FREE_miss", KSTAT_DATA_ULONG },
};
/*
* entry in the segment page cache
*/
struct seg_pcache {
};
struct seg_phash {
};
struct seg_phash_wired {
};
/*
* A parameter to control a maximum number of bytes that can be
* purged from pcache at a time.
*/
/*
* log2(fraction of pcache to reclaim at a time).
*/
/*
*/
/*
* Keep frequently used variables together in one cache line.
*/
static struct p_ctrl1 {
#ifdef _LP64
#endif /* _LP64 */
} pctrl1;
static struct p_ctrl2 {
} pctrl2;
static struct p_ctrl3 {
#ifdef _LP64
#endif /* _LP64 */
} pctrl3;
extern struct seg_ops segspt_shmops;
/*
* htag0 argument can be a seg or amp pointer.
*/
(IS_PFLAGS_WIRED((flags)) ? \
/*
* htag0 argument can be a seg or amp pointer.
*/
/*
* seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
* active hash bucket lists. We maintain active bucket lists to reduce the
* overhead of finding active buckets during asynchronous purging since there
* can be 10s of millions of buckets on a large system but only a small subset
* of them in actual use.
*
* There're 2 active bucket lists. Current active list (as per seg_pahcur) is
* used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
* buckets. The other list is used by asynchronous purge thread. This allows
* the purge thread to walk its active list without holding seg_pmem_mtx for a
* long time. When asynchronous thread is done with its list it switches to
* current active list and makes the list it just finished processing as
* current active list.
*
* seg_padd_abuck() only adds the bucket to current list if the bucket is not
* yet on any list. seg_premove_abuck() may remove the bucket from either
* list. If the bucket is on current list it will be always removed. Otherwise
* the bucket is only removed if asynchronous purge thread is not currently
* running or seg_premove_abuck() is called by asynchronous purge thread
* itself. A given bucket can only be on one of active lists at a time. These
* routines should be called with per bucket lock held. The routines use
* seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
* the first entry is added to the bucket chain and seg_premove_abuck() must
* be called after the last pcp entry is deleted from its chain. Per bucket
* lock should be held by the callers. This avoids a potential race condition
* when seg_premove_abuck() removes a bucket after pcp entries are added to
* its list after the caller checked that the bucket has no entries. (this
* race would cause a loss of an active bucket from the active lists).
*
* Both lists are circular doubly linked lists anchored at seg_pahhead heads.
* New entries are added to the end of the list since LRU is used as the
* purging policy.
*/
static void
{
int lix;
/*
* This bucket can already be on one of active lists
* since seg_premove_abuck() may have failed to remove it
* before.
*/
lix = seg_pahcur;
return;
}
/*
* If this bucket is still on list !lix async thread can't yet remove
* it since we hold here per bucket lock. In this case just return
* since async thread will eventually find and process this bucket.
*/
return;
}
/*
* This bucket is not on any active bucket list yet.
* Add the bucket to the tail of current active list.
*/
}
static void
{
int lix;
if (athr) {
/*
* We are called by asynchronous thread that found this bucket
* on not currently active (i.e. !seg_pahcur) list. Remove it
* from there. Per bucket lock we are holding makes sure
* seg_pinsert() can't sneak in and add pcp entries to this
* bucket right before we remove the bucket from its list.
*/
lix = !seg_pahcur;
return;
}
lix = seg_pahcur;
/*
* If the bucket is on currently active list just remove it from
* there.
*/
return;
}
/*
* If asynchronous thread is not running we can remove the bucket from
* not currently active list. The bucket must be on this list since we
* already checked that it's not on the other list and the bucket from
* which we just deleted the last pcp entry must be still on one of the
* active bucket lists.
*/
if (!seg_pathr_on) {
}
}
/*
* Check if bucket pointed by hp already has a pcp entry that matches request
* htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
* Also delete matching entries that cover smaller address range but start
* at the same address as addr argument. Return the list of deleted entries if
* any. This is an internal helper function called from seg_pinsert() only
* lock.
*/
static struct seg_pcache *
{
*found = 0;
continue;
}
delcallb_list = pcp;
} else {
*found = 1;
break;
}
}
}
return (delcallb_list);
}
/*
* lookup an address range in pagelock cache. Return shadow list and bump up
* active count. If amp is not NULL use amp as a lookup tag otherwise use seg
* as a lookup tag.
*/
struct page **
{
void *htag0;
/*
* Skip pagelock cache, while DR is in progress or
* seg_pcache is off.
*/
if (seg_pdisabled) {
return (NULL);
}
ASSERT(seg_phashsize_win != 0);
/*
* If this request wants to write pages
* but write permissions starting from
* addr don't cover the entire length len
* return lookup failure back to the caller.
* It will check protections and fail this
* pagelock operation with EACCESS error.
*/
break;
}
break;
}
}
}
}
return (NULL);
}
/*
* mark address range inactive. If the cache is off or the address range is
* not in the cache or another shadow list that covers bigger range is found
* we call the segment driver to reclaim the pages. Otherwise just decrement
* active count and set ref bit. If amp is not NULL use amp as a lookup tag
* otherwise use seg as a lookup tag.
*/
void
{
void *htag0;
int keep = 0;
/*
* Skip lookup if pcache is not configured.
*/
if (seg_phashsize_win == 0) {
goto out;
}
/*
* inactive entry from pcache.
*/
} else {
}
}
if (keep) {
/*
* Don't remove this pcp entry
* if we didn't find duplicate
* shadow lists on second search.
* Somebody removed those duplicates
* since we dropped hash lock after first
* search.
*/
}
/*
* This entry is no longer active. Remove it
* now either because pcaching is temporarily
* disabled or there're other pcp entries that
* can match this pagelock request (i.e. this
* entry is a duplicate).
*/
}
if (!IS_PCP_WIRED(pcp) &&
/*
* We removed the last entry from this
* bucket. Now remove the bucket from
* its active list.
*/
seg_premove_abuck(hp, 0);
}
}
}
goto out;
} else {
/*
* We found a matching pcp entry but will not
* free it right away even if it's no longer
* active.
*/
/*
* Set the reference bit and mark the
* time of last access to this pcp
* so that asynchronous thread doesn't
* free it immediately since
* it may be reactivated very soon.
*/
}
}
return;
}
} else if (!IS_PFLAGS_WIRED(flags) &&
/*
* This is a duplicate pcp entry. This situation may
* happen if a bigger shadow list that covers our
* range was added while our entry was still active.
* Now we can free our pcp entry if it becomes
* inactive.
*/
/*
* Mark this entry as referenced just in case
* we'll free our own pcp entry soon.
*/
}
/*
* we are already holding pmtx and found a
* duplicate. Don't keep our own pcp entry.
*/
keep = 0;
continue;
}
/*
* We have to use mutex_tryenter to attempt to lock
* order. If mutex_tryenter fails drop hash lock and
* retake both locks in correct order and research
* this hash chain.
*/
} else {
}
if (!mutex_tryenter(pmtx)) {
/*
* If we don't find bigger shadow list on
* second search (it may happen since we
* dropped bucket lock) keep the entry that
* matches our own shadow list.
*/
keep = 1;
goto again;
}
}
}
}
out:
if (npages) {
seg_plocked -= npages;
if (!IS_PFLAGS_WIRED(flags)) {
}
}
}
#ifdef DEBUG
#endif
/*
* The seg_pinsert_check() is used by segment drivers to predict whether
* a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
*/
/*ARGSUSED*/
int
{
#ifdef DEBUG
return (SEGP_FAIL);
}
#endif
if (seg_pdisabled) {
return (SEGP_FAIL);
}
ASSERT(seg_phashsize_win != 0);
if (IS_PFLAGS_WIRED(flags)) {
return (SEGP_SUCCESS);
}
return (SEGP_FAIL);
}
return (SEGP_FAIL);
}
return (SEGP_SUCCESS);
}
#ifdef DEBUG
#endif
/*
* Insert address range with shadow list into pagelock cache if there's no
* shadow list already cached for this address range. If the cache is off or
* caching is temporarily disabled or the allowed 'window' is exceeded return
* SEGP_FAIL. Otherwise return SEGP_SUCCESS.
*
* For non wired shadow lists (segvn case) include address in the hashing
* function to avoid linking all the entries from the same segment or amp on
* the same bucket. amp is used instead of seg if amp is not NULL. Non wired
* entire pcache hash table. For wired shadow lists (segspt case) we
* don't use address hashing and per segment linking because the caller
* currently inserts only one entry per segment that covers the entire
* segment. If we used per segment linking even for segspt it would complicate
* seg_ppurge_wiredpp() locking.
*
* first.
*
* This function will also remove from pcache old inactive shadow lists that
* overlap with this request but cover smaller range for the same start
* address.
*/
int
{
#ifdef DEBUG
return (SEGP_FAIL);
}
#endif
if (seg_pdisabled) {
return (SEGP_FAIL);
}
ASSERT(seg_phashsize_win != 0);
if (!IS_PFLAGS_WIRED(flags)) {
return (SEGP_FAIL);
}
}
seg_plocked += npages;
/*
* If amp is not NULL set htag0 to amp otherwise set it to seg.
*/
} else {
}
if (!IS_PFLAGS_WIRED(flags)) {
int found;
void *htag0;
} else {
}
if (found) {
seg_plocked -= npages;
goto out;
}
} else {
}
if (!IS_PFLAGS_WIRED(flags) &&
}
if (!IS_PFLAGS_WIRED(flags)) {
}
out:
npages = 0;
while (delcallb_list != NULL) {
pcp = delcallb_list;
}
if (npages) {
seg_plocked -= npages;
}
return (SEGP_SUCCESS);
}
/*
* purge entries from the pagelock cache if not active
* and not recently used.
*/
static void
{
int hlinks = 0;
int hlix;
int lowmem;
int trim;
ASSERT(seg_phashsize_win != 0);
/*
* if the cache is off or empty, return
*/
return;
}
if (!force) {
lowmem = 0;
trim = 0;
lowmem = 1;
if (seg_plocked_window >=
(availrmem_initial >> 1)) {
lowmem = 1;
}
if (seg_plocked_window >=
lowmem = 1;
}
}
}
trim = 1;
}
return;
}
if (lowmem) {
} else {
}
if (npgs_to_purge == 0) {
return;
}
} else {
ASSERT(seg_phashsize_wired != 0);
for (hpw = seg_phashtab_wired;
continue;
}
continue;
}
delcallb_list = pcp;
}
}
}
if (seg_pathr_on) {
goto runcb;
}
seg_pathr_on = 1;
hlix = !seg_pahcur;
continue;
}
void *htag0;
continue;
}
continue;
}
} else {
}
if (!mutex_tryenter(pmtx)) {
continue;
}
delcallb_list = pcp;
}
}
if (npgs_purged >= seg_plocked_window) {
break;
}
if (!force) {
if (npgs_purged >= npgs_to_purge) {
break;
}
break;
}
}
}
}
/*
* We processed the entire hlix active bucket list
* but didn't find enough pages to reclaim.
* Switch the lists and walk the other list
* if we haven't done it yet.
*/
seg_pahcur = hlix;
if (++hlinks < 2) {
goto again;
}
/*
* Reinsert the header to point to hlinkp
* so that we start from hlinkp bucket next time around.
*/
}
seg_pathr_on = 0;
/*
* callback is executed since they must have non 0 softlockcnt. That's
*/
while (delcallb_list != NULL) {
pcp = delcallb_list;
if (!IS_PCP_WIRED(pcp)) {
}
}
if (npages) {
seg_plocked -= npages;
}
}
/*
* Remove cached pages for segment(s) entries from hashtable. The segments
* are identified by pp array. This is useful for multiple seg's cached on
*/
void
{
/*
* if the cache is empty, return
*/
if (seg_plocked == 0) {
return;
}
ASSERT(seg_phashsize_wired != 0);
for (hp = seg_phashtab_wired;
continue;
}
/*
* purge entries which are not active
*/
delcallb_list = pcp;
}
}
/*
* segments can't go away until callback is executed since
* they must have non 0 softlockcnt. That's why we don't
*/
while (delcallb_list != NULL) {
int done;
pcp = delcallb_list;
if (done) {
goto out;
}
}
}
out:
seg_plocked -= npages;
}
/*
* purge all entries for a given segment. Since we
* callback into the segment driver directly for page
* reclaim the caller needs to hold the right locks.
*/
void
{
void *htag0;
if (seg_plocked == 0) {
return;
}
ASSERT(seg_phashsize_win != 0);
/*
* If amp is not NULL use amp as a lookup tag otherwise use seg
* as a lookup tag.
*/
if (IS_PFLAGS_WIRED(flags)) {
break;
}
delcallb_list = pcp;
}
}
} else {
} else {
}
break;
}
delcallb_list = pcp;
seg_premove_abuck(hp, 0);
}
}
}
while (delcallb_list != NULL) {
pcp = delcallb_list;
}
seg_plocked -= npages;
if (!IS_PFLAGS_WIRED(flags)) {
}
}
static void seg_pinit_mem_config(void);
/*
* setup the pagelock cache
*/
static void
seg_pinit(void)
{
ulong_t i;
seg_plocked = 0;
seg_plocked_window = 0;
if (segpcache_enabled == 0) {
seg_phashsize_win = 0;
seg_phashsize_wired = 0;
seg_pdisabled = 1;
return;
}
seg_pdisabled = 0;
if (segpcache_pcp_maxage_ticks <= 0) {
}
seg_pathr_empty_ahb = 0;
seg_pathr_full_ahb = 0;
/*
* absurd value set it to a default.
*/
/*
* Create one bucket per 32K (or at least per 8 pages) of
* available memory.
*/
}
if (!ISP2(segpcache_hashsize_win)) {
segpcache_hashsize_win = 1 <<
}
seg_phashsize_win * sizeof (struct seg_phash),
KM_SLEEP);
for (i = 0; i < seg_phashsize_win; i++) {
hp = &seg_phashtab_win[i];
}
seg_pahcur = 0;
seg_pathr_on = 0;
/*
* absurd value set it to a default.
*/
if (segpcache_hashsize_wired == 0 ||
/*
* Choose segpcache_hashsize_wired based on physmem.
* Create a bucket per 128K bytes upto 256K buckets.
*/
} else {
}
}
if (!ISP2(segpcache_hashsize_wired)) {
segpcache_hashsize_wired = 1 <<
}
for (i = 0; i < seg_phashsize_wired; i++) {
}
if (segpcache_maxwindow == 0) {
if (physmegs < 64) {
/* 3% of memory */
} else if (physmegs < 512) {
/* 12% of memory */
} else if (physmegs < 1024) {
/* 25% of memory */
} else if (physmegs < 2048) {
/* 50% of memory */
} else {
/* no limit */
}
}
}
/*
* called by pageout if memory is low
*/
void
seg_preap(void)
{
/*
* if the cache is off or empty, return
*/
if (seg_plocked_window == 0) {
return;
}
ASSERT(seg_phashsize_win != 0);
/*
* If somebody is already purging pcache
* just return.
*/
if (seg_pdisabled) {
return;
}
}
/*
* run as a backgroud thread and reclaim pagelock
* pages which have not been used recently
*/
void
seg_pasync_thread(void)
{
if (seg_phashsize_win == 0) {
thread_exit();
/*NOTREACHED*/
}
callb_generic_cpr, "seg_pasync");
if (segpcache_reap_ticks <= 0) {
}
for (;;) {
if (seg_pdisabled == 0) {
seg_ppurge_async(0);
}
}
}
/*
* Initialize segment management data structures.
*/
void
seg_init(void)
{
if (ksp) {
}
seg_pinit();
}
/*
* Allocate a segment to cover [base, base+size]
* and attach it to the specified address space.
*/
struct seg *
{
}
/* caller must fill in ops, data */
return (new);
}
/*
* Attach a segment to the address space. Used by seg_alloc()
* and for kernel startup to attach to static segments.
*/
int
{
/*
* as_addseg() will add the segment at the appropraite point
* in the list. It will return -1 if there is overlap with
* an already existing segment.
*/
}
/*
* Unmap a segment and free it from its associated address space.
* This should be called by anybody who's finished with a whole segment's
* mapping. Just calls SEGOP_UNMAP() on the whole mapping . It is the
* responsibility of the segment driver to unlink the the segment
* from the address space, and to free public and private data structures
* associated with the segment. (This is typically done by a call to
* seg_free()).
*/
void
{
#ifdef DEBUG
int ret;
#endif /* DEBUG */
/* Shouldn't have called seg_unmap if mapping isn't yet established */
/* Unmap the whole mapping */
#ifdef DEBUG
#else
#endif /* DEBUG */
}
/*
* Free the segment from its associated as. This should only be called
* if a mapping to the segment has not yet been established (e.g., if
* an error occurs in the middle of doing an as_map when the segment
* has already been partially set up) or if it has already been deleted
* (e.g., from a segment driver unmap routine if the unmap applies to the
* entire segment). If the mapping is currently set up then seg_unmap() should
* be called instead.
*/
void
{
/*
* If the segment private data field is NULL,
* then segment driver is not attached yet.
*/
}
/*ARGSUSED*/
static void
void *arg,
{
/* Nothing to do. */
}
void
seg_p_enable(void)
{
ASSERT(seg_pdisabled != 0);
}
/*
* seg_p_disable - disables seg_pcache, and then attempts to empty the
* cache.
* Returns SEGP_SUCCESS if the cache was successfully emptied, or
* SEGP_FAIL if the cache could not be emptied.
*/
int
seg_p_disable(void)
{
int stall_count = 0;
ASSERT(seg_pdisabled != 0);
/*
* Attempt to empty the cache. Terminate if seg_plocked does not
* diminish with SEGP_STALL_THRESHOLD consecutive attempts.
*/
while (seg_plocked != 0) {
ASSERT(seg_phashsize_win != 0);
seg_ppurge_async(1);
if (seg_plocked == old_plocked) {
if (stall_count++ > SEGP_STALL_THRESHOLD) {
return (SEGP_FAIL);
}
} else
stall_count = 0;
if (seg_plocked != 0)
}
return (SEGP_SUCCESS);
}
/*
* Attempt to purge seg_pcache. May need to return before this has
* completed to allow other pre_del callbacks to unlock pages. This is
* ok because:
* 1) The seg_pdisabled flag has been set so at least we won't
* cache anymore locks and the locks we couldn't purge
* will not be held if they do get released by a subsequent
* pre-delete callback.
*
* 2) The rest of the memory delete thread processing does not
* depend on the changes made in this pre-delete callback. No
* panics will result, the worst that will happen is that the
* DR code will timeout and cancel the delete.
*/
/*ARGSUSED*/
static int
void *arg,
{
if (seg_phashsize_win == 0) {
return (0);
}
if (seg_p_disable() != SEGP_SUCCESS)
"!Pre-delete couldn't purge"" pagelock cache - continuing");
return (0);
}
/*ARGSUSED*/
static void
void *arg,
int cancelled)
{
if (seg_phashsize_win == 0) {
return;
}
seg_p_enable();
}
};
static void
seg_pinit_mem_config(void)
{
int ret;
/*
* Want to catch this in the debug kernel. At run time, if the
* callbacks don't get run all will be OK as the disable just makes
* it more likely that the pages can be collected.
*/
}
/*
* Verify that segment is not a shared anonymous segment which reserves
* swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
* from one zone to another if any segments are shared. This is because the
* last process to exit will credit the swap reservation. This could lead
* to the swap being reserved by one zone, and credited to another.
*/
{
return (B_FALSE);
return (B_FALSE);
}
return (B_TRUE);
}
/*
* Return swap reserved by a segment backing a private mapping.
*/
{
}
return (swap);
}
/*
* General not supported function for SEGOP_INHERIT
*/
/* ARGSUSED */
int
{
return (ENOTSUP);
}