/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright (c) 2015, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
* The Regents of the University of California
* All Rights Reserved
*
* University Acknowledgment- Portions of this document are derived from
* software developed by the University of California, Berkeley, and its
* contributors.
*/
/*
* VM - segment management.
*/
#include <sys/types.h>
#include <sys/inttypes.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kmem.h>
#include <sys/sysmacros.h>
#include <sys/vmsystm.h>
#include <sys/tuneable.h>
#include <sys/debug.h>
#include <sys/fs/swapnode.h>
#include <sys/cmn_err.h>
#include <sys/callb.h>
#include <sys/mem_config.h>
#include <sys/mman.h>
#include <vm/hat.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_kmem.h>
#include <vm/seg_spt.h>
#include <vm/seg_vn.h>
#include <vm/anon.h>
/*
* kstats for segment advise
*/
segadvstat_t segadvstat = {
{ "MADV_FREE_hit", KSTAT_DATA_ULONG },
{ "MADV_FREE_miss", KSTAT_DATA_ULONG },
};
kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
/*
* entry in the segment page cache
*/
struct seg_pcache {
struct seg_pcache *p_hnext; /* list for hashed blocks */
struct seg_pcache *p_hprev;
pcache_link_t p_plink; /* per segment/amp list */
void *p_htag0; /* segment/amp pointer */
caddr_t p_addr; /* base address/anon_idx */
size_t p_len; /* total bytes */
size_t p_wlen; /* writtable bytes at p_addr */
struct page **p_pp; /* pp shadow list */
seg_preclaim_cbfunc_t p_callback; /* reclaim callback function */
clock_t p_lbolt; /* lbolt from last use */
struct seg_phash *p_hashp; /* our pcache hash bucket */
uint_t p_active; /* active count */
uchar_t p_write; /* true if S_WRITE */
uchar_t p_ref; /* reference byte */
ushort_t p_flags; /* bit flags */
};
struct seg_phash {
struct seg_pcache *p_hnext; /* list for hashed blocks */
struct seg_pcache *p_hprev;
kmutex_t p_hmutex; /* protects hash bucket */
pcache_link_t p_halink[2]; /* active bucket linkages */
};
struct seg_phash_wired {
struct seg_pcache *p_hnext; /* list for hashed blocks */
struct seg_pcache *p_hprev;
kmutex_t p_hmutex; /* protects hash bucket */
};
/*
* A parameter to control a maximum number of bytes that can be
* purged from pcache at a time.
*/
#define P_MAX_APURGE_BYTES (1024 * 1024 * 1024)
/*
* log2(fraction of pcache to reclaim at a time).
*/
#define P_SHRINK_SHFT (5)
/*
* The following variables can be tuned via /etc/system.
*/
int segpcache_enabled = 1; /* if 1, shadow lists are cached */
pgcnt_t segpcache_maxwindow = 0; /* max # of pages that can be cached */
ulong_t segpcache_hashsize_win = 0; /* # of non wired buckets */
ulong_t segpcache_hashsize_wired = 0; /* # of wired buckets */
int segpcache_reap_sec = 1; /* reap check rate in secs */
clock_t segpcache_reap_ticks = 0; /* reap interval in ticks */
int segpcache_pcp_maxage_sec = 1; /* pcp max age in secs */
clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */
int segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */
pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */
static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */
static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */
static kcondvar_t seg_pasync_cv;
#pragma align 64(pctrl1)
#pragma align 64(pctrl2)
#pragma align 64(pctrl3)
/*
* Keep frequently used variables together in one cache line.
*/
static struct p_ctrl1 {
uint_t p_disabled; /* if not 0, caching temporarily off */
pgcnt_t p_maxwin; /* max # of pages that can be cached */
size_t p_hashwin_sz; /* # of non wired buckets */
struct seg_phash *p_htabwin; /* hash table for non wired entries */
size_t p_hashwired_sz; /* # of wired buckets */
struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
kmem_cache_t *p_kmcache; /* kmem cache for seg_pcache structs */
#ifdef _LP64
ulong_t pad[1];
#endif /* _LP64 */
} pctrl1;
static struct p_ctrl2 {
kmutex_t p_mem_mtx; /* protects window counter and p_halinks */
pgcnt_t p_locked_win; /* # pages from window */
pgcnt_t p_locked; /* # of pages cached by pagelock */
uchar_t p_ahcur; /* current active links for insert/delete */
uchar_t p_athr_on; /* async reclaim thread is running. */
pcache_link_t p_ahhead[2]; /* active buckets linkages */
} pctrl2;
static struct p_ctrl3 {
clock_t p_pcp_maxage; /* max pcp age in ticks */
ulong_t p_athr_empty_ahb; /* athread walk stats */
ulong_t p_athr_full_ahb; /* athread walk stats */
pgcnt_t p_maxapurge_npages; /* max pages to purge at a time */
int p_shrink_shft; /* reap shift factor */
#ifdef _LP64
ulong_t pad[3];
#endif /* _LP64 */
} pctrl3;
#define seg_pdisabled pctrl1.p_disabled
#define seg_pmaxwindow pctrl1.p_maxwin
#define seg_phashsize_win pctrl1.p_hashwin_sz
#define seg_phashtab_win pctrl1.p_htabwin
#define seg_phashsize_wired pctrl1.p_hashwired_sz
#define seg_phashtab_wired pctrl1.p_htabwired
#define seg_pkmcache pctrl1.p_kmcache
#define seg_pmem_mtx pctrl2.p_mem_mtx
#define seg_plocked_window pctrl2.p_locked_win
#define seg_plocked pctrl2.p_locked
#define seg_pahcur pctrl2.p_ahcur
#define seg_pathr_on pctrl2.p_athr_on
#define seg_pahhead pctrl2.p_ahhead
#define seg_pmax_pcpage pctrl3.p_pcp_maxage
#define seg_pathr_empty_ahb pctrl3.p_athr_empty_ahb
#define seg_pathr_full_ahb pctrl3.p_athr_full_ahb
#define seg_pshrink_shift pctrl3.p_shrink_shft
#define seg_pmaxapurge_npages pctrl3.p_maxapurge_npages
#define P_HASHWIN_MASK (seg_phashsize_win - 1)
#define P_HASHWIRED_MASK (seg_phashsize_wired - 1)
#define P_BASESHIFT (6)
kthread_t *seg_pasync_thr;
extern struct seg_ops segvn_ops;
extern struct seg_ops segspt_shmops;
#define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
#define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
#define LBOLT_DELTA(t) ((ulong_t)(ddi_get_lbolt() - (t)))
#define PCP_AGE(pcp) LBOLT_DELTA((pcp)->p_lbolt)
/*
* htag0 argument can be a seg or amp pointer.
*/
#define P_HASHBP(seg, htag0, addr, flags) \
(IS_PFLAGS_WIRED((flags)) ? \
((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \
((uintptr_t)(htag0) >> P_BASESHIFT)]) : \
(&seg_phashtab_win[P_HASHWIN_MASK & \
(((uintptr_t)(htag0) >> 3) ^ \
((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ? \
(flags >> 16) : page_get_shift((seg)->s_szc))))]))
/*
* htag0 argument can be a seg or amp pointer.
*/
#define P_MATCH(pcp, htag0, addr, len) \
((pcp)->p_htag0 == (htag0) && \
(pcp)->p_addr == (addr) && \
(pcp)->p_len >= (len))
#define P_MATCH_PP(pcp, htag0, addr, len, pp) \
((pcp)->p_pp == (pp) && \
(pcp)->p_htag0 == (htag0) && \
(pcp)->p_addr == (addr) && \
(pcp)->p_len >= (len))
#define plink2pcache(pl) ((struct seg_pcache *)((uintptr_t)(pl) - \
offsetof(struct seg_pcache, p_plink)))
#define hlink2phash(hl, l) ((struct seg_phash *)((uintptr_t)(hl) - \
offsetof(struct seg_phash, p_halink[l])))
/*
* seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
* active hash bucket lists. We maintain active bucket lists to reduce the
* overhead of finding active buckets during asynchronous purging since there
* can be 10s of millions of buckets on a large system but only a small subset
* of them in actual use.
*
* There're 2 active bucket lists. Current active list (as per seg_pahcur) is
* used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
* buckets. The other list is used by asynchronous purge thread. This allows
* the purge thread to walk its active list without holding seg_pmem_mtx for a
* long time. When asynchronous thread is done with its list it switches to
* current active list and makes the list it just finished processing as
* current active list.
*
* seg_padd_abuck() only adds the bucket to current list if the bucket is not
* yet on any list. seg_premove_abuck() may remove the bucket from either
* list. If the bucket is on current list it will be always removed. Otherwise
* the bucket is only removed if asynchronous purge thread is not currently
* running or seg_premove_abuck() is called by asynchronous purge thread
* itself. A given bucket can only be on one of active lists at a time. These
* routines should be called with per bucket lock held. The routines use
* seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
* the first entry is added to the bucket chain and seg_premove_abuck() must
* be called after the last pcp entry is deleted from its chain. Per bucket
* lock should be held by the callers. This avoids a potential race condition
* when seg_premove_abuck() removes a bucket after pcp entries are added to
* its list after the caller checked that the bucket has no entries. (this
* race would cause a loss of an active bucket from the active lists).
*
* Both lists are circular doubly linked lists anchored at seg_pahhead heads.
* New entries are added to the end of the list since LRU is used as the
* purging policy.
*/
static void
seg_padd_abuck(struct seg_phash *hp)
{
int lix;
ASSERT(MUTEX_HELD(&hp->p_hmutex));
ASSERT((struct seg_phash *)hp->p_hnext != hp);
ASSERT((struct seg_phash *)hp->p_hprev != hp);
ASSERT(hp->p_hnext == hp->p_hprev);
ASSERT(!IS_PCP_WIRED(hp->p_hnext));
ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
ASSERT(hp >= seg_phashtab_win &&
hp < &seg_phashtab_win[seg_phashsize_win]);
/*
* This bucket can already be on one of active lists
* since seg_premove_abuck() may have failed to remove it
* before.
*/
mutex_enter(&seg_pmem_mtx);
lix = seg_pahcur;
ASSERT(lix >= 0 && lix <= 1);
if (hp->p_halink[lix].p_lnext != NULL) {
ASSERT(hp->p_halink[lix].p_lprev != NULL);
ASSERT(hp->p_halink[!lix].p_lnext == NULL);
ASSERT(hp->p_halink[!lix].p_lprev == NULL);
mutex_exit(&seg_pmem_mtx);
return;
}
ASSERT(hp->p_halink[lix].p_lprev == NULL);
/*
* If this bucket is still on list !lix async thread can't yet remove
* it since we hold here per bucket lock. In this case just return
* since async thread will eventually find and process this bucket.
*/
if (hp->p_halink[!lix].p_lnext != NULL) {
ASSERT(hp->p_halink[!lix].p_lprev != NULL);
mutex_exit(&seg_pmem_mtx);
return;
}
ASSERT(hp->p_halink[!lix].p_lprev == NULL);
/*
* This bucket is not on any active bucket list yet.
* Add the bucket to the tail of current active list.
*/
hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
mutex_exit(&seg_pmem_mtx);
}
static void
seg_premove_abuck(struct seg_phash *hp, int athr)
{
int lix;
ASSERT(MUTEX_HELD(&hp->p_hmutex));
ASSERT((struct seg_phash *)hp->p_hnext == hp);
ASSERT((struct seg_phash *)hp->p_hprev == hp);
ASSERT(hp >= seg_phashtab_win &&
hp < &seg_phashtab_win[seg_phashsize_win]);
if (athr) {
ASSERT(seg_pathr_on);
ASSERT(seg_pahcur <= 1);
/*
* We are called by asynchronous thread that found this bucket
* on not currently active (i.e. !seg_pahcur) list. Remove it
* from there. Per bucket lock we are holding makes sure
* seg_pinsert() can't sneak in and add pcp entries to this
* bucket right before we remove the bucket from its list.
*/
lix = !seg_pahcur;
ASSERT(hp->p_halink[lix].p_lnext != NULL);
ASSERT(hp->p_halink[lix].p_lprev != NULL);
ASSERT(hp->p_halink[!lix].p_lnext == NULL);
ASSERT(hp->p_halink[!lix].p_lprev == NULL);
hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
hp->p_halink[lix].p_lnext = NULL;
hp->p_halink[lix].p_lprev = NULL;
return;
}
mutex_enter(&seg_pmem_mtx);
lix = seg_pahcur;
ASSERT(lix >= 0 && lix <= 1);
/*
* If the bucket is on currently active list just remove it from
* there.
*/
if (hp->p_halink[lix].p_lnext != NULL) {
ASSERT(hp->p_halink[lix].p_lprev != NULL);
ASSERT(hp->p_halink[!lix].p_lnext == NULL);
ASSERT(hp->p_halink[!lix].p_lprev == NULL);
hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
hp->p_halink[lix].p_lnext = NULL;
hp->p_halink[lix].p_lprev = NULL;
mutex_exit(&seg_pmem_mtx);
return;
}
ASSERT(hp->p_halink[lix].p_lprev == NULL);
/*
* If asynchronous thread is not running we can remove the bucket from
* not currently active list. The bucket must be on this list since we
* already checked that it's not on the other list and the bucket from
* which we just deleted the last pcp entry must be still on one of the
* active bucket lists.
*/
lix = !lix;
ASSERT(hp->p_halink[lix].p_lnext != NULL);
ASSERT(hp->p_halink[lix].p_lprev != NULL);
if (!seg_pathr_on) {
hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
hp->p_halink[lix].p_lnext = NULL;
hp->p_halink[lix].p_lprev = NULL;
}
mutex_exit(&seg_pmem_mtx);
}
/*
* Check if bucket pointed by hp already has a pcp entry that matches request
* htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
* Also delete matching entries that cover smaller address range but start
* at the same address as addr argument. Return the list of deleted entries if
* any. This is an internal helper function called from seg_pinsert() only
* for non wired shadow lists. The caller already holds a per seg/amp list
* lock.
*/
static struct seg_pcache *
seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
caddr_t addr, size_t len, int *found)
{
struct seg_pcache *pcp;
struct seg_pcache *delcallb_list = NULL;
ASSERT(MUTEX_HELD(&hp->p_hmutex));
*found = 0;
for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
pcp = pcp->p_hnext) {
ASSERT(pcp->p_hashp == hp);
if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
ASSERT(!IS_PCP_WIRED(pcp));
if (pcp->p_len < len) {
pcache_link_t *plinkp;
if (pcp->p_active) {
continue;
}
plinkp = &pcp->p_plink;
plinkp->p_lprev->p_lnext = plinkp->p_lnext;
plinkp->p_lnext->p_lprev = plinkp->p_lprev;
pcp->p_hprev->p_hnext = pcp->p_hnext;
pcp->p_hnext->p_hprev = pcp->p_hprev;
pcp->p_hprev = delcallb_list;
delcallb_list = pcp;
} else {
*found = 1;
break;
}
}
}
return (delcallb_list);
}
/*
* lookup an address range in pagelock cache. Return shadow list and bump up
* active count. If amp is not NULL use amp as a lookup tag otherwise use seg
* as a lookup tag.
*/
struct page **
seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
enum seg_rw rw, uint_t flags)
{
struct seg_pcache *pcp;
struct seg_phash *hp;
void *htag0;
ASSERT(seg != NULL);
ASSERT(rw == S_READ || rw == S_WRITE);
/*
* Skip pagelock cache, while DR is in progress or
* seg_pcache is off.
*/
if (seg_pdisabled) {
return (NULL);
}
ASSERT(seg_phashsize_win != 0);
htag0 = (amp == NULL ? (void *)seg : (void *)amp);
hp = P_HASHBP(seg, htag0, addr, flags);
mutex_enter(&hp->p_hmutex);
for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
pcp = pcp->p_hnext) {
ASSERT(pcp->p_hashp == hp);
if (P_MATCH(pcp, htag0, addr, len)) {
ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
/*
* If this request wants to write pages
* but write permissions starting from
* addr don't cover the entire length len
* return lookup failure back to the caller.
* It will check protections and fail this
* pagelock operation with EACCESS error.
*/
if (rw == S_WRITE && pcp->p_wlen < len) {
break;
}
if (pcp->p_active == UINT_MAX) {
break;
}
pcp->p_active++;
if (rw == S_WRITE && !pcp->p_write) {
pcp->p_write = 1;
}
mutex_exit(&hp->p_hmutex);
return (pcp->p_pp);
}
}
mutex_exit(&hp->p_hmutex);
return (NULL);
}
/*
* mark address range inactive. If the cache is off or the address range is
* not in the cache or another shadow list that covers bigger range is found
* we call the segment driver to reclaim the pages. Otherwise just decrement
* active count and set ref bit. If amp is not NULL use amp as a lookup tag
* otherwise use seg as a lookup tag.
*/
void
seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
seg_preclaim_cbfunc_t callback)
{
struct seg_pcache *pcp;
struct seg_phash *hp;
kmutex_t *pmtx = NULL;
pcache_link_t *pheadp;
void *htag0;
pgcnt_t npages = 0;
int keep = 0;
ASSERT(seg != NULL);
ASSERT(rw == S_READ || rw == S_WRITE);
htag0 = (amp == NULL ? (void *)seg : (void *)amp);
/*
* Skip lookup if pcache is not configured.
*/
if (seg_phashsize_win == 0) {
goto out;
}
/*
* Grab per seg/amp lock before hash lock if we are going to remove
* inactive entry from pcache.
*/
if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
if (amp == NULL) {
pheadp = &seg->s_phead;
pmtx = &seg->s_pmtx;
} else {
pheadp = &amp->a_phead;
pmtx = &amp->a_pmtx;
}
mutex_enter(pmtx);
}
hp = P_HASHBP(seg, htag0, addr, flags);
mutex_enter(&hp->p_hmutex);
again:
for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
pcp = pcp->p_hnext) {
ASSERT(pcp->p_hashp == hp);
if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
ASSERT(pcp->p_active);
if (keep) {
/*
* Don't remove this pcp entry
* if we didn't find duplicate
* shadow lists on second search.
* Somebody removed those duplicates
* since we dropped hash lock after first
* search.
*/
ASSERT(pmtx != NULL);
ASSERT(!IS_PFLAGS_WIRED(flags));
mutex_exit(pmtx);
pmtx = NULL;
}
pcp->p_active--;
if (pcp->p_active == 0 && (pmtx != NULL ||
(seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
/*
* This entry is no longer active. Remove it
* now either because pcaching is temporarily
* disabled or there're other pcp entries that
* can match this pagelock request (i.e. this
* entry is a duplicate).
*/
ASSERT(callback == pcp->p_callback);
if (pmtx != NULL) {
pcache_link_t *plinkp = &pcp->p_plink;
ASSERT(!IS_PCP_WIRED(pcp));
ASSERT(pheadp->p_lnext != pheadp);
ASSERT(pheadp->p_lprev != pheadp);
plinkp->p_lprev->p_lnext =
plinkp->p_lnext;
plinkp->p_lnext->p_lprev =
plinkp->p_lprev;
}
pcp->p_hprev->p_hnext = pcp->p_hnext;
pcp->p_hnext->p_hprev = pcp->p_hprev;
if (!IS_PCP_WIRED(pcp) &&
hp->p_hnext == (struct seg_pcache *)hp) {
/*
* We removed the last entry from this
* bucket. Now remove the bucket from
* its active list.
*/
seg_premove_abuck(hp, 0);
}
mutex_exit(&hp->p_hmutex);
if (pmtx != NULL) {
mutex_exit(pmtx);
}
len = pcp->p_len;
npages = btop(len);
if (rw != S_WRITE && pcp->p_write) {
rw = S_WRITE;
}
kmem_cache_free(seg_pkmcache, pcp);
goto out;
} else {
/*
* We found a matching pcp entry but will not
* free it right away even if it's no longer
* active.
*/
if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
/*
* Set the reference bit and mark the
* time of last access to this pcp
* so that asynchronous thread doesn't
* free it immediately since
* it may be reactivated very soon.
*/
pcp->p_lbolt = ddi_get_lbolt();
pcp->p_ref = 1;
}
mutex_exit(&hp->p_hmutex);
if (pmtx != NULL) {
mutex_exit(pmtx);
}
return;
}
} else if (!IS_PFLAGS_WIRED(flags) &&
P_MATCH(pcp, htag0, addr, len)) {
/*
* This is a duplicate pcp entry. This situation may
* happen if a bigger shadow list that covers our
* range was added while our entry was still active.
* Now we can free our pcp entry if it becomes
* inactive.
*/
if (!pcp->p_active) {
/*
* Mark this entry as referenced just in case
* we'll free our own pcp entry soon.
*/
pcp->p_lbolt = ddi_get_lbolt();
pcp->p_ref = 1;
}
if (pmtx != NULL) {
/*
* we are already holding pmtx and found a
* duplicate. Don't keep our own pcp entry.
*/
keep = 0;
continue;
}
/*
* We have to use mutex_tryenter to attempt to lock
* seg/amp list lock since we already hold hash lock
* and seg/amp list lock is above hash lock in lock
* order. If mutex_tryenter fails drop hash lock and
* retake both locks in correct order and research
* this hash chain.
*/
ASSERT(keep == 0);
if (amp == NULL) {
pheadp = &seg->s_phead;
pmtx = &seg->s_pmtx;
} else {
pheadp = &amp->a_phead;
pmtx = &amp->a_pmtx;
}
if (!mutex_tryenter(pmtx)) {
mutex_exit(&hp->p_hmutex);
mutex_enter(pmtx);
mutex_enter(&hp->p_hmutex);
/*
* If we don't find bigger shadow list on
* second search (it may happen since we
* dropped bucket lock) keep the entry that
* matches our own shadow list.
*/
keep = 1;
goto again;
}
}
}
mutex_exit(&hp->p_hmutex);
if (pmtx != NULL) {
mutex_exit(pmtx);
}
out:
(*callback)(htag0, addr, len, pp, rw, 0);
if (npages) {
mutex_enter(&seg_pmem_mtx);
ASSERT(seg_plocked >= npages);
seg_plocked -= npages;
if (!IS_PFLAGS_WIRED(flags)) {
ASSERT(seg_plocked_window >= npages);
seg_plocked_window -= npages;
}
mutex_exit(&seg_pmem_mtx);
}
}
#ifdef DEBUG
static uint32_t p_insert_chk_mtbf = 0;
#endif
/*
* The seg_pinsert_check() is used by segment drivers to predict whether
* a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
*/
/*ARGSUSED*/
int
seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
size_t len, uint_t flags)
{
ASSERT(seg != NULL);
#ifdef DEBUG
if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
return (SEGP_FAIL);
}
#endif
if (seg_pdisabled) {
return (SEGP_FAIL);
}
ASSERT(seg_phashsize_win != 0);
if (IS_PFLAGS_WIRED(flags)) {
return (SEGP_SUCCESS);
}
if (seg_plocked_window + btop(len) > seg_pmaxwindow) {
return (SEGP_FAIL);
}
if (freemem < desfree) {
return (SEGP_FAIL);
}
return (SEGP_SUCCESS);
}
#ifdef DEBUG
static uint32_t p_insert_mtbf = 0;
#endif
/*
* Insert address range with shadow list into pagelock cache if there's no
* shadow list already cached for this address range. If the cache is off or
* caching is temporarily disabled or the allowed 'window' is exceeded return
* SEGP_FAIL. Otherwise return SEGP_SUCCESS.
*
* For non wired shadow lists (segvn case) include address in the hashing
* function to avoid linking all the entries from the same segment or amp on
* the same bucket. amp is used instead of seg if amp is not NULL. Non wired
* pcache entries are also linked on a per segment/amp list so that all
* entries can be found quickly during seg/amp purge without walking the
* entire pcache hash table. For wired shadow lists (segspt case) we
* don't use address hashing and per segment linking because the caller
* currently inserts only one entry per segment that covers the entire
* segment. If we used per segment linking even for segspt it would complicate
* seg_ppurge_wiredpp() locking.
*
* Both hash bucket and per seg/amp locks need to be held before adding a non
* wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
* first.
*
* This function will also remove from pcache old inactive shadow lists that
* overlap with this request but cover smaller range for the same start
* address.
*/
int
seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
seg_preclaim_cbfunc_t callback)
{
struct seg_pcache *pcp;
struct seg_phash *hp;
pgcnt_t npages;
pcache_link_t *pheadp;
kmutex_t *pmtx;
struct seg_pcache *delcallb_list = NULL;
ASSERT(seg != NULL);
ASSERT(rw == S_READ || rw == S_WRITE);
ASSERT(rw == S_READ || wlen == len);
ASSERT(rw == S_WRITE || wlen <= len);
ASSERT(amp == NULL || wlen == len);
#ifdef DEBUG
if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
return (SEGP_FAIL);
}
#endif
if (seg_pdisabled) {
return (SEGP_FAIL);
}
ASSERT(seg_phashsize_win != 0);
ASSERT((len & PAGEOFFSET) == 0);
npages = btop(len);
mutex_enter(&seg_pmem_mtx);
if (!IS_PFLAGS_WIRED(flags)) {
if (seg_plocked_window + npages > seg_pmaxwindow) {
mutex_exit(&seg_pmem_mtx);
return (SEGP_FAIL);
}
seg_plocked_window += npages;
}
seg_plocked += npages;
mutex_exit(&seg_pmem_mtx);
pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
/*
* If amp is not NULL set htag0 to amp otherwise set it to seg.
*/
if (amp == NULL) {
pcp->p_htag0 = (void *)seg;
pcp->p_flags = flags & 0xffff;
} else {
pcp->p_htag0 = (void *)amp;
pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
}
pcp->p_addr = addr;
pcp->p_len = len;
pcp->p_wlen = wlen;
pcp->p_pp = pp;
pcp->p_write = (rw == S_WRITE);
pcp->p_callback = callback;
pcp->p_active = 1;
hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
if (!IS_PFLAGS_WIRED(flags)) {
int found;
void *htag0;
if (amp == NULL) {
pheadp = &seg->s_phead;
pmtx = &seg->s_pmtx;
htag0 = (void *)seg;
} else {
pheadp = &amp->a_phead;
pmtx = &amp->a_pmtx;
htag0 = (void *)amp;
}
mutex_enter(pmtx);
mutex_enter(&hp->p_hmutex);
delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
len, &found);
if (found) {
mutex_exit(&hp->p_hmutex);
mutex_exit(pmtx);
mutex_enter(&seg_pmem_mtx);
seg_plocked -= npages;
seg_plocked_window -= npages;
mutex_exit(&seg_pmem_mtx);
kmem_cache_free(seg_pkmcache, pcp);
goto out;
}
pcp->p_plink.p_lnext = pheadp->p_lnext;
pcp->p_plink.p_lprev = pheadp;
pheadp->p_lnext->p_lprev = &pcp->p_plink;
pheadp->p_lnext = &pcp->p_plink;
} else {
mutex_enter(&hp->p_hmutex);
}
pcp->p_hashp = hp;
pcp->p_hnext = hp->p_hnext;
pcp->p_hprev = (struct seg_pcache *)hp;
hp->p_hnext->p_hprev = pcp;
hp->p_hnext = pcp;
if (!IS_PFLAGS_WIRED(flags) &&
hp->p_hprev == pcp) {
seg_padd_abuck(hp);
}
mutex_exit(&hp->p_hmutex);
if (!IS_PFLAGS_WIRED(flags)) {
mutex_exit(pmtx);
}
out:
npages = 0;
while (delcallb_list != NULL) {
pcp = delcallb_list;
delcallb_list = pcp->p_hprev;
ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
npages += btop(pcp->p_len);
kmem_cache_free(seg_pkmcache, pcp);
}
if (npages) {
ASSERT(!IS_PFLAGS_WIRED(flags));
mutex_enter(&seg_pmem_mtx);
ASSERT(seg_plocked >= npages);
ASSERT(seg_plocked_window >= npages);
seg_plocked -= npages;
seg_plocked_window -= npages;
mutex_exit(&seg_pmem_mtx);
}
return (SEGP_SUCCESS);
}
/*
* purge entries from the pagelock cache if not active
* and not recently used.
*/
static void
seg_ppurge_async(int force)
{
struct seg_pcache *delcallb_list = NULL;
struct seg_pcache *pcp;
struct seg_phash *hp;
pgcnt_t npages = 0;
pgcnt_t npages_window = 0;
pgcnt_t npgs_to_purge;
pgcnt_t npgs_purged = 0;
int hlinks = 0;
int hlix;
pcache_link_t *hlinkp;
pcache_link_t *hlnextp = NULL;
int lowmem;
int trim;
ASSERT(seg_phashsize_win != 0);
/*
* if the cache is off or empty, return
*/
if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
return;
}
if (!force) {
lowmem = 0;
trim = 0;
if (freemem < lotsfree + needfree) {
spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
if (fmem <= 5 * (desfree >> 2)) {
lowmem = 1;
} else if (fmem <= 7 * (lotsfree >> 3)) {
if (seg_plocked_window >=
(availrmem_initial >> 1)) {
lowmem = 1;
}
} else if (fmem < lotsfree) {
if (seg_plocked_window >=
3 * (availrmem_initial >> 2)) {
lowmem = 1;
}
}
}
if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) {
trim = 1;
}
if (!lowmem && !trim) {
return;
}
npgs_to_purge = seg_plocked_window >>
seg_pshrink_shift;
if (lowmem) {
npgs_to_purge = MIN(npgs_to_purge,
MAX(seg_pmaxapurge_npages, desfree));
} else {
npgs_to_purge = MIN(npgs_to_purge,
seg_pmaxapurge_npages);
}
if (npgs_to_purge == 0) {
return;
}
} else {
struct seg_phash_wired *hpw;
ASSERT(seg_phashsize_wired != 0);
for (hpw = seg_phashtab_wired;
hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
if (hpw->p_hnext == (struct seg_pcache *)hpw) {
continue;
}
mutex_enter(&hpw->p_hmutex);
for (pcp = hpw->p_hnext;
pcp != (struct seg_pcache *)hpw;
pcp = pcp->p_hnext) {
ASSERT(IS_PCP_WIRED(pcp));
ASSERT(pcp->p_hashp ==
(struct seg_phash *)hpw);
if (pcp->p_active) {
continue;
}
pcp->p_hprev->p_hnext = pcp->p_hnext;
pcp->p_hnext->p_hprev = pcp->p_hprev;
pcp->p_hprev = delcallb_list;
delcallb_list = pcp;
}
mutex_exit(&hpw->p_hmutex);
}
}
mutex_enter(&seg_pmem_mtx);
if (seg_pathr_on) {
mutex_exit(&seg_pmem_mtx);
goto runcb;
}
seg_pathr_on = 1;
mutex_exit(&seg_pmem_mtx);
ASSERT(seg_pahcur <= 1);
hlix = !seg_pahcur;
again:
for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
hlinkp = hlnextp) {
hlnextp = hlinkp->p_lnext;
ASSERT(hlnextp != NULL);
hp = hlink2phash(hlinkp, hlix);
if (hp->p_hnext == (struct seg_pcache *)hp) {
seg_pathr_empty_ahb++;
continue;
}
seg_pathr_full_ahb++;
mutex_enter(&hp->p_hmutex);
for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
pcp = pcp->p_hnext) {
pcache_link_t *pheadp;
pcache_link_t *plinkp;
void *htag0;
kmutex_t *pmtx;
ASSERT(!IS_PCP_WIRED(pcp));
ASSERT(pcp->p_hashp == hp);
if (pcp->p_active) {
continue;
}
if (!force && pcp->p_ref &&
PCP_AGE(pcp) < seg_pmax_pcpage) {
pcp->p_ref = 0;
continue;
}
plinkp = &pcp->p_plink;
htag0 = pcp->p_htag0;
if (pcp->p_flags & SEGP_AMP) {
pheadp = &((amp_t *)htag0)->a_phead;
pmtx = &((amp_t *)htag0)->a_pmtx;
} else {
pheadp = &((seg_t *)htag0)->s_phead;
pmtx = &((seg_t *)htag0)->s_pmtx;
}
if (!mutex_tryenter(pmtx)) {
continue;
}
ASSERT(pheadp->p_lnext != pheadp);
ASSERT(pheadp->p_lprev != pheadp);
plinkp->p_lprev->p_lnext =
plinkp->p_lnext;
plinkp->p_lnext->p_lprev =
plinkp->p_lprev;
pcp->p_hprev->p_hnext = pcp->p_hnext;
pcp->p_hnext->p_hprev = pcp->p_hprev;
mutex_exit(pmtx);
pcp->p_hprev = delcallb_list;
delcallb_list = pcp;
npgs_purged += btop(pcp->p_len);
}
if (hp->p_hnext == (struct seg_pcache *)hp) {
seg_premove_abuck(hp, 1);
}
mutex_exit(&hp->p_hmutex);
if (npgs_purged >= seg_plocked_window) {
break;
}
if (!force) {
if (npgs_purged >= npgs_to_purge) {
break;
}
if (!trim && !(seg_pathr_full_ahb & 15)) {
ASSERT(lowmem);
if (freemem >= lotsfree + needfree) {
break;
}
}
}
}
if (hlinkp == &seg_pahhead[hlix]) {
/*
* We processed the entire hlix active bucket list
* but didn't find enough pages to reclaim.
* Switch the lists and walk the other list
* if we haven't done it yet.
*/
mutex_enter(&seg_pmem_mtx);
ASSERT(seg_pathr_on);
ASSERT(seg_pahcur == !hlix);
seg_pahcur = hlix;
mutex_exit(&seg_pmem_mtx);
if (++hlinks < 2) {
hlix = !hlix;
goto again;
}
} else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
seg_pahhead[hlix].p_lnext != hlinkp) {
ASSERT(hlinkp != NULL);
ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);
/*
* Reinsert the header to point to hlinkp
* so that we start from hlinkp bucket next time around.
*/
seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
seg_pahhead[hlix].p_lnext = hlinkp;
seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
hlinkp->p_lprev = &seg_pahhead[hlix];
}
mutex_enter(&seg_pmem_mtx);
ASSERT(seg_pathr_on);
seg_pathr_on = 0;
mutex_exit(&seg_pmem_mtx);
runcb:
/*
* Run the delayed callback list. segments/amps can't go away until
* callback is executed since they must have non 0 softlockcnt. That's
* why we don't need to hold as/seg/amp locks to execute the callback.
*/
while (delcallb_list != NULL) {
pcp = delcallb_list;
delcallb_list = pcp->p_hprev;
ASSERT(!pcp->p_active);
(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
npages += btop(pcp->p_len);
if (!IS_PCP_WIRED(pcp)) {
npages_window += btop(pcp->p_len);
}
kmem_cache_free(seg_pkmcache, pcp);
}
if (npages) {
mutex_enter(&seg_pmem_mtx);
ASSERT(seg_plocked >= npages);
ASSERT(seg_plocked_window >= npages_window);
seg_plocked -= npages;
seg_plocked_window -= npages_window;
mutex_exit(&seg_pmem_mtx);
}
}
/*
* Remove cached pages for segment(s) entries from hashtable. The segments
* are identified by pp array. This is useful for multiple seg's cached on
* behalf of dummy segment (ISM/DISM) with common pp array.
*/
void
seg_ppurge_wiredpp(struct page **pp)
{
struct seg_pcache *pcp;
struct seg_phash_wired *hp;
pgcnt_t npages = 0;
struct seg_pcache *delcallb_list = NULL;
/*
* if the cache is empty, return
*/
if (seg_plocked == 0) {
return;
}
ASSERT(seg_phashsize_wired != 0);
for (hp = seg_phashtab_wired;
hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
if (hp->p_hnext == (struct seg_pcache *)hp) {
continue;
}
mutex_enter(&hp->p_hmutex);
pcp = hp->p_hnext;
while (pcp != (struct seg_pcache *)hp) {
ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
ASSERT(IS_PCP_WIRED(pcp));
/*
* purge entries which are not active
*/
if (!pcp->p_active && pcp->p_pp == pp) {
ASSERT(pcp->p_htag0 != NULL);
pcp->p_hprev->p_hnext = pcp->p_hnext;
pcp->p_hnext->p_hprev = pcp->p_hprev;
pcp->p_hprev = delcallb_list;
delcallb_list = pcp;
}
pcp = pcp->p_hnext;
}
mutex_exit(&hp->p_hmutex);
/*
* segments can't go away until callback is executed since
* they must have non 0 softlockcnt. That's why we don't
* need to hold as/seg locks to execute the callback.
*/
while (delcallb_list != NULL) {
int done;
pcp = delcallb_list;
delcallb_list = pcp->p_hprev;
ASSERT(!pcp->p_active);
done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
pcp->p_len, pcp->p_pp,
pcp->p_write ? S_WRITE : S_READ, 1);
npages += btop(pcp->p_len);
ASSERT(IS_PCP_WIRED(pcp));
kmem_cache_free(seg_pkmcache, pcp);
if (done) {
ASSERT(delcallb_list == NULL);
goto out;
}
}
}
out:
mutex_enter(&seg_pmem_mtx);
ASSERT(seg_plocked >= npages);
seg_plocked -= npages;
mutex_exit(&seg_pmem_mtx);
}
/*
* purge all entries for a given segment. Since we
* callback into the segment driver directly for page
* reclaim the caller needs to hold the right locks.
*/
void
seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
{
struct seg_pcache *delcallb_list = NULL;
struct seg_pcache *pcp;
struct seg_phash *hp;
pgcnt_t npages = 0;
void *htag0;
if (seg_plocked == 0) {
return;
}
ASSERT(seg_phashsize_win != 0);
/*
* If amp is not NULL use amp as a lookup tag otherwise use seg
* as a lookup tag.
*/
htag0 = (amp == NULL ? (void *)seg : (void *)amp);
ASSERT(htag0 != NULL);
if (IS_PFLAGS_WIRED(flags)) {
hp = P_HASHBP(seg, htag0, 0, flags);
mutex_enter(&hp->p_hmutex);
pcp = hp->p_hnext;
while (pcp != (struct seg_pcache *)hp) {
ASSERT(pcp->p_hashp == hp);
ASSERT(IS_PCP_WIRED(pcp));
if (pcp->p_htag0 == htag0) {
if (pcp->p_active) {
break;
}
pcp->p_hprev->p_hnext = pcp->p_hnext;
pcp->p_hnext->p_hprev = pcp->p_hprev;
pcp->p_hprev = delcallb_list;
delcallb_list = pcp;
}
pcp = pcp->p_hnext;
}
mutex_exit(&hp->p_hmutex);
} else {
pcache_link_t *plinkp;
pcache_link_t *pheadp;
kmutex_t *pmtx;
if (amp == NULL) {
ASSERT(seg != NULL);
pheadp = &seg->s_phead;
pmtx = &seg->s_pmtx;
} else {
pheadp = &amp->a_phead;
pmtx = &amp->a_pmtx;
}
mutex_enter(pmtx);
while ((plinkp = pheadp->p_lnext) != pheadp) {
pcp = plink2pcache(plinkp);
ASSERT(!IS_PCP_WIRED(pcp));
ASSERT(pcp->p_htag0 == htag0);
hp = pcp->p_hashp;
mutex_enter(&hp->p_hmutex);
if (pcp->p_active) {
mutex_exit(&hp->p_hmutex);
break;
}
ASSERT(plinkp->p_lprev == pheadp);
pheadp->p_lnext = plinkp->p_lnext;
plinkp->p_lnext->p_lprev = pheadp;
pcp->p_hprev->p_hnext = pcp->p_hnext;
pcp->p_hnext->p_hprev = pcp->p_hprev;
pcp->p_hprev = delcallb_list;
delcallb_list = pcp;
if (hp->p_hnext == (struct seg_pcache *)hp) {
seg_premove_abuck(hp, 0);
}
mutex_exit(&hp->p_hmutex);
}
mutex_exit(pmtx);
}
while (delcallb_list != NULL) {
pcp = delcallb_list;
delcallb_list = pcp->p_hprev;
ASSERT(!pcp->p_active);
(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
npages += btop(pcp->p_len);
kmem_cache_free(seg_pkmcache, pcp);
}
mutex_enter(&seg_pmem_mtx);
ASSERT(seg_plocked >= npages);
seg_plocked -= npages;
if (!IS_PFLAGS_WIRED(flags)) {
ASSERT(seg_plocked_window >= npages);
seg_plocked_window -= npages;
}
mutex_exit(&seg_pmem_mtx);
}
static void seg_pinit_mem_config(void);
/*
* setup the pagelock cache
*/
static void
seg_pinit(void)
{
struct seg_phash *hp;
ulong_t i;
pgcnt_t physmegs;
seg_plocked = 0;
seg_plocked_window = 0;
if (segpcache_enabled == 0) {
seg_phashsize_win = 0;
seg_phashsize_wired = 0;
seg_pdisabled = 1;
return;
}
seg_pdisabled = 0;
seg_pkmcache = kmem_cache_create("seg_pcache",
sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
if (segpcache_pcp_maxage_ticks <= 0) {
segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
}
seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
seg_pathr_empty_ahb = 0;
seg_pathr_full_ahb = 0;
seg_pshrink_shift = segpcache_shrink_shift;
seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);
mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
physmegs = physmem >> (20 - PAGESHIFT);
/*
* If segpcache_hashsize_win was not set in /etc/system or it has
* absurd value set it to a default.
*/
if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
/*
* Create one bucket per 32K (or at least per 8 pages) of
* available memory.
*/
pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
}
if (!ISP2(segpcache_hashsize_win)) {
ulong_t rndfac = ~(1UL <<
(highbit(segpcache_hashsize_win) - 1));
rndfac &= segpcache_hashsize_win;
segpcache_hashsize_win += rndfac;
segpcache_hashsize_win = 1 <<
(highbit(segpcache_hashsize_win) - 1);
}
seg_phashsize_win = segpcache_hashsize_win;
seg_phashtab_win = kmem_zalloc(
seg_phashsize_win * sizeof (struct seg_phash),
KM_SLEEP);
for (i = 0; i < seg_phashsize_win; i++) {
hp = &seg_phashtab_win[i];
hp->p_hnext = (struct seg_pcache *)hp;
hp->p_hprev = (struct seg_pcache *)hp;
mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
}
seg_pahcur = 0;
seg_pathr_on = 0;
seg_pahhead[0].p_lnext = &seg_pahhead[0];
seg_pahhead[0].p_lprev = &seg_pahhead[0];
seg_pahhead[1].p_lnext = &seg_pahhead[1];
seg_pahhead[1].p_lprev = &seg_pahhead[1];
/*
* If segpcache_hashsize_wired was not set in /etc/system or it has
* absurd value set it to a default.
*/
if (segpcache_hashsize_wired == 0 ||
segpcache_hashsize_wired > physmem / 4) {
/*
* Choose segpcache_hashsize_wired based on physmem.
* Create a bucket per 128K bytes upto 256K buckets.
*/
if (physmegs < 20 * 1024) {
segpcache_hashsize_wired = MAX(1024, physmegs << 3);
} else {
segpcache_hashsize_wired = 256 * 1024;
}
}
if (!ISP2(segpcache_hashsize_wired)) {
segpcache_hashsize_wired = 1 <<
highbit(segpcache_hashsize_wired);
}
seg_phashsize_wired = segpcache_hashsize_wired;
seg_phashtab_wired = kmem_zalloc(
seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
for (i = 0; i < seg_phashsize_wired; i++) {
hp = (struct seg_phash *)&seg_phashtab_wired[i];
hp->p_hnext = (struct seg_pcache *)hp;
hp->p_hprev = (struct seg_pcache *)hp;
mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
}
if (segpcache_maxwindow == 0) {
if (physmegs < 64) {
/* 3% of memory */
segpcache_maxwindow = availrmem >> 5;
} else if (physmegs < 512) {
/* 12% of memory */
segpcache_maxwindow = availrmem >> 3;
} else if (physmegs < 1024) {
/* 25% of memory */
segpcache_maxwindow = availrmem >> 2;
} else if (physmegs < 2048) {
/* 50% of memory */
segpcache_maxwindow = availrmem >> 1;
} else {
/* no limit */
segpcache_maxwindow = (pgcnt_t)-1;
}
}
seg_pmaxwindow = segpcache_maxwindow;
seg_pinit_mem_config();
}
/*
* called by pageout if memory is low
*/
void
seg_preap(void)
{
/*
* if the cache is off or empty, return
*/
if (seg_plocked_window == 0) {
return;
}
ASSERT(seg_phashsize_win != 0);
/*
* If somebody is already purging pcache
* just return.
*/
if (seg_pdisabled) {
return;
}
cv_signal(&seg_pasync_cv);
}
/*
* run as a backgroud thread and reclaim pagelock
* pages which have not been used recently
*/
void
seg_pasync_thread(void)
{
callb_cpr_t cpr_info;
if (seg_phashsize_win == 0) {
thread_exit();
/*NOTREACHED*/
}
seg_pasync_thr = curthread;
CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
callb_generic_cpr, "seg_pasync");
if (segpcache_reap_ticks <= 0) {
segpcache_reap_ticks = segpcache_reap_sec * hz;
}
mutex_enter(&seg_pasync_mtx);
for (;;) {
CALLB_CPR_SAFE_BEGIN(&cpr_info);
(void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx,
segpcache_reap_ticks, TR_CLOCK_TICK);
CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
if (seg_pdisabled == 0) {
seg_ppurge_async(0);
}
}
}
static struct kmem_cache *seg_cache;
/*
* Initialize segment management data structures.
*/
void
seg_init(void)
{
kstat_t *ksp;
seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
0, NULL, NULL, NULL, NULL, NULL, 0);
ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
if (ksp) {
ksp->ks_data = (void *)segadvstat_ptr;
kstat_install(ksp);
}
seg_pinit();
}
/*
* Allocate a segment to cover [base, base+size]
* and attach it to the specified address space.
*/
struct seg *
seg_alloc(struct as *as, caddr_t base, size_t size)
{
struct seg *new;
caddr_t segbase;
size_t segsize;
segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
(uintptr_t)segbase;
if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
return ((struct seg *)NULL); /* bad virtual addr range */
if (as != &kas &&
valid_usr_range(segbase, segsize, 0, as,
as->a_userlimit) != RANGE_OKAY)
return ((struct seg *)NULL); /* bad virtual addr range */
new = kmem_cache_alloc(seg_cache, KM_SLEEP);
new->s_ops = NULL;
new->s_data = NULL;
new->s_szc = 0;
new->s_flags = 0;
mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
new->s_phead.p_lnext = &new->s_phead;
new->s_phead.p_lprev = &new->s_phead;
if (seg_attach(as, segbase, segsize, new) < 0) {
kmem_cache_free(seg_cache, new);
return ((struct seg *)NULL);
}
/* caller must fill in ops, data */
return (new);
}
/*
* Attach a segment to the address space. Used by seg_alloc()
* and for kernel startup to attach to static segments.
*/
int
seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
{
seg->s_as = as;
seg->s_base = base;
seg->s_size = size;
/*
* as_addseg() will add the segment at the appropraite point
* in the list. It will return -1 if there is overlap with
* an already existing segment.
*/
return (as_addseg(as, seg));
}
/*
* Unmap a segment and free it from its associated address space.
* This should be called by anybody who's finished with a whole segment's
* mapping. Just calls SEGOP_UNMAP() on the whole mapping . It is the
* responsibility of the segment driver to unlink the the segment
* from the address space, and to free public and private data structures
* associated with the segment. (This is typically done by a call to
* seg_free()).
*/
void
seg_unmap(struct seg *seg)
{
#ifdef DEBUG
int ret;
#endif /* DEBUG */
ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
/* Shouldn't have called seg_unmap if mapping isn't yet established */
ASSERT(seg->s_data != NULL);
/* Unmap the whole mapping */
#ifdef DEBUG
ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
ASSERT(ret == 0);
#else
SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
#endif /* DEBUG */
}
/*
* Free the segment from its associated as. This should only be called
* if a mapping to the segment has not yet been established (e.g., if
* an error occurs in the middle of doing an as_map when the segment
* has already been partially set up) or if it has already been deleted
* (e.g., from a segment driver unmap routine if the unmap applies to the
* entire segment). If the mapping is currently set up then seg_unmap() should
* be called instead.
*/
void
seg_free(struct seg *seg)
{
register struct as *as = seg->s_as;
struct seg *tseg = as_removeseg(as, seg);
ASSERT(tseg == seg);
/*
* If the segment private data field is NULL,
* then segment driver is not attached yet.
*/
if (seg->s_data != NULL)
SEGOP_FREE(seg);
mutex_destroy(&seg->s_pmtx);
ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
kmem_cache_free(seg_cache, seg);
}
/*ARGSUSED*/
static void
seg_p_mem_config_post_add(
void *arg,
pgcnt_t delta_pages)
{
/* Nothing to do. */
}
void
seg_p_enable(void)
{
mutex_enter(&seg_pcache_mtx);
ASSERT(seg_pdisabled != 0);
seg_pdisabled--;
mutex_exit(&seg_pcache_mtx);
}
/*
* seg_p_disable - disables seg_pcache, and then attempts to empty the
* cache.
* Returns SEGP_SUCCESS if the cache was successfully emptied, or
* SEGP_FAIL if the cache could not be emptied.
*/
int
seg_p_disable(void)
{
pgcnt_t old_plocked;
int stall_count = 0;
mutex_enter(&seg_pcache_mtx);
seg_pdisabled++;
ASSERT(seg_pdisabled != 0);
mutex_exit(&seg_pcache_mtx);
/*
* Attempt to empty the cache. Terminate if seg_plocked does not
* diminish with SEGP_STALL_THRESHOLD consecutive attempts.
*/
while (seg_plocked != 0) {
ASSERT(seg_phashsize_win != 0);
old_plocked = seg_plocked;
seg_ppurge_async(1);
if (seg_plocked == old_plocked) {
if (stall_count++ > SEGP_STALL_THRESHOLD) {
return (SEGP_FAIL);
}
} else
stall_count = 0;
if (seg_plocked != 0)
delay(hz/SEGP_PREDEL_DELAY_FACTOR);
}
return (SEGP_SUCCESS);
}
/*
* Attempt to purge seg_pcache. May need to return before this has
* completed to allow other pre_del callbacks to unlock pages. This is
* ok because:
* 1) The seg_pdisabled flag has been set so at least we won't
* cache anymore locks and the locks we couldn't purge
* will not be held if they do get released by a subsequent
* pre-delete callback.
*
* 2) The rest of the memory delete thread processing does not
* depend on the changes made in this pre-delete callback. No
* panics will result, the worst that will happen is that the
* DR code will timeout and cancel the delete.
*/
/*ARGSUSED*/
static int
seg_p_mem_config_pre_del(
void *arg,
pgcnt_t delta_pages)
{
if (seg_phashsize_win == 0) {
return (0);
}
if (seg_p_disable() != SEGP_SUCCESS)
cmn_err(CE_NOTE,
"!Pre-delete couldn't purge"" pagelock cache - continuing");
return (0);
}
/*ARGSUSED*/
static void
seg_p_mem_config_post_del(
void *arg,
pgcnt_t delta_pages,
int cancelled)
{
if (seg_phashsize_win == 0) {
return;
}
seg_p_enable();
}
static kphysm_setup_vector_t seg_p_mem_config_vec = {
KPHYSM_SETUP_VECTOR_VERSION,
seg_p_mem_config_post_add,
seg_p_mem_config_pre_del,
seg_p_mem_config_post_del,
};
static void
seg_pinit_mem_config(void)
{
int ret;
ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
/*
* Want to catch this in the debug kernel. At run time, if the
* callbacks don't get run all will be OK as the disable just makes
* it more likely that the pages can be collected.
*/
ASSERT(ret == 0);
}
/*
* Verify that segment is not a shared anonymous segment which reserves
* swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
* from one zone to another if any segments are shared. This is because the
* last process to exit will credit the swap reservation. This could lead
* to the swap being reserved by one zone, and credited to another.
*/
boolean_t
seg_can_change_zones(struct seg *seg)
{
struct segvn_data *svd;
if (seg->s_ops == &segspt_shmops)
return (B_FALSE);
if (seg->s_ops == &segvn_ops) {
svd = (struct segvn_data *)seg->s_data;
if (svd->type == MAP_SHARED &&
svd->amp != NULL &&
svd->amp->swresv > 0)
return (B_FALSE);
}
return (B_TRUE);
}
/*
* Return swap reserved by a segment backing a private mapping.
*/
size_t
seg_swresv(struct seg *seg)
{
struct segvn_data *svd;
size_t swap = 0;
if (seg->s_ops == &segvn_ops) {
svd = (struct segvn_data *)seg->s_data;
if (svd->type == MAP_PRIVATE && svd->swresv > 0)
swap = svd->swresv;
}
return (swap);
}
/*
* General not supported function for SEGOP_INHERIT
*/
/* ARGSUSED */
int
seg_inherit_notsup(struct seg *seg, caddr_t addr, size_t len, uint_t op)
{
return (ENOTSUP);
}