seg_vn.c revision 4b31bd29a1057d2aa04778c28370b6211b993555
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2015, Joyent, Inc. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
* The Regents of the University of California
* All Rights Reserved
*
* University Acknowledgment- Portions of this document are derived from
* software developed by the University of California, Berkeley, and its
* contributors.
*/
/*
*/
#include <sys/tuneable.h>
#include <sys/sysmacros.h>
#include <sys/shm_impl.h>
/*
* segvn_fault needs a temporary page list array. To avoid calling kmem all
* the time, it creates a small (PVN_GETPAGE_NUM entry) array and uses it if
* it can. In the rare case when this page list is not large enough, it
* goes and gets a large enough array from kmem.
*
* This small page list array covers either 8 pages or 64kB worth of pages -
* whichever is smaller.
*/
#define PVN_MAX_GETPAGE_SZ 0x10000
#define PVN_MAX_GETPAGE_NUM 0x8
#define PVN_GETPAGE_NUM PVN_MAX_GETPAGE_NUM
#else
#define PVN_GETPAGE_SZ PVN_MAX_GETPAGE_SZ
#endif
/*
* Private seg op routines.
*/
char *vec);
};
/*
* Common zfod structures, provided as a shorthand for others to use.
*/
static segvn_crargs_t zfod_segvn_crargs =
static segvn_crargs_t kzfod_segvn_crargs =
static segvn_crargs_t stack_noexec_crargs =
struct segvn_crargs *, size_t);
struct segvn_crargs *, size_t);
static void segvn_pagelist_rele(page_t **);
static void segvn_setvnode_mpss(vnode_t *);
enum fault_type, enum seg_rw, int);
static void segvn_vpage(struct seg *);
enum seg_rw, int);
enum seg_rw, int);
static int segvn_clrszc(struct seg *);
size_t, void *, u_offset_t);
static struct kmem_cache *segvn_cache;
static struct kmem_cache **segvn_szc_cache;
#ifdef VM_STATS
static struct segvnvmstats_str {
} segvnvmstats;
#endif /* VM_STATS */
if ((len) != 0) { \
} else { \
} \
}
/*ARGSUSED*/
static int
{
return (0);
}
/*ARGSUSED1*/
static void
{
}
/*ARGSUSED*/
static int
{
return (0);
}
/*
* Patching this variable to non-zero allows the system to run with
* stacks marked as "not executable". It's a bit of a kludge, but is
* provided as a tweakable for platforms that export those ABIs
* (e.g. sparc V8) that have executable stacks enabled by default.
* There are also some restrictions for platforms that don't actually
* implement 'noexec' protections.
*
* Once enabled, the system is (therefore) unable to provide a fully
* ABI-compliant execution environment, though practically speaking,
* most everything works. The exceptions are generally some interpreters
* and debuggers that create executable code on the stack and jump
* into it (without explicitly mprotecting the address range to include
* PROT_EXEC).
*
* One important class of applications that are disabled are those
* that have been transformed into malicious agents using one of the
* numerous "buffer overflow" attacks. See 4007890.
*/
int noexec_user_stack = 0;
int noexec_user_stack_log = 1;
int segvn_lpg_disable = 0;
uint_t segvn_maxpgszc = 0;
int segvn_use_regions = 1;
/*
* Segvn supports text replication optimization for NUMA platforms. Text
* replica's are represented by anon maps (amp). There's one amp per text file
* region per lgroup. A process chooses the amp for each of its text mappings
* based on the lgroup assignment of its main thread (t_tid = 1). All
* processes that want a replica on a particular lgroup for the same text file
* mapping share the same amp. amp's are looked up in svntr_hashtab hash table
* with vp,off,size,szc used as a key. Text replication segments are read only
* MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by
* forcing COW faults from vnode to amp and mapping amp pages instead of vnode
* pages. Replication amp is assigned to a segment when it gets its first
* pagefault. To handle main thread lgroup rehoming segvn_trasync_thread
* rechecks periodically if the process still maps an amp local to the main
* thread. If not async thread forces process to remap to an amp in the new
* home lgroup of the main thread. Current text replication implementation
* only provides the benefit to workloads that do most of their work in the
* main thread of a process or all the threads of a process run in the same
* lgroup. To extend text replication benefit to different types of
* multithreaded workloads further work would be needed in the hat layer to
* allow the same virtual address in the same hat to simultaneously map
* different physical addresses (i.e. page table replication would be needed
* for x86).
*
* amp pages are used instead of vnode pages as long as segment has a very
* simple life cycle. It's created via segvn_create(), handles S_EXEC
* (S_READ) pagefaults and is fully unmapped. If anything more complicated
* happens such as protection is changed, real COW fault happens, pagesize is
* changed, MC_LOCK is requested or segment is partially unmapped we turn off
* text replication by converting the segment back to vnode only segment
* (unmap segment's address range and set svd->amp to NULL).
*
* The original file can be changed after amp is inserted into
* svntr_hashtab. Processes that are launched after the file is already
* changed can't use the replica's created prior to the file change. To
* implement this functionality hash entries are timestamped. Replica's can
* only be used if current file modification time is the same as the timestamp
* saved when hash entry was created. However just timestamps alone are not
* sufficient to detect file modification via mmap(MAP_SHARED) mappings. We
* deal with file changes via MAP_SHARED mappings differently. When writable
* MAP_SHARED mappings are created to vnodes marked as executable we mark all
* existing replica's for this vnode as not usable for future text
* mappings. And we don't create new replica's for files that currently have
* potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is
* true).
*/
#define SEGVN_TEXTREPL_MAXBYTES_FACTOR (20)
static struct kmem_cache *svntr_cache;
static svntr_stats_t *segvn_textrepl_stats;
static ksema_t segvn_trasync_sem;
int segvn_disable_textrepl = 1;
int segvn_update_tr_time = 10;
int segvn_disable_textrepl_update = 0;
static void segvn_textrepl(struct seg *);
static void segvn_textunrepl(struct seg *, int);
static void segvn_inval_trcache(vnode_t *);
static void segvn_trasync_thread(void);
static void segvn_trupdate_wakeup(void *);
static void segvn_trupdate(void);
ulong_t);
/*
* Initialize segvn data structures
*/
void
segvn_init(void)
{
sizeof (struct segvn_data), 0,
if (segvn_lpg_disable == 0) {
if (szc == 0) {
segvn_lpg_disable = 1;
}
if (page_get_pagesize(0) != PAGESIZE) {
panic("segvn_init: bad szc 0");
/*NOTREACHED*/
}
while (szc != 0) {
/*NOTREACHED*/
}
szc--;
}
}
if (segvn_maxpgszc) {
KM_SLEEP);
}
char str[32];
}
segvn_use_regions = 0;
/*
* For now shared regions and text replication segvn support
* are mutually exclusive. This is acceptable because
* currently significant benefit from text replication was
* only observed on AMD64 NUMA platforms (due to relatively
* small L2$ size) and currently we don't support shared
* regions on x86.
*/
if (segvn_use_regions && !segvn_disable_textrepl) {
}
#if defined(_LP64)
ulong_t i;
for (i = 0; i < svntr_hashtab_sz; i++) {
}
sizeof (svntr_stats_t), KM_SLEEP);
}
#endif
if (!ISP2(segvn_pglock_comb_balign) ||
}
}
#define SEGVN_PAGEIO ((void *)0x1)
#define SEGVN_NOPAGEIO ((void *)0x2)
static void
{
int err;
if (vn_vmpss_usepageio(vp)) {
} else {
}
/*
* set v_mpssdata just once per vnode life
* so that it never changes.
*/
} else {
}
}
}
}
int
{
struct segvn_data *svd;
int error = 0;
int use_rgn = 0;
int trok = 0;
panic("segvn_create type");
/*NOTREACHED*/
}
/*
* Check arguments. If a shared anon structure is given then
* it is illegal to also specify a vp.
*/
panic("segvn_create anon_map");
/*NOTREACHED*/
}
use_rgn = 1;
}
/* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */
if (a->type == MAP_SHARED)
a->flags &= ~MAP_NORESERVE;
if (a->szc != 0) {
a->szc = 0;
} else {
if (a->szc > segvn_maxpgszc)
a->szc = segvn_maxpgszc;
a->szc = 0;
/*
* paranoid check.
* hat_page_demote() is not supported
* on swapfs pages.
*/
a->szc = 0;
a->szc = 0;
}
a->szc = 0;
}
}
}
}
/*
* If segment may need private pages, reserve them now.
*/
return (EAGAIN);
}
/*
* Reserve any mapping structures that may be required.
*
* Don't do it for segments that may use regions. It's currently a
* noop in the hat implementations anyway.
*/
if (!use_rgn) {
}
if (a->cred) {
} else {
}
/* Inform the vnode of the new mapping */
if (error) {
if (swresv != 0) {
}
if (!use_rgn) {
}
return (error);
}
/*
* svntr_hashtab will be NULL if we support shared regions.
*/
(a->flags & _MAP_TEXTREPL)) &&
!(a->flags & MAP_NORESERVE) &&
}
/*
* MAP_NORESERVE mappings don't count towards the VSZ of a process
* until we fault the pages in.
*/
a->flags & MAP_NORESERVE) {
}
/*
* If more than one segment in the address space, and they're adjacent
* virtually, try to concatenate them. Don't concatenate if an
* explicit anon_map structure was supplied (e.g., SystemV shared
* memory) or if we'll use text replication for this segment.
*/
/*
* Memory policy flags (lgrp_mem_policy_flags) is valid when
*/
} else {
/*
* Get policy when not extending it from another segment
*/
}
/*
* First, try to concatenate the previous and new segments
*/
/*
* Get memory allocation policy from previous segment.
* When extension is specified (e.g. for heap) apply
* this policy to the new segment regardless of the
* outcome of segment concatenation. Extension occurs
* for non-default policy otherwise default policy is
* used and is based on extended segment size.
*/
if (lgrp_mem_policy_flags ==
if (ppolicy != lgrp_mem_default_policy) {
} else {
a->type);
}
}
/*
* success! now try to concatenate
* with following seg
*/
return (0);
}
}
/*
* Failed, so try to concatenate with following seg
*/
/*
* Get memory allocation policy from next segment.
* When extension is specified (e.g. for stack) apply
* this policy to the new segment regardless of the
* outcome of segment concatenation. Extension occurs
* for non-default policy otherwise default policy is
* used and is based on extended segment size.
*/
if (lgrp_mem_policy_flags ==
if (npolicy != lgrp_mem_default_policy) {
} else {
a->type);
}
}
return (0);
}
}
}
if (a->type == MAP_SHARED)
}
/*
* Anonymous mappings have no backing file so the offset is meaningless.
*/
svd->pageadvice = 0;
svd->softlockcnt = 0;
svd->softlockcnt_sbase = 0;
svd->softlockcnt_send = 0;
segvn_setvnode_mpss(a->vp);
}
}
svd->anon_index = 0;
/*
* Shared mappings to a vp need no other setup.
* If we have a shared mapping to an anon_map object
* which hasn't been allocated yet, allocate the
* struct now so that it will be properly shared
* by remembering the swap reservation there.
*/
}
} else {
/*
* Private mapping (with or without a vp).
* Allocate anon_map when needed.
*/
}
} else {
/*
* Mapping to an existing anon_map structure without a vp.
* For now we will insure that the segment size isn't larger
* than the size - offset gives us. Later on we may wish to
* have the anon array dynamically allocated itself so that
* we don't always have to allocate all the anon pointer slots.
* This of course involves adding extra code to check that we
* aren't trying to use an anon pointer slot beyond the end
* of the currently allocated anon array.
*/
panic("segvn_create anon_map size");
/*NOTREACHED*/
}
if (a->type == MAP_SHARED) {
/*
* SHARED mapping to a given anon_map.
*/
}
} else {
/*
* PRIVATE mapping to a given anon_map.
* Make sure that all the needed anon
* structures are created (so that we will
* share the underlying pages if nothing
* is written by this mapping) and then
* duplicate the anon array as is done
* when a privately mapped segment is dup'ed.
*/
}
svd->anon_index = 0;
/*
* Prevent 2 threads from allocating anon
* slots simultaneously.
*/
continue;
/*
* Allocate the anon struct now.
* Might as well load up translation
* to the page while we're at it...
*/
panic("segvn_create anon_zero");
/*NOTREACHED*/
}
/*
* Re-acquire the anon_map lock and
* initialize the anon array entry.
*/
}
}
}
/*
* Set default memory allocation policy for segment
*
* Always set policy for private memory at least for initialization
* even if this is a shared memory segment
*/
if (use_rgn) {
}
return (0);
}
/*
* Concatenate two existing segments, if possible.
* Return 0 on success, -1 if two segments are not compatible
* or -2 on memory allocation failure.
* If amp_cat == 1 then try and concat segments with anon maps
*/
static int
{
return (-1);
}
/* both segments exist, try to merge them */
return (-1);
/*
* vp == NULL implies zfod, offset doesn't matter
*/
return (-1);
}
/*
* Don't concatenate if either segment uses text replication.
*/
return (-1);
}
/*
* Fail early if we're not supposed to concatenate
* segments with non NULL amp.
*/
return (-1);
}
return (-1);
}
svd2->anon_index) {
return (-1);
}
}
/*
* If either seg has vpages, create a new merged vpage array.
*/
return (-2);
}
} else {
}
}
} else {
}
}
}
}
}
}
}
/*
* If either segment has private pages, create a new merged anon
* array. If mergeing shared anon segments just decrement anon map's
* refcnt.
*/
}
return (-2);
}
/*
* XXX anon rwlock is not really needed because
* this is a private segment and we are writers.
*/
}
return (-2);
}
}
ANON_NOSLEEP)) {
}
}
return (-2);
}
}
}
} else {
}
}
svd1->anon_index = 0;
}
/*
* Now free the old vpage structures.
*/
}
}
}
if (svd2->pageadvice) {
}
}
}
/* all looks ok, merge segments */
return (0);
}
/*
* Extend the previous segment (seg1) to include the
* new segment (seg2 + a), if possible.
* Return 0 on success.
*/
static int
struct segvn_crargs *a;
{
/*
* We don't need any segment level locks for "segvn" data
* since the address space is "write" locked.
*/
return (-1);
}
/* second segment is new, try to extend first */
/* XXX - should also check cred */
return (-1);
/* vp == NULL implies zfod, offset doesn't matter */
return (-1);
return (-1);
}
if (amp1) {
/*
* Segment has private pages, can data structures
* be expanded?
*
* Acquire the anon_map lock to prevent it from changing,
* if it is shared. This ensures that the anon_map
* lock on an address space references it.
* XXX - Don't need the anon_map lock at all if "refcnt"
* is 1.
*
* Can't grow a MAP_SHARED segment with an anonmap because
* there may be existing anon slots where we want to extend
* the segment and we wouldn't know what to do with them
* (e.g., for tmpfs right thing is to just leave them there,
*/
return (-1);
return (-1);
}
if (newpgs == 0) {
return (-1);
}
}
return (-1);
}
}
}
}
return (0);
}
/*
* Extend the next segment (seg2) to include the
* new segment (seg1 + a), if possible.
* Return 0 on success.
*/
static int
struct segvn_crargs *a,
{
/*
* We don't need any segment level locks for "segvn" data
* since the address space is "write" locked.
*/
return (-1);
}
/* first segment is new, try to extend second */
/* XXX - should also check cred */
return (-1);
/* vp == NULL implies zfod, offset doesn't matter */
return (-1);
return (-1);
}
if (amp2) {
/*
* Segment has private pages, can data structures
* be expanded?
*
* Acquire the anon_map lock to prevent it from changing,
* if it is shared. This ensures that the anon_map
* lock on an address space references it.
*
* XXX - Don't need the anon_map lock at all if "refcnt"
* is 1.
*/
return (-1);
return (-1);
}
if (newpgs == 0) {
return (-1);
}
}
/* Not merging segments so adjust anon_index back */
if (amp2)
return (-1);
}
}
}
}
}
return (0);
}
/*
* Duplicate all the pages in the segment. This may break COW sharing for a
* given page. If the page is marked with inherit zero set, then instead of
* duplicating the page, we zero the page.
*/
static int
{
int error;
size_t i;
/*
* XXX break cow sharing using PAGESIZE
* pages. They will be relocated into larger
* pages at fault time.
*/
while (i-- > 0) {
/*
* prot need not be computed below 'cause anon_private
* is going to ignore it anyway as child doesn't inherit
* pagelock from parent.
*/
/*
* Check whether we should zero this or dup it.
*/
VPP_ISINHZERO(vpp))) {
} else {
if (error != 0)
return (error);
}
return (ENOMEM);
}
}
old_idx++;
new_idx++;
}
return (0);
}
static int
{
struct segvn_data *newsvd;
int error = 0;
/*
* If segment has anon reserved, reserve more for the new seg.
* For a MAP_NORESERVE segment swresv will be a count of all the
* allocated anon slots; thus we reserve for the child as many slots
* as the parent has allocated. This semantic prevents the child or
* parent from dieing during a copy-on-write fault caused by trying
* to write a shared pre-existing anon page.
*/
return (ENOMEM);
}
}
newsvd->softlockcnt = 0;
newsvd->softlockcnt_sbase = 0;
newsvd->softlockcnt_send = 0;
/*
* Not attaching to a shared anon object.
*/
} else {
}
newsvd->anon_index = 0;
} else {
/* regions for now are only used on pure vnode segments */
} else {
int reclaim = 1;
/*
* Allocate and initialize new anon_map structure.
*/
newsvd->anon_index = 0;
/*
* We don't have to acquire the anon_map lock
* for the new segment (since it belongs to an
* address space that is still not associated
* with any process), or the segment in the old
* address space (since all threads in it
* are stopped while duplicating the address space).
*/
/*
* The goal of the following code is to make sure that
* softlocked pages do not end up as copy on write
* pages. This would cause problems where one
* thread writes to a page that is COW and a different
* thread in the same process has softlocked it. The
* softlock lock would move away from this process
* because the write would cause this process to get
* a copy (without the softlock).
*
* The strategy here is to just break the
* sharing on pages that could possibly be
* softlocked.
*
* In addition, if any pages have been marked that they
* should be inherited as zero, then we immediately go
* ahead and break COW and zero them. In the case of a
* softlocked page that should be inherited zero, we
* break COW and just get a zero page.
*/
if (svd->softlockcnt ||
/*
* The softlock count might be non zero
* because some pages are still stuck in the
* cache for lazy reclaim. Flush the cache
* now. This should drop the count to zero.
* [or there is really I/O going on to these
* pages]. Note, we have the writers lock so
* nothing gets inserted during the flush.
*/
reclaim = 0;
goto retry;
}
if (error != 0) {
goto out;
}
} else { /* common case */
/*
* If at least one of anon slots of a
* large page exists then make sure
* all anon slots of a large page
* exist to avoid partial cow sharing
* of a large page in the future.
*/
} else {
}
}
}
}
/*
* If necessary, create a vpage structure for the new segment.
* Do not copy any page lock indications.
*/
uint_t i;
for (i = 0; i < npages; i++) {
VPP_CLRPPLOCK(nvp++);
}
} else
/* Inform the vnode of the new mapping */
}
out:
}
return (error);
}
/*
* callback function to invoke free_vp_pages() for only those pages actually
* processed by the HAT when a shared region is destroyed.
*/
extern int free_pages;
static void
{
if (!free_pages) {
return;
}
}
/*
* callback function used by segvn_unmap to invoke free_vp_pages() for only
* those pages actually processed by the HAT
*/
static void
{
}
/*
* This function determines the number of bytes of swap reserved by
* a segment for which per-page accounting is present. It is used to
* calculate the correct value of a segvn_data's swresv.
*/
static size_t
{
size_t nswappages = 0;
if (VPP_ISSWAPRES(vp))
nswappages++;
}
return (nswappages << PAGESHIFT);
}
static int
{
struct segvn_data *nsvd;
int reclaim = 1;
/*
* We don't need any segment level locks for "segvn" data
* since the address space is "write" locked.
*/
/*
* Fail the unmap if pages are SOFTLOCKed through this mapping.
* softlockcnt is protected from change by the as write lock.
*/
if (svd->softlockcnt > 0) {
/*
* If this is shared segment non 0 softlockcnt
* means locked pages are still in use.
*/
return (EAGAIN);
}
/*
* since we do have the writers lock nobody can fill
* the cache during the purge. The flush either succeeds
*/
if (reclaim == 1) {
reclaim = 0;
goto retry;
}
return (EAGAIN);
}
/*
* Check for bad sizes
*/
panic("segvn_unmap");
/*NOTREACHED*/
}
int err;
/*
* could pass a flag to segvn_demote_range()
* below to tell it not to do any unloads but
* this case is rare enough to not bother for
* now.
*/
}
if (err == 0) {
return (IE_RETRY);
}
return (err);
}
}
/* Inform the vnode of the unmapping. */
int error;
return (error);
}
/*
* Remove any page locks set through this mapping.
* If text replication is not off no page locks could have been
* established via this mapping.
*/
}
} else {
}
/*
* Unload any hardware translations in the range to be taken
* out. Use a callback to invoke free_vp_pages() effectively.
*/
}
}
}
/*
* Check for entire segment
*/
return (0);
}
/*
* Check for beginning of segment
*/
/* free up old vpage */
}
/*
* Shared anon map is no longer in use. Before
* freeing its pages purge all entries from
* pcache that belong to this amp.
*/
}
/*
* Free up now unused parts of anon_map array.
*/
} else {
len);
}
} else {
}
/*
* Unreserve swap space for the
* unmapped chunk of this segment in
* case it's MAP_SHARED
*/
}
}
}
if (SEG_IS_PARTIAL_RESV(seg))
} else {
} else {
}
}
}
return (0);
}
/*
* Check for end of segment
*/
/* free up old vpage */
}
/*
* Free up now unused parts of anon_map array.
*/
/*
* Shared anon map is no longer in use. Before
* freeing its pages purge all entries from
* pcache that belong to this amp.
*/
}
} else {
len);
}
} else {
}
/*
* Unreserve swap space for the
* unmapped chunk of this segment in
* case it's MAP_SHARED
*/
}
}
}
if (SEG_IS_PARTIAL_RESV(seg))
} else {
} else {
}
}
}
return (0);
}
/*
* The section to go is in the middle of the segment,
* have to make it into two segments. nseg is made for
* the high end while seg is cut down at the low end.
*/
panic("segvn_unmap seg_alloc");
/*NOTREACHED*/
}
nsvd->softlockcnt = 0;
nsvd->softlockcnt_sbase = 0;
nsvd->softlockcnt_send = 0;
}
} else {
/* need to split vpage into two arrays */
/* free up old vpage */
}
nsvd->anon_index = 0;
} else {
/*
* Need to create a new anon map for the new segment.
* We'll also allocate a new smaller array for the old
* smaller segment to save space.
*/
/*
* Free up now unused parts of anon_map array.
*/
/*
* Shared anon map is no longer in use. Before
* freeing its pages purge all entries from
* pcache that belong to this amp.
*/
}
} else {
len);
}
} else {
}
/*
* Unreserve swap space for the
* unmapped chunk of this segment in
* case it's MAP_SHARED
*/
}
}
} else {
svd->anon_index = 0;
nsvd->anon_index = 0;
}
}
if (SEG_IS_PARTIAL_RESV(seg))
} else {
} else {
panic("segvn_unmap: cannot split "
"swap reservation");
/*NOTREACHED*/
}
}
}
}
return (0); /* I'm glad that's all over with! */
}
static void
{
/*
* We don't need any segment level locks for "segvn" data
* since the address space is "write" locked.
*/
/*
* Be sure to unlock pages. XXX Why do things get free'ed instead
* of unmapped? XXX
*/
/*
* Deallocate the vpage and anon pointers if necessary and possible.
*/
}
/*
* If there are no more references to this anon_map
* structure, then deallocate the structure after freeing
* up all the anon slot pointers that we can.
*/
/*
* Private - we only need to anon_free
* the part that this segment refers to.
*/
} else {
}
} else {
/*
* Shared anon map is no longer in use. Before
* freeing its pages purge all entries from
* pcache that belong to this amp.
*/
/*
* Shared - anon_free the entire
* anon_map's worth of stuff and
* release any swap reservation.
*/
} else {
}
}
}
/*
* We had a private mapping which still has
* a held anon_map so just free up all the
* anon slot pointers that we were using.
*/
} else {
}
} else {
}
}
/*
* Release swap reservation.
*/
if (SEG_IS_PARTIAL_RESV(seg))
}
/*
* Release claim on vnode, credentials, and finally free the
* private data.
*/
}
svd->pageadvice = 0;
/*
* Take segfree_syncmtx lock to let segvn_reclaim() finish if it's
* still working with this segment without holding as lock (in case
* it's called by pcache async thread).
*/
}
/*
* Do a F_SOFTUNLOCK call over the range requested. The range must have
* already been F_SOFTLOCK'ed.
* Caller must always match addr and len of a softunlock with a previous
* softlock with exactly the same addr and len.
*/
static void
{
} else {
}
!= NULL) {
} else {
}
} else {
}
/*
* Use page_find() instead of page_lookup() to
* find the page since we know that it is locked.
*/
"segvn_softunlock: addr %p, ap %p, vp %p, off %llx",
/*NOTREACHED*/
}
hat_setref(pp);
}
}
/*
* All SOFTLOCKS are gone. Wakeup any waiting
* unmappers so they can try again to unmap.
* Check for waiters first without the mutex
* held so we don't always grab the mutex on
* softunlocks.
*/
}
}
}
}
/*
* Release all the pages in the NULL terminated ppp list
* which haven't already been converted to PAGE_HANDLED.
*/
static void
{
if (*ppp != PAGE_HANDLED)
page_unlock(*ppp);
}
}
static int stealcow = 1;
/*
* Workaround for viking chip bug. See bug id 1220902.
* To fix this down in pagefault() would require importing so
* much as and segvn code as to be unmaintainable.
*/
int enable_mbit_wa = 0;
/*
* Handles all the dirty work of getting the right
* anonymous pages and loading up the translations.
* This routine is called only from segvn_fault()
* when looping over the range of addresses requested.
*
* The basic algorithm here is:
* If this is an anon_zero case
* Call anon_zero to allocate page
* Load up translation
* Return
* endif
* If this is an anon page
* Use anon_getpage to get the page
* else
* Find page in pl[] list passed in
* endif
* If not a cow
* Load up the translation to the page
* return
* endif
* Call anon_private to handle cow
* Load up (writable) translation to new page
*/
static faultcode_t
int brkcow) /* we may need to break cow */
{
int err;
int cow;
int claim;
int steal = 0;
int anon_lock = 0;
}
/*
* Initialize protection value for this page.
* If we have per page protection values check it now.
*/
switch (rw) {
case S_READ:
break;
case S_WRITE:
break;
case S_EXEC:
break;
case S_OTHER:
default:
break;
}
return (FC_PROT); /* illegal access type */
} else {
}
if (type == F_SOFTLOCK) {
}
/*
* Always acquire the anon array lock to prevent 2 threads from
* allocating separate anon slots for the same "addr".
*/
anon_lock = 1;
}
/*
* Allocate a (normally) writable anonymous page of
* zeroes. If no advance reservations, reserve now.
*/
ptob(1));
} else {
goto out;
}
}
goto out; /* out of swap space */
}
/*
* Re-acquire the anon_map lock and
* initialize the anon array entry.
*/
/*
* Handle pages that have been marked for migration
*/
if (lgrp_optimizations())
if (enable_mbit_wa) {
hat_setmod(pp);
prot &= ~PROT_WRITE;
}
/*
* If AS_PAGLCK is set in a_flags (via memcntl(2)
* with MC_LOCKAS, MCL_FUTURE) and this is a
* MAP_NORESERVE segment, we may need to
* permanently lock the page as it is being faulted
* for the first time. The following text applies
* only to MAP_NORESERVE segments:
*
* As per memcntl(2), if this segment was created
* after MCL_FUTURE was applied (a "future"
* segment), its pages must be locked. If this
* segment existed at MCL_FUTURE application (a
* "past" segment), the interface is unclear.
*
* We decide to lock only if vpage is present:
*
* - "future" segments will have a vpage array (see
* as_map), and so will be locked as required
*
* - "past" segments may not have a vpage array,
* depending on whether events (such as
* mprotect) have occurred. Locking if vpage
* exists will preserve legacy behavior. Not
* locking if vpage is absent, will not break
* the interface or legacy behavior. Note that
* allocating vpage here if it's absent requires
* upgrading the segvn reader lock, the cost of
* which does not seem worthwhile.
*
* Usually testing and setting VPP_ISPPLOCK and
* VPP_SETPPLOCK requires holding the segvn lock as
* writer, but in this case all readers are
* serializing on the anon array lock.
*/
!VPP_ISPPLOCK(vpage)) {
mutex_enter(&p->p_lock);
1) == 0) {
} else {
PAGESIZE, 1);
}
}
mutex_exit(&p->p_lock);
}
if (!(hat_flag & HAT_LOAD_LOCK))
return (0);
}
}
/*
* Obtain the page structure via anon_getpage() if it is
* a private copy of an object (the result of a previous
* copy-on-write).
*/
if (err)
goto out;
/*
* If this is a shared mapping to an
* anon_map, then ignore the write
* permissions returned by anon_getpage().
* They apply to the private mappings
* of this anon_map.
*/
vpprot |= PROT_WRITE;
}
}
}
/*
* Search the pl[] list passed in if it is from the
* original object (i.e., not a private copy).
*/
/*
* Find original page. We must be bringing it in
* from the list in pl[].
*/
if (opp == PAGE_HANDLED)
continue;
break;
}
panic("segvn_faultpage not found");
/*NOTREACHED*/
}
*ppp = PAGE_HANDLED;
}
/*
* The fault is treated as a copy-on-write fault if a
* write occurs on a private segment and the object
* page (i.e., mapping) is write protected. We assume
* that fatal protection checks have already been made.
*/
if (brkcow) {
/*
* If we are doing text replication COW on first touch.
*/
} else {
cow = 0;
}
/*
* If not a copy-on-write case load the translation
* and return.
*/
if (cow == 0) {
/*
* Handle pages that have been marked for migration
*/
if (lgrp_optimizations())
prot &= ~PROT_WRITE;
}
if (!(hat_flag & HAT_LOAD_LOCK))
if (anon_lock) {
}
return (0);
}
/*
* Steal the page only if it isn't a private page
* since stealing a private page is not worth the effort.
*/
steal = 1;
/*
* Steal the original page if the following conditions are true:
*
* We are low on memory, the page is not private, page is not large,
* not shared, not modified, not `locked' or if we have it `locked'
* (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies
* that the page is not shared) and if it doesn't have any
* translations. page_struct_lock isn't needed to look at p_cowcnt
* and p_lckcnt because we first get exclusive lock on page.
*/
/*
* Check if this page has other translations
* after unloading our translation.
*/
if (hat_page_is_mapped(opp)) {
}
/*
* hat_unload() might sync back someone else's recent
* modification, so check again.
*/
pageflags |= STEAL_PAGE;
}
/*
* If we have a vpage pointer, see if it indicates that we have
* ``locked'' the page we map -- if so, tell anon_private to
* transfer the locking resource to the new page.
*
* See Statement at the beginning of segvn_lockop regarding
*
*/
/*
* Allocate a private page and perform the copy.
* For MAP_NORESERVE reserve swap space now, unless this
* is a cow fault on an existing anon page in which case
* MAP_NORESERVE will have made advance reservations.
*/
} else {
goto out;
}
}
goto out;
}
/*
* If we copied away from an anonymous page, then
* we are one step closer to freeing up an anon slot.
*
* NOTE: The original anon slot must be released while
* holding the "anon_map" lock. This is necessary to prevent
* other threads from obtaining a pointer to the anon slot
* which may be freed if its "refcnt" is 1.
*/
/*
* Handle pages that have been marked for migration
*/
if (lgrp_optimizations())
if (enable_mbit_wa) {
hat_setmod(pp);
prot &= ~PROT_WRITE;
}
if (!(hat_flag & HAT_LOAD_LOCK))
return (0);
out:
if (anon_lock)
if (type == F_SOFTLOCK) {
}
return (FC_MAKE_ERR(err));
}
/*
* relocate a bunch of smaller targ pages into one large repl page. all targ
* pages must be complete pages smaller than replacement pages.
* it's assumed that no page's szc can change since they are all PAGESIZE or
* complete large pages locked SHARED.
*/
static void
{
pgcnt_t i;
i = 0;
while (repl_npgs) {
int err;
if (curnpgs == 1) {
repl = replacement;
} else {
int j;
for (j = 0; j < curnpgs; j++) {
repl = replacement;
page_pptonum(targ[i]) + j);
}
repl = repl_savepp;
}
panic("segvn_relocate_pages: "
"page_relocate failed err=%d curnpgs=%ld "
}
i += curnpgs;
}
repl = first_repl;
for (i = 0; i < repl_npgs; i++) {
page_downgrade(targ[i]);
repl++;
}
}
/*
* Check if all pages in ppa array are complete smaller than szc pages and
* their roots will still be aligned relative to their current size if the
* entire ppa array is relocated into one szc page. If these conditions are
* not met return 0.
*
* If all pages are properly aligned attempt to upgrade their locks
* to exclusive mode. If it fails set *upgrdfail to 1 and return 0.
* upgrdfail was set to 0 by caller.
*
* Return 1 if all pages are aligned and locked exclusively.
*
* If all pages in ppa array happen to be physically contiguous to make one
* szc page and all exclusive locks are successfully obtained promote the page
* size to szc and set *pszc to szc. Return 1 with pages locked shared.
*/
static int
{
int contig = 1;
pgcnt_t i;
pgcnt_t j;
int root = 0;
for (i = 0; i < totnpgs; i++) {
if (i == 0) {
contig = 0;
} else {
}
contig = 0;
}
if (root) {
return (0);
}
} else if (!root) {
return (0);
}
if (curszc == 0) {
/*
* p_szc changed means we don't have all pages
* locked. return failure.
*/
return (0);
}
!IS_P2ALIGNED(i, curnpgs)) {
return (0);
}
root = 1;
} else {
ASSERT(i > 0);
return (0);
}
panic("segvn_full_szcpages: "
"large page not physically contiguous");
}
root = 0;
}
}
}
for (i = 0; i < totnpgs; i++) {
if (!page_tryupgrade(ppa[i])) {
for (j = 0; j < i; j++) {
page_downgrade(ppa[j]);
}
*upgrdfail = 1;
return (0);
}
}
/*
* When a page is put a free cachelist its szc is set to 0. if file
* system reclaimed pages from cachelist targ pages will be physically
* contiguous with 0 p_szc. in this case just upgrade szc of targ
* pages without any relocations.
* To avoid any hat issues with previous small mappings
* hat_pageunload() the target pages first.
*/
if (contig) {
for (i = 0; i < totnpgs; i++) {
}
for (i = 0; i < totnpgs; i++) {
}
for (i = 0; i < totnpgs; i++) {
page_downgrade(ppa[i]);
}
}
}
return (1);
}
/*
* Create physically contiguous pages for [vp, off] - [vp, off +
* page_size(szc)) range and for private segment return them in ppa array.
* Pages are created either via IO or relocations.
*
* Return 1 on success and 0 on failure.
*
* If physically contiguous pages already exist for this range return 1 without
* filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa
* array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE().
*/
static int
int *downsize)
{
int io_err = 0;
int i;
int nios = 0;
/*
* downsize will be set to 1 only if we fail to lock pages. this will
* allow subsequent faults to try to relocate the page again. If we
* fail due to misalignment don't downsize and let the caller map the
* whole region with small mappings to avoid more faults into the area
* where we can't get large pages anyway.
*/
*downsize = 0;
/*
* we pass NULL for nrelocp to page_lookup_create()
* so that it doesn't relocate. We relocate here
* later only after we make sure we can lock all
* pages in the range we handle and they are all
* aligned.
*/
continue;
}
return (1);
}
goto out;
}
/*
* sizing down to pszc won't help.
*/
goto out;
}
/*
* sizing down to pszc won't help.
*/
goto out;
}
*downsize = 1;
goto out;
}
/*
* Some file systems like NFS don't check EOF
* conditions in VOP_PAGEIO(). Check it here
* now that pages are locked SE_EXCL. Any file
* truncation will wait until the pages are
* unlocked so no need to worry that file will
* be truncated after we check its size here.
* XXX fix NFS to remove this check.
*/
goto out;
}
*downsize = 1;
*ret_pszc = 0;
goto out;
}
if (io_err) {
}
goto out;
}
nios++;
}
}
*downsize = 1;
goto out;
}
/*
* page szc chould have changed before the entire group was
* locked. reread page szc.
*/
/* link just the roots */
while (--ppages != 0) {
}
}
goto out;
}
*downsize = 1;
*ret_pszc = 0;
goto out;
}
if (io_err) {
}
goto out;
}
nios++;
}
}
/*
* we're now bound to succeed or panic.
* remove pages from done_pplist. it's not needed anymore.
*/
while (done_pplist != NULL) {
pp = done_pplist;
}
while (targ_pplist != NULL) {
int ret;
pp = targ_pplist;
newpp = repl_pplist;
#ifdef DEBUG
#endif
nreloc = 0;
panic("segvn_fill_vp_pages: "
"page_relocate failed");
}
while (nreloc-- != 0) {
pp++;
}
}
for (i = 0; i < pages; i++) {
page_downgrade(ppa[i]);
}
} else {
/*
* the caller will still call VOP_GETPAGE() for shared segments
* to check FS write permissions. For private segments we map
* file read only anyway. so no VOP_GETPAGE is needed.
*/
for (i = 0; i < pages; i++) {
page_unlock(ppa[i]);
}
}
return (1);
out:
/*
* Do the cleanup. Unlock target pages we didn't relocate. They are
* linked on targ_pplist by root pages. reassemble unused replacement
* and io pages back to pplist.
*/
do {
}
tmp_pplist = NULL;
while (targ_pplist != NULL) {
pp = targ_pplist;
if (pszc != 0) {
}
pp = repl_pplist;
/* relink replacement page */
while (--ppages != 0) {
pp++;
}
}
if (tmp_pplist != NULL) {
pplist = tmp_pplist;
}
/*
* at this point all pages are either on done_pplist or
* pplist. They can't be all on done_pplist otherwise
* we'd've been done.
*/
if (nios != 0) {
do {
pp = done_pplist;
do {
}
while (done_pplist != NULL) {
pp = done_pplist;
}
return (0);
}
if (io_err) {
/*
* don't downsize on io error.
* see if vop_getpage succeeds.
* pplist may still be used in this case
* for relocations.
*/
return (0);
}
return (0);
}
int segvn_anypgsz = 0;
if ((type) == F_SOFTLOCK) { \
-(pages)); \
}
for (i = 0; i < (pages); i++) { \
hat_setmod((ppa)[i]); \
} \
for (i = 0; i < (pages); i++) { \
prot &= ~PROT_WRITE; \
break; \
} \
} \
} \
}
#ifdef VM_STATS
#define SEGVN_VMSTAT_FLTVNPAGES(idx) \
#else /* VM_STATS */
#define SEGVN_VMSTAT_FLTVNPAGES(idx)
#endif
static faultcode_t
{
faultcode_t err = 0;
int ierr;
int vop_size_err = 0;
ulong_t i;
int alloc_failed = 0;
int adjszc_chk;
int physcontig;
int upgrdfail;
int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */
}
switch (rw) {
case S_READ:
break;
case S_WRITE:
break;
case S_EXEC:
break;
case S_OTHER:
default:
break;
}
} else {
/* caller has already done segment level protection check. */
}
} else {
}
for (;;) {
adjszc_chk = 0;
if (adjszc_chk) {
uintptr_t e;
if (!IS_P2ALIGNED(a, ppgsz) ||
break;
}
}
}
if (err != 0) {
goto out;
}
}
goto next;
} else {
maxpages) == 0);
}
}
goto out;
}
}
if (type == F_SOFTLOCK) {
pages);
}
physcontig = 0;
pszc = 0;
ierr = -1;
break;
}
int downsize;
if (!physcontig && downsize &&
type != F_SOFTLOCK) {
ierr = -1;
break;
}
ASSERT(!physcontig ||
segtype == MAP_PRIVATE ||
physcontig = 0;
}
}
physcontig = 1;
}
if (!physcontig) {
#ifdef DEBUG
if (ierr == 0) {
for (i = 0; i < pages; i++) {
}
}
#endif /* DEBUG */
if (segtype == MAP_PRIVATE) {
vpprot &= ~PROT_WRITE;
}
} else {
ierr = 0;
}
if (ierr != 0) {
}
goto out;
}
goto out;
}
goto out;
}
goto out;
}
/* can't reduce map area */
vop_size_err = 1;
goto out;
}
pszc = 0;
ierr = -1;
break;
}
}
for (i = 0; i < pages; i++) {
page_unlock(ppa[i]);
}
}
/*
* For private segments SOFTLOCK
* either always breaks cow (any rw
* type except S_READ_NOCOW) or
* address space is locked as writer
* (S_READ_NOCOW case) and anon slots
* can't show up on second check.
* Therefore if we are here for
* SOFTLOCK case it must be a cow
* break but cow break never reduces
* szc. text replication (tron) in
* this case works as cow break.
* Thus the assert below.
*/
type != F_SOFTLOCK);
ierr = -2;
break;
}
goto again;
}
#ifdef DEBUG
}
#endif /* DEBUG */
if (ierr != 0) {
goto out;
}
/*
* p_szc can't be changed for locked
* swapfs pages.
*/
hat_flag);
if (!(hat_flag & HAT_LOAD_LOCK)) {
for (i = 0; i < pages; i++) {
page_unlock(ppa[i]);
}
}
goto next;
}
/*
* hat_page_demote() needs an SE_EXCL lock on one of
* constituent page_t's and it decreases root's p_szc
* last. This means if root's p_szc is equal szc and
* all its constituent pages are locked
* hat_page_demote() that could have changed p_szc to
* szc is already done and no new have page_demote()
* can start for this large page.
*/
/*
* we need to make sure same mapping size is used for
* the same address range if there's a possibility the
* adddress is already mapped because hat layer panics
* when translation is loaded for the range already
* mapped with a different page size. We achieve it
* by always using largest page size possible subject
* to the constraints of page size, segment page size
* and page alignment. Since mappings are invalidated
* when those constraints change and make it
* impossible to use previously used mapping size no
* mapping size conflicts should happen.
*/
#ifdef DEBUG
for (i = 0; i < pages; i++) {
pfn + i);
}
#endif /* DEBUG */
/*
* All pages are of szc we need and they are
* all locked so they can't change szc. load
* translations.
*
* if page got promoted since last check
* we don't need pplist.
*/
}
if (PP_ISMIGRATE(ppa[0])) {
}
if (!(hat_flag & HAT_LOAD_LOCK)) {
for (i = 0; i < pages; i++) {
page_unlock(ppa[i]);
}
}
}
goto next;
}
/*
* See if upsize is possible.
*/
} else {
}
for (i = 0; i < pages; i++) {
page_unlock(ppa[i]);
}
}
ierr = -2;
break;
}
}
/*
* check if we should use smallest mapping size.
*/
upgrdfail = 0;
if (szc == 0 ||
&pszc))) {
/*
* segvn_full_szcpages failed to lock
* all pages EXCL. Size down.
*/
}
for (i = 0; i < pages; i++) {
page_unlock(ppa[i]);
}
}
ierr = -1;
break;
}
}
}
if (upgrdfail && segvn_anypgsz_vnode) {
/* SOFTLOCK case */
} else {
for (i = 0; i < pages; i++) {
a + (i << PAGESHIFT),
}
}
if (!(hat_flag & HAT_LOAD_LOCK)) {
for (i = 0; i < pages; i++) {
page_unlock(ppa[i]);
}
}
}
goto next;
}
/*
* segvn_full_szcpages() upgraded pages szc.
*/
goto chkszc;
}
/*
* p_szc of ppa[0] can change since we haven't
* locked all constituent pages. Call
* page_lock_szc() to prevent szc changes.
* This should be a rare case that happens when
* multiple segments use a different page size
* to map the same file offsets.
*/
}
goto chkszc;
}
/*
* page got promoted since last check.
* we don't need preaalocated large
* page.
*/
}
if (!(hat_flag & HAT_LOAD_LOCK)) {
for (i = 0; i < pages; i++) {
page_unlock(ppa[i]);
}
}
}
goto next;
}
/*
* if page got demoted since last check
* we could have not allocated larger page.
* allocate now.
*/
for (i = 0; i < pages; i++) {
page_unlock(ppa[i]);
}
}
ierr = -1;
break;
}
#ifdef DEBUG
} else {
#endif /* DEBUG */
}
for (i = 0; i < pages; i++) {
a + (i << PAGESHIFT),
}
} else {
}
if (!(hat_flag & HAT_LOAD_LOCK)) {
for (i = 0; i < pages; i++) {
page_unlock(ppa[i]);
}
}
}
next:
}
adjszc_chk = 1;
}
if (a == lpgeaddr)
break;
/*
* ierr == -1 means we failed to map with a large page.
* (either due to allocation/relocation failures or
* misalignment with other mappings to this file.
*
* ierr == -2 means some other thread allocated a large page
* after we gave up tp map with a large page. retry with
* larger mapping.
*/
if (ierr == -2) {
} else if (segvn_anypgsz_vnode) {
szc--;
} else {
/*
* other process created pszc large page.
* but we still have to drop to 0 szc.
*/
szc = 0;
}
if (ierr == -2) {
/*
* Size up case. Note lpgaddr may only be needed for
* softlock case so we don't adjust it here.
*/
} else {
/*
* Size down case. Note lpgaddr may only be needed for
* softlock case so we don't adjust it here.
*/
if (a < addr) {
/*
* The beginning of the large page region can
* be pulled to the right to make a smaller
* region. We haven't yet faulted a single
* page.
*/
}
}
}
out:
if (!err && !vop_size_err) {
return (0);
}
}
if (!vop_size_err) {
return (err);
}
/*
* Large page end is mapped beyond the end of file and it's a cow
* fault (can be a text replication induced cow) or softlock so we can't
* reduce the map area. For now just demote the segment. This should
* really only happen if the end of the file changed after the mapping
* was established since when large page segments are created we make
* sure they don't extend beyond the end of the file.
*/
err = 0;
if (err != 0) {
}
}
/* segvn_fault will do its job as if szc had been zero to begin with */
}
/*
* This routine will attempt to fault in one large page.
* it will use smaller pages if that fails.
* It should only be called for pure anonymous segments.
*/
static faultcode_t
{
int ierr;
ulong_t i;
int adjszc_chk;
}
switch (rw) {
case S_READ:
break;
case S_WRITE:
break;
case S_EXEC:
break;
case S_OTHER:
default:
break;
}
} else {
/* caller has already done segment level protection check. */
}
for (;;) {
adjszc_chk = 0;
goto error;
}
}
pgsz);
}
if (type == F_SOFTLOCK) {
pages);
}
if (ierr != 0) {
if (type == F_SOFTLOCK) {
-pages);
}
if (ierr > 0) {
goto error;
}
break;
}
/*
* Handle pages that have been marked for migration
*/
if (lgrp_optimizations())
if (segtype == MAP_SHARED) {
vpprot |= PROT_WRITE;
}
if (hat_flag & HAT_LOAD_LOCK) {
} else {
for (i = 0; i < pages; i++)
page_unlock(ppa[i]);
}
adjszc_chk = 1;
}
if (a == lpgeaddr)
break;
/*
* ierr == -1 means we failed to allocate a large page.
* so do a size down operation.
*
* ierr == -2 means some other process that privately shares
* pages with this process has allocated a larger page and we
* need to retry with larger pages. So do a size up
* operation. This relies on the fact that large pages are
* never partially shared i.e. if we share any constituent
* page of a large page with another process we must share the
* entire large page. Note this cannot happen for SOFTLOCK
* case, unless current address (a) is at the beginning of the
* next page size boundary because the other process couldn't
* have relocated locked pages.
*/
if (segvn_anypgsz) {
} else {
/*
* For non COW faults and segvn_anypgsz == 0
* we need to be careful not to loop forever
* if existing page is found with szc other
* than 0 or seg->s_szc. This could be due
* to page relocations on behalf of DR or
* more likely large page creation. For this
* case simply re-size to existing page's szc
* if returned by anon_map_getpages().
*/
} else {
}
}
if (type == F_SOFTLOCK) {
/*
* For softlocks we cannot reduce the fault area
* (calculated based on the largest page size for this
* segment) for size down and a is already next
* page size aligned as assertted above for size
* ups. Therefore just continue in case of softlock.
*/
continue; /* keep lint happy */
} else if (ierr == -2) {
/*
* Size up case. Note lpgaddr may only be needed for
* softlock case so we don't adjust it here.
*/
} else {
/*
* Size down case. Note lpgaddr may only be needed for
* softlock case so we don't adjust it here.
*/
if (a < addr) {
/*
* The beginning of the large page region can
* be pulled to the right to make a smaller
* region. We haven't yet faulted a single
* page.
*/
}
}
}
return (0);
}
return (err);
}
/*
* This routine is called via a machine specific fault handling routine.
* It is also called by software routines wishing to lock or unlock
* a range of addresses.
*
* Here is the basic algorithm:
* If unlocking
* Call segvn_softunlock
* Return
* endif
* Checking and set up work
* If we will need some non-anonymous pages
* Call VOP_GETPAGE over the range of non-anonymous pages
* endif
* Loop over all addresses requested
* Call segvn_faultpage passing in page list
* to load up translations and handle anonymous pages
* endloop
* Load up translation to any additional pages in page list not
* already handled that fit into this segment
*/
static faultcode_t
{
caddr_t a;
int err;
int dogetpage = 0;
/*
* First handle the easy stuff
*/
if (type == F_SOFTUNLOCK) {
if (rw == S_READ_NOCOW) {
}
return (0);
}
if (brkcow == 0) {
}
}
return (FC_PROT);
}
segvn_textunrepl(seg, 0);
}
}
top:
/*
* If we have the same protections for the entire segment,
* insure that the access being attempted is legitimate.
*/
switch (rw) {
case S_READ:
case S_READ_NOCOW:
break;
case S_WRITE:
break;
case S_EXEC:
break;
case S_OTHER:
default:
break;
}
return (FC_PROT); /* illegal access type */
}
}
/* this must be SOFTLOCK S_READ fault */
/*
* this must be the first ever non S_READ_NOCOW
* softlock for this segment.
*/
}
goto top;
}
/*
* We can't allow the long term use of softlocks for vmpss segments,
* because in some file truncation cases we should be able to demote
* the segment, which requires that there are no softlocks. The
* only case where it's ok to allow a SOFTLOCK fault against a vmpss
* segment is S_READ_NOCOW, where the caller holds the address space
* locked as writer and calls softunlock before dropping the as lock.
* S_READ_NOCOW is used by /proc to read memory from another user.
*
* Another deadlock between SOFTLOCK and file truncation can happen
* because segvn_fault_vnodepages() calls the FS one pagesize at
* a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages()
* can cause a deadlock because the first set of page_t's remain
* locked SE_SHARED. To avoid this, we demote segments on a first
* SOFTLOCK if they have a length greater than the segment's
* page size.
*
* So for now, we only avoid demoting a segment on a SOFTLOCK when
* the access type is S_READ_NOCOW and the fault length is less than
* or equal to the segment's page size. While this is quite restrictive,
* it should be the most common case of SOFTLOCK against a vmpss
* segment.
*
* For S_READ_NOCOW, it's safe not to do a copy on write because the
* caller makes sure no COW will be caused by another thread for a
* softlocked page.
*/
int demote = 0;
if (rw != S_READ_NOCOW) {
demote = 1;
}
lpgeaddr);
demote = 1;
}
}
if (demote) {
if (err) {
return (FC_MAKE_ERR(err));
}
}
goto top;
}
}
/*
* Check to see if we need to allocate an anon_map structure.
*/
/*
* Drop the "read" lock on the segment and acquire
* the "write" version since we have to allocate the
* anon_map.
*/
}
/*
* Start all over again since segment protections
* may have changed after we dropped the "read" lock.
*/
goto top;
}
/*
* S_READ_NOCOW vs S_READ distinction was
* only needed for the code above. After
* that we treat it as S_READ.
*/
if (rw == S_READ_NOCOW) {
}
/*
* MADV_SEQUENTIAL work is ignored for large page segments.
*/
} else {
goto top;
}
}
return (err);
}
/*
* The fast path could apply to S_WRITE also, except
* that the protection fault could be caused by lazy
* tlb flush when ro->rw. In this case, the pte is
* RW already. But RO in the other cpu's tlb causes
* the fault. Since hat_chgprot won't do anything if
* pte doesn't change, we may end up faulting
* indefinitely until the RO tlb entry gets replaced.
*/
goto slow;
}
}
return (0);
}
}
slow:
else
/*
* If MADV_SEQUENTIAL has been set for the particular page we
* are faulting on, free behind all pages in the segment and put
* them on the free list.
*/
(svd->pageadvice &&
break;
/*
* If this is an anon page, we must find the
* correct <vp, offset> for it
*/
&cookie);
} else {
}
} else {
}
break; /* XXX */
/*
* Skip pages that are free or have an
* "exclusive" lock.
*/
break;
/*
* We don't need the page_struct_lock to test
* as this is only advisory; even if we
* acquire it someone might race in and lock
* the page after we unlock and before the
* PUTPAGE, then VOP_PUTPAGE will do nothing.
*/
/*
* Hold the vnode before releasing
* the page lock to prevent it from
* being freed and re-used by some
* other thread.
*/
/*
* We should build a page list
* to kluster putpages XXX
*/
(void) VOP_PUTPAGE(fvp,
} else {
/*
* XXX - Should the loop terminate if
* the page is `locked'?
*/
}
--vpp;
--fanon_index;
}
}
}
pl_alloc_sz = 0;
/*
* See if we need to call VOP_GETPAGE for
* *any* of the range being faulted on.
* We can skip all of this work if there
* was no original vnode.
*/
dogetpage = 1;
else {
/*
* Only acquire reader lock to prevent amp->ahp
* from being changed. It's ok to miss pages,
* hence we don't do anon_array_enter
*/
/* inline non_anon() */
else
}
if (dogetpage) {
/*
* Page list won't fit in local array,
* allocate one of the needed size.
*/
/*
* Ask VOP_GETPAGE to return the exact number
* of pages if
* (a) this is a COW fault, or
* (b) this is a software fault, or
* (c) next page is already mapped.
*/
} else {
/*
* Ask VOP_GETPAGE to return adjacent pages
* within the segment.
*/
}
/*
* Need to get some non-anonymous pages.
* We need to make only one call to GETPAGE to do
* this to prevent certain deadlocking conditions
* when we are doing locking. In this case
* non_anon() should have picked up the smallest
* range which includes all the non-anonymous
* pages in the requested range. We have to
* be careful regarding which rw flag to pass in
* because on a private mapping, the underlying
* object is never allowed to be written.
*/
} else {
}
"segvn_getpage:seg %p addr %p vp %p",
if (err) {
if (pl_alloc_sz)
return (FC_MAKE_ERR(err));
}
vpprot &= ~PROT_WRITE;
}
}
/*
* N.B. at this time the plp array has all the needed non-anon
* pages in addition to (possibly) having some adjacent pages.
*/
/*
* Always acquire the anon_array_lock to prevent
* 2 threads from allocating separate anon slots for
* the same "addr".
*
* If this is a copy-on-write fault and we don't already
* have the anon_array_lock, acquire it to prevent the
* fault routine from handling multiple copy-on-write faults
* on the same "addr" in the same address space.
*
* Only one thread should deal with the fault since after
* it is handled, the other threads can acquire a translation
* to the newly created private page. This prevents two or
* more threads from creating different private pages for the
* same fault.
*
* We grab "serialization" lock here if this is a MAP_PRIVATE segment
* to prevent deadlock between this thread and another thread
* which has soft-locked this page and wants to acquire serial_lock.
* ( bug 4026339 )
*
* The fix for bug 4026339 becomes unnecessary when using the
* locking scheme with per amp rwlock and a global set of hash
* lock, anon_array_lock. If we steal a vnode page when low
* on memory and upgrad the page lock through page_rename,
* then the page is PAGE_HANDLED, nothing needs to be done
* for this page after returning from segvn_faultpage.
*
* But really, the page lock should be downgraded after
* the stolen page is page_rename'd.
*/
/*
* Ok, now loop over the address range and handle faults
*/
if (err) {
S_OTHER);
}
if (pl_alloc_sz)
return (err);
}
if (vpage) {
vpage++;
}
}
/* Didn't get pages from the underlying fs so we're done */
if (!dogetpage)
goto done;
/*
* Now handle any other pages in the list returned.
* If the page can be used, load up the translations now.
* Note that the for loop will only be entered if "plp"
* is pointing to a non-NULL page pointer which means that
* VOP_GETPAGE() was called and vpprot has been initialized.
*/
/*
* Large Files: diff should be unsigned value because we started
* supporting > 2GB segment sizes from 2.5.1 and when a
* large file of size > 2GB gets mapped to address space
* the diff value can be > 2GB.
*/
int anon_index;
int hat_flag = HAT_LOAD_ADV;
}
if (pp == PAGE_HANDLED)
continue;
/*
* Large Files: Following is the assertion
* validating the above cast.
*/
/*
* Prevent other threads in the address space from
* creating private pages (i.e., allocating anon slots)
* while we are in the process of loading translations
* to additional pages returned by the underlying
* object.
*/
}
hat_setmod(pp);
prot &= ~PROT_WRITE;
}
/*
* Skip mapping read ahead pages marked
* for migration, so they will get migrated
* properly on fault
*/
}
}
}
}
done:
if (pl_alloc_sz)
return (0);
}
/*
* This routine is used to start I/O on pages asynchronously. XXX it will
* only create PAGESIZE pages. At fault time they will be relocated into
* larger pages.
*/
static faultcode_t
{
int err;
/*
* Reader lock to prevent amp->ahp from being changed.
* This is advisory, it's ok to miss a page, so
* we don't do anon_array_enter lock.
*/
if (err)
return (FC_MAKE_ERR(err));
return (0);
}
}
return (0); /* zfod page - do nothing now */
}
if (err)
return (FC_MAKE_ERR(err));
return (0);
}
static int
{
int unload_done = 0;
return (EACCES); /* violated maxprot */
/* return if prot is the same */
return (0);
}
/*
* Since we change protections we first have to flush the cache.
* This makes sure all the pagelock calls have to recheck
* protections.
*/
if (svd->softlockcnt > 0) {
/*
* If this is shared segment non 0 softlockcnt
* means locked pages are still in use.
*/
return (EAGAIN);
}
/*
* Since we do have the segvn writers lock nobody can fill
* the cache with entries belonging to this seg during
* the purge. The flush either succeeds or we still have
*/
if (svd->softlockcnt > 0) {
return (EAGAIN);
}
}
unload_done = 1;
segvn_textunrepl(seg, 0);
unload_done = 1;
}
}
int err;
/*
* If we are holding the as lock as a reader then
* we need to return IE_RETRY and let the as
* layer drop and re-acquire the lock as a writer.
*/
return (IE_RETRY);
SDR_END, 0);
} else {
}
if (err == 0)
return (IE_RETRY);
return (IE_NOMEM);
return (err);
}
}
/*
* If it's a private mapping and we're making it writable then we
* may have to reserve the additional swap space now. If we are
* making writable only a part of the segment then we use its vpage
* array to keep a record of the pages for which we have reserved
* swap. In this case we set the pageswap field in the segment's
* segvn structure to record this.
*
* If it's a private mapping to a file (i.e., vp != NULL) and we're
* removing write permission on the entire segment and we haven't
* modified any pages, we can release the swap space.
*/
if (prot & PROT_WRITE) {
/*
* Start by determining how much swap
* space is required.
*/
/* The whole segment */
} else {
/*
* Make sure that the vpage array
* exists, and make a note of the
* range of elements corresponding
* to len.
*/
return (ENOMEM);
}
/*
* This is the first time we've
* asked for a part of this
* segment, so we need to
* reserve everything we've
* been asked for.
*/
} else {
/*
* We have to count the number
* of pages required.
*/
cvp++) {
if (!VPP_ISSWAPRES(cvp))
sz++;
}
}
}
/* Try to reserve the necessary swap. */
if (anon_resv_zone(sz,
return (IE_NOMEM);
}
/*
* Make a note of how much swap space
* we've reserved.
*/
} else {
if (!VPP_ISSWAPRES(cvp))
}
}
}
} else {
/*
* Swap space is released only if this segment
* does not map anonymous memory, since read faults
* on such segments still need an anon slot to read
* in the data.
*/
"anon proc:%p %lu %u", seg, 0, 0);
}
}
}
return (0); /* all done */
}
/*
* A vpage structure exists or else the change does not
* involve the entire segment. Establish a vpage structure
* if none is there. Then, for each page in the range,
* adjust its individual permissions. Note that write-
* enabling a MAP_PRIVATE page can affect the claims for
* locked down memory. Overcommitting memory terminates
* the operation.
*/
return (ENOMEM);
}
}
/*
* See Statement at the beginning of segvn_lockop regarding
* the way cowcnts and lckcnts are handled.
*/
&cookie);
}
}
break;
}
}
anon_idx++;
} else {
&cookie);
}
if (VPP_ISPPLOCK(svp) &&
} else
panic("segvn_setprot: no page");
/*NOTREACHED*/
}
PROT_WRITE) {
if (prot & PROT_WRITE) {
if (!page_addclaim(
pp)) {
break;
}
} else {
if (!page_subclaim(
pp)) {
break;
}
}
}
}
}
/*
* Did we terminate prematurely? If so, simply unload
* the translations to the things we've updated so far.
*/
if (unload_done) {
return (IE_NOMEM);
}
if (len != 0)
len, HAT_UNLOAD);
return (IE_NOMEM);
}
} else {
return (ENOMEM);
}
}
}
if (unload_done) {
return (0);
}
if (((prot & PROT_WRITE) != 0 &&
/*
* Either private or shared data with write access (in
* which case we need to throw out all former translations
* so that we get the right translations set up on fault
* and we don't allow write access to any copy-on-write pages
* that might be around or to prevent write access to pages
* representing holes in a file), or we don't have permission
* to access the memory at all (in which case we have to
* unload any current translations that might exist).
*/
} else {
/*
* A shared mapping or a private mapping in which write
* protection is going to be denied - just change all the
* protections over the range of addresses in question.
* segvn does not support any other attributes other
* than prot so we can use hat_chgattr.
*/
}
return (0);
}
/*
* segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize,
* to determine if the seg is capable of mapping the requested szc.
*/
static int
{
struct segvn_data *nsvd;
int err;
return (0);
}
/*
* addr should always be pgsz aligned but eaddr may be misaligned if
* it's at the end of the segment.
*
* XXX we should assert this condition since as_setpagesize() logic
* guarantees it.
*/
return (EINVAL);
}
return (EINVAL);
}
}
szc > segvn_maxpgszc) {
return (EINVAL);
}
/* paranoid check */
return (EINVAL);
}
return (EINVAL);
}
/*
* Check that protections are the same within new page
* size boundaries.
*/
return (EINVAL);
}
} else {
return (EINVAL);
}
}
}
}
/*
* Since we are changing page size we first have to flush
* the cache. This makes sure all the pagelock calls have
* to recheck protections.
*/
if (svd->softlockcnt > 0) {
/*
* If this is shared segment non 0 softlockcnt
* means locked pages are still in use.
*/
return (EAGAIN);
}
/*
* Since we do have the segvn writers lock nobody can fill
* the cache with entries belonging to this seg during
* the purge. The flush either succeeds or we still have
*/
if (svd->softlockcnt > 0) {
return (EAGAIN);
}
}
}
/*
* Operation for sub range of existing segment.
*/
if (err == 0) {
return (IE_RETRY);
}
return (IE_NOMEM);
}
return (err);
}
/* eaddr is szc aligned */
}
return (IE_RETRY);
}
/* eaddr is szc aligned */
}
return (IE_RETRY);
}
/*
* Break any low level sharing and reset seg->s_szc to 0.
*/
}
return (err);
}
/*
* If the end of the current segment is not pgsz aligned
* then attempt to concatenate with the next segment.
*/
return (ENOMEM);
}
return (EINVAL);
}
if (nsvd->softlockcnt > 0) {
/*
* If this is shared segment non 0 softlockcnt
* means locked pages are still in use.
*/
return (EAGAIN);
}
if (nsvd->softlockcnt > 0) {
return (EAGAIN);
}
}
}
if (err != 0) {
return (err);
}
if (err == -1) {
return (EINVAL);
}
if (err == -2) {
return (IE_NOMEM);
}
return (IE_RETRY);
}
/*
* May need to re-align anon array to
* new szc.
*/
return (IE_NOMEM);
}
return (IE_NOMEM);
}
svd->anon_index = 0;
}
}
return (EINVAL);
}
return (EINVAL);
}
/*
* anon_fill_cow_holes() may call VOP_GETPAGE().
* don't take anon map lock here to avoid holding it
* across VOP_GETPAGE() calls that may call back into
* segvn for klsutering checks. We don't really need
* anon map lock here since it's a private segment and
* we hold as level lock as writers.
*/
return (EINVAL);
}
}
}
}
}
return (0);
}
static int
{
int err = 0;
int pageflag = 0;
return (0);
}
} else {
}
/*
* do HAT_UNLOAD_UNMAP since we are changing the pagesize.
* unload argument is 0 when we are freeing the segment
* and unload was already done.
*/
}
return (0);
}
/*
* XXX anon rwlock is not really needed because this is a
* private segment and we are writers.
*/
}
goto out;
}
} else {
continue;
}
goto out;
}
goto out;
}
}
}
}
out:
return (err);
}
static int
{
int err = 0;
return (1);
return (1);
anon = 0;
}
}
if (!VPP_ISPPLOCK(svp))
continue;
if (anon) {
panic("segvn_claim_pages: no anon slot");
}
}
panic("segvn_claim_pages: no page");
}
}
return (1);
}
/* Find each large page within ppa, and adjust its claim */
/* Does ppa cover a single large page? */
if (prot & PROT_WRITE)
else
} else {
if (prot & PROT_WRITE)
else
if (err == 0)
break;
}
}
for (i = 0; i < pg_idx; i++) {
page_unlock(ppa[i]);
}
return (err);
}
/*
* Returns right (upper address) segment if split occurred.
* If the address is equal to the beginning or end of its segment it returns
* the current segment.
*/
static struct seg *
{
struct segvn_data *nsvd;
return (seg);
} else {
/*
* The offset for an anonymous segment has no signifigance in
* terms of an offset into a file. If we were to use the above
* calculation instead, the structures read out of
* /proc/<pid>/xmap would be more difficult to decipher since
* it would be unclear whether two seemingly contiguous
* prxmap_t structures represented different segments or a
* single segment that had been split up into multiple prxmap_t
* structures (e.g. if some part of the segment had not yet
* been faulted in).
*/
}
}
svd->anon_index = 0;
nsvd->anon_index = 0;
}
/*
* Split the amount of swap reserved.
*/
/*
* For MAP_NORESERVE, only allocate swap reserve for pages
* being used. Other segments get enough to cover whole
* segment.
*/
} else {
} else {
}
}
}
return (nseg);
}
/*
* called on memory operations (unmap, setprot, setpagesize) for a subset
* of a large page segment to either demote the memory range (SDR_RANGE)
*
* returns 0 on success. returns errno, including ENOMEM, on failure.
*/
static int
int flag,
{
int err;
/* demote entire range */
} else {
}
}
} else {
}
return (err);
}
} else {
}
if (err != 0) {
return (err);
}
} else {
}
}
}
return (0);
return (err);
}
}
}
return (0);
}
static int
{
/*
* If segment protection can be used, simply check against them.
*/
int err;
return (err);
}
/*
* Have to check down to the vpage level.
*/
return (EACCES);
}
}
return (0);
}
static int
{
if (pgno != 0) {
do {
} while (pgno != 0);
} else {
do {
pgno--;
} while (pgno != 0);
}
}
return (0);
}
static u_offset_t
{
}
/*ARGSUSED*/
static int
{
MAP_INITDATA)));
}
/*ARGSUSED*/
static int
{
return (0);
}
/*
* addr + delta relative to the mapping at addr. We assume here
* that delta is a signed PAGESIZE'd multiple (which can be negative).
*
* For segvn, we currently "approve" of the action if we are
* or if the advice stored in segvn_data or vpages allows it.
* Currently, klustering is not allowed only if MADV_RANDOM is set.
*/
static int
{
return (-1); /* exceeded segment bounds */
/*
* Check to see if either of the pages addr or addr + delta
* have advice set that prevents klustering (if MADV_RANDOM advice
* is set for entire segment, or MADV_SEQUENTIAL is set and delta
* is negative).
*/
return (-1);
return (-1);
return (-1);
}
return (0); /* shared mapping - all ok */
return (0); /* off original vnode */
return (-1); /* one with and one without an anon */
}
return (0); /* off original vnode */
}
/*
* Now we know we have two anon pointers - check to
* see if they happen to be properly allocated.
*/
/*
* XXX We cheat here and don't lock the anon slots. We can't because
* we may have been called from the anon layer which might already
* have locked them. We are holding a refcnt on the slots so they
* can't disappear. The worst that will happen is we'll get the wrong
* names (vp, off) for the slots and make a poor klustering decision.
*/
return (-1);
return (0);
}
/*
* Swap the pages of seg out to secondary storage, returning the
* number of bytes of storage freed.
*
* The basic idea is first to unload all translations and then to call
* VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the
* swap device. Pages to which other segments have mappings will remain
* mapped and won't be swapped. Our caller (as_swapout) has already
* performed the unloading step.
*
* The value returned is intended to correlate well with the process's
* memory requirements. However, there are some caveats:
* 1) When given a shared segment as argument, this routine will
* only succeed in swapping out pages for the last sharer of the
* segment. (Previous callers will only have decremented mapping
* reference counts.)
* 2) We assume that the hat layer maintains a large enough translation
* cache to capture process reference patterns.
*/
static size_t
{
/*
* Find pages unmapped by our caller and force them
* out to the virtual swap device.
*/
/*
* Obtain <vp, off> pair for the page, then look it up.
*
* Note that this code is willing to consider regular
* pages as well as anon pages. Is this appropriate here?
*/
&cookie)) {
continue;
}
} else {
}
} else {
}
continue;
}
continue;
/*
* Examine the page to see whether it can be tossed out,
* keeping track of how many we've found.
*/
if (!page_tryupgrade(pp)) {
/*
* If the page has an i/o lock and no mappings,
* it's very likely that the page is being
* written out as a result of klustering.
* Assume this is so and take credit for it here.
*/
if (!page_io_trylock(pp)) {
if (!hat_page_is_mapped(pp))
pgcnt++;
} else {
}
continue;
}
/*
* Skip if page is locked or has mappings.
* We don't need the page_struct_lock to look at lckcnt
* and cowcnt because the page is exclusive locked.
*/
hat_page_is_mapped(pp)) {
continue;
}
/*
* dispose skips large pages so try to demote first.
*/
/*
* XXX should skip the remaining page_t's of this
* large page.
*/
continue;
}
/*
* No longer mapped -- we can toss it out. How
* we do so depends on whether or not it's dirty.
*/
/*
* We must clean the page before it can be
* freed. Setting B_FREE will cause pvn_done
* to free the page when the i/o completes.
* XXX: This also causes it to be accounted
* as a pageout instead of a swap: need
* B_SWAPOUT bit to use instead of B_FREE.
*
* Hold the vnode before releasing the page lock
* to prevent it from being freed and re-used by
* some other thread.
*/
/*
* Queue all i/o requests for the pageout thread
* to avoid saturating the pageout devices.
*/
} else {
/*
* The page was clean, free it.
*
* XXX: Can we ever encounter modified pages
* with no associated vnode here?
*/
/*LINTED: constant in conditional context*/
}
/*
* Credit now even if i/o is in progress.
*/
pgcnt++;
}
/*
* Wakeup pageout to initiate i/o on all queued requests.
*/
}
/*
* Synchronize primary storage cache with real object in virtual memory.
*
* XXX - Anonymous pages should not be sync'ed out at all.
*/
static int
{
int bflags;
int err = 0;
int segtype;
int pageprot;
int prot;
if (svd->softlockcnt > 0) {
/*
* If this is shared segment non 0 softlockcnt
* means locked pages are still in use.
*/
return (EAGAIN);
}
/*
* flush all pages from seg cache
* otherwise we may deadlock in swap_putpage
* for B_INVAL page (4175402).
*
* Even if we grab segvn WRITER's lock
* here, there might be another thread which could've
* we acquired the lock here. So, grabbing either
* lock here is of not much use. Until we devise
* a strategy at upper layers to solve the
* synchronization issues completely, we expect
* applications to handle this appropriately.
*/
if (svd->softlockcnt > 0) {
return (EAGAIN);
}
/*
* Try to purge this amp's entries from pcache. It will
* succeed only if other segments that share the amp have no
* outstanding softlock's.
*/
return (EAGAIN);
}
}
if (attr) {
/*
* We are done if the segment types don't match
* or if we have segment level protections and
* they don't match.
*/
return (0);
}
return (0);
}
} else
(flags & MS_INVALIDATE) == 0) {
/*
* No attributes, no anonymous pages and MS_INVALIDATE flag
* is not on, just use one big request.
*/
return (err);
}
} else {
}
} else {
}
continue;
if (attr) {
if (vpp) {
vpp++;
}
continue;
}
}
/*
* See if any of these pages are locked -- if so, then we
* will have to truncate an invalidate request at the first
* locked one. We don't need the page_struct_lock to test
* as this is only advisory; even if we acquire it someone
* might race in and lock the page after we unlock and before
* we do the PUTPAGE, then PUTPAGE simply does nothing.
*/
if (flags & MS_INVALIDATE) {
return (EBUSY);
}
page_tryupgrade(pp)) {
/*
* swapfs VN_DISPOSE() won't
* invalidate large pages.
* Attempt to demote.
* XXX can't help it if it
* fails. But for swapfs
* pages it is no big deal.
*/
(void) page_try_demote_pages(
pp);
}
}
}
/*
* Avoid writing out to disk ISM's large pages
* because segspt_free_pages() relies on NULL an_pvp
* of anon slots of such pages.
*/
/*
* swapfs uses page_lookup_nowait if not freeing or
* invalidating and skips a page if
* page_lookup_nowait returns NULL.
*/
continue;
}
continue;
}
/*
* Note ISM pages are created large so (vp, off)'s
* page cannot suddenly become large after we unlock
* pp.
*/
}
/*
* XXX - Should ultimately try to kluster
* calls to VOP_PUTPAGE() for performance.
*/
if (err)
break;
}
return (err);
}
/*
* Determine if we have data corresponding to pages in the
* primary storage virtual memory cache (i.e., "in core").
*/
static size_t
{
int ret;
return (len); /* no anonymous pages created yet */
}
}
}
/* A page exists for the anon slot */
ret |= SEG_PAGE_INCORE;
/*
* If page is mapped and writable
*/
ret |= SEG_PAGE_ANON;
}
/*
* Don't get page_struct lock for lckcnt and cowcnt,
* since this is purely advisory.
*/
ret |= SEG_PAGE_SOFTLOCK;
ret |= SEG_PAGE_HASCOW;
}
}
/* Gather vnode statistics */
/*
* Try to obtain a "shared" lock on the page
* without blocking. If this fails, determine
* if the page is in memory.
*/
/* Page is incore, and is named */
}
/*
* Don't get page_struct lock for lckcnt and cowcnt,
* since this is purely advisory.
*/
ret |= SEG_PAGE_SOFTLOCK;
ret |= SEG_PAGE_HASCOW;
}
}
/* Gather virtual page information */
if (vpp) {
if (VPP_ISPPLOCK(vpp))
ret |= SEG_PAGE_LOCKED;
vpp++;
}
}
return (len);
}
/*
*
* p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region
* irrespective of the following factors or anything else:
*
* (1) anon slots are populated or not
* (2) cow is broken or not
* (3) refcnt on ap is 1 or greater than 1
*
* If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock
* and munlock.
*
*
*
* if vpage has PROT_WRITE
* transfer cowcnt on the oldpage -> cowcnt on the newpage
* else
* transfer lckcnt on the oldpage -> lckcnt on the newpage
*
* During copy-on-write, decrement p_cowcnt on the oldpage and increment
* p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE.
*
* We may also break COW if softlocking on read access in the physio case.
* In this case, vpage may not have PROT_WRITE. So, we need to decrement
* p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the
* vpage doesn't have PROT_WRITE.
*
*
*
* If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and
* increment p_lckcnt by calling page_subclaim() which takes care of
* availrmem accounting and p_lckcnt overflow.
*
* If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and
* increment p_cowcnt by calling page_addclaim() which takes care of
* availrmem availability and p_cowcnt overflow.
*/
/*
* Lock down (or unlock) pages mapped by this segment.
*
* XXX only creates PAGESIZE pages if anon slots are not initialized.
* At fault time they will be relocated into larger pages.
*/
static int
{
int segtype;
int pageprot;
int claim;
int chargeproc = 1;
size_t locked_bytes = 0;
size_t unlocked_bytes = 0;
int err = 0;
/*
* Hold write lock on address space because may split or concatenate
* segments
*/
/*
* If this is a shm, use shm's project and zone, else use
* project and zone of calling process
*/
/* Determine if this segment backs a sysV shm */
chargeproc = 0;
}
if (attr) {
/*
* We are done if the segment types don't match
* or if we have segment level protections and
* they don't match.
*/
return (0);
}
return (0);
}
}
segvn_textunrepl(seg, 0);
}
}
/*
* If we're locking, then we must create a vpage structure if
* none exists. If we're unlocking, then check to see if there
* is a vpage -- if not, then we could not have locked anything.
*/
return (ENOMEM);
}
} else {
return (0);
}
}
/*
* The anonymous data vector (i.e., previously
* unreferenced mapping to swap space) can be allocated
* by lazily testing for its existence.
*/
}
}
/* determine number of unlocked bytes in range for lock operation */
vpp++) {
if (!VPP_ISPPLOCK(vpp))
}
} else {
/* Only count sysV pages once for locked memory */
continue;
}
continue;
}
}
mutex_enter(&p->p_lock);
mutex_exit(&p->p_lock);
if (err) {
return (err);
}
}
/*
* Loop over all pages in the range. Process if we're locking and
* page has not already been locked in this mapping; or if we're
* unlocking and the page has been locked.
*/
/*
* If this isn't a MAP_NORESERVE segment and
* we're locking, allocate anon slots if they
* don't exist. The page is brought in later on.
*/
== NULL)) {
anon_index)) == NULL) {
goto out;
}
anon_index) == NULL);
}
}
/*
* Get name for page, accounting for
* existence of private copy.
*/
} else {
continue;
}
}
}
} else {
}
/*
* Get page frame. It's ok if the page is
* not available when we're unlocking, as this
* may simply mean that a page we locked got
* truncated out of existence after we locked it.
*
* Invoke VOP_GETPAGE() to obtain the page struct
* since we may need to read it from disk if its
* been paged out.
*/
else {
int error;
}
/*
* If the error is EDEADLK then we must bounce
* up and drop all vm subsystem locks and then
* retry the operation later
* This behavior is a temporary measure because
* deadlock if we don't allow this bounce to
* happen. The real solution is to re-design
* the logging code to work properly. See bug
* 4125102 for details of the problem.
*/
goto out;
}
/*
* Quit if we fail to fault in the page. Treat
* the failure as an error, unless the addr
* is mapped beyond the end of a file.
*/
goto out;
}
goto out;
}
goto out;
} else if (error) {
goto out;
}
}
/*
* See Statement at the beginning of this routine.
*
* claim is always set if MAP_PRIVATE and PROT_WRITE
* irrespective of following factors:
*
* (1) anon slots are populated or not
* (2) cow is broken or not
* (3) refcnt on ap is 1 or greater than 1
*
* See 4140683 for details
*/
/*
* Perform page-level operation appropriate to
* operation. If locking, undo the SOFTLOCK
* performed to bring the page into memory
* after setting the lock. If unlocking,
* and no page was found, account for the claim
* separately.
*/
}
}
if (ret == 0) {
/* locking page failed */
goto out;
}
locked_bytes += PAGESIZE;
} else
locked_bytes += PAGESIZE;
} else {
/* sysV pages should be locked */
+= PAGESIZE;
} else
} else {
}
}
}
}
out:
/* Credit back bytes that did not get locked */
if ((unlocked_bytes - locked_bytes) > 0) {
mutex_enter(&p->p_lock);
mutex_exit(&p->p_lock);
}
} else {
/* Account bytes that were unlocked */
if (unlocked_bytes > 0) {
mutex_enter(&p->p_lock);
mutex_exit(&p->p_lock);
}
}
return (err);
}
/*
* Set advice from user for specified pages
* There are 10 types of advice:
* MADV_NORMAL - Normal (default) behavior (whatever that is)
* MADV_RANDOM - Random page references
* do not allow readahead or 'klustering'
* MADV_SEQUENTIAL - Sequential page references
* Pages previous to the one currently being
* accessed (determined by fault) are 'not needed'
* and are freed immediately
* MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl)
* MADV_DONTNEED - Pages are not needed (synced out in mctl)
* MADV_FREE - Contents can be discarded
* MADV_ACCESS_DEFAULT- Default access
* MADV_ACCESS_LWP - Next LWP will access heavily
* MADV_ACCESS_MANY- Many LWPs or processes will access heavily
* MADV_PURGE - Contents will be immediately discarded
*/
static int
{
int err = 0;
int already_set;
/*
* In case of MADV_FREE/MADV_PURGE, we won't be modifying any segment
* private data structures; so, we only need to grab READER's lock
*/
return (0);
}
} else {
}
/*
* Large pages are assumed to be only turned on when accesses to the
* segment's address range have spatial and temporal locality. That
* justifies ignoring MADV_SEQUENTIAL for large page segments.
* Also, ignore advice affecting lgroup memory allocation
* if don't need to do lgroup optimizations on this system
*/
if ((behav == MADV_SEQUENTIAL &&
return (0);
}
/*
* Since we are going to unload hat mappings
* we first have to flush the cache. Otherwise
* this might lead to system panic if another
* thread is doing physio on the range whose
* mappings are unloaded by madvise(3C).
*/
if (svd->softlockcnt > 0) {
/*
* If this is shared segment non 0 softlockcnt
* means locked pages are still in use.
*/
return (EAGAIN);
}
/*
* Since we do have the segvn writers lock
* nobody can fill the cache with entries
* belonging to this seg during the purge.
* The flush either succeeds or we still
* madvise(3C) fails.
*/
if (svd->softlockcnt > 0) {
/*
* Since madvise(3C) is advisory and
* it's not part of UNIX98, madvise(3C)
* failure here doesn't cause any hardship.
* Note that we don't block in "as" layer.
*/
return (EAGAIN);
}
/*
* Try to purge this amp's entries from pcache. It
* will succeed only if other segments that share the
* amp have no outstanding softlock's.
*/
}
}
/*
* MADV_FREE is not supported for segments with an
* underlying object; if anonmap is NULL, anon slots
* are not yet populated and there is nothing for us
* to do. As MADV_FREE is advisory, we don't return an
* error in either case.
*/
return (0);
}
/*
* If we're here with a NULL anonmap, it's because we
* are doing a MADV_PURGE. We have nothing to do, but
* because MADV_PURGE isn't merely advisory, we return
* an error in this case.
*/
return (EBUSY);
}
/*
* If we purged pages on a MAP_NORESERVE mapping, we
* need to be sure to now unreserve our reserved swap.
* (We use the atomic operations to manipulate our
* segment and address space counters because we only
* have the corresponding locks held as reader, not
* writer.)
*/
}
/*
* MADV_PURGE and MADV_FREE differ in their return semantics:
* because MADV_PURGE is designed to be bug-for-bug compatible
* with its clumsy Linux forebear, it will fail where MADV_FREE
* does not.
*/
}
/*
* If advice is to be applied to entire segment,
* use advice field in seg_data structure
* otherwise use appropriate vpage entry.
*/
switch (behav) {
case MADV_ACCESS_LWP:
case MADV_ACCESS_MANY:
case MADV_ACCESS_DEFAULT:
/*
* Set memory allocation policy for this segment
*/
else {
/*
* For private memory, need writers lock on
* address space because the segment may be
* split or concatenated when changing policy
*/
return (IE_RETRY);
}
}
/*
* If policy set already and it shouldn't be reapplied,
* don't do anything.
*/
if (already_set &&
break;
/*
* Mark any existing pages in given range for
* migration
*/
/*
* If same policy set already or this is a shared
* memory segment, don't need to try to concatenate
* segment with adjacent ones.
*/
break;
/*
* Try to concatenate this segment with previous
* one and next one, since we changed policy for
* this one and it may be compatible with adjacent
* ones now.
*/
/*
* Drop lock for private data of current
* segment before concatenating (deleting) it
* and return IE_REATTACH to tell as_ctl() that
* current segment has changed
*/
err = IE_REATTACH;
return (err);
}
break;
case MADV_SEQUENTIAL:
/*
* unloading mapping guarantees
* detection in segvn_fault
*/
/* FALLTHROUGH */
case MADV_NORMAL:
case MADV_RANDOM:
svd->pageadvice = 0;
break;
case MADV_WILLNEED: /* handled in memcntl */
case MADV_DONTNEED: /* handled in memcntl */
case MADV_FREE: /* handled above */
case MADV_PURGE: /* handled above */
break;
default:
}
} else {
struct segvn_data *new_svd;
return (ENOMEM);
}
switch (behav) {
case MADV_ACCESS_LWP:
case MADV_ACCESS_MANY:
case MADV_ACCESS_DEFAULT:
/*
* Set memory allocation policy for portion of this
* segment
*/
/*
* Align address and length of advice to page
* boundaries for large pages
*/
}
/*
* Check to see whether policy is set already
*/
else
/*
* If policy set already and it shouldn't be reapplied,
* don't do anything.
*/
if (already_set &&
break;
/*
* For private memory, need writers lock on
* address space because the segment may be
* split or concatenated when changing policy
*/
return (IE_RETRY);
}
/*
* Mark any existing pages in given range for
* migration
*/
/*
* Don't need to try to split or concatenate
* segments, since policy is same or this is a shared
* memory segment
*/
break;
}
/*
* Split off new segment if advice only applies to a
* portion of existing segment starting in middle
*/
/*
* Must flush I/O page cache
* before splitting segment
*/
if (svd->softlockcnt > 0)
/*
* Split segment and return IE_REATTACH to tell
* as_ctl() that current segment changed
*/
err = IE_REATTACH;
/*
* If new segment ends where old one
* did, try to concatenate the new
* segment with next one.
*/
/*
* Set policy for new segment
*/
(void) lgrp_privm_policy_set(policy,
new_seg);
if (next &&
(void) segvn_concat(new_seg,
next, 1);
}
}
/*
* Split off end of existing segment if advice only
* applies to a portion of segment ending before
* end of the existing segment
*/
/*
* Must flush I/O page cache
* before splitting segment
*/
if (svd->softlockcnt > 0)
/*
* If beginning of old segment was already
* split off, use new segment to split end off
* from.
*/
/*
* Split segment
*/
/*
* Set policy for new segment
*/
(void) lgrp_privm_policy_set(policy,
} else {
/*
* Split segment and return IE_REATTACH
* to tell as_ctl() that current
* segment changed
*/
err = IE_REATTACH;
(void) lgrp_privm_policy_set(policy,
/*
* If new segment starts where old one
* did, try to concatenate it with
* previous segment.
*/
seg);
/*
* Drop lock for private data
* of current segment before
* concatenating (deleting) it
*/
if (prev &&
&segvn_ops &&
(void) segvn_concat(
return (err);
}
}
}
}
break;
case MADV_SEQUENTIAL:
/* FALLTHROUGH */
case MADV_NORMAL:
case MADV_RANDOM:
break;
case MADV_WILLNEED: /* handled in memcntl */
case MADV_DONTNEED: /* handled in memcntl */
case MADV_FREE: /* handled above */
case MADV_PURGE: /* handled above */
break;
default:
}
}
return (err);
}
/*
* There is one kind of inheritance that can be specified for pages:
*
* SEGP_INH_ZERO - Pages should be zeroed in the child
*/
static int
{
int ret = 0;
/* Can't support something we don't know about */
if (behav != SEGP_INH_ZERO)
return (ENOTSUP);
/*
* This must be a straightforward anonymous segment that is mapped
* privately and is not backed by a vnode.
*/
goto out;
}
/*
* If the entire segment has been marked as inherit zero, then no reason
* to do anything else.
*/
ret = 0;
goto out;
}
/*
* If this applies to the entire segment, simply mark it and we're done.
*/
ret = 0;
goto out;
}
/*
* We've been asked to mark a subset of this segment as inherit zero,
* therefore we need to mainpulate its vpages.
*/
goto out;
}
}
ret = 0;
out:
return (ret);
}
/*
* Create a vpage structure for this seg.
*/
static void
{
static pgcnt_t page_limit = 0;
/*
* If no vpage structure exists, allocate one. Copy the protections
* and the advice from the segment itself to the individual pages.
*/
/*
* Start by calculating the number of pages we must allocate to
* track the per-page vpage structs needs for this entire
* segment. If we know now that it will require more than our
* heuristic for the maximum amount of kmem we can consume then
* fail. We do this here, instead of trying to detect this deep
* in page_resv and propagating the error up, since the entire
* memory allocation stack is not amenable to passing this
* back. Instead, it wants to keep trying.
*
* As a heuristic we set a page limit of 5/8s of total_pages
* for this allocation. We use shifts so that no floating
* point conversion takes place and only need to do the
* calculation once.
*/
if (page_limit == 0)
if (npages > page_limit)
return;
}
}
}
/*
* Dump the pages belonging to this segvn segment.
*/
static void
{
struct segvn_data *svd;
}
int we_own_it = 0;
} else {
}
/*
* If pp == NULL, the page either does not exist
* or is exclusively locked. So determine if it
* exists before searching for it.
*/
we_own_it = 1;
else
if (pp) {
if (we_own_it)
}
}
}
#ifdef DEBUG
static uint32_t segvn_pglock_mtbf = 0;
#endif
/*
* uses global segment pcache to cache shadow lists (i.e. pp arrays) of pages
* to avoid the overhead of per page locking, unlocking for subsequent IOs to
* the same parts of the segment. Currently shadow list creation is only
* supported for pure anon segments. MAP_PRIVATE segment pcache entries are
* tagged with segment pointer, starting virtual address and length. This
* approach for MAP_SHARED segments may add many pcache entries for the same
* set of pages and lead to long hash chains that decrease pcache lookup
* performance. To avoid this issue for shared segments shared anon map and
* starting anon index are used for pcache entry tagging. This allows all
* segments to share pcache entries for the same anon range and reduces pcache
* chain's length as well as memory overhead from duplicate shadow lists and
* pcache entries.
*
* softlockcnt field in segvn_data structure counts the number of F_SOFTLOCK'd
* pages via segvn_fault() and pagelock'd pages via this routine. But pagelock
* part of softlockcnt accounting is done differently for private and shared
* segments. In private segment case softlock is only incremented when a new
* shadow list is created but not when an existing one is found via
* seg_plookup(). pcache entries have reference count incremented/decremented
* by each seg_plookup()/seg_pinactive() operation. Only entries that have 0
* reference count can be purged (and purging is needed before segment can be
* freed). When a private segment pcache entry is purged segvn_reclaim() will
* decrement softlockcnt. Since in private segment case each of its pcache
* entries only belongs to this segment we can expect that when
* segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this
* segment purge will succeed and softlockcnt will drop to 0. In shared
* segment case reference count in pcache entry counts active locks from many
* different segments so we can't expect segment purging to succeed even when
* segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this
* segment. To be able to determine when there're no pending pagelocks in
* shared segment case we don't rely on purging to make softlockcnt drop to 0
* but instead softlockcnt is incremented and decremented for every
* segvn_pagelock(L_PAGELOCK/L_PAGEUNLOCK) call regardless if a new shadow
* list was created or an existing one was found. When softlockcnt drops to 0
* this segment no longer has any claims for pcached shadow lists and the
* segment can be freed even if there're still active pcache entries
* shared by this segment anon map. Shared segment pcache entries belong to
* anon map and are typically removed when anon map is freed after all
* processes destroy the segments that use this anon map.
*/
static int
{
caddr_t a;
int anlock;
int use_pcache;
int sftlck_sbase = 0;
int sftlck_send = 0;
#ifdef DEBUG
if ((ts % segvn_pglock_mtbf) == 0) {
return (ENOTSUP);
}
return (EFAULT);
}
}
#endif
/*
* for now we only support pagelock to anon memory. We would have to
* check protections for vnode objects and call into the vnode driver.
* That's too much for a fast path. Let the fault entry point handle
* it.
*/
if (type == L_PAGELOCK) {
goto out;
}
panic("segvn_pagelock(L_PAGEUNLOCK): vp != NULL");
}
if (type == L_PAGELOCK) {
goto out;
}
panic("segvn_pagelock(L_PAGEUNLOCK): amp == NULL");
}
if (type == L_PAGELOCK) {
goto out;
}
panic("segvn_pagelock(L_PAGEUNLOCK): bad rw");
}
/*
* We are adjusting the pagelock region to the large page size
* boundary because the unlocked part of a large page cannot
* be freed anyway unless all constituent pages of a large
* page are locked. Bigger regions reduce pcache chain length
* and improve lookup performance. The tradeoff is that the
* very first segvn_pagelock() call for a given page is more
* expensive if only 1 page_t is needed for IO. This is only
* an issue if pcache entry doesn't get reused by several
* subsequent calls. We optimize here for the case when pcache
* is heavily used by repeated IOs to the same address range.
*
* Note segment's page size cannot change while we are holding
* as lock. And then it cannot change while softlockcnt is
* not 0. This will allow us to correctly recalculate large
* page size region for the matching pageunlock/reclaim call
* since as_pageunlock() caller must always match
* as_pagelock() call's addr and len.
*
* For pageunlock *ppp points to the pointer of page_t that
* corresponds to the real unadjusted start address. Similar
* for pagelock *ppp must point to the pointer of page_t that
* corresponds to the real unadjusted start address.
*/
} else if (len < segvn_pglock_comb_thrshld) {
adjustpages = 0;
} else {
/*
* Align the address range of large enough requests to allow
* combining of different shadow lists into 1 to reduce memory
* overhead from potentially overlapping large shadow lists
* (worst case is we have a 1MB IO into buffers with start
* addresses separated by 4K). Alignment is only possible if
* padded chunks have sufficient access permissions. Note
* permissions won't change between L_PAGELOCK and
* L_PAGEUNLOCK calls since non 0 softlockcnt will force
* segvn_setprot() to wait until softlockcnt drops to 0. This
* allows us to determine in L_PAGEUNLOCK the same range we
* computed in L_PAGELOCK.
*
* If alignment is limited by segment ends set
* sftlck_sbase/sftlck_send flags. In L_PAGELOCK case when
* these flags are set bump softlockcnt_sbase/softlockcnt_send
* per segment counters. In L_PAGEUNLOCK case decrease
* softlockcnt_sbase/softlockcnt_send counters if
* sftlck_sbase/sftlck_send flags are set. When
* softlockcnt_sbase/softlockcnt_send are non 0
* segvn_concat()/segvn_extend_prev()/segvn_extend_next()
* won't merge the segments. This restriction combined with
* restriction on segment unmapping and splitting for segments
* that have non 0 softlockcnt allows L_PAGEUNLOCK to
* correctly determine the same range that was previously
* locked by matching L_PAGELOCK.
*/
sftlck_sbase = 1;
}
} else {
sftlck_sbase = 1;
} else {
}
}
break;
}
vp++;
}
pflags = 0;
}
}
if (pflags) {
} else {
lpgeaddr = 0;
} else {
}
}
if (lpgeaddr == 0 ||
sftlck_send = 1;
}
}
break;
}
vp++;
}
}
}
}
/*
* For MAP_SHARED segments we create pcache entries tagged by amp and
* anon index so that we can share pcache entries with other segments
* that map this amp. For private segments pcache entries are tagged
* with segment and virtual address.
*/
} else {
}
if (type == L_PAGEUNLOCK) {
/*
* update hat ref bits for /proc. We need to make sure
* that threads tracing the ref and mod bits of the
* address space get the right data.
* Note: page ref and mod bits are updated at reclaim time
*/
} else {
}
}
}
/*
* Check the shadow list entry after the last page used in
* this IO request. If it's NOPCACHE_SHWLIST the shadow list
* was not inserted into pcache and is not large page
* adjusted. In this case call reclaim callback directly and
* don't adjust the shadow list start and size for large
* pages.
*/
void *ptag;
} else {
}
} else {
}
}
if (sftlck_sbase) {
}
if (sftlck_send) {
}
/*
* If someone is blocked while unmapping, we purge
* segment page cache and thus reclaim pplist synchronously
* without waiting for seg_pasync_thread. This speeds up
* unmapping in cases where munmap(2) is called, while
* raw async i/o is still in progress or where a thread
* exits on data fault in a multithreaded application.
*/
if (svd->softlockcnt == 0) {
}
/*
* softlockcnt is not 0 and this is a
* MAP_PRIVATE segment. Try to purge its
* pcache entries to reduce softlockcnt.
* If it drops to 0 segvn_reclaim()
* will wake up a thread waiting on
* unmapwait flag.
*
* We don't purge MAP_SHARED segments with non
* 0 softlockcnt since IO is still in progress
* for such segments.
*/
}
}
return (0);
}
/* The L_PAGELOCK case ... */
/*
* For MAP_SHARED segments we have to check protections before
* seg_plookup() since pcache entries may be shared by many segments
* with potentially different page protections.
*/
goto out;
}
} else {
/*
* check page protections
*/
a = lpgaddr;
} else {
a = addr;
}
goto out;
}
}
}
}
/*
* try to find pages in segment page cache
*/
npages);
}
if (sftlck_sbase) {
}
if (sftlck_send) {
}
return (0);
}
/*
* For MAP_SHARED segments we already verified above that segment
* protections allow this pagelock operation.
*/
goto out;
}
} else {
wlen = 0;
}
} else {
int wcont = 1;
/*
* check page protections
*/
goto out;
}
} else {
wcont = 0;
}
}
}
}
/*
* Only build large page adjusted shadow list if we expect to insert
* it into pcache. For large enough pages it's a big overhead to
* create a shadow list of the entire large page. But this overhead
* should be amortized over repeated pcache hits on subsequent reuse
* of this shadow list (IO into any range within this shadow list will
* find it in pcache since we large page align the request for pcache
* lookups). pcache performance is improved with bigger shadow lists
* as it reduces the time to pcache the entire big segment and reduces
* pcache chain length.
*/
use_pcache = 1;
} else {
use_pcache = 0;
/*
* Since this entry will not be inserted into the pcache, we
* will not do any adjustments to the starting address or
* size of the memory to be locked.
*/
adjustpages = 0;
}
/*
* If use_pcache is 0 this shadow list is not large page adjusted.
* Record this info in the last entry of shadow array so that
* L_PAGEUNLOCK can determine if it should large page adjust the
* address range to find the real range that was locked.
*/
anlock = 0;
/*
* Lock and unlock anon array only once per large page.
* anon_array_enter() locks the root anon slot according to
* a_szc which can't change while anon map is locked. We lock
* anon the first time through this loop and each time we
* reach anon index that corresponds to a root of a large
* page.
*/
anlock = 1;
}
/*
* We must never use seg_pcache for COW pages
* because we might end up with original page still
* lying in seg_pcache even after private page is
* created. This leads to data corruption as
* aio_write refers to the page still in cache
* while all other accesses refer to the private
* page.
*/
break;
}
} else {
}
anlock = 0;
if (error) {
break;
}
anlock = 1;
break;
}
}
break;
}
}
/*
* Unlock anon if this is the last slot in a large page.
*/
anlock = 0;
}
}
if (anlock) { /* Ensure the lock is dropped */
}
npages);
}
if (sftlck_sbase) {
}
if (sftlck_send) {
}
if (use_pcache) {
}
return (0);
}
np--;
pplist++;
}
out:
return (error);
}
/*
* purge any cached pages in the I/O page cache
*/
static void
{
/*
* pcache is only used by pure anon segments.
*/
return;
}
/*
* For MAP_SHARED segments non 0 segment's softlockcnt means
* active IO is still in progress via this segment. So we only
* purge MAP_SHARED segments when their softlockcnt is 0.
*/
if (svd->softlockcnt) {
}
}
}
/*
* If async argument is not 0 we are called from pcache async thread and don't
* hold AS lock.
*/
/*ARGSUSED*/
static int
{
} else {
hat_setref(*pplist);
}
np--;
pplist++;
}
/*
* If we are pcache async thread we don't hold AS lock. This means if
* softlockcnt drops to 0 after the decrement below address space may
* get freed. We can't allow it since after softlock derement to 0 we
* still need to access as structure for possible wakeup of unmap
* waiters. To prevent the disappearance of as we take this segment
* segfree_syncmtx. segvn_free() also takes this mutex as a barrier to
* make sure this routine completes before segment is freed.
*
* The second complication we have to deal with in async case is a
* possibility of missed wake up of unmap wait thread. When we don't
* hold as lock here we may take a_contents lock before unmap wait
* thread that was first to see softlockcnt was still not 0. As a
* result we'll fail to wake up an unmap wait thread. To avoid this
* race we set nounmapwait flag in as structure if we drop softlockcnt
* to 0 when we were called by pcache async thread. unmapwait thread
* will not block if this flag is set.
*/
if (async) {
}
if (async) {
}
}
}
}
if (async) {
}
return (0);
}
/*ARGSUSED*/
static int
{
} else {
hat_setref(*pplist);
}
np--;
pplist++;
}
/*
* If somebody sleeps in anonmap_purge() wake them up if a_softlockcnt
* drops to 0. anon map can't be freed until a_softlockcnt drops to 0
* and anonmap_purge() acquires a_purgemtx.
*/
amp->a_purgewait) {
amp->a_purgewait = 0;
}
return (0);
}
/*
* get a memory ID for an addr in a given segment
*
* XXX only creates PAGESIZE pages if anon slots are not initialized.
* At fault time they will be relocated into larger pages.
*/
static int
{
return (0);
}
return (0);
} else {
}
return (ENOMEM);
}
== NULL);
ap, ANON_SLEEP);
}
return (0);
}
}
return (EINVAL);
}
static int
{
return (1);
vpage++;
pages--;
while (pages-- > 0) {
return (0);
vpage++;
}
return (1);
}
/*
* Get memory allocation policy info for specified address in given segment
*/
static lgrp_mem_policy_info_t *
{
struct segvn_data *svn_data;
return (NULL);
/*
* Get policy info for private or shared memory
*/
} else {
}
} else {
}
return (policy_info);
}
/*ARGSUSED*/
static int
{
return (0);
}
/*
* Bind text vnode segment to an amp. If we bind successfully mappings will be
* established to per vnode mapping per lgroup amp pages instead of to vnode
* pages. There's one amp per vnode text mapping per lgroup. Many processes
* may share the same text replication amp. If a suitable amp doesn't already
* exist in svntr hash table create a new one. We may fail to bind to amp if
* segment is not eligible for text replication. Code below first checks for
* these conditions. If binding is successful segment tr_state is set to on
* and svd->amp points to the amp to use. Otherwise tr_state is set to off and
* svd->amp remains as NULL.
*/
static void
{
int first;
/*
* If numa optimizations are no longer desired bail out.
*/
if (!lgrp_optimizations()) {
return;
}
/*
* Avoid creating anon maps with size bigger than the file size.
* If VOP_GETATTR() call fails bail out.
*/
return;
}
return;
}
/*
* VVMEXEC may not be set yet if exec() prefaults text segment. Set
* this flag now before vn_is_mapped(V_WRITE) so that MAP_SHARED
* mapping that checks if trcache for this vnode needs to be
* invalidated can't miss us.
*/
}
/*
* Bail out if potentially MAP_SHARED writable mappings exist to this
* vnode. We don't want to use old file contents from existing
* replicas if this mapping was established after the original file
* was changed.
*/
return;
}
continue;
}
/*
* Bail out if the file or its attributes were changed after
* this replication entry was created since we need to use the
* latest file contents. Note that mtime test alone is not
* sufficient because a user can explicitly change mtime via
* utimes(2) interfaces back to the old value after modifiying
* the file contents. To detect this case we also have to test
* ctime which among other things records the time of the last
* mtime change by utimes(2). ctime is not changed when the file
* is only read or executed so we expect that typically existing
* replication amp's can be used most of the time.
*/
return;
}
/*
* if off, eoff and szc match current segment we found the
* existing entry we can use.
*/
break;
}
/*
* Don't create different but overlapping in file offsets
* entries to avoid replication of the same file pages more
* than once per lgroup.
*/
return;
}
}
/*
* If we didn't find existing entry create a new one.
*/
return;
}
#ifdef DEBUG
{
lgrp_id_t i;
for (i = 0; i < NLGRPS_MAX; i++) {
}
}
#endif /* DEBUG */
}
first = 1;
/*
* We want to pick a replica with pages on main thread's (t_tid = 1,
* aka T1) lgrp. Currently text replication is only optimized for
* workloads that either have all threads of a process on the same
* lgrp or execute their large text primarily on main thread.
*/
lgrp_id = p->p_t1_lgrpid;
/*
* In case exec() prefaults text on non main thread use
* current thread lgrpid. It will become main thread anyway
* soon.
*/
}
/*
* Set p_tr_lgrpid to lgrpid if it hasn't been set yet. Otherwise
* just set it to NLGRPS_MAX if it's different from current process T1
* home lgrp. p_tr_lgrpid is used to detect if process uses text
* replication and T1 new home is different from lgrp used for text
* replication. When this happens asyncronous segvn thread rechecks if
* segments should change lgrps used for text replication. If we fail
* to set p_tr_lgrpid with atomic_cas_32 then set it to NLGRPS_MAX
* without cas if it's not already NLGRPS_MAX and not equal lgrp_id
* we want to use. We don't need to use cas in this case because
* another thread that races in between our non atomic check and set
* may only change p_tr_lgrpid to NLGRPS_MAX at this point.
*/
olid = p->p_tr_lgrpid;
olid) {
olid = p->p_tr_lgrpid;
p->p_tr_lgrpid = NLGRPS_MAX;
}
}
/*
* lgrp_move_thread() won't schedule async recheck after
* p->p_t1_lgrpid update unless p->p_tr_lgrpid is not
* LGRP_NONE. Recheck p_t1_lgrpid once now that p->p_tr_lgrpid
* is not LGRP_NONE.
*/
p->p_t1_lgrpid != lgrp_id) {
first = 0;
goto again;
}
}
/*
* If no amp was created yet for lgrp_id create a new one as long as
* we have enough memory to afford it.
*/
if (trmem > segvn_textrepl_max_bytes) {
goto fail;
}
goto fail;
}
goto fail;
}
}
}
svd->anon_index = 0;
return;
fail:
} else {
}
}
/*
* Convert seg back to regular vnode mapping seg by unbinding it from its text
* replication amp. This routine is most typically called when segment is
* unmapped but can also be called when segment no longer qualifies for text
* replication (e.g. due to protection changes). If unload_unmap is set use
* HAT_UNLOAD_UNMAP flag in hat_unload_callback(). If we are the last user of
* svntr free all its anon maps and remove it from the hash table.
*/
static void
{
lgrp_id_t i;
break;
}
}
panic("segvn_textunrepl: svntr record not found");
}
panic("segvn_textunrepl: amp mismatch");
}
}
} else {
}
}
goto done;
}
for (i = 0; i < NLGRPS_MAX; i++) {
continue;
}
} else {
}
}
done:
}
/*
* This is called when a MAP_SHARED writable mapping is created to a vnode
* that is currently used for execution (VVMEXEC flag is set). In this case we
* need to prevent further use of existing replicas.
*/
static void
{
if (svntr_hashtab == NULL) {
return;
}
}
}
}
static void
segvn_trasync_thread(void)
{
callb_generic_cpr, "segvn_async");
if (segvn_update_textrepl_interval == 0) {
} else {
}
for (;;) {
}
}
static uint64_t segvn_lgrp_trthr_migrs_snpsht = 0;
static void
segvn_trupdate_wakeup(void *dummy)
{
}
if (!segvn_disable_textrepl_update &&
segvn_update_textrepl_interval != 0) {
}
}
static void
segvn_trupdate(void)
{
hash);
}
}
}
}
static void
{
proc_t *p;
lgrp_id = p->p_t1_lgrpid;
return;
}
return;
}
/*
* lock in reverse from syncrounous thread order.
*/
}
return;
}
}
return;
}
if (trmem > segvn_textrepl_max_bytes) {
return;
}
return;
}
return;
}
}
/*
* We don't need to drop the bucket lock but here we give other
* threads a chance. svntr and svd can't be unlinked as long as
* segment lock is held as a writer and AS held as well. After we
* retake bucket lock we'll continue from where we left. We'll be able
* to reach the end of either list since new entries are always added
* to the beginning of the lists.
*/
p->p_tr_lgrpid = NLGRPS_MAX;
}