vm_as.c revision 406882169e00272f14067d948324d690893e6fe3
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
* The Regents of the University of California
* All Rights Reserved
*
* University Acknowledgment- Portions of this document are derived from
* software developed by the University of California, Berkeley, and its
* contributors.
*/
/*
* VM - address spaces.
*/
#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/mman.h>
#include <sys/sysmacros.h>
#include <sys/cpuvar.h>
#include <sys/sysinfo.h>
#include <sys/kmem.h>
#include <sys/vnode.h>
#include <sys/vmsystm.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/tnf_probe.h>
#include <sys/vtrace.h>
#include <vm/hat.h>
#include <vm/xhat.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_vn.h>
#include <vm/seg_dev.h>
#include <vm/seg_kmem.h>
#include <vm/seg_map.h>
#include <vm/seg_spt.h>
#include <vm/page.h>
clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
static struct kmem_cache *as_cache;
static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
static void as_clearwatchprot(struct as *, caddr_t, size_t);
int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
/*
* Verifying the segment lists is very time-consuming; it may not be
* desirable always to define VERIFY_SEGLIST when DEBUG is set.
*/
#ifdef DEBUG
#define VERIFY_SEGLIST
int do_as_verify = 0;
#endif
/*
* Allocate a new callback data structure entry and fill in the events of
* interest, the address range of interest, and the callback argument.
* Link the entry on the as->a_callbacks list. A callback entry for the
* entire address space may be specified with vaddr = 0 and size = -1.
*
* CALLERS RESPONSIBILITY: If not calling from within the process context for
* the specified as, the caller must guarantee persistence of the specified as
* for the duration of this function (eg. pages being locked within the as
* will guarantee persistence).
*/
int
as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
caddr_t vaddr, size_t size, int sleepflag)
{
struct as_callback *current_head, *cb;
caddr_t saddr;
size_t rsize;
/* callback function and an event are mandatory */
if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
return (EINVAL);
/* Adding a callback after as_free has been called is not allowed */
if (as == &kas)
return (ENOMEM);
/*
* vaddr = 0 and size = -1 is used to indicate that the callback range
* is the entire address space so no rounding is done in that case.
*/
if (size != -1) {
saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
(size_t)saddr;
/* check for wraparound */
if (saddr + rsize < saddr)
return (ENOMEM);
} else {
if (vaddr != 0)
return (EINVAL);
saddr = vaddr;
rsize = size;
}
/* Allocate and initialize a callback entry */
cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
if (cb == NULL)
return (EAGAIN);
cb->ascb_func = cb_func;
cb->ascb_arg = arg;
cb->ascb_events = events;
cb->ascb_saddr = saddr;
cb->ascb_len = rsize;
/* Add the entry to the list */
mutex_enter(&as->a_contents);
current_head = as->a_callbacks;
as->a_callbacks = cb;
cb->ascb_next = current_head;
/*
* The call to this function may lose in a race with
* a pertinent event - eg. a thread does long term memory locking
* but before the callback is added another thread executes as_unmap.
* A broadcast here resolves that.
*/
if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
AS_CLRUNMAPWAIT(as);
cv_broadcast(&as->a_cv);
}
mutex_exit(&as->a_contents);
return (0);
}
/*
* Search the callback list for an entry which pertains to arg.
*
* This is called from within the client upon completion of the callback.
* RETURN VALUES:
* AS_CALLBACK_DELETED (callback entry found and deleted)
* AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
* AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
* entry will be made in as_do_callbacks)
*
* If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
* set, it indicates that as_do_callbacks is processing this entry. The
* AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
* to unblock as_do_callbacks, in case it is blocked.
*
* CALLERS RESPONSIBILITY: If not calling from within the process context for
* the specified as, the caller must guarantee persistence of the specified as
* for the duration of this function (eg. pages being locked within the as
* will guarantee persistence).
*/
uint_t
as_delete_callback(struct as *as, void *arg)
{
struct as_callback **prevcb = &as->a_callbacks;
struct as_callback *cb;
uint_t rc = AS_CALLBACK_NOTFOUND;
mutex_enter(&as->a_contents);
for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
if (cb->ascb_arg != arg)
continue;
/*
* If the events indicate AS_CALLBACK_CALLED, just clear
* AS_ALL_EVENT in the events field and wakeup the thread
* that may be waiting in as_do_callbacks. as_do_callbacks
* will take care of removing this entry from the list. In
* that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise
* (AS_CALLBACK_CALLED not set), just remove it from the
* list, return the memory and return AS_CALLBACK_DELETED.
*/
if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
/* leave AS_CALLBACK_CALLED */
cb->ascb_events &= ~AS_ALL_EVENT;
rc = AS_CALLBACK_DELETE_DEFERRED;
cv_broadcast(&as->a_cv);
} else {
*prevcb = cb->ascb_next;
kmem_free(cb, sizeof (struct as_callback));
rc = AS_CALLBACK_DELETED;
}
break;
}
mutex_exit(&as->a_contents);
return (rc);
}
/*
* Searches the as callback list for a matching entry.
* Returns a pointer to the first matching callback, or NULL if
* nothing is found.
* This function never sleeps so it is ok to call it with more
* locks held but the (required) a_contents mutex.
*
* See also comment on as_do_callbacks below.
*/
static struct as_callback *
as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
size_t event_len)
{
struct as_callback *cb;
ASSERT(MUTEX_HELD(&as->a_contents));
for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
/*
* If the callback has not already been called, then
* check if events or address range pertains. An event_len
* of zero means do an unconditional callback.
*/
if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
((event_len != 0) && (((cb->ascb_events & events) == 0) ||
(event_addr + event_len < cb->ascb_saddr) ||
(event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
continue;
}
break;
}
return (cb);
}
/*
* Executes a given callback and removes it from the callback list for
* this address space.
* This function may sleep so the caller must drop all locks except
* a_contents before calling this func.
*
* See also comments on as_do_callbacks below.
*/
static void
as_execute_callback(struct as *as, struct as_callback *cb,
uint_t events)
{
struct as_callback **prevcb;
void *cb_arg;
ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
cb->ascb_events |= AS_CALLBACK_CALLED;
mutex_exit(&as->a_contents);
(*cb->ascb_func)(as, cb->ascb_arg, events);
mutex_enter(&as->a_contents);
/*
* the callback function is required to delete the callback
* when the callback function determines it is OK for
* this thread to continue. as_delete_callback will clear
* the AS_ALL_EVENT in the events field when it is deleted.
* If the callback function called as_delete_callback,
* events will already be cleared and there will be no blocking.
*/
while ((cb->ascb_events & events) != 0) {
cv_wait(&as->a_cv, &as->a_contents);
}
/*
* This entry needs to be taken off the list. Normally, the
* callback func itself does that, but unfortunately the list
* may have changed while the callback was running because the
* a_contents mutex was dropped and someone else other than the
* callback func itself could have called as_delete_callback,
* so we have to search to find this entry again. The entry
* must have AS_CALLBACK_CALLED, and have the same 'arg'.
*/
cb_arg = cb->ascb_arg;
prevcb = &as->a_callbacks;
for (cb = as->a_callbacks; cb != NULL;
prevcb = &cb->ascb_next, cb = *prevcb) {
if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
(cb_arg != cb->ascb_arg)) {
continue;
}
*prevcb = cb->ascb_next;
kmem_free(cb, sizeof (struct as_callback));
break;
}
}
/*
* Check the callback list for a matching event and intersection of
* address range. If there is a match invoke the callback. Skip an entry if:
* - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
* - not event of interest
* - not address range of interest
*
* An event_len of zero indicates a request for an unconditional callback
* (regardless of event), only the AS_CALLBACK_CALLED is checked. The
* a_contents lock must be dropped before a callback, so only one callback
* can be done before returning. Return -1 (true) if a callback was
* executed and removed from the list, else return 0 (false).
*
* The logically separate parts, i.e. finding a matching callback and
* executing a given callback have been separated into two functions
* so that they can be called with different sets of locks held beyond
* the always-required a_contents. as_find_callback does not sleep so
* it is ok to call it if more locks than a_contents (i.e. the a_lock
* rwlock) are held. as_execute_callback on the other hand may sleep
* so all locks beyond a_contents must be dropped by the caller if one
* does not want to end comatose.
*/
static int
as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
size_t event_len)
{
struct as_callback *cb;
if ((cb = as_find_callback(as, events, event_addr, event_len))) {
as_execute_callback(as, cb, events);
return (-1);
}
return (0);
}
/*
* Search for the segment containing addr. If a segment containing addr
* exists, that segment is returned. If no such segment exists, and
* the list spans addresses greater than addr, then the first segment
* whose base is greater than addr is returned; otherwise, NULL is
* returned unless tail is true, in which case the last element of the
* list is returned.
*
* a_seglast is used to cache the last found segment for repeated
* searches to the same addr (which happens frequently).
*/
struct seg *
as_findseg(struct as *as, caddr_t addr, int tail)
{
struct seg *seg = as->a_seglast;
avl_index_t where;
ASSERT(AS_LOCK_HELD(as, &as->a_lock));
if (seg != NULL &&
seg->s_base <= addr &&
addr < seg->s_base + seg->s_size)
return (seg);
seg = avl_find(&as->a_segtree, &addr, &where);
if (seg != NULL)
return (as->a_seglast = seg);
seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
if (seg == NULL && tail)
seg = avl_last(&as->a_segtree);
return (as->a_seglast = seg);
}
#ifdef VERIFY_SEGLIST
/*
* verify that the linked list is coherent
*/
static void
as_verify(struct as *as)
{
struct seg *seg, *seglast, *p, *n;
uint_t nsegs = 0;
if (do_as_verify == 0)
return;
seglast = as->a_seglast;
for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
ASSERT(seg->s_as == as);
p = AS_SEGPREV(as, seg);
n = AS_SEGNEXT(as, seg);
ASSERT(p == NULL || p->s_as == as);
ASSERT(p == NULL || p->s_base < seg->s_base);
ASSERT(n == NULL || n->s_base > seg->s_base);
ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
if (seg == seglast)
seglast = NULL;
nsegs++;
}
ASSERT(seglast == NULL);
ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
}
#endif /* VERIFY_SEGLIST */
/*
* Add a new segment to the address space. The avl_find()
* may be expensive so we attempt to use last segment accessed
* in as_gap() as an insertion point.
*/
int
as_addseg(struct as *as, struct seg *newseg)
{
struct seg *seg;
caddr_t addr;
caddr_t eaddr;
avl_index_t where;
ASSERT(AS_WRITE_HELD(as, &as->a_lock));
as->a_updatedir = 1; /* inform /proc */
gethrestime(&as->a_updatetime);
if (as->a_lastgaphl != NULL) {
struct seg *hseg = NULL;
struct seg *lseg = NULL;
if (as->a_lastgaphl->s_base > newseg->s_base) {
hseg = as->a_lastgaphl;
lseg = AVL_PREV(&as->a_segtree, hseg);
} else {
lseg = as->a_lastgaphl;
hseg = AVL_NEXT(&as->a_segtree, lseg);
}
if (hseg && lseg && lseg->s_base < newseg->s_base &&
hseg->s_base > newseg->s_base) {
avl_insert_here(&as->a_segtree, newseg, lseg,
AVL_AFTER);
as->a_lastgaphl = NULL;
as->a_seglast = newseg;
return (0);
}
as->a_lastgaphl = NULL;
}
addr = newseg->s_base;
eaddr = addr + newseg->s_size;
again:
seg = avl_find(&as->a_segtree, &addr, &where);
if (seg == NULL)
seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
if (seg == NULL)
seg = avl_last(&as->a_segtree);
if (seg != NULL) {
caddr_t base = seg->s_base;
/*
* If top of seg is below the requested address, then
* the insertion point is at the end of the linked list,
* and seg points to the tail of the list. Otherwise,
* the insertion point is immediately before seg.
*/
if (base + seg->s_size > addr) {
if (addr >= base || eaddr > base) {
#ifdef __sparc
extern struct seg_ops segnf_ops;
/*
* no-fault segs must disappear if overlaid.
* XXX need new segment type so
* we don't have to check s_ops
*/
if (seg->s_ops == &segnf_ops) {
seg_unmap(seg);
goto again;
}
#endif
return (-1); /* overlapping segment */
}
}
}
as->a_seglast = newseg;
avl_insert(&as->a_segtree, newseg, where);
#ifdef VERIFY_SEGLIST
as_verify(as);
#endif
return (0);
}
struct seg *
as_removeseg(struct as *as, struct seg *seg)
{
avl_tree_t *t;
ASSERT(AS_WRITE_HELD(as, &as->a_lock));
as->a_updatedir = 1; /* inform /proc */
gethrestime(&as->a_updatetime);
if (seg == NULL)
return (NULL);
t = &as->a_segtree;
if (as->a_seglast == seg)
as->a_seglast = NULL;
as->a_lastgaphl = NULL;
/*
* if this segment is at an address higher than
* a_lastgap, set a_lastgap to the next segment (NULL if last segment)
*/
if (as->a_lastgap &&
(seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
as->a_lastgap = AVL_NEXT(t, seg);
/*
* remove the segment from the seg tree
*/
avl_remove(t, seg);
#ifdef VERIFY_SEGLIST
as_verify(as);
#endif
return (seg);
}
/*
* Find a segment containing addr.
*/
struct seg *
as_segat(struct as *as, caddr_t addr)
{
struct seg *seg = as->a_seglast;
ASSERT(AS_LOCK_HELD(as, &as->a_lock));
if (seg != NULL && seg->s_base <= addr &&
addr < seg->s_base + seg->s_size)
return (seg);
seg = avl_find(&as->a_segtree, &addr, NULL);
return (seg);
}
/*
* Serialize all searches for holes in an address space to
* prevent two or more threads from allocating the same virtual
* address range. The address space must not be "read/write"
* locked by the caller since we may block.
*/
void
as_rangelock(struct as *as)
{
mutex_enter(&as->a_contents);
while (AS_ISCLAIMGAP(as))
cv_wait(&as->a_cv, &as->a_contents);
AS_SETCLAIMGAP(as);
mutex_exit(&as->a_contents);
}
/*
* Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
*/
void
as_rangeunlock(struct as *as)
{
mutex_enter(&as->a_contents);
AS_CLRCLAIMGAP(as);
cv_signal(&as->a_cv);
mutex_exit(&as->a_contents);
}
/*
* compar segments (or just an address) by segment address range
*/
static int
as_segcompar(const void *x, const void *y)
{
struct seg *a = (struct seg *)x;
struct seg *b = (struct seg *)y;
if (a->s_base < b->s_base)
return (-1);
if (a->s_base >= b->s_base + b->s_size)
return (1);
return (0);
}
void
as_avlinit(struct as *as)
{
avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
offsetof(struct seg, s_tree));
avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
offsetof(struct watched_page, wp_link));
}
/*ARGSUSED*/
static int
as_constructor(void *buf, void *cdrarg, int kmflags)
{
struct as *as = buf;
mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
as_avlinit(as);
return (0);
}
/*ARGSUSED1*/
static void
as_destructor(void *buf, void *cdrarg)
{
struct as *as = buf;
avl_destroy(&as->a_segtree);
mutex_destroy(&as->a_contents);
cv_destroy(&as->a_cv);
rw_destroy(&as->a_lock);
}
void
as_init(void)
{
as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
as_constructor, as_destructor, NULL, NULL, NULL, 0);
}
/*
* Allocate and initialize an address space data structure.
* We call hat_alloc to allow any machine dependent
* information in the hat structure to be initialized.
*/
struct as *
as_alloc(void)
{
struct as *as;
as = kmem_cache_alloc(as_cache, KM_SLEEP);
as->a_flags = 0;
as->a_vbits = 0;
as->a_hrm = NULL;
as->a_seglast = NULL;
as->a_size = 0;
as->a_resvsize = 0;
as->a_updatedir = 0;
gethrestime(&as->a_updatetime);
as->a_objectdir = NULL;
as->a_sizedir = 0;
as->a_userlimit = (caddr_t)USERLIMIT;
as->a_lastgap = NULL;
as->a_lastgaphl = NULL;
as->a_callbacks = NULL;
AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
as->a_hat = hat_alloc(as); /* create hat for default system mmu */
AS_LOCK_EXIT(as, &as->a_lock);
as->a_xhat = NULL;
return (as);
}
/*
* Free an address space data structure.
* Need to free the hat first and then
* all the segments on this as and finally
* the space for the as struct itself.
*/
void
as_free(struct as *as)
{
struct hat *hat = as->a_hat;
struct seg *seg, *next;
int called = 0;
top:
/*
* Invoke ALL callbacks. as_do_callbacks will do one callback
* per call, and not return (-1) until the callback has completed.
* When as_do_callbacks returns zero, all callbacks have completed.
*/
mutex_enter(&as->a_contents);
while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
;
/* This will prevent new XHATs from attaching to as */
if (!called)
AS_SETBUSY(as);
mutex_exit(&as->a_contents);
AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
if (!called) {
called = 1;
hat_free_start(hat);
if (as->a_xhat != NULL)
xhat_free_start_all(as);
}
for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
int err;
next = AS_SEGNEXT(as, seg);
retry:
err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
if (err == EAGAIN) {
mutex_enter(&as->a_contents);
if (as->a_callbacks) {
AS_LOCK_EXIT(as, &as->a_lock);
} else if (!AS_ISNOUNMAPWAIT(as)) {
/*
* Memory is currently locked. Wait for a
* cv_signal that it has been unlocked, then
* try the operation again.
*/
if (AS_ISUNMAPWAIT(as) == 0)
cv_broadcast(&as->a_cv);
AS_SETUNMAPWAIT(as);
AS_LOCK_EXIT(as, &as->a_lock);
while (AS_ISUNMAPWAIT(as))
cv_wait(&as->a_cv, &as->a_contents);
} else {
/*
* We may have raced with
* segvn_reclaim()/segspt_reclaim(). In this
* case clean nounmapwait flag and retry since
* softlockcnt in this segment may be already
* 0. We don't drop as writer lock so our
* number of retries without sleeping should
* be very small. See segvn_reclaim() for
* more comments.
*/
AS_CLRNOUNMAPWAIT(as);
mutex_exit(&as->a_contents);
goto retry;
}
mutex_exit(&as->a_contents);
goto top;
} else {
/*
* We do not expect any other error return at this
* time. This is similar to an ASSERT in seg_unmap()
*/
ASSERT(err == 0);
}
}
hat_free_end(hat);
if (as->a_xhat != NULL)
xhat_free_end_all(as);
AS_LOCK_EXIT(as, &as->a_lock);
/* /proc stuff */
ASSERT(avl_numnodes(&as->a_wpage) == 0);
if (as->a_objectdir) {
kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
as->a_objectdir = NULL;
as->a_sizedir = 0;
}
/*
* Free the struct as back to kmem. Assert it has no segments.
*/
ASSERT(avl_numnodes(&as->a_segtree) == 0);
kmem_cache_free(as_cache, as);
}
int
as_dup(struct as *as, struct proc *forkedproc)
{
struct as *newas;
struct seg *seg, *newseg;
size_t purgesize = 0;
int error;
AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
as_clearwatch(as);
newas = as_alloc();
newas->a_userlimit = as->a_userlimit;
newas->a_proc = forkedproc;
AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER);
/* This will prevent new XHATs from attaching */
mutex_enter(&as->a_contents);
AS_SETBUSY(as);
mutex_exit(&as->a_contents);
mutex_enter(&newas->a_contents);
AS_SETBUSY(newas);
mutex_exit(&newas->a_contents);
(void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
if (seg->s_flags & S_PURGE) {
purgesize += seg->s_size;
continue;
}
newseg = seg_alloc(newas, seg->s_base, seg->s_size);
if (newseg == NULL) {
AS_LOCK_EXIT(newas, &newas->a_lock);
as_setwatch(as);
mutex_enter(&as->a_contents);
AS_CLRBUSY(as);
mutex_exit(&as->a_contents);
AS_LOCK_EXIT(as, &as->a_lock);
as_free(newas);
return (-1);
}
if ((error = SEGOP_DUP(seg, newseg)) != 0) {
/*
* We call seg_free() on the new seg
* because the segment is not set up
* completely; i.e. it has no ops.
*/
as_setwatch(as);
mutex_enter(&as->a_contents);
AS_CLRBUSY(as);
mutex_exit(&as->a_contents);
AS_LOCK_EXIT(as, &as->a_lock);
seg_free(newseg);
AS_LOCK_EXIT(newas, &newas->a_lock);
as_free(newas);
return (error);
}
newas->a_size += seg->s_size;
}
newas->a_resvsize = as->a_resvsize - purgesize;
error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
if (as->a_xhat != NULL)
error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL);
mutex_enter(&newas->a_contents);
AS_CLRBUSY(newas);
mutex_exit(&newas->a_contents);
AS_LOCK_EXIT(newas, &newas->a_lock);
as_setwatch(as);
mutex_enter(&as->a_contents);
AS_CLRBUSY(as);
mutex_exit(&as->a_contents);
AS_LOCK_EXIT(as, &as->a_lock);
if (error != 0) {
as_free(newas);
return (error);
}
forkedproc->p_as = newas;
return (0);
}
/*
* Handle a ``fault'' at addr for size bytes.
*/
faultcode_t
as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
enum fault_type type, enum seg_rw rw)
{
struct seg *seg;
caddr_t raddr; /* rounded down addr */
size_t rsize; /* rounded up size */
size_t ssize;
faultcode_t res = 0;
caddr_t addrsav;
struct seg *segsav;
int as_lock_held;
klwp_t *lwp = ttolwp(curthread);
int is_xhat = 0;
int holding_wpage = 0;
extern struct seg_ops segdev_ops;
if (as->a_hat != hat) {
/* This must be an XHAT then */
is_xhat = 1;
if ((type != F_INVAL) || (as == &kas))
return (FC_NOSUPPORT);
}
retry:
if (!is_xhat) {
/*
* Indicate that the lwp is not to be stopped while waiting
* for a pagefault. This is to avoid deadlock while debugging
* a process via /proc over NFS (in particular).
*/
if (lwp != NULL)
lwp->lwp_nostop++;
/*
* same length must be used when we softlock and softunlock.
* We don't support softunlocking lengths less than
* the original length when there is largepage support.
* See seg_dev.c for more comments.
*/
switch (type) {
case F_SOFTLOCK:
CPU_STATS_ADD_K(vm, softlock, 1);
break;
case F_SOFTUNLOCK:
break;
case F_PROT:
CPU_STATS_ADD_K(vm, prot_fault, 1);
break;
case F_INVAL:
CPU_STATS_ENTER_K();
CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
if (as == &kas)
CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
CPU_STATS_EXIT_K();
break;
}
}
/* Kernel probe */
TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
tnf_opaque, address, addr,
tnf_fault_type, fault_type, type,
tnf_seg_access, access, rw);
raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
(size_t)raddr;
/*
* XXX -- Don't grab the as lock for segkmap. We should grab it for
* correctness, but then we could be stuck holding this lock for
* a LONG time if the fault needs to be resolved on a slow
* filesystem, and then no-one will be able to exec new commands,
* as exec'ing requires the write lock on the as.
*/
if (as == &kas && segkmap && segkmap->s_base <= raddr &&
raddr + size < segkmap->s_base + segkmap->s_size) {
/*
* if (as==&kas), this can't be XHAT: we've already returned
* FC_NOSUPPORT.
*/
seg = segkmap;
as_lock_held = 0;
} else {
AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
if (is_xhat && avl_numnodes(&as->a_wpage) != 0) {
/*
* Grab and hold the writers' lock on the as
* if the fault is to a watched page.
* This will keep CPUs from "peeking" at the
* address range while we're temporarily boosting
* the permissions for the XHAT device to
* resolve the fault in the segment layer.
*
* We could check whether faulted address
* is within a watched page and only then grab
* the writer lock, but this is simpler.
*/
AS_LOCK_EXIT(as, &as->a_lock);
AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
}
seg = as_segat(as, raddr);
if (seg == NULL) {
AS_LOCK_EXIT(as, &as->a_lock);
if ((lwp != NULL) && (!is_xhat))
lwp->lwp_nostop--;
return (FC_NOMAP);
}
as_lock_held = 1;
}
addrsav = raddr;
segsav = seg;
for (; rsize != 0; rsize -= ssize, raddr += ssize) {
if (raddr >= seg->s_base + seg->s_size) {
seg = AS_SEGNEXT(as, seg);
if (seg == NULL || raddr != seg->s_base) {
res = FC_NOMAP;
break;
}
}
if (raddr + rsize > seg->s_base + seg->s_size)
ssize = seg->s_base + seg->s_size - raddr;
else
ssize = rsize;
if (!is_xhat || (seg->s_ops != &segdev_ops)) {
if (is_xhat && avl_numnodes(&as->a_wpage) != 0 &&
pr_is_watchpage_as(raddr, rw, as)) {
/*
* Handle watch pages. If we're faulting on a
* watched page from an X-hat, we have to
* restore the original permissions while we
* handle the fault.
*/
as_clearwatch(as);
holding_wpage = 1;
}
res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
/* Restore watchpoints */
if (holding_wpage) {
as_setwatch(as);
holding_wpage = 0;
}
if (res != 0)
break;
} else {
/* XHAT does not support seg_dev */
res = FC_NOSUPPORT;
break;
}
}
/*
* If we were SOFTLOCKing and encountered a failure,
* we must SOFTUNLOCK the range we already did. (Maybe we
* should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
* right here...)
*/
if (res != 0 && type == F_SOFTLOCK) {
for (seg = segsav; addrsav < raddr; addrsav += ssize) {
if (addrsav >= seg->s_base + seg->s_size)
seg = AS_SEGNEXT(as, seg);
ASSERT(seg != NULL);
/*
* Now call the fault routine again to perform the
* unlock using S_OTHER instead of the rw variable
* since we never got a chance to touch the pages.
*/
if (raddr > seg->s_base + seg->s_size)
ssize = seg->s_base + seg->s_size - addrsav;
else
ssize = raddr - addrsav;
(void) SEGOP_FAULT(hat, seg, addrsav, ssize,
F_SOFTUNLOCK, S_OTHER);
}
}
if (as_lock_held)
AS_LOCK_EXIT(as, &as->a_lock);
if ((lwp != NULL) && (!is_xhat))
lwp->lwp_nostop--;
/*
* If the lower levels returned EDEADLK for a fault,
* It means that we should retry the fault. Let's wait
* a bit also to let the deadlock causing condition clear.
* This is part of a gross hack to work around a design flaw
* in the ufs/sds logging code and should go away when the
* logging code is re-designed to fix the problem. See bug
* 4125102 for details of the problem.
*/
if (FC_ERRNO(res) == EDEADLK) {
delay(deadlk_wait);
res = 0;
goto retry;
}
return (res);
}
/*
* Asynchronous ``fault'' at addr for size bytes.
*/
faultcode_t
as_faulta(struct as *as, caddr_t addr, size_t size)
{
struct seg *seg;
caddr_t raddr; /* rounded down addr */
size_t rsize; /* rounded up size */
faultcode_t res = 0;
klwp_t *lwp = ttolwp(curthread);
retry:
/*
* Indicate that the lwp is not to be stopped while waiting
* for a pagefault. This is to avoid deadlock while debugging
* a process via /proc over NFS (in particular).
*/
if (lwp != NULL)
lwp->lwp_nostop++;
raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
(size_t)raddr;
AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
seg = as_segat(as, raddr);
if (seg == NULL) {
AS_LOCK_EXIT(as, &as->a_lock);
if (lwp != NULL)
lwp->lwp_nostop--;
return (FC_NOMAP);
}
for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
if (raddr >= seg->s_base + seg->s_size) {
seg = AS_SEGNEXT(as, seg);
if (seg == NULL || raddr != seg->s_base) {
res = FC_NOMAP;
break;
}
}
res = SEGOP_FAULTA(seg, raddr);
if (res != 0)
break;
}
AS_LOCK_EXIT(as, &as->a_lock);
if (lwp != NULL)
lwp->lwp_nostop--;
/*
* If the lower levels returned EDEADLK for a fault,
* It means that we should retry the fault. Let's wait
* a bit also to let the deadlock causing condition clear.
* This is part of a gross hack to work around a design flaw
* in the ufs/sds logging code and should go away when the
* logging code is re-designed to fix the problem. See bug
* 4125102 for details of the problem.
*/
if (FC_ERRNO(res) == EDEADLK) {
delay(deadlk_wait);
res = 0;
goto retry;
}
return (res);
}
/*
* Set the virtual mapping for the interval from [addr : addr + size)
* in address space `as' to have the specified protection.
* It is ok for the range to cross over several segments,
* as long as they are contiguous.
*/
int
as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
{
struct seg *seg;
struct as_callback *cb;
size_t ssize;
caddr_t raddr; /* rounded down addr */
size_t rsize; /* rounded up size */
int error = 0, writer = 0;
caddr_t saveraddr;
size_t saversize;
setprot_top:
raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
(size_t)raddr;
if (raddr + rsize < raddr) /* check for wraparound */
return (ENOMEM);
saveraddr = raddr;
saversize = rsize;
/*
* Normally we only lock the as as a reader. But
* if due to setprot the segment driver needs to split
* a segment it will return IE_RETRY. Therefore we re-acquire
* the as lock as a writer so the segment driver can change
* the seg list. Also the segment driver will return IE_RETRY
* after it has changed the segment list so we therefore keep
* locking as a writer. Since these opeartions should be rare
* want to only lock as a writer when necessary.
*/
if (writer || avl_numnodes(&as->a_wpage) != 0) {
AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
} else {
AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
}
as_clearwatchprot(as, raddr, rsize);
seg = as_segat(as, raddr);
if (seg == NULL) {
as_setwatch(as);
AS_LOCK_EXIT(as, &as->a_lock);
return (ENOMEM);
}
for (; rsize != 0; rsize -= ssize, raddr += ssize) {
if (raddr >= seg->s_base + seg->s_size) {
seg = AS_SEGNEXT(as, seg);
if (seg == NULL || raddr != seg->s_base) {
error = ENOMEM;
break;
}
}
if ((raddr + rsize) > (seg->s_base + seg->s_size))
ssize = seg->s_base + seg->s_size - raddr;
else
ssize = rsize;
retry:
error = SEGOP_SETPROT(seg, raddr, ssize, prot);
if (error == IE_NOMEM) {
error = EAGAIN;
break;
}
if (error == IE_RETRY) {
AS_LOCK_EXIT(as, &as->a_lock);
writer = 1;
goto setprot_top;
}
if (error == EAGAIN) {
/*
* Make sure we have a_lock as writer.
*/
if (writer == 0) {
AS_LOCK_EXIT(as, &as->a_lock);
writer = 1;
goto setprot_top;
}
/*
* Memory is currently locked. It must be unlocked
* before this operation can succeed through a retry.
* The possible reasons for locked memory and
* corresponding strategies for unlocking are:
* (1) Normal I/O
* wait for a signal that the I/O operation
* has completed and the memory is unlocked.
* (2) Asynchronous I/O
* The aio subsystem does not unlock pages when
* the I/O is completed. Those pages are unlocked
* when the application calls aiowait/aioerror.
* So, to prevent blocking forever, cv_broadcast()
* is done to wake up aio_cleanup_thread.
* Subsequently, segvn_reclaim will be called, and
* that will do AS_CLRUNMAPWAIT() and wake us up.
* (3) Long term page locking:
* Drivers intending to have pages locked for a
* period considerably longer than for normal I/O
* (essentially forever) may have registered for a
* callback so they may unlock these pages on
* request. This is needed to allow this operation
* to succeed. Each entry on the callback list is
* examined. If the event or address range pertains
* the callback is invoked (unless it already is in
* progress). The a_contents lock must be dropped
* before the callback, so only one callback can
* be done at a time. Go to the top and do more
* until zero is returned. If zero is returned,
* either there were no callbacks for this event
* or they were already in progress.
*/
mutex_enter(&as->a_contents);
if (as->a_callbacks &&
(cb = as_find_callback(as, AS_SETPROT_EVENT,
seg->s_base, seg->s_size))) {
AS_LOCK_EXIT(as, &as->a_lock);
as_execute_callback(as, cb, AS_SETPROT_EVENT);
} else if (!AS_ISNOUNMAPWAIT(as)) {
if (AS_ISUNMAPWAIT(as) == 0)
cv_broadcast(&as->a_cv);
AS_SETUNMAPWAIT(as);
AS_LOCK_EXIT(as, &as->a_lock);
while (AS_ISUNMAPWAIT(as))
cv_wait(&as->a_cv, &as->a_contents);
} else {
/*
* We may have raced with
* segvn_reclaim()/segspt_reclaim(). In this
* case clean nounmapwait flag and retry since
* softlockcnt in this segment may be already
* 0. We don't drop as writer lock so our
* number of retries without sleeping should
* be very small. See segvn_reclaim() for
* more comments.
*/
AS_CLRNOUNMAPWAIT(as);
mutex_exit(&as->a_contents);
goto retry;
}
mutex_exit(&as->a_contents);
goto setprot_top;
} else if (error != 0)
break;
}
if (error != 0) {
as_setwatch(as);
} else {
as_setwatchprot(as, saveraddr, saversize, prot);
}
AS_LOCK_EXIT(as, &as->a_lock);
return (error);
}
/*
* Check to make sure that the interval [addr, addr + size)
* in address space `as' has at least the specified protection.
* It is ok for the range to cross over several segments, as long
* as they are contiguous.
*/
int
as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
{
struct seg *seg;
size_t ssize;
caddr_t raddr; /* rounded down addr */
size_t rsize; /* rounded up size */
int error = 0;
raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
(size_t)raddr;
if (raddr + rsize < raddr) /* check for wraparound */
return (ENOMEM);
/*
* This is ugly as sin...
* Normally, we only acquire the address space readers lock.
* However, if the address space has watchpoints present,
* we must acquire the writer lock on the address space for
* the benefit of as_clearwatchprot() and as_setwatchprot().
*/
if (avl_numnodes(&as->a_wpage) != 0)
AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
else
AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
as_clearwatchprot(as, raddr, rsize);
seg = as_segat(as, raddr);
if (seg == NULL) {
as_setwatch(as);
AS_LOCK_EXIT(as, &as->a_lock);
return (ENOMEM);
}
for (; rsize != 0; rsize -= ssize, raddr += ssize) {
if (raddr >= seg->s_base + seg->s_size) {
seg = AS_SEGNEXT(as, seg);
if (seg == NULL || raddr != seg->s_base) {
error = ENOMEM;
break;
}
}
if ((raddr + rsize) > (seg->s_base + seg->s_size))
ssize = seg->s_base + seg->s_size - raddr;
else
ssize = rsize;
error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
if (error != 0)
break;
}
as_setwatch(as);
AS_LOCK_EXIT(as, &as->a_lock);
return (error);
}
int
as_unmap(struct as *as, caddr_t addr, size_t size)
{
struct seg *seg, *seg_next;
struct as_callback *cb;
caddr_t raddr, eaddr;
size_t ssize, rsize = 0;
int err;
top:
raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
(uintptr_t)PAGEMASK);
AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
as->a_updatedir = 1; /* inform /proc */
gethrestime(&as->a_updatetime);
/*
* Use as_findseg to find the first segment in the range, then
* step through the segments in order, following s_next.
*/
as_clearwatchprot(as, raddr, eaddr - raddr);
for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
if (eaddr <= seg->s_base)
break; /* eaddr was in a gap; all done */
/* this is implied by the test above */
ASSERT(raddr < eaddr);
if (raddr < seg->s_base)
raddr = seg->s_base; /* raddr was in a gap */
if (eaddr > (seg->s_base + seg->s_size))
ssize = seg->s_base + seg->s_size - raddr;
else
ssize = eaddr - raddr;
/*
* Save next segment pointer since seg can be
* destroyed during the segment unmap operation.
*/
seg_next = AS_SEGNEXT(as, seg);
/*
* We didn't count /dev/null mappings, so ignore them here.
* We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
* we have to do this check here while we have seg.)
*/
if (!SEG_IS_DEVNULL_MAPPING(seg) &&
!SEG_IS_PARTIAL_RESV(seg))
rsize = ssize;
retry:
err = SEGOP_UNMAP(seg, raddr, ssize);
if (err == EAGAIN) {
/*
* Memory is currently locked. It must be unlocked
* before this operation can succeed through a retry.
* The possible reasons for locked memory and
* corresponding strategies for unlocking are:
* (1) Normal I/O
* wait for a signal that the I/O operation
* has completed and the memory is unlocked.
* (2) Asynchronous I/O
* The aio subsystem does not unlock pages when
* the I/O is completed. Those pages are unlocked
* when the application calls aiowait/aioerror.
* So, to prevent blocking forever, cv_broadcast()
* is done to wake up aio_cleanup_thread.
* Subsequently, segvn_reclaim will be called, and
* that will do AS_CLRUNMAPWAIT() and wake us up.
* (3) Long term page locking:
* Drivers intending to have pages locked for a
* period considerably longer than for normal I/O
* (essentially forever) may have registered for a
* callback so they may unlock these pages on
* request. This is needed to allow this operation
* to succeed. Each entry on the callback list is
* examined. If the event or address range pertains
* the callback is invoked (unless it already is in
* progress). The a_contents lock must be dropped
* before the callback, so only one callback can
* be done at a time. Go to the top and do more
* until zero is returned. If zero is returned,
* either there were no callbacks for this event
* or they were already in progress.
*/
mutex_enter(&as->a_contents);
if (as->a_callbacks &&
(cb = as_find_callback(as, AS_UNMAP_EVENT,
seg->s_base, seg->s_size))) {
AS_LOCK_EXIT(as, &as->a_lock);
as_execute_callback(as, cb, AS_UNMAP_EVENT);
} else if (!AS_ISNOUNMAPWAIT(as)) {
if (AS_ISUNMAPWAIT(as) == 0)
cv_broadcast(&as->a_cv);
AS_SETUNMAPWAIT(as);
AS_LOCK_EXIT(as, &as->a_lock);
while (AS_ISUNMAPWAIT(as))
cv_wait(&as->a_cv, &as->a_contents);
} else {
/*
* We may have raced with
* segvn_reclaim()/segspt_reclaim(). In this
* case clean nounmapwait flag and retry since
* softlockcnt in this segment may be already
* 0. We don't drop as writer lock so our
* number of retries without sleeping should
* be very small. See segvn_reclaim() for
* more comments.
*/
AS_CLRNOUNMAPWAIT(as);
mutex_exit(&as->a_contents);
goto retry;
}
mutex_exit(&as->a_contents);
goto top;
} else if (err == IE_RETRY) {
AS_LOCK_EXIT(as, &as->a_lock);
goto top;
} else if (err) {
as_setwatch(as);
AS_LOCK_EXIT(as, &as->a_lock);
return (-1);
}
as->a_size -= ssize;
as->a_resvsize -= rsize;
raddr += ssize;
}
AS_LOCK_EXIT(as, &as->a_lock);
return (0);
}
static int
as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
{
uint_t szc;
uint_t nszc;
int error;
caddr_t a;
caddr_t eaddr;
size_t segsize;
struct seg *seg;
size_t pgsz;
int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
uint_t save_szcvec;
ASSERT(AS_WRITE_HELD(as, &as->a_lock));
ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
ASSERT(IS_P2ALIGNED(size, PAGESIZE));
ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
if (!do_off) {
vn_a->offset = 0;
}
if (szcvec <= 1) {
seg = seg_alloc(as, addr, size);
if (seg == NULL) {
return (ENOMEM);
}
vn_a->szc = 0;
error = (*crfp)(seg, vn_a);
if (error != 0) {
seg_free(seg);
} else {
as->a_size += size;
/*
* We'll count MAP_NORESERVE mappings as we fault
* pages in.
*/
if (!SEG_IS_PARTIAL_RESV(seg))
as->a_resvsize += size;
}
return (error);
}
eaddr = addr + size;
save_szcvec = szcvec;
szcvec >>= 1;
szc = 0;
nszc = 0;
while (szcvec) {
if ((szcvec & 0x1) == 0) {
nszc++;
szcvec >>= 1;
continue;
}
nszc++;
pgsz = page_get_pagesize(nszc);
a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
if (a != addr) {
ASSERT(a < eaddr);
segsize = a - addr;
seg = seg_alloc(as, addr, segsize);
if (seg == NULL) {
return (ENOMEM);
}
vn_a->szc = szc;
error = (*crfp)(seg, vn_a);
if (error != 0) {
seg_free(seg);
return (error);
}
as->a_size += segsize;
/*
* We'll count MAP_NORESERVE mappings as we fault
* pages in. We don't count /dev/null mappings at all.
*/
if (!SEG_IS_DEVNULL_MAPPING(seg) &&
!SEG_IS_PARTIAL_RESV(seg))
as->a_resvsize += segsize;
*segcreated = 1;
if (do_off) {
vn_a->offset += segsize;
}
addr = a;
}
szc = nszc;
szcvec >>= 1;
}
ASSERT(addr < eaddr);
szcvec = save_szcvec | 1; /* add 8K pages */
while (szcvec) {
a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
ASSERT(a >= addr);
if (a != addr) {
segsize = a - addr;
seg = seg_alloc(as, addr, segsize);
if (seg == NULL) {
return (ENOMEM);
}
vn_a->szc = szc;
error = (*crfp)(seg, vn_a);
if (error != 0) {
seg_free(seg);
return (error);
}
as->a_size += segsize;
/*
* We'll count MAP_NORESERVE mappings as we fault
* pages in. We don't count /dev/null mappings at all.
*/
if (!SEG_IS_DEVNULL_MAPPING(seg) &&
!SEG_IS_PARTIAL_RESV(seg))
as->a_resvsize += segsize;
*segcreated = 1;
if (do_off) {
vn_a->offset += segsize;
}
addr = a;
}
szcvec &= ~(1 << szc);
if (szcvec) {
szc = highbit(szcvec) - 1;
pgsz = page_get_pagesize(szc);
}
}
ASSERT(addr == eaddr);
return (0);
}
static int
as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
{
uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
type, 0);
int error;
struct seg *seg;
struct vattr va;
u_offset_t eoff;
size_t save_size = 0;
extern size_t textrepl_size_thresh;
ASSERT(AS_WRITE_HELD(as, &as->a_lock));
ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
ASSERT(IS_P2ALIGNED(size, PAGESIZE));
ASSERT(vn_a->vp != NULL);
ASSERT(vn_a->amp == NULL);
again:
if (szcvec <= 1) {
seg = seg_alloc(as, addr, size);
if (seg == NULL) {
return (ENOMEM);
}
vn_a->szc = 0;
error = (*crfp)(seg, vn_a);
if (error != 0) {
seg_free(seg);
} else {
as->a_size += size;
/*
* We'll count MAP_NORESERVE mappings as we fault
* pages in.
*/
if (!SEG_IS_PARTIAL_RESV(seg))
as->a_resvsize += size;
}
return (error);
}
va.va_mask = AT_SIZE;
if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
szcvec = 0;
goto again;
}
eoff = vn_a->offset & PAGEMASK;
if (eoff >= va.va_size) {
szcvec = 0;
goto again;
}
eoff += size;
if (btopr(va.va_size) < btopr(eoff)) {
save_size = size;
size = va.va_size - (vn_a->offset & PAGEMASK);
size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
type, 0);
if (szcvec <= 1) {
size = save_size;
goto again;
}
}
if (size > textrepl_size_thresh) {
vn_a->flags |= _MAP_TEXTREPL;
}
error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
segcreated);
if (error != 0) {
return (error);
}
if (save_size) {
addr += size;
size = save_size - size;
szcvec = 0;
goto again;
}
return (0);
}
/*
* as_map_ansegs: shared or private anonymous memory. Note that the flags
* passed to map_pgszvec cannot be MAP_INITDATA, for anon.
*/
static int
as_map_ansegs(struct as *as, caddr_t addr, size_t size,
int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
{
uint_t szcvec;
uchar_t type;
ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
if (vn_a->type == MAP_SHARED) {
type = MAPPGSZC_SHM;
} else if (vn_a->type == MAP_PRIVATE) {
if (vn_a->szc == AS_MAP_HEAP) {
type = MAPPGSZC_HEAP;
} else if (vn_a->szc == AS_MAP_STACK) {
type = MAPPGSZC_STACK;
} else {
type = MAPPGSZC_PRIVM;
}
}
szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
(uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
(vn_a->flags & MAP_TEXT), type, 0);
ASSERT(AS_WRITE_HELD(as, &as->a_lock));
ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
ASSERT(IS_P2ALIGNED(size, PAGESIZE));
ASSERT(vn_a->vp == NULL);
return (as_map_segvn_segs(as, addr, size, szcvec,
crfp, vn_a, segcreated));
}
int
as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
{
AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
return (as_map_locked(as, addr, size, crfp, argsp));
}
int
as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
void *argsp)
{
struct seg *seg = NULL;
caddr_t raddr; /* rounded down addr */
size_t rsize; /* rounded up size */
int error;
int unmap = 0;
struct proc *p = curproc;
struct segvn_crargs crargs;
raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
(size_t)raddr;
/*
* check for wrap around
*/
if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
AS_LOCK_EXIT(as, &as->a_lock);
return (ENOMEM);
}
as->a_updatedir = 1; /* inform /proc */
gethrestime(&as->a_updatetime);
if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
AS_LOCK_EXIT(as, &as->a_lock);
(void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
RCA_UNSAFE_ALL);
return (ENOMEM);
}
if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
crargs = *(struct segvn_crargs *)argsp;
error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
if (error != 0) {
AS_LOCK_EXIT(as, &as->a_lock);
if (unmap) {
(void) as_unmap(as, addr, size);
}
return (error);
}
} else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
crargs = *(struct segvn_crargs *)argsp;
error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
if (error != 0) {
AS_LOCK_EXIT(as, &as->a_lock);
if (unmap) {
(void) as_unmap(as, addr, size);
}
return (error);
}
} else {
seg = seg_alloc(as, addr, size);
if (seg == NULL) {
AS_LOCK_EXIT(as, &as->a_lock);
return (ENOMEM);
}
error = (*crfp)(seg, argsp);
if (error != 0) {
seg_free(seg);
AS_LOCK_EXIT(as, &as->a_lock);
return (error);
}
/*
* Add size now so as_unmap will work if as_ctl fails.
*/
as->a_size += rsize;
/*
* We'll count MAP_NORESERVE mappings as we fault
* pages in. We don't count /dev/null mappings at all.
*/
if (!SEG_IS_DEVNULL_MAPPING(seg) &&
!SEG_IS_PARTIAL_RESV(seg))
as->a_resvsize += rsize;
}
as_setwatch(as);
/*
* If the address space is locked,
* establish memory locks for the new segment.
*/
mutex_enter(&as->a_contents);
if (AS_ISPGLCK(as)) {
mutex_exit(&as->a_contents);
AS_LOCK_EXIT(as, &as->a_lock);
error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
if (error != 0)
(void) as_unmap(as, addr, size);
} else {
mutex_exit(&as->a_contents);
AS_LOCK_EXIT(as, &as->a_lock);
}
return (error);
}
/*
* Delete all segments in the address space marked with S_PURGE.
* This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
* These segments are deleted as a first step before calls to as_gap(), so
* that they don't affect mmap() or shmat().
*/
void
as_purge(struct as *as)
{
struct seg *seg;
struct seg *next_seg;
/*
* the setting of NEEDSPURGE is protect by as_rangelock(), so
* no need to grab a_contents mutex for this check
*/
if ((as->a_flags & AS_NEEDSPURGE) == 0)
return;
AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
next_seg = NULL;
seg = AS_SEGFIRST(as);
while (seg != NULL) {
next_seg = AS_SEGNEXT(as, seg);
if (seg->s_flags & S_PURGE)
SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
seg = next_seg;
}
AS_LOCK_EXIT(as, &as->a_lock);
mutex_enter(&as->a_contents);
as->a_flags &= ~AS_NEEDSPURGE;
mutex_exit(&as->a_contents);
}
/*
* Find a hole within [*basep, *basep + *lenp), which contains a mappable
* range of addresses at least "minlen" long, where the base of the range is
* at "off" phase from an "align" boundary and there is space for a
* "redzone"-sized redzone on eithe rside of the range. Thus,
* if align was 4M and off was 16k, the user wants a hole which will start
* 16k into a 4M page.
*
* If flags specifies AH_HI, the hole will have the highest possible address
* in the range. We use the as->a_lastgap field to figure out where to
* start looking for a gap.
*
* Otherwise, the gap will have the lowest possible address.
*
* If flags specifies AH_CONTAIN, the hole will contain the address addr.
*
* If an adequate hole is found, *basep and *lenp are set to reflect the part of
* the hole that is within range, and 0 is returned. On failure, -1 is returned.
*
* NOTE: This routine is not correct when base+len overflows caddr_t.
*/
int
as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
{
caddr_t lobound = *basep;
caddr_t hibound = lobound + *lenp;
struct seg *lseg, *hseg;
caddr_t lo, hi;
int forward;
caddr_t save_base;
size_t save_len;
size_t save_minlen;
size_t save_redzone;
int fast_path = 1;
save_base = *basep;
save_len = *lenp;
save_minlen = minlen;
save_redzone = redzone;
/*
* For the first pass/fast_path, just add align and redzone into
* minlen since if we get an allocation, we can guarantee that it
* will fit the alignment and redzone requested.
* This increases the chance that hibound will be adjusted to
* a_lastgap->s_base which will likely allow us to find an
* acceptable hole in the address space quicker.
* If we can't find a hole with this fast_path, then we look for
* smaller holes in which the alignment and offset may allow
* the allocation to fit.
*/
minlen += align;
minlen += 2 * redzone;
redzone = 0;
AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
if (AS_SEGFIRST(as) == NULL) {
if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
align, redzone, off)) {
AS_LOCK_EXIT(as, &as->a_lock);
return (0);
} else {
AS_LOCK_EXIT(as, &as->a_lock);
*basep = save_base;
*lenp = save_len;
return (-1);
}
}
retry:
/*
* Set up to iterate over all the inter-segment holes in the given
* direction. lseg is NULL for the lowest-addressed hole and hseg is
* NULL for the highest-addressed hole. If moving backwards, we reset
* sseg to denote the highest-addressed segment.
*/
forward = (flags & AH_DIR) == AH_LO;
if (forward) {
hseg = as_findseg(as, lobound, 1);
lseg = AS_SEGPREV(as, hseg);
} else {
/*
* If allocating at least as much as the last allocation,
* use a_lastgap's base as a better estimate of hibound.
*/
if (as->a_lastgap &&
minlen >= as->a_lastgap->s_size &&
hibound >= as->a_lastgap->s_base)
hibound = as->a_lastgap->s_base;
hseg = as_findseg(as, hibound, 1);
if (hseg->s_base + hseg->s_size < hibound) {
lseg = hseg;
hseg = NULL;
} else {
lseg = AS_SEGPREV(as, hseg);
}
}
for (;;) {
/*
* Set lo and hi to the hole's boundaries. (We should really
* use MAXADDR in place of hibound in the expression below,
* but can't express it easily; using hibound in its place is
* harmless.)
*/
lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
hi = (hseg == NULL) ? hibound : hseg->s_base;
/*
* If the iteration has moved past the interval from lobound
* to hibound it's pointless to continue.
*/
if ((forward && lo > hibound) || (!forward && hi < lobound))
break;
else if (lo > hibound || hi < lobound)
goto cont;
/*
* Candidate hole lies at least partially within the allowable
* range. Restrict it to fall completely within that range,
* i.e., to [max(lo, lobound), min(hi, hibound)].
*/
if (lo < lobound)
lo = lobound;
if (hi > hibound)
hi = hibound;
/*
* Verify that the candidate hole is big enough and meets
* hardware constraints. If the hole is too small, no need
* to do the further checks since they will fail.
*/
*basep = lo;
*lenp = hi - lo;
if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
((flags & AH_CONTAIN) == 0 ||
(*basep <= addr && *basep + *lenp > addr))) {
if (!forward)
as->a_lastgap = hseg;
if (hseg != NULL)
as->a_lastgaphl = hseg;
else
as->a_lastgaphl = lseg;
AS_LOCK_EXIT(as, &as->a_lock);
return (0);
}
cont:
/*
* Move to the next hole.
*/
if (forward) {
lseg = hseg;
if (lseg == NULL)
break;
hseg = AS_SEGNEXT(as, hseg);
} else {
hseg = lseg;
if (hseg == NULL)
break;
lseg = AS_SEGPREV(as, lseg);
}
}
if (fast_path && (align != 0 || save_redzone != 0)) {
fast_path = 0;
minlen = save_minlen;
redzone = save_redzone;
goto retry;
}
*basep = save_base;
*lenp = save_len;
AS_LOCK_EXIT(as, &as->a_lock);
return (-1);
}
/*
* Find a hole of at least size minlen within [*basep, *basep + *lenp).
*
* If flags specifies AH_HI, the hole will have the highest possible address
* in the range. We use the as->a_lastgap field to figure out where to
* start looking for a gap.
*
* Otherwise, the gap will have the lowest possible address.
*
* If flags specifies AH_CONTAIN, the hole will contain the address addr.
*
* If an adequate hole is found, base and len are set to reflect the part of
* the hole that is within range, and 0 is returned, otherwise,
* -1 is returned.
*
* NOTE: This routine is not correct when base+len overflows caddr_t.
*/
int
as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
caddr_t addr)
{
return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
}
/*
* Return the next range within [base, base + len) that is backed
* with "real memory". Skip holes and non-seg_vn segments.
* We're lazy and only return one segment at a time.
*/
int
as_memory(struct as *as, caddr_t *basep, size_t *lenp)
{
extern struct seg_ops segspt_shmops; /* needs a header file */
struct seg *seg;
caddr_t addr, eaddr;
caddr_t segend;
AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
addr = *basep;
eaddr = addr + *lenp;
seg = as_findseg(as, addr, 0);
if (seg != NULL)
addr = MAX(seg->s_base, addr);
for (;;) {
if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
AS_LOCK_EXIT(as, &as->a_lock);
return (EINVAL);
}
if (seg->s_ops == &segvn_ops) {
segend = seg->s_base + seg->s_size;
break;
}
/*
* We do ISM by looking into the private data
* to determine the real size of the segment.
*/
if (seg->s_ops == &segspt_shmops) {
segend = seg->s_base + spt_realsize(seg);
if (addr < segend)
break;
}
seg = AS_SEGNEXT(as, seg);
if (seg != NULL)
addr = seg->s_base;
}
*basep = addr;
if (segend > eaddr)
*lenp = eaddr - addr;
else
*lenp = segend - addr;
AS_LOCK_EXIT(as, &as->a_lock);
return (0);
}
/*
* Swap the pages associated with the address space as out to
* secondary storage, returning the number of bytes actually
* swapped.
*
* The value returned is intended to correlate well with the process's
* memory requirements. Its usefulness for this purpose depends on
* how well the segment-level routines do at returning accurate
* information.
*/
size_t
as_swapout(struct as *as)
{
struct seg *seg;
size_t swpcnt = 0;
/*
* Kernel-only processes have given up their address
* spaces. Of course, we shouldn't be attempting to
* swap out such processes in the first place...
*/
if (as == NULL)
return (0);
AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
/* Prevent XHATs from attaching */
mutex_enter(&as->a_contents);
AS_SETBUSY(as);
mutex_exit(&as->a_contents);
/*
* Free all mapping resources associated with the address
* space. The segment-level swapout routines capitalize
* on this unmapping by scavanging pages that have become
* unmapped here.
*/
hat_swapout(as->a_hat);
if (as->a_xhat != NULL)
xhat_swapout_all(as);
mutex_enter(&as->a_contents);
AS_CLRBUSY(as);
mutex_exit(&as->a_contents);
/*
* Call the swapout routines of all segments in the address
* space to do the actual work, accumulating the amount of
* space reclaimed.
*/
for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
struct seg_ops *ov = seg->s_ops;
/*
* We have to check to see if the seg has
* an ops vector because the seg may have
* been in the middle of being set up when
* the process was picked for swapout.
*/
if ((ov != NULL) && (ov->swapout != NULL))
swpcnt += SEGOP_SWAPOUT(seg);
}
AS_LOCK_EXIT(as, &as->a_lock);
return (swpcnt);
}
/*
* Determine whether data from the mappings in interval [addr, addr + size)
* are in the primary memory (core) cache.
*/
int
as_incore(struct as *as, caddr_t addr,
size_t size, char *vec, size_t *sizep)
{
struct seg *seg;
size_t ssize;
caddr_t raddr; /* rounded down addr */
size_t rsize; /* rounded up size */
size_t isize; /* iteration size */
int error = 0; /* result, assume success */
*sizep = 0;
raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
(size_t)raddr;
if (raddr + rsize < raddr) /* check for wraparound */
return (ENOMEM);
AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
seg = as_segat(as, raddr);
if (seg == NULL) {
AS_LOCK_EXIT(as, &as->a_lock);
return (-1);
}
for (; rsize != 0; rsize -= ssize, raddr += ssize) {
if (raddr >= seg->s_base + seg->s_size) {
seg = AS_SEGNEXT(as, seg);
if (seg == NULL || raddr != seg->s_base) {
error = -1;
break;
}
}
if ((raddr + rsize) > (seg->s_base + seg->s_size))
ssize = seg->s_base + seg->s_size - raddr;
else
ssize = rsize;
*sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
if (isize != ssize) {
error = -1;
break;
}
vec += btopr(ssize);
}
AS_LOCK_EXIT(as, &as->a_lock);
return (error);
}
static void
as_segunlock(struct seg *seg, caddr_t addr, int attr,
ulong_t *bitmap, size_t position, size_t npages)
{
caddr_t range_start;
size_t pos1 = position;
size_t pos2;
size_t size;
size_t end_pos = npages + position;
while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
size = ptob((pos2 - pos1));
range_start = (caddr_t)((uintptr_t)addr +
ptob(pos1 - position));
(void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
(ulong_t *)NULL, (size_t)NULL);
pos1 = pos2;
}
}
static void
as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
caddr_t raddr, size_t rsize)
{
struct seg *seg = as_segat(as, raddr);
size_t ssize;
while (rsize != 0) {
if (raddr >= seg->s_base + seg->s_size)
seg = AS_SEGNEXT(as, seg);
if ((raddr + rsize) > (seg->s_base + seg->s_size))
ssize = seg->s_base + seg->s_size - raddr;
else
ssize = rsize;
as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
rsize -= ssize;
raddr += ssize;
}
}
/*
* Cache control operations over the interval [addr, addr + size) in
* address space "as".
*/
/*ARGSUSED*/
int
as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
uintptr_t arg, ulong_t *lock_map, size_t pos)
{
struct seg *seg; /* working segment */
caddr_t raddr; /* rounded down addr */
caddr_t initraddr; /* saved initial rounded down addr */
size_t rsize; /* rounded up size */
size_t initrsize; /* saved initial rounded up size */
size_t ssize; /* size of seg */
int error = 0; /* result */
size_t mlock_size; /* size of bitmap */
ulong_t *mlock_map; /* pointer to bitmap used */
/* to represent the locked */
/* pages. */
retry:
if (error == IE_RETRY)
AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
else
AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
/*
* If these are address space lock/unlock operations, loop over
* all segments in the address space, as appropriate.
*/
if (func == MC_LOCKAS) {
size_t npages, idx;
size_t rlen = 0; /* rounded as length */
idx = pos;
if (arg & MCL_FUTURE) {
mutex_enter(&as->a_contents);
AS_SETPGLCK(as);
mutex_exit(&as->a_contents);
}
if ((arg & MCL_CURRENT) == 0) {
AS_LOCK_EXIT(as, &as->a_lock);
return (0);
}
seg = AS_SEGFIRST(as);
if (seg == NULL) {
AS_LOCK_EXIT(as, &as->a_lock);
return (0);
}
do {
raddr = (caddr_t)((uintptr_t)seg->s_base &
(uintptr_t)PAGEMASK);
rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
} while ((seg = AS_SEGNEXT(as, seg)) != NULL);
mlock_size = BT_BITOUL(btopr(rlen));
if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
AS_LOCK_EXIT(as, &as->a_lock);
return (EAGAIN);
}
for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
error = SEGOP_LOCKOP(seg, seg->s_base,
seg->s_size, attr, MC_LOCK, mlock_map, pos);
if (error != 0)
break;
pos += seg_pages(seg);
}
if (error) {
for (seg = AS_SEGFIRST(as); seg != NULL;
seg = AS_SEGNEXT(as, seg)) {
raddr = (caddr_t)((uintptr_t)seg->s_base &
(uintptr_t)PAGEMASK);
npages = seg_pages(seg);
as_segunlock(seg, raddr, attr, mlock_map,
idx, npages);
idx += npages;
}
}
kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
AS_LOCK_EXIT(as, &as->a_lock);
goto lockerr;
} else if (func == MC_UNLOCKAS) {
mutex_enter(&as->a_contents);
AS_CLRPGLCK(as);
mutex_exit(&as->a_contents);
for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
error = SEGOP_LOCKOP(seg, seg->s_base,
seg->s_size, attr, MC_UNLOCK, NULL, 0);
if (error != 0)
break;
}
AS_LOCK_EXIT(as, &as->a_lock);
goto lockerr;
}
/*
* Normalize addresses and sizes.
*/
initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
(size_t)raddr;
if (raddr + rsize < raddr) { /* check for wraparound */
AS_LOCK_EXIT(as, &as->a_lock);
return (ENOMEM);
}
/*
* Get initial segment.
*/
if ((seg = as_segat(as, raddr)) == NULL) {
AS_LOCK_EXIT(as, &as->a_lock);
return (ENOMEM);
}
if (func == MC_LOCK) {
mlock_size = BT_BITOUL(btopr(rsize));
if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
AS_LOCK_EXIT(as, &as->a_lock);
return (EAGAIN);
}
}
/*
* Loop over all segments. If a hole in the address range is
* discovered, then fail. For each segment, perform the appropriate
* control operation.
*/
while (rsize != 0) {
/*
* Make sure there's no hole, calculate the portion
* of the next segment to be operated over.
*/
if (raddr >= seg->s_base + seg->s_size) {
seg = AS_SEGNEXT(as, seg);
if (seg == NULL || raddr != seg->s_base) {
if (func == MC_LOCK) {
as_unlockerr(as, attr, mlock_map,
initraddr, initrsize - rsize);
kmem_free(mlock_map,
mlock_size * sizeof (ulong_t));
}
AS_LOCK_EXIT(as, &as->a_lock);
return (ENOMEM);
}
}
if ((raddr + rsize) > (seg->s_base + seg->s_size))
ssize = seg->s_base + seg->s_size - raddr;
else
ssize = rsize;
/*
* Dispatch on specific function.
*/
switch (func) {
/*
* Synchronize cached data from mappings with backing
* objects.
*/
case MC_SYNC:
if (error = SEGOP_SYNC(seg, raddr, ssize,
attr, (uint_t)arg)) {
AS_LOCK_EXIT(as, &as->a_lock);
return (error);
}
break;
/*
* Lock pages in memory.
*/
case MC_LOCK:
if (error = SEGOP_LOCKOP(seg, raddr, ssize,
attr, func, mlock_map, pos)) {
as_unlockerr(as, attr, mlock_map, initraddr,
initrsize - rsize + ssize);
kmem_free(mlock_map, mlock_size *
sizeof (ulong_t));
AS_LOCK_EXIT(as, &as->a_lock);
goto lockerr;
}
break;
/*
* Unlock mapped pages.
*/
case MC_UNLOCK:
(void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
(ulong_t *)NULL, (size_t)NULL);
break;
/*
* Store VM advise for mapped pages in segment layer.
*/
case MC_ADVISE:
error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
/*
* Check for regular errors and special retry error
*/
if (error) {
if (error == IE_RETRY) {
/*
* Need to acquire writers lock, so
* have to drop readers lock and start
* all over again
*/
AS_LOCK_EXIT(as, &as->a_lock);
goto retry;
} else if (error == IE_REATTACH) {
/*
* Find segment for current address
* because current segment just got
* split or concatenated
*/
seg = as_segat(as, raddr);
if (seg == NULL) {
AS_LOCK_EXIT(as, &as->a_lock);
return (ENOMEM);
}
} else {
/*
* Regular error
*/
AS_LOCK_EXIT(as, &as->a_lock);
return (error);
}
}
break;
/*
* Can't happen.
*/
default:
panic("as_ctl: bad operation %d", func);
/*NOTREACHED*/
}
rsize -= ssize;
raddr += ssize;
}
if (func == MC_LOCK)
kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
AS_LOCK_EXIT(as, &as->a_lock);
return (0);
lockerr:
/*
* If the lower levels returned EDEADLK for a segment lockop,
* it means that we should retry the operation. Let's wait
* a bit also to let the deadlock causing condition clear.
* This is part of a gross hack to work around a design flaw
* in the ufs/sds logging code and should go away when the
* logging code is re-designed to fix the problem. See bug
* 4125102 for details of the problem.
*/
if (error == EDEADLK) {
delay(deadlk_wait);
error = 0;
goto retry;
}
return (error);
}
int
fc_decode(faultcode_t fault_err)
{
int error = 0;
switch (FC_CODE(fault_err)) {
case FC_OBJERR:
error = FC_ERRNO(fault_err);
break;
case FC_PROT:
error = EACCES;
break;
default:
error = EFAULT;
break;
}
return (error);
}
/*
* Pagelock pages from a range that spans more than 1 segment. Obtain shadow
* lists from each segment and copy them to one contiguous shadow list (plist)
* as expected by the caller. Save pointers to per segment shadow lists at
* the tail of plist so that they can be used during as_pageunlock().
*/
static int
as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
caddr_t addr, size_t size, enum seg_rw rw)
{
caddr_t sv_addr = addr;
size_t sv_size = size;
struct seg *sv_seg = seg;
ulong_t segcnt = 1;
ulong_t cnt;
size_t ssize;
pgcnt_t npages = btop(size);
page_t **plist;
page_t **pl;
int error;
caddr_t eaddr;
faultcode_t fault_err = 0;
pgcnt_t pl_off;
extern struct seg_ops segspt_shmops;
ASSERT(AS_LOCK_HELD(as, &as->a_lock));
ASSERT(seg != NULL);
ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
ASSERT(addr + size > seg->s_base + seg->s_size);
ASSERT(IS_P2ALIGNED(size, PAGESIZE));
ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
/*
* Count the number of segments covered by the range we are about to
* lock. The segment count is used to size the shadow list we return
* back to the caller.
*/
for (; size != 0; size -= ssize, addr += ssize) {
if (addr >= seg->s_base + seg->s_size) {
seg = AS_SEGNEXT(as, seg);
if (seg == NULL || addr != seg->s_base) {
AS_LOCK_EXIT(as, &as->a_lock);
return (EFAULT);
}
/*
* Do a quick check if subsequent segments
* will most likely support pagelock.
*/
if (seg->s_ops == &segvn_ops) {
vnode_t *vp;
if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
vp != NULL) {
AS_LOCK_EXIT(as, &as->a_lock);
goto slow;
}
} else if (seg->s_ops != &segspt_shmops) {
AS_LOCK_EXIT(as, &as->a_lock);
goto slow;
}
segcnt++;
}
if (addr + size > seg->s_base + seg->s_size) {
ssize = seg->s_base + seg->s_size - addr;
} else {
ssize = size;
}
}
ASSERT(segcnt > 1);
plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
addr = sv_addr;
size = sv_size;
seg = sv_seg;
for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
if (addr >= seg->s_base + seg->s_size) {
seg = AS_SEGNEXT(as, seg);
ASSERT(seg != NULL && addr == seg->s_base);
cnt++;
ASSERT(cnt < segcnt);
}
if (addr + size > seg->s_base + seg->s_size) {
ssize = seg->s_base + seg->s_size - addr;
} else {
ssize = size;
}
pl = &plist[npages + cnt];
error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
L_PAGELOCK, rw);
if (error) {
break;
}
ASSERT(plist[npages + cnt] != NULL);
ASSERT(pl_off + btop(ssize) <= npages);
bcopy(plist[npages + cnt], &plist[pl_off],
btop(ssize) * sizeof (page_t *));
pl_off += btop(ssize);
}
if (size == 0) {
AS_LOCK_EXIT(as, &as->a_lock);
ASSERT(cnt == segcnt - 1);
*ppp = plist;
return (0);
}
/*
* one of pagelock calls failed. The error type is in error variable.
* Unlock what we've locked so far and retry with F_SOFTLOCK if error
* type is either EFAULT or ENOTSUP. Otherwise just return the error
* back to the caller.
*/
eaddr = addr;
seg = sv_seg;
for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
if (addr >= seg->s_base + seg->s_size) {
seg = AS_SEGNEXT(as, seg);
ASSERT(seg != NULL && addr == seg->s_base);
cnt++;
ASSERT(cnt < segcnt);
}
if (eaddr > seg->s_base + seg->s_size) {
ssize = seg->s_base + seg->s_size - addr;
} else {
ssize = eaddr - addr;
}
pl = &plist[npages + cnt];
ASSERT(*pl != NULL);
(void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
L_PAGEUNLOCK, rw);
}
AS_LOCK_EXIT(as, &as->a_lock);
kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
if (error != ENOTSUP && error != EFAULT) {
return (error);
}
slow:
/*
* If we are here because pagelock failed due to the need to cow fault
* in the pages we want to lock F_SOFTLOCK will do this job and in
* next as_pagelock() call for this address range pagelock will
* hopefully succeed.
*/
fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
if (fault_err != 0) {
return (fc_decode(fault_err));
}
*ppp = NULL;
return (0);
}
/*
* lock pages in a given address space. Return shadow list. If
* the list is NULL, the MMU mapping is also locked.
*/
int
as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
size_t size, enum seg_rw rw)
{
size_t rsize;
caddr_t raddr;
faultcode_t fault_err;
struct seg *seg;
int err;
TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
"as_pagelock_start: addr %p size %ld", addr, size);
raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
(size_t)raddr;
/*
* if the request crosses two segments let
* as_fault handle it.
*/
AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
seg = as_segat(as, raddr);
if (seg == NULL) {
AS_LOCK_EXIT(as, &as->a_lock);
return (EFAULT);
}
ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
if (raddr + rsize > seg->s_base + seg->s_size) {
return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
}
if (raddr + rsize <= raddr) {
AS_LOCK_EXIT(as, &as->a_lock);
return (EFAULT);
}
TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
"seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
/*
* try to lock pages and pass back shadow list
*/
err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
AS_LOCK_EXIT(as, &as->a_lock);
if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
return (err);
}
/*
* Use F_SOFTLOCK to lock the pages because pagelock failed either due
* to no pagelock support for this segment or pages need to be cow
* faulted in. If fault is needed F_SOFTLOCK will do this job for
* this as_pagelock() call and in the next as_pagelock() call for the
* same address range pagelock call will hopefull succeed.
*/
fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
if (fault_err != 0) {
return (fc_decode(fault_err));
}
*ppp = NULL;
TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
return (0);
}
/*
* unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow
* lists from the end of plist and call pageunlock interface for each segment.
* Drop as lock and free plist.
*/
static void
as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
struct page **plist, enum seg_rw rw)
{
ulong_t cnt;
caddr_t eaddr = addr + size;
pgcnt_t npages = btop(size);
size_t ssize;
page_t **pl;
ASSERT(AS_LOCK_HELD(as, &as->a_lock));
ASSERT(seg != NULL);
ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
ASSERT(addr + size > seg->s_base + seg->s_size);
ASSERT(IS_P2ALIGNED(size, PAGESIZE));
ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
ASSERT(plist != NULL);
for (cnt = 0; addr < eaddr; addr += ssize) {
if (addr >= seg->s_base + seg->s_size) {
seg = AS_SEGNEXT(as, seg);
ASSERT(seg != NULL && addr == seg->s_base);
cnt++;
}
if (eaddr > seg->s_base + seg->s_size) {
ssize = seg->s_base + seg->s_size - addr;
} else {
ssize = eaddr - addr;
}
pl = &plist[npages + cnt];
ASSERT(*pl != NULL);
(void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
L_PAGEUNLOCK, rw);
}
ASSERT(cnt > 0);
AS_LOCK_EXIT(as, &as->a_lock);
cnt++;
kmem_free(plist, (npages + cnt) * sizeof (page_t *));
}
/*
* unlock pages in a given address range
*/
void
as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
enum seg_rw rw)
{
struct seg *seg;
size_t rsize;
caddr_t raddr;
TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
"as_pageunlock_start: addr %p size %ld", addr, size);
/*
* if the shadow list is NULL, as_pagelock was
* falling back to as_fault
*/
if (pp == NULL) {
(void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
return;
}
raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
(size_t)raddr;
AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
seg = as_segat(as, raddr);
ASSERT(seg != NULL);
TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
"seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
if (raddr + rsize <= seg->s_base + seg->s_size) {
SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
} else {
as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
return;
}
AS_LOCK_EXIT(as, &as->a_lock);
TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
}
int
as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
boolean_t wait)
{
struct seg *seg;
size_t ssize;
caddr_t raddr; /* rounded down addr */
size_t rsize; /* rounded up size */
int error = 0;
size_t pgsz = page_get_pagesize(szc);
setpgsz_top:
if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
return (EINVAL);
}
raddr = addr;
rsize = size;
if (raddr + rsize < raddr) /* check for wraparound */
return (ENOMEM);
AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
as_clearwatchprot(as, raddr, rsize);
seg = as_segat(as, raddr);
if (seg == NULL) {
as_setwatch(as);
AS_LOCK_EXIT(as, &as->a_lock);
return (ENOMEM);
}
for (; rsize != 0; rsize -= ssize, raddr += ssize) {
if (raddr >= seg->s_base + seg->s_size) {
seg = AS_SEGNEXT(as, seg);
if (seg == NULL || raddr != seg->s_base) {
error = ENOMEM;
break;
}
}
if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
ssize = seg->s_base + seg->s_size - raddr;
} else {
ssize = rsize;
}
retry:
error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
if (error == IE_NOMEM) {
error = EAGAIN;
break;
}
if (error == IE_RETRY) {
AS_LOCK_EXIT(as, &as->a_lock);
goto setpgsz_top;
}
if (error == ENOTSUP) {
error = EINVAL;
break;
}
if (wait && (error == EAGAIN)) {
/*
* Memory is currently locked. It must be unlocked
* before this operation can succeed through a retry.
* The possible reasons for locked memory and
* corresponding strategies for unlocking are:
* (1) Normal I/O
* wait for a signal that the I/O operation
* has completed and the memory is unlocked.
* (2) Asynchronous I/O
* The aio subsystem does not unlock pages when
* the I/O is completed. Those pages are unlocked
* when the application calls aiowait/aioerror.
* So, to prevent blocking forever, cv_broadcast()
* is done to wake up aio_cleanup_thread.
* Subsequently, segvn_reclaim will be called, and
* that will do AS_CLRUNMAPWAIT() and wake us up.
* (3) Long term page locking:
* This is not relevant for as_setpagesize()
* because we cannot change the page size for
* driver memory. The attempt to do so will
* fail with a different error than EAGAIN so
* there's no need to trigger as callbacks like
* as_unmap, as_setprot or as_free would do.
*/
mutex_enter(&as->a_contents);
if (!AS_ISNOUNMAPWAIT(as)) {
if (AS_ISUNMAPWAIT(as) == 0) {
cv_broadcast(&as->a_cv);
}
AS_SETUNMAPWAIT(as);
AS_LOCK_EXIT(as, &as->a_lock);
while (AS_ISUNMAPWAIT(as)) {
cv_wait(&as->a_cv, &as->a_contents);
}
} else {
/*
* We may have raced with
* segvn_reclaim()/segspt_reclaim(). In this
* case clean nounmapwait flag and retry since
* softlockcnt in this segment may be already
* 0. We don't drop as writer lock so our
* number of retries without sleeping should
* be very small. See segvn_reclaim() for
* more comments.
*/
AS_CLRNOUNMAPWAIT(as);
mutex_exit(&as->a_contents);
goto retry;
}
mutex_exit(&as->a_contents);
goto setpgsz_top;
} else if (error != 0) {
break;
}
}
as_setwatch(as);
AS_LOCK_EXIT(as, &as->a_lock);
return (error);
}
/*
* as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
* in its chunk where s_szc is less than the szc we want to set.
*/
static int
as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
int *retry)
{
struct seg *seg;
size_t ssize;
int error;
ASSERT(AS_WRITE_HELD(as, &as->a_lock));
seg = as_segat(as, raddr);
if (seg == NULL) {
panic("as_iset3_default_lpsize: no seg");
}
for (; rsize != 0; rsize -= ssize, raddr += ssize) {
if (raddr >= seg->s_base + seg->s_size) {
seg = AS_SEGNEXT(as, seg);
if (seg == NULL || raddr != seg->s_base) {
panic("as_iset3_default_lpsize: as changed");
}
}
if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
ssize = seg->s_base + seg->s_size - raddr;
} else {
ssize = rsize;
}
if (szc > seg->s_szc) {
error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
/* Only retry on EINVAL segments that have no vnode. */
if (error == EINVAL) {
vnode_t *vp = NULL;
if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
(SEGOP_GETVP(seg, raddr, &vp) != 0 ||
vp == NULL)) {
*retry = 1;
} else {
*retry = 0;
}
}
if (error) {
return (error);
}
}
}
return (0);
}
/*
* as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
* pagesize on each segment in its range, but if any fails with EINVAL,
* then it reduces the pagesizes to the next size in the bitmap and
* retries as_iset3_default_lpsize(). The reason why the code retries
* smaller allowed sizes on EINVAL is because (a) the anon offset may not
* match the bigger sizes, and (b) it's hard to get this offset (to begin
* with) to pass to map_pgszcvec().
*/
static int
as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
uint_t szcvec)
{
int error;
int retry;
ASSERT(AS_WRITE_HELD(as, &as->a_lock));
for (;;) {
error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
if (error == EINVAL && retry) {
szcvec &= ~(1 << szc);
if (szcvec <= 1) {
return (EINVAL);
}
szc = highbit(szcvec) - 1;
} else {
return (error);
}
}
}
/*
* as_iset1_default_lpsize() breaks its chunk into areas where existing
* segments have a smaller szc than we want to set. For each such area,
* it calls as_iset2_default_lpsize()
*/
static int
as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
uint_t szcvec)
{
struct seg *seg;
size_t ssize;
caddr_t setaddr = raddr;
size_t setsize = 0;
int set;
int error;
ASSERT(AS_WRITE_HELD(as, &as->a_lock));
seg = as_segat(as, raddr);
if (seg == NULL) {
panic("as_iset1_default_lpsize: no seg");
}
if (seg->s_szc < szc) {
set = 1;
} else {
set = 0;
}
for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
if (raddr >= seg->s_base + seg->s_size) {
seg = AS_SEGNEXT(as, seg);
if (seg == NULL || raddr != seg->s_base) {
panic("as_iset1_default_lpsize: as changed");
}
if (seg->s_szc >= szc && set) {
ASSERT(setsize != 0);
error = as_iset2_default_lpsize(as,
setaddr, setsize, szc, szcvec);
if (error) {
return (error);
}
set = 0;
} else if (seg->s_szc < szc && !set) {
setaddr = raddr;
setsize = 0;
set = 1;
}
}
if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
ssize = seg->s_base + seg->s_size - raddr;
} else {
ssize = rsize;
}
}
error = 0;
if (set) {
ASSERT(setsize != 0);
error = as_iset2_default_lpsize(as, setaddr, setsize,
szc, szcvec);
}
return (error);
}
/*
* as_iset_default_lpsize() breaks its chunk according to the size code bitmap
* returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
* chunk to as_iset1_default_lpsize().
*/
static int
as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
int type)
{
int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
flags, rtype, 1);
uint_t szc;
uint_t nszc;
int error;
caddr_t a;
caddr_t eaddr;
size_t segsize;
size_t pgsz;
uint_t save_szcvec;
ASSERT(AS_WRITE_HELD(as, &as->a_lock));
ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
ASSERT(IS_P2ALIGNED(size, PAGESIZE));
szcvec &= ~1;
if (szcvec <= 1) { /* skip if base page size */
return (0);
}
/* Get the pagesize of the first larger page size. */
szc = lowbit(szcvec) - 1;
pgsz = page_get_pagesize(szc);
eaddr = addr + size;
addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
save_szcvec = szcvec;
szcvec >>= (szc + 1);
nszc = szc;
while (szcvec) {
if ((szcvec & 0x1) == 0) {
nszc++;
szcvec >>= 1;
continue;
}
nszc++;
pgsz = page_get_pagesize(nszc);
a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
if (a != addr) {
ASSERT(szc > 0);
ASSERT(a < eaddr);
segsize = a - addr;
error = as_iset1_default_lpsize(as, addr, segsize, szc,
save_szcvec);
if (error) {
return (error);
}
addr = a;
}
szc = nszc;
szcvec >>= 1;
}
ASSERT(addr < eaddr);
szcvec = save_szcvec;
while (szcvec) {
a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
ASSERT(a >= addr);
if (a != addr) {
ASSERT(szc > 0);
segsize = a - addr;
error = as_iset1_default_lpsize(as, addr, segsize, szc,
save_szcvec);
if (error) {
return (error);
}
addr = a;
}
szcvec &= ~(1 << szc);
if (szcvec) {
szc = highbit(szcvec) - 1;
pgsz = page_get_pagesize(szc);
}
}
ASSERT(addr == eaddr);
return (0);
}
/*
* Set the default large page size for the range. Called via memcntl with
* page size set to 0. as_set_default_lpsize breaks the range down into
* chunks with the same type/flags, ignores-non segvn segments, and passes
* each chunk to as_iset_default_lpsize().
*/
int
as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
{
struct seg *seg;
caddr_t raddr;
size_t rsize;
size_t ssize;
int rtype, rflags;
int stype, sflags;
int error;
caddr_t setaddr;
size_t setsize;
int segvn;
if (size == 0)
return (0);
AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
again:
error = 0;
raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
(size_t)raddr;
if (raddr + rsize < raddr) { /* check for wraparound */
AS_LOCK_EXIT(as, &as->a_lock);
return (ENOMEM);
}
as_clearwatchprot(as, raddr, rsize);
seg = as_segat(as, raddr);
if (seg == NULL) {
as_setwatch(as);
AS_LOCK_EXIT(as, &as->a_lock);
return (ENOMEM);
}
if (seg->s_ops == &segvn_ops) {
rtype = SEGOP_GETTYPE(seg, addr);
rflags = rtype & (MAP_TEXT | MAP_INITDATA);
rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
segvn = 1;
} else {
segvn = 0;
}
setaddr = raddr;
setsize = 0;
for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
if (raddr >= (seg->s_base + seg->s_size)) {
seg = AS_SEGNEXT(as, seg);
if (seg == NULL || raddr != seg->s_base) {
error = ENOMEM;
break;
}
if (seg->s_ops == &segvn_ops) {
stype = SEGOP_GETTYPE(seg, raddr);
sflags = stype & (MAP_TEXT | MAP_INITDATA);
stype &= (MAP_SHARED | MAP_PRIVATE);
if (segvn && (rflags != sflags ||
rtype != stype)) {
/*
* The next segment is also segvn but
* has different flags and/or type.
*/
ASSERT(setsize != 0);
error = as_iset_default_lpsize(as,
setaddr, setsize, rflags, rtype);
if (error) {
break;
}
rflags = sflags;
rtype = stype;
setaddr = raddr;
setsize = 0;
} else if (!segvn) {
rflags = sflags;
rtype = stype;
setaddr = raddr;
setsize = 0;
segvn = 1;
}
} else if (segvn) {
/* The next segment is not segvn. */
ASSERT(setsize != 0);
error = as_iset_default_lpsize(as,
setaddr, setsize, rflags, rtype);
if (error) {
break;
}
segvn = 0;
}
}
if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
ssize = seg->s_base + seg->s_size - raddr;
} else {
ssize = rsize;
}
}
if (error == 0 && segvn) {
/* The last chunk when rsize == 0. */
ASSERT(setsize != 0);
error = as_iset_default_lpsize(as, setaddr, setsize,
rflags, rtype);
}
if (error == IE_RETRY) {
goto again;
} else if (error == IE_NOMEM) {
error = EAGAIN;
} else if (error == ENOTSUP) {
error = EINVAL;
} else if (error == EAGAIN) {
mutex_enter(&as->a_contents);
if (!AS_ISNOUNMAPWAIT(as)) {
if (AS_ISUNMAPWAIT(as) == 0) {
cv_broadcast(&as->a_cv);
}
AS_SETUNMAPWAIT(as);
AS_LOCK_EXIT(as, &as->a_lock);
while (AS_ISUNMAPWAIT(as)) {
cv_wait(&as->a_cv, &as->a_contents);
}
mutex_exit(&as->a_contents);
AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
} else {
/*
* We may have raced with
* segvn_reclaim()/segspt_reclaim(). In this case
* clean nounmapwait flag and retry since softlockcnt
* in this segment may be already 0. We don't drop as
* writer lock so our number of retries without
* sleeping should be very small. See segvn_reclaim()
* for more comments.
*/
AS_CLRNOUNMAPWAIT(as);
mutex_exit(&as->a_contents);
}
goto again;
}
as_setwatch(as);
AS_LOCK_EXIT(as, &as->a_lock);
return (error);
}
/*
* Setup all of the uninitialized watched pages that we can.
*/
void
as_setwatch(struct as *as)
{
struct watched_page *pwp;
struct seg *seg;
caddr_t vaddr;
uint_t prot;
int err, retrycnt;
if (avl_numnodes(&as->a_wpage) == 0)
return;
ASSERT(AS_WRITE_HELD(as, &as->a_lock));
for (pwp = avl_first(&as->a_wpage); pwp != NULL;
pwp = AVL_NEXT(&as->a_wpage, pwp)) {
retrycnt = 0;
retry:
vaddr = pwp->wp_vaddr;
if (pwp->wp_oprot != 0 || /* already set up */
(seg = as_segat(as, vaddr)) == NULL ||
SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
continue;
pwp->wp_oprot = prot;
if (pwp->wp_read)
prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
if (pwp->wp_write)
prot &= ~PROT_WRITE;
if (pwp->wp_exec)
prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
if (err == IE_RETRY) {
pwp->wp_oprot = 0;
ASSERT(retrycnt == 0);
retrycnt++;
goto retry;
}
}
pwp->wp_prot = prot;
}
}
/*
* Clear all of the watched pages in the address space.
*/
void
as_clearwatch(struct as *as)
{
struct watched_page *pwp;
struct seg *seg;
caddr_t vaddr;
uint_t prot;
int err, retrycnt;
if (avl_numnodes(&as->a_wpage) == 0)
return;
ASSERT(AS_WRITE_HELD(as, &as->a_lock));
for (pwp = avl_first(&as->a_wpage); pwp != NULL;
pwp = AVL_NEXT(&as->a_wpage, pwp)) {
retrycnt = 0;
retry:
vaddr = pwp->wp_vaddr;
if (pwp->wp_oprot == 0 || /* not set up */
(seg = as_segat(as, vaddr)) == NULL)
continue;
if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
if (err == IE_RETRY) {
ASSERT(retrycnt == 0);
retrycnt++;
goto retry;
}
}
pwp->wp_oprot = 0;
pwp->wp_prot = 0;
}
}
/*
* Force a new setup for all the watched pages in the range.
*/
static void
as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
{
struct watched_page *pwp;
struct watched_page tpw;
caddr_t eaddr = addr + size;
caddr_t vaddr;
struct seg *seg;
int err, retrycnt;
uint_t wprot;
avl_index_t where;
if (avl_numnodes(&as->a_wpage) == 0)
return;
ASSERT(AS_WRITE_HELD(as, &as->a_lock));
tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
while (pwp != NULL && pwp->wp_vaddr < eaddr) {
retrycnt = 0;
vaddr = pwp->wp_vaddr;
wprot = prot;
if (pwp->wp_read)
wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
if (pwp->wp_write)
wprot &= ~PROT_WRITE;
if (pwp->wp_exec)
wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
retry:
seg = as_segat(as, vaddr);
if (seg == NULL) {
panic("as_setwatchprot: no seg");
/*NOTREACHED*/
}
err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
if (err == IE_RETRY) {
ASSERT(retrycnt == 0);
retrycnt++;
goto retry;
}
}
pwp->wp_oprot = prot;
pwp->wp_prot = wprot;
pwp = AVL_NEXT(&as->a_wpage, pwp);
}
}
/*
* Clear all of the watched pages in the range.
*/
static void
as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
{
caddr_t eaddr = addr + size;
struct watched_page *pwp;
struct watched_page tpw;
uint_t prot;
struct seg *seg;
int err, retrycnt;
avl_index_t where;
if (avl_numnodes(&as->a_wpage) == 0)
return;
tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
ASSERT(AS_WRITE_HELD(as, &as->a_lock));
while (pwp != NULL && pwp->wp_vaddr < eaddr) {
if ((prot = pwp->wp_oprot) != 0) {
retrycnt = 0;
if (prot != pwp->wp_prot) {
retry:
seg = as_segat(as, pwp->wp_vaddr);
if (seg == NULL)
continue;
err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
PAGESIZE, prot);
if (err == IE_RETRY) {
ASSERT(retrycnt == 0);
retrycnt++;
goto retry;
}
}
pwp->wp_oprot = 0;
pwp->wp_prot = 0;
}
pwp = AVL_NEXT(&as->a_wpage, pwp);
}
}
void
as_signal_proc(struct as *as, k_siginfo_t *siginfo)
{
struct proc *p;
mutex_enter(&pidlock);
for (p = practive; p; p = p->p_next) {
if (p->p_as == as) {
mutex_enter(&p->p_lock);
if (p->p_as == as)
sigaddq(p, NULL, siginfo, KM_NOSLEEP);
mutex_exit(&p->p_lock);
}
}
mutex_exit(&pidlock);
}
/*
* return memory object ID
*/
int
as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
{
struct seg *seg;
int sts;
AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
seg = as_segat(as, addr);
if (seg == NULL) {
AS_LOCK_EXIT(as, &as->a_lock);
return (EFAULT);
}
/*
* catch old drivers which may not support getmemid
*/
if (seg->s_ops->getmemid == NULL) {
AS_LOCK_EXIT(as, &as->a_lock);
return (ENODEV);
}
sts = SEGOP_GETMEMID(seg, addr, memidp);
AS_LOCK_EXIT(as, &as->a_lock);
return (sts);
}