mem_config.c revision 9f1a1f17daffe08eb291b552d9394f81f3ad0f05
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/memlist_impl.h>
#include <sys/tuneable.h>
#include <vm/seg_kmem.h>
#define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */
#include <sys/mem_config.h>
#include <sys/mem_cage.h>
extern void memlist_read_lock(void);
extern void memlist_read_unlock(void);
extern void memlist_write_lock(void);
extern void memlist_write_unlock(void);
extern struct memlist *phys_avail;
extern uint_t page_ctrs_adjust(int);
static void kphysm_setup_post_add(pgcnt_t);
static int kphysm_setup_pre_del(pgcnt_t);
static void kphysm_setup_post_del(pgcnt_t, int);
static kmutex_t memseg_lists_lock;
static struct memseg *memseg_va_avail;
static struct memseg *memseg_delete_junk;
static struct memseg *memseg_edit_junk;
void memseg_remap_init(void);
static struct kmem_cache *memseg_cache;
/*
* Add a chunk of memory to the system. page_t's for this memory
* are allocated in the first few pages of the chunk.
* base: starting PAGESIZE page of new memory.
* npgs: length in PAGESIZE pages.
*
* Adding mem this way doesn't increase the size of the hash tables;
* growing them would be too hard. This should be OK, but adding memory
* dynamically most likely means more hash misses, since the tables will
* be smaller than they otherwise would be.
*/
int
{
int exhausted;
int mnode;
int reuse;
int mlret;
void *mapva;
/*
* Add this span in the delete list to prevent interactions.
*/
return (KPHYSM_ESPAN);
}
/*
* Check to see if any of the memory span has been added
* by trying an add to the installed memory list. This
* forms the interlocking process for add.
*/
if (mlret == MEML_SPANOP_OK)
if (mlret != MEML_SPANOP_OK) {
if (mlret == MEML_SPANOP_EALLOC) {
return (KPHYSM_ERESOURCE);
} else
if (mlret == MEML_SPANOP_ESPAN) {
return (KPHYSM_ESPAN);
} else {
return (KPHYSM_ERESOURCE);
}
}
/*
* We store the page_t's for this new memory in the first
* few pages of the chunk. Here, we go and get'em ...
*/
/*
* The expression after the '-' gives the number of pages
* that will fit in the new memory based on a requirement
* of (PAGESIZE + sizeof (page_t)) bytes per page.
*/
if (kpm_enable && !exhausted) {
/*
* A viable kpm large page mapping must not overlap two
* dynamic memsegs. Therefore the total size is checked
* to be at least kpm_pgsz and also whether start and end
* points are at least kpm_pgsz aligned.
*/
/*
* There is no specific error code for violating
* kpm granularity constraints.
*/
return (KPHYSM_ENOTVIABLE);
}
if (!exhausted) {
/* final nkpmpgs */
kpm_pages_off = ptsz +
}
}
/*
* Is memory area supplied too small?
*/
if (exhausted) {
/*
* There is no specific error code for 'too small'.
*/
return (KPHYSM_ERESOURCE);
}
/*
* We may re-use a previously allocated VA space for the page_ts
* eventually, but we need to initialize and lock the pages first.
*/
/*
* Get an address in the kernel address map, map
* the page_t pages and see if we can touch them.
*/
" Can't allocate VA for page_ts");
return (KPHYSM_ERESOURCE);
}
/*
* In the remapping code we map one page at a time so we must do
* the same here to match mapping sizes.
*/
pfn++;
}
" Can't access pp array at 0x%p [phys 0x%lx]",
return (KPHYSM_EFAULT);
}
/*
* Add this memory slice to its memory node translation.
*
* Note that right now, each node may have only one slice;
* this may change with COD or in larger SSM systems with
* nested latency groups, so we must not assume that the
* node does not yet exist.
*/
/*
* Allocate or resize page counters as necessary to accomodate
* the increase in memory pages.
*/
if (page_ctrs_adjust(mnode) != 0) {
return (KPHYSM_ERESOURCE);
}
/*
* Update the phys_avail memory list.
* The phys_install list was done at the start.
*/
/* See if we can find a memseg to re-use. */
/*
* Initialize the memseg structure representing this memory
* and add it to the existing list of memsegs. Do some basic
* initialization and add the memory to the system.
* In order to prevent lock deadlocks, the add_physmem()
* code is repeated here, but split into several stages.
*/
} else {
/*EMPTY*/
}
/*
* Initialize metadata. The page_ts are set to locked state
* ready to be freed.
*/
/* Save the original pp base in case we reuse a memseg. */
pfn++;
continue;
}
if (reuse) {
/* Remap our page_ts to the re-used memseg VA space. */
pfn++;
}
}
memsegs_lock(1);
/*
* The new memseg is inserted at the beginning of the list.
* Not only does this save searching for the tail, but in the
* case of a re-used memseg, it solves the problem of what
* happens of some process has still got a pointer to the
* memseg and follows the next pointer to continue traversing
* the memsegs list.
*/
total_pages += npgs;
/*
* Recalculate the paging parameters now total_pages has changed.
* This will also cause the clock hands to be reset before next use.
*/
setupclock(1);
memsegs_unlock(1);
/*
* Free the pages outside the lock to avoid locking loops.
*/
}
/*
* Now that we've updated the appropriate memory lists we
* need to reset a number of globals, since we've increased memory.
* Several have already been updated for us as noted above. The
* globals we're interested in at this point are:
* physmax - highest page frame number.
* physinstalled - number of pages currently installed (done earlier)
* maxmem - max free pages in the system
* physmem - physical memory pages available
* availrmem - real memory available
*/
dump_resize();
/*
* Update lgroup generation number on single lgroup systems
*/
if (nlgrps == 1)
lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
return (KPHYSM_OK); /* Successfully added system memory */
}
/*
* There are various error conditions in kphysm_add_memory_dynamic()
* which require a rollback of already changed global state.
*/
static void
{
int mlret;
/* Unreserve memory span. */
}
/*
* Only return an available memseg of exactly the right size.
* When the meta data area has it's own virtual address space
* we will need to manage this more carefully and do best fit
* allocations, possibly splitting an availble area.
*/
static struct memseg *
{
segpp = &memseg_va_avail;
if (kpm_enable)
else
break;
}
}
return (seg);
}
static uint_t handle_gen;
struct memdelspan {
struct memdelspan *mds_next;
};
#define MDS_BITMAPBYTES(MDSP) \
struct transit_list {
struct transit_list *trl_next;
struct memdelspan *trl_spans;
int trl_collect;
};
struct transit_list_head {
struct transit_list *trh_head;
};
static struct transit_list_head transit_list_head;
struct mem_handle;
static void transit_list_collect(struct mem_handle *, int);
static void transit_list_insert(struct transit_list *);
static void transit_list_remove(struct transit_list *);
#ifdef DEBUG
#define MEM_DEL_STATS
#endif /* DEBUG */
#ifdef MEM_DEL_STATS
static int mem_del_stat_print = 0;
struct mem_del_stat {
};
/*
* The stat values are only incremented in the delete thread
* so no locking or atomic required.
*/
static void mem_del_stat_print_func(struct mem_handle *);
#else /* MEM_DEL_STATS */
#define MDSTAT_PRINT(MHP)
#endif /* MEM_DEL_STATS */
/*
* mh_mutex must be taken to examine or change mh_exthandle and mh_state.
* The mutex may not be required for other fields, dependent on mh_state.
*/
struct mem_handle {
struct mem_handle *mh_next;
struct transit_list mh_transit;
void (*mh_delete_complete)(void *, int error);
void *mh_delete_complete_arg;
volatile uint_t mh_dr_aio_cleanup_cancel;
volatile uint_t mh_aio_cleanup_done;
#ifdef MEM_DEL_STATS
struct mem_del_stat mh_delstat;
#endif /* MEM_DEL_STATS */
};
static struct mem_handle *mem_handle_head;
static kmutex_t mem_handle_list_mutex;
static struct mem_handle *
{
struct mem_handle *mhp;
/* handle_gen is protected by list mutex. */
return (mhp);
}
static void
{
struct mem_handle **mhpp;
/*
* Exit the mutex to preserve locking order. This is OK
* here as once in the FREE state, the handle cannot
* be found by a lookup.
*/
mhpp = &mem_handle_head;
/*
* No need to lock the handle (mh_mutex) as only
* mh_next changing and this is the only thread that
* can be referncing mhp.
*/
}
/*
* This function finds the internal mem_handle corresponding to an
* external handle and returns it with the mh_mutex held.
*/
static struct mem_handle *
{
struct mem_handle *mhp;
/*
* The state of the handle could have been changed
* by kphysm_del_release() while waiting for mh_mutex.
*/
continue;
}
break;
}
}
return (mhp);
}
int
{
struct mem_handle *mhp;
/*
* The handle is allocated using KM_SLEEP, so cannot fail.
* If the implementation is changed, the correct error to return
* here would be KPHYSM_ENOHANDLES.
*/
return (KPHYSM_OK);
}
static int
{
}
static int can_remove_pgs(pgcnt_t);
static struct memdelspan *
{
struct memdelspan *mdsp;
struct memdelspan *mdsp_new;
while (size != 0) {
continue;
break;
}
size = 0;
thislen = 0;
} else {
}
} else {
}
}
/* TODO: phys_install could change now */
if (thislen == 0)
continue;
}
return (mdsp_new);
}
static void
{
struct memdelspan *amdsp;
}
}
/*
* Concatenate lists. No list ordering is required.
*/
static void
{
}
/*
* Given a new list of delspans, check there is no overlap with
* all existing span activity (add or delete) and then concatenate
* the new spans to the given list.
* Return 1 for OK, 0 if overlapping.
*/
static int
struct transit_list *my_tlp,
struct memdelspan *mdsp_new)
{
struct transit_list_head *trh;
struct transit_list *tlp;
int ret;
trh = &transit_list_head;
ret = 1;
/* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
struct memdelspan *mdsp;
struct memdelspan *nmdsp;
ret = 0;
goto done;
}
}
}
}
done:
if (ret != 0) {
}
return (ret);
}
static void
struct transit_list *my_tlp,
{
struct transit_list_head *trh;
struct memdelspan *mdsp;
trh = &transit_list_head;
if (npgs == 0) {
} else {
struct memdelspan **prv;
} else {
}
}
}
}
}
/*
* Reserve interface for add to stop delete before add finished.
* This list is only accessed through the delspan_insert/remove
* functions and so is fully protected by the mutex in struct transit_list.
*/
static struct transit_list reserve_transit;
static int
{
struct memdelspan *mdsp;
int ret;
}
return (ret);
}
static void
{
}
/*
* Return whether memseg was created by kphysm_add_memory_dynamic().
* If this is the case and startp non zero, return also the start pfn
* of the meta data via startp.
*/
static int
{
return (0);
/* Meta data is required to be at the beginning */
return (1);
}
int
{
struct mem_handle *mhp;
struct memdelspan *mdsp;
struct memdelspan *mdsp_new;
int ret;
return (KPHYSM_EHANDLE);
}
return (KPHYSM_ESEQUENCE);
}
/*
* Intersect the span with the installed memory list (phys_install).
*/
/*
* No physical memory in this range. Is this an
* error? If an attempt to start the delete is made
* for OK returns from del_span such as this, start will
* return an error.
* Could return KPHYSM_ENOWORK.
*/
/*
* It is assumed that there are no error returns
* from span_to_install() due to kmem_alloc failure.
*/
return (KPHYSM_OK);
}
/*
* Does this span overlap an existing span?
*/
/*
* Differentiate between already on list for this handle
* (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
*/
ret = KPHYSM_EBUSY;
ret = KPHYSM_EDUP;
break;
}
}
return (ret);
}
/*
* At this point the spans in mdsp_new have been inserted into the
* list of spans for this handle and thereby to the global list of
* spans being processed. Each of these spans must now be checked
* for relocatability. As a side-effect segments in the memseg list
* may be split.
*
* Note that mdsp_new can no longer be used as it is now part of
* a larger list. Select elements of this larger list based
* on base and npgs.
*/
phys_pages = 0;
vm_pages = 0;
continue;
}
/*
* The pages_checked count is a hack. All pages should be
* checked for relocatability. Those not covered by memsegs
* should be tested with arch_kphysm_del_span_ok().
*/
pages_checked = 0;
/* Span and memseg don't overlap. */
continue;
}
/* Check that segment is suitable for delete. */
/*
* Can only delete whole added segments
* for the moment.
* Check that this is completely within the
* span.
*/
ret = KPHYSM_EBUSY;
break;
}
} else {
/*
* Set mseg_start for accounting below.
*/
/*
* If this segment is larger than the span,
* try to split it. After the split, it
* is necessary to restart.
*/
int s_ret;
/* Split required. */
else
else
anpgs);
if (s_ret == 0) {
/* Split failed. */
break;
}
goto restart;
}
}
/*
* The memseg is wholly within the delete span.
* The individual pages can now be checked.
*/
/* Cage test. */
if (PP_ISNORELOC(pp)) {
break;
}
}
break;
}
}
break;
break;
}
}
} else {
/*
* Keep holding the mh_mutex to prevent it going away.
*/
}
return (ret);
}
int
{
struct memdelspan *mdsp;
struct memdelspan *mdsp_new;
int done_first_nonreloc;
mqp->phys_pages = 0;
mqp->nonrelocatable = 0;
mqp->first_nonrelocatable = 0;
mqp->last_nonrelocatable = 0;
/*
* It is OK to proceed here if mdsp_new == NULL.
*/
done_first_nonreloc = 0;
while (snpgs != 0) {
/*
* Find the lowest addressed memseg that starts
* after sbase and account for it.
* This is to catch dynamic memsegs whose start
* is hidden.
*/
}
}
}
/*
* Now have the full extent of the memseg so
* do the range check.
*/
if (mseg_start >= p_end ||
/* Span does not overlap memseg. */
}
}
/*
* Account for gap either before the segment if
* there is one or to the end of the span.
*/
/*
* Check with arch layer for relocatability.
*/
/*
* No non-relocatble pages in this
* area, avoid the fine-grained
* test.
*/
}
if (!arch_kphysm_del_span_ok(sbase,
1)) {
mqp->nonrelocatable++;
if (!done_first_nonreloc) {
mqp->
= sbase;
done_first_nonreloc = 1;
}
}
sbase++;
snpgs--;
}
}
/*
* Skip the page_t area of a
* dynamic memseg.
*/
snpgs = 0;
continue;
}
}
/*
* The individual pages can now be checked.
*/
if (PP_ISNORELOC(pp)) {
mqp->nonrelocatable++;
if (!done_first_nonreloc) {
mqp->
= sbase;
done_first_nonreloc = 1;
}
}
sbase++;
snpgs--;
}
}
}
}
return (KPHYSM_OK);
}
/*
* This release function can be called at any stage as follows:
* _gethandle only called
* _span(s) only called
* _start called but failed
* delete thread exited
*/
int
{
struct mem_handle *mhp;
return (KPHYSM_EHANDLE);
}
case MHND_STARTING:
case MHND_RUNNING:
return (KPHYSM_ENOTFINISHED);
case MHND_FREE:
return (KPHYSM_EHANDLE);
case MHND_INIT:
break;
case MHND_DONE:
break;
case MHND_RELEASE:
return (KPHYSM_ESEQUENCE);
default:
#ifdef DEBUG
#endif /* DEBUG */
return (KPHYSM_EHANDLE);
}
/*
* Set state so that we can wait if necessary.
* fields except mh_exthandle and mh_state.
*/
/*
* The mem_handle cannot be de-allocated by any other operation
* now, so no need to hold mh_mutex.
*/
mhp->mh_phys_pages = 0;
mhp->mh_vm_pages = 0;
mhp->mh_hold_todo = 0;
return (KPHYSM_OK);
}
/*
* This cancel function can only be called with the thread running.
*/
int
{
struct mem_handle *mhp;
return (KPHYSM_EHANDLE);
}
return (KPHYSM_ENOTRUNNING);
}
/*
* Set the cancel flag and wake the delete thread up.
* The thread may be waiting on I/O, so the effect of the cancel
* may be delayed.
*/
}
return (KPHYSM_OK);
}
int
{
struct mem_handle *mhp;
return (KPHYSM_EHANDLE);
}
/*
* Calling kphysm_del_status() is allowed before the delete
* is started to allow for status display.
*/
return (KPHYSM_ENOTRUNNING);
}
return (KPHYSM_OK);
}
static int mem_delete_additional_pages = 100;
static int
{
/*
* If all pageable pages were paged out, freemem would
* equal availrmem. There is a minimum requirement for
* availrmem.
*/
< npgs)
return (0);
/* TODO: check swap space, etc. */
return (1);
}
static int
{
int ret;
if (ret != 0)
return (ret);
}
static void
{
}
#define FREEMEM_INCR 100
#define DEL_FREE_WAIT_FRAC 4
#define DEL_BUSY_WAIT_FRAC 20
static void kphysm_del_cleanup(struct mem_handle *);
static pgcnt_t
{
int ret;
/*
* Get up to freemem_incr pages.
*/
/*
* Take free_get pages away from freemem,
* waiting if necessary.
*/
/*
* Duplicate test from page_create_throttle()
* but don't override with !PG_WAIT.
*/
ret = 0;
} else {
if (ret == 0) {
/* EMPTY */
}
}
if (ret != 0) {
return (free_get);
}
/*
* Put pressure on pageout.
*/
(lbolt + DEL_FREE_WAIT_TICKS));
}
return (0);
}
#define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100
/*
* This function is run as a helper thread for delete_memory_thread.
* It is needed in order to force kaio cleanup, so that pages used in kaio
* will be unlocked and subsequently relocated by delete_memory_thread.
* The address of the delete_memory_threads's mem_handle is passed in to
* this thread function, and is used to set the mh_aio_cleanup_done member
* prior to calling thread_exit().
*/
static void
{
int (*aio_cleanup_dr_delete_memory)(proc_t *);
int cleaned;
int n = 0;
struct mem_handle *mhp;
thread_exit();
}
aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
if (aio_cleanup_dr_delete_memory == NULL) {
"aio_cleanup_dr_delete_memory not found in kaio");
thread_exit();
}
do {
cleaned = 0;
/* cleanup proc's outstanding kaio */
cleaned +=
}
}
if ((*pcancel == 0) &&
(!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
/* delay a bit before retrying all procs again */
n = 0;
}
} while (*pcancel == 0);
thread_exit();
}
static void
{
struct mem_handle *mhp;
struct memdelspan *mdsp;
void (*del_complete_funcp)(void *, int error);
void *del_complete_arg;
int comp_code;
int ret;
int first_scan;
#ifdef MEM_DEL_STATS
#endif /* MEM_DEL_STATS */
#ifdef MEM_DEL_STATS
start_total = ddi_get_lbolt();
#endif /* MEM_DEL_STATS */
callb_generic_cpr, "memdel");
/* Allocate the remap pages now, if necessary. */
/*
* Subtract from availrmem now if possible as availrmem
* may not be available by the end of the delete.
*/
goto early_exit;
}
if (ret != 0) {
goto refused;
}
KM_SLEEP);
}
first_scan = 1;
freemem_left = 0;
/*
* Start dr_aio_cleanup_thread, which periodically iterates
* through the process list and invokes aio cleanup. This
* is needed in order to avoid a deadly embrace between the
* delete_memory_thread (waiting on writer lock for page, with the
* exclusive-wanted bit set), kaio read request threads (waiting for a
* reader lock on the same page that is wanted by the
* delete_memory_thread), and threads waiting for kaio completion
* (blocked on spt_amp->lock).
*/
mhp->mh_dr_aio_cleanup_cancel = 0;
mhp->mh_aio_cleanup_done = 0;
collected = 0;
if (first_scan) {
}
continue;
}
if (freemem_left == 0) {
if (freemem_left == 0)
break;
}
/*
* Release mh_mutex - some of this
* stuff takes some time (eg PUTPAGE).
*/
/*
* Not covered by a page_t - will
* be dealt with elsewhere.
*/
continue;
}
SE_EXCL_WANTED)) {
if (page_isretired(pp)) {
/*
* Page has been retired.
*
* Its shared lock can and
* must be upgraded to an
* exclusive lock in order
* to hashout the page when
* the delete completes.
*/
if (!page_tryupgrade(pp)) {
continue;
}
} else {
/*
* Page in use elsewhere.
*/
continue;
}
}
/*
* See if the cage expanded into the delete.
* This can happen as we have to allow the
* cage to expand.
*/
if (PP_ISNORELOC(pp)) {
if (page_isretired(pp))
else
break;
}
if (page_isretired(pp)) {
/*
* Page has been retired and is
* not part of the cage so we
* can now do the accounting for
* it.
*/
NBPBMW] |=
mhp->mh_hold_todo--;
continue;
}
ASSERT(freemem_left != 0);
/*
* Like page_reclaim() only 'freemem'
* processing is already done.
*/
} else {
}
PP_CLRFREE(pp);
PP_CLRAGED(pp);
collected++;
freemem_left--;
continue;
}
if (first_scan) {
continue;
}
/*
* Keep stats on pages encountered that
* are toxic or failing but not retired.
*/
if (page_istoxic(pp)) {
} else if (page_isfailing(pp)) {
}
/*
* In certain cases below, special exceptions
* are made for pages that are toxic. This
* is because the current meaning of toxic
* is that an uncorrectable error has been
* previously associated with the page.
*/
if (!page_istoxic(pp)) {
/*
* Must relocate locked in
* memory pages.
*/
#ifdef MEM_DEL_STATS
start_pgrp = ddi_get_lbolt();
#endif /* MEM_DEL_STATS */
/*
* Lock all constituent pages
* of a large page to ensure
* that p_szc won't change.
*/
if (!group_page_trylock(pp,
SE_EXCL)) {
continue;
}
pp_targ =
#ifdef MEM_DEL_STATS
(uint64_t)
ddi_get_lbolt() -
#endif /* MEM_DEL_STATS */
goto reloc;
}
#ifdef MEM_DEL_STATS
(uint64_t)ddi_get_lbolt() -
#endif /* MEM_DEL_STATS */
continue;
} else {
/*
* Cannot do anything about
* this page because it is
* toxic.
*/
continue;
}
}
/*
* Unload the mappings and check if mod bit
* is set.
*/
#ifdef MEM_DEL_STATS
start_pgrp = ddi_get_lbolt();
#endif /* MEM_DEL_STATS */
/*
* Lock all constituent pages
* of a large page to ensure
* that p_szc won't change.
*/
continue;
}
NULL, 0);
#ifdef MEM_DEL_STATS
(uint64_t)ddi_get_lbolt() -
#endif /* MEM_DEL_STATS */
goto reloc;
}
}
if (!page_try_demote_pages(pp)) {
#ifdef MEM_DEL_STATS
#endif /* MEM_DEL_STATS */
continue;
}
/*
* Regular 'page-out'.
*/
if (!mod) {
/*
* page_destroy was called with
* dontfree. As long as p_lckcnt
* and p_cowcnt are both zero, the
* only additional action of
* page_destroy with !dontfree is to
* call page_free, so we can collect
* the page here.
*/
collected++;
#ifdef MEM_DEL_STATS
#endif /* MEM_DEL_STATS */
continue;
}
/*
* The page is toxic and the mod bit is
* set, we cannot do anything here to deal
* with it.
*/
if (page_istoxic(pp)) {
#ifdef MEM_DEL_STATS
#endif /* MEM_DEL_STATS */
continue;
}
#ifdef MEM_DEL_STATS
#endif /* MEM_DEL_STATS */
/*
* Try to get the page back immediately
* so that it can be collected.
*/
/*
* This should not happen as this
* thread is deleting the page.
* If this code is generalized, this
* becomes a reality.
*/
#ifdef DEBUG
"delete_memory_thread(0x%p) "
"pfn 0x%lx has no page_t",
#endif /* DEBUG */
continue;
}
SE_EXCL_WANTED)) {
goto free_page_collect;
}
}
continue;
/*
* Got some freemem and a target
* page, so move the data to avoid
* I/O and lock problems.
*/
/*
* page_relocate() will return pgcnt: the
* number of consecutive pages relocated.
* If it is successful, pp will be a
* linked list of the page structs that
* were relocated. If page_relocate() is
* unsuccessful, pp will be unmodified.
*/
#ifdef MEM_DEL_STATS
start_pgrp = ddi_get_lbolt();
#endif /* MEM_DEL_STATS */
#ifdef MEM_DEL_STATS
#endif /* MEM_DEL_STATS */
if (result != 0) {
/*
* We did not succeed. We need
* to give the pp_targ pages back.
* page_free(pp_targ, 1) without
* the freemem accounting.
*/
continue;
}
/*
* We will then collect pgcnt pages.
*/
/*
* We need to make sure freemem_left is
* large enough.
*/
while ((freemem_left < pgcnt) &&
freemem_left +=
}
/*
* Do not proceed if mh_cancel is set.
*/
/*
* Unlink and unlock each page.
*/
}
/*
* We need to give the pp pages back.
* page_free(pp, 1) without the
* freemem accounting.
*/
break;
}
/* Now remove pgcnt from freemem_left */
freemem_left -= pgcnt;
ASSERT(freemem_left >= 0);
/*
* pp and pp_targ were passed back as
* a linked list of pages.
* Unlink and unlock each page.
*/
/*
* The original page is now free
* so remove it from the linked
* list and collect it.
*/
collected++;
}
}
}
first_scan = 0;
(collected == 0)) {
/*
* This code is needed as we cannot wait
* for a page to be locked OR the delete to
* be cancelled. Also, we must delay so
* that other threads get a chance to run
* on our cpu, otherwise page locks may be
* held indefinitely by those threads.
*/
(lbolt + DEL_BUSY_WAIT_TICKS));
}
}
/* stop the dr aio cleanup thread */
transit_list_collect(mhp, 0);
if (freemem_left != 0) {
/* Return any surplus. */
freemem_left = 0;
}
}
#ifdef MEM_DEL_STATS
#endif /* MEM_DEL_STATS */
/*
* If the memory delete was cancelled, exclusive-wanted bits must
* be cleared, and also any retired pages that
* were accounted for above must have their exclusive lock
* downgraded to a shared lock to return them to their previous
* state.
* Otherwise, if the memory delete has completed, retired pages
* must be hashed out.
*/
}
}
} else {
}
/* do we already have pp? */
}
/*
* To satisfy ASSERT below in
* cancel code.
*/
mhp->mh_hold_todo++;
} else {
}
}
}
}
/*
* Free retired page bitmap and collected page bitmap
*/
}
/* wait for our dr aio cancel thread to exit */
while (!(mhp->mh_aio_cleanup_done)) {
}
/*
* Go through list of deleted pages (mh_deleted) freeing
* them.
*/
mhp->mh_hold_todo++;
/* Restore p_next. */
"page %p is free",
(void *)pp);
}
}
goto t_exit;
}
/*
* All the pages are no longer in use and are exclusively locked.
*/
/* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
thread_exit();
/*NOTREACHED*/
}
/*
* Start the delete of the memory from the system.
*/
int
void (*complete)(void *, int),
void *complete_arg)
{
struct mem_handle *mhp;
return (KPHYSM_EHANDLE);
}
case MHND_FREE:
return (KPHYSM_EHANDLE);
case MHND_INIT:
break;
case MHND_STARTING:
case MHND_RUNNING:
return (KPHYSM_ESEQUENCE);
case MHND_DONE:
return (KPHYSM_ESEQUENCE);
case MHND_RELEASE:
return (KPHYSM_ESEQUENCE);
default:
#ifdef DEBUG
#endif /* DEBUG */
return (KPHYSM_EHANDLE);
}
return (KPHYSM_ENOWORK);
}
/*
* Release the mutex in case thread_create sleeps.
*/
/*
* The "obvious" process for this thread is pageout (proc_pageout)
* but this gives the thread too much power over freemem
* which results in freemem starvation.
*/
return (KPHYSM_OK);
}
static pgcnt_t pp_dummy_npages;
static void
{
continue;
}
}
void
{
int i;
/*
* dpages starts off as the size of the structure and
* ends up as the minimum number of pages that will
* hold a whole number of page_t structures.
*/
while ((dpages & 1) == 0)
dpages >>= 1;
/*
* Allocate pp_dummy pages directly from static_arena,
* since these are whole page allocations and are
* referenced by physical address. This also has the
* nice fringe benefit of hiding the memory from
* ::findleaks since it doesn't deal well with allocated
* kernel heap memory that doesn't have any mappings.
*/
for (i = 0; i < pp_dummy_npages; i++) {
&pp_dummy[MMU_PAGESIZE * i]);
}
/*
* Initialize the page_t's to a known 'deleted' state
* that matches the state of deleted pages.
*/
ptob(pp_dummy_npages)));
/* Remove kmem mappings for the pages for safety. */
/* Leave pp_dummy pointer set as flag that init is done. */
}
}
static void
{
while (metapgs != 0) {
pgcnt_t n;
int i;
n = pp_dummy_npages;
if (n > metapgs)
n = metapgs;
for (i = 0; i < n; i++) {
}
metapgs -= n;
}
}
/*
* Transition all the deleted pages to the deleted state so that
* page_lock will not wait. The page_lock_delete call will
* also wake up any waiters.
*/
static void
{
}
}
static void
{
struct memdelspan *mdsp;
memsegs_lock(1);
/*
* remove from main segment list.
*/
npgs = 0;
/* Span and memseg don't overlap. */
continue;
}
/* Hide the memseg from future scans. */
membar_producer(); /* TODO: Needed? */
/*
* Leave the deleted segment's next pointer intact
* in case a memsegs scanning loop is walking this
* segment concurrently.
*/
}
}
total_pages -= npgs;
/*
* Recalculate the paging parameters now total_pages has changed.
* This will also cause the clock hands to be reset before next use.
*/
setupclock(1);
memsegs_unlock(1);
int dynamic;
int mlret;
/*
* Put the page_t's into the deleted state to stop
* cv_wait()s on the pages. When we remap, the dummy
* page_t's will be in the same state.
*/
/*
* Collect up information based on pages_base and pages_end
* early so that we can flag early that the memseg has been
* deleted by setting pages_end == pages_base.
*/
if (dynamic) {
/* Remap the meta data to our special dummy area. */
} else {
/*
* Set for clean-up below.
*/
/*
* For memory whose page_ts were allocated
* at boot, we need to find a new use for
* the page_t memory.
* For the moment, just leak it.
* (It is held in the memseg_delete_junk list.)
*/
}
/* Must not use seg now as it could be re-used. */
&phys_avail);
&phys_install);
}
/* availrmem is adjusted during the delete. */
dump_resize();
/*
* Update lgroup generation number on single lgroup systems
*/
if (nlgrps == 1)
lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
/* Successfully deleted system memory */
}
static uint_t mdel_nullvp_waiter;
static void
struct mem_handle *mhp)
{
/* do not do PP_SETAGED(pp); */
} else {
}
}
mhp->mh_hold_todo--;
}
static void
{
struct transit_list_head *trh;
trh = &transit_list_head;
}
static void
{
struct transit_list_head *trh;
trh = &transit_list_head;
}
static void
{
struct transit_list_head *trh;
struct transit_list **tlpp;
trh = &transit_list_head;
}
static struct transit_list *
{
struct transit_list *tlp;
struct memdelspan *mdsp;
return (tlp);
}
}
}
return (NULL);
}
int
{
struct transit_list_head *trh;
struct transit_list *tlp;
int ret;
trh = &transit_list_head;
return (0);
return (ret);
}
#ifdef MEM_DEL_STATS
extern int hz;
static void
{
if (mem_del_stat_print) {
printf("memory delete loop %x/%x, statistics%s\n",
}
}
#endif /* MEM_DEL_STATS */
struct mem_callback {
void *arg;
};
#define NMEMCALLBACKS 100
static uint_t nmemcallbacks;
static krwlock_t mem_callback_rwlock;
int
{
/*
* This test will become more complicated when the version must
* change.
*/
return (EINVAL);
return (EINVAL);
for (i = 0, found = 0; i < nmemcallbacks; i++) {
found = i + 1;
#ifdef DEBUG
/* Catch this in DEBUG kernels. */
"(0x%p, 0x%p) duplicate registration from 0x%p",
#endif /* DEBUG */
return (EEXIST);
}
}
if (found != 0) {
i = found - 1;
} else {
if (nmemcallbacks == NMEMCALLBACKS) {
return (ENOMEM);
}
i = nmemcallbacks++;
}
return (0);
}
void
{
uint_t i;
for (i = 0; i < nmemcallbacks; i++) {
if (i == (nmemcallbacks - 1))
break;
}
}
}
static void
{
uint_t i;
for (i = 0; i < nmemcallbacks; i++) {
}
}
}
/*
* Note the locking between pre_del and post_del: The reader lock is held
* between the two calls to stop the set of functions from changing.
*/
static int
{
uint_t i;
int ret;
int aret;
ret = 0;
for (i = 0; i < nmemcallbacks; i++) {
}
}
return (ret);
}
static void
{
uint_t i;
for (i = 0; i < nmemcallbacks; i++) {
}
}
}
static int
{
/*
* Lock the memsegs list against other updates now
*/
memsegs_lock(1);
/*
* Find boot time memseg that wholly covers this area.
*/
/* First find the memseg with page 'base' in it. */
break;
}
memsegs_unlock(1);
return (0);
}
memsegs_unlock(1);
return (0);
}
memsegs_unlock(1);
return (0);
}
/*
* Work out the size of the two segments that will
* surround the new segment, one for low address
* and one for high.
*/
/*
* Sanity check.
*/
memsegs_unlock(1);
return (0);
}
/*
* Allocate the new structures. The old memseg will not be freed
* as there may be a reference to it.
*/
if (size_low != 0) {
}
if (size_high != 0) {
}
/*
* All allocation done now.
*/
if (size_low != 0) {
}
if (size_high != 0) {
}
/*
* Update hat_kpm specific info of all involved memsegs and
* allow hat_kpm specific global chain updates.
*/
/*
* At this point we have two equivalent memseg sub-chains,
* the same place in the global chain. By re-writing the pointer
* in the previous element we switch atomically from using the old
* (seg) to the new.
*/
membar_enter();
memsegs_unlock(1);
/*
* We leave the old segment, 'seg', intact as there may be
* references to it. Also, as the value of total_pages has not
* changed and the memsegs list is effectively the same when
* accessed via the old or the new pointer, we do not have to
* cause pageout_scanner() to re-evaluate its hand pointers.
*
* We currently do not re-use or reclaim the page_t memory.
* If we do, then this may have to change.
*/
return (1);
}
/*
* The memsegs lock is only taken when modifying the memsegs list
* and rebuilding the pfn hash table (after boot).
* No lock is needed for read as memseg structure are never de-allocated
* and the pointer linkage is never updated until the memseg is ready.
*/
void
memsegs_lock(int writer)
{
}
/*ARGSUSED*/
void
memsegs_unlock(int writer)
{
}
/*
* memlist (phys_install, phys_avail) locking.
*/
/*
*/
static kmutex_t memlists_mutex;
void
{
}
void
{
}
void
{
}
void
{
}
/*
* The sfmmu hat layer (e.g.) accesses some parts of the memseg
* structure using physical addresses. Therefore a kmem_cache is
* used with KMC_NOHASH to avoid page crossings within a memseg
* structure. KMC_NOHASH requires that no external (outside of
* slab) information is allowed. This, in turn, implies that the
* cache's slabsize must be exactly a single page, since per-slab
* information (e.g. the freelist for the slab) is kept at the
* end of the slab, where it is easy to locate. Should be changed
* available.
*/
void
{
}