htable.c revision 6b60931c06941531e2bac4709902c411f22362f3
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/sysmacros.h>
#include <sys/machparam.h>
#include <sys/machsystm.h>
#include <sys/x86_archext.h>
#include <sys/archsystm.h>
#include <sys/bootconf.h>
#include <vm/seg_kmem.h>
#include <sys/bootinfo.h>
#include <vm/kboot_mmu.h>
/*
* The variable htable_reserve_amount, rather than HTABLE_RESERVE_AMOUNT,
* is used in order to facilitate testing of the htable_steal() code.
* By resetting htable_reserve_amount to a lower value, we can force
* stealing to occur. The reserve amount is a guess to get us through boot.
*/
#define HTABLE_RESERVE_AMOUNT (200)
/*
* Used to hand test htable_steal().
*/
#ifdef DEBUG
ulong_t force_steal = 0;
ulong_t ptable_cnt = 0;
#endif
/*
* Any value works, but a power of two <= mmu.ptes_per_table is best.
*/
/*
* mutex stuff for access to htable hash
*/
#define NUM_HTABLE_MUTEX 128
/*
* forward declarations
*/
/*
* A counter to track if we are stealing or reaping htables. When non-zero
* htable_free() will directly free htables (either to the reserve or kmem)
* instead of putting them in a hat's htable cache.
*/
/*
* Track the number of active pagetables, so we can know how many to reap
*/
static uint32_t active_ptables = 0;
/*
* Allocate a memory page for a hardware page table.
*
* A wrapper around page_get_physical(), with some extra checks.
*/
static pfn_t
{
pfn = PFN_INVALID;
/*
* The first check is to see if there is memory in the system. If we
* drop to throttlefree, then fail the ptable_alloc() and let the
* stealing code kick in. Note that we have to do this test here,
* since the test in page_create_throttle() would let the NOSLEEP
* allocation go through and deplete the page reserves.
*
* The !NOMEMWAIT() lets pageout, fsflush, etc. skip this check.
*/
return (PFN_INVALID);
#ifdef DEBUG
/*
* This code makes htable_steal() easier to test. By setting
* force_steal we force pagetable allocations to fall
* into the stealing code. Roughly 1 in ever "force_steal"
* page table allocations will fail.
*/
++ptable_cnt > force_steal) {
ptable_cnt = 0;
return (PFN_INVALID);
}
#endif /* DEBUG */
return (PFN_INVALID);
if (pfn == PFN_INVALID)
panic("ptable_alloc(): Invalid PFN!!");
return (pfn);
}
/*
* Free an htable's associated page table page. See the comments
* for ptable_alloc().
*/
static void
{
/*
* need to destroy the page used for the pagetable
*/
panic("ptable_free(): no page for pfn!");
/*
* Get an exclusive lock, might have to wait for a kmem reader.
*/
if (!page_tryupgrade(pp)) {
/*
* RFE: we could change this to not loop forever
* George Cameron had some idea on how to do that.
* For now looping works - it's just like sfmmu.
*/
continue;
}
page_unresv(1);
}
/*
* Put one htable on the reserve list.
*/
static void
{
}
/*
* Take one htable from the reserve.
*/
static htable_t *
htable_get_reserve(void)
{
if (htable_reserve_cnt != 0) {
}
return (ht);
}
/*
* Allocate initial htables and put them on the reserve list
*/
void
{
while (count > 0) {
--count;
}
}
/*
* Readjust the reserves after a thread finishes using them.
*/
void
{
/*
* Free any excess htables in the reserve list
*/
while (htable_reserve_cnt > htable_reserve_amount &&
!USE_HAT_RESERVES()) {
ht = htable_get_reserve();
return;
}
}
/*
* This routine steals htables from user processes for htable_alloc() or
* for htable_reap().
*/
static htable_t *
{
uint_t h;
uint_t e;
/*
* Limit htable_steal_passes to something reasonable
*/
if (htable_steal_passes == 0)
htable_steal_passes = 1;
/*
* Loop through all user hats. The 1st pass takes cached htables that
* aren't in use. The later passes steal by removing mappings, too.
*/
for (;;) {
/*
* Clear the victim flag and move to next hat
*/
}
/*
* Skip any hat that is already being stolen from.
*
* We skip SHARED hats, as these are dummy
* hats that host ISM shared page tables.
*
* We also skip if HAT_FREEING because hat_pte_unmap()
* won't zero out the PTE's. That would lead to hitting
* stale PTEs either here or under hat_unload() when we
* steal and unload the same page table in competing
* threads.
*/
break;
}
/*
* Are we finished?
*/
/*
* Try to spread the pain of stealing,
* move victim HAT to the end of the HAT list.
*/
/* unlink victim hat */
else
else
/* relink at end of hat list */
else
}
break;
}
/*
* Mark the HAT as a stealing victim.
*/
/*
* Take any htables from the hat's cached "free" list.
*/
++stolen;
}
/*
* Don't steal on first pass.
*/
continue;
/*
* Search the active htables for one to steal.
* Start at a different hash bucket every time to
* help spread the pain of stealing.
*/
do {
HTABLE_ENTER(h);
/*
* Can we rule out reaping?
*/
ht->ht_lock_cnt != 0)
continue;
/*
* Increment busy so the htable can't
* disappear. We drop the htable mutex
* to avoid deadlocks with
* hat_pageunload() and the hment mutex
* while we call hat_pte_unmap()
*/
HTABLE_EXIT(h);
/*
* Try stealing.
* - unload and invalidate all PTEs
*/
e < HTABLE_NUM_PTES(ht) &&
ht->ht_valid_cnt > 0 &&
ht->ht_lock_cnt == 0;
++e, va += MMU_PAGESIZE) {
if (!PTE_ISVALID(pte))
continue;
hat_pte_unmap(ht, e,
}
/*
* Reacquire htable lock. If we didn't
* remove all mappings in the table,
* or another thread added a new mapping
* behind us, give up on this table.
*/
HTABLE_ENTER(h);
ht->ht_valid_cnt != 0 ||
ht->ht_lock_cnt != 0) {
continue;
}
/*
* Steal it and unlink the page table.
*/
/*
* remove from the hash list
*/
} else {
ht);
hat->hat_ht_hash[h] =
}
/*
* Break to outer loop to release the
* higher (ht_parent) pagetable. This
* spreads out the pain caused by
* pagefaults.
*/
++stolen;
break;
}
HTABLE_EXIT(h);
if (++h == hat->hat_num_hash)
h = 0;
}
}
return (list);
}
/*
* This is invoked from kmem when the system is low on memory. We try
* to free hments, htables, and ptables to improve the memory situation.
*/
/*ARGSUSED*/
static void
htable_reap(void *handle)
{
if (!can_steal_post_boot)
return;
/*
* Try to reap 5% of the page tables bounded by a maximum of
* 5% of physmem and a minimum of 10.
*/
/*
* Let htable_steal() do the work, we just call htable_free()
*/
}
/*
* Free up excess reserves
*/
}
/*
* Allocate an htable, stealing one or using the reserve if necessary
*/
static htable_t *
{
is_bare = 1;
/*
* First reuse a cached htable from the hat_ht_cached field, this
*/
need_to_zero = 0;
/* XX64 ASSERT() they're all zero somehow */
}
}
/*
* Allocate an htable, possibly refilling the reserves.
*/
if (USE_HAT_RESERVES()) {
ht = htable_get_reserve();
} else {
/*
* Donate successful htable allocations to the reserve.
*/
for (;;) {
break;
if (USE_HAT_RESERVES() ||
break;
}
}
/*
* allocate a page for the hardware page table if needed
*/
if (USE_HAT_RESERVES())
else
}
}
}
/*
* If allocations failed, kick off a kmem_reap() and resort to
* htable steal(). We may spin here if the system is very low on
* memory. If the kernel itself has consumed all memory and kmem_reap()
* can't free up anything, then we'll really get stuck here.
* That should only happen in a system where the administrator has
*/
kmem_reap();
/*
* If we stole for a bare htable, release the pagetable page.
*/
if (is_bare) {
}
}
}
/*
* All attempts to allocate or steal failed. This should only happen
* if we run out of memory during boot, due perhaps to a huge
* boot_archive. At this point there's no way to continue.
*/
panic("htable_alloc(): couldn't steal\n");
/*
* Shared page tables have all entries locked and entries may not
* be added or deleted.
*/
ht->ht_lock_cnt = 0;
need_to_zero = 0;
} else {
ht->ht_lock_cnt = 0;
ht->ht_valid_cnt = 0;
}
/*
* setup flags, etc. for VLP htables
*/
if (is_vlp) {
need_to_zero = 0;
}
/*
* fill in the htable
*/
/*
* Zero out any freshly allocated page table
*/
if (need_to_zero)
return (ht);
}
/*
* Free up an htable, either to a hat's cached list, the reserves or
* back to kmem.
*/
static void
{
/*
* If the process isn't exiting, cache the free htable in the hat
* structure. We always do this for the boot reserve. We don't
*/
(use_boot_reserve ||
return;
}
/*
* If we have a hardware page table, free it.
* We don't free page tables that are accessed by sharing.
*/
}
/*
* Free htables or put into reserves.
*/
} else {
}
}
/*
* This is called when a hat is being destroyed or swapped out. We reap all
* the remaining htables in the hat cache. If destroying all left over
* htables are also destroyed.
*
* We also don't need to invalidate any of the PTPs nor do any demapping.
*/
void
{
int h;
/*
* Purge the htable cache if just reaping.
*/
for (;;) {
break;
}
}
return;
}
/*
* if freeing, no locking is needed
*/
}
/*
* walk thru the htable hash table and free all the htables in it.
*/
for (h = 0; h < hat->hat_num_hash; ++h) {
} else {
}
}
}
}
/*
* Unlink an entry for a table at vaddr and level out of the existing table
* one level higher. We are always holding the HASH_ENTER() when doing this.
*/
static void
{
/*
* When any top level VLP page table entry changes, we must issue
* a reload of cr3 on all processors. Also some CPU types require
* invalidating when inner table entries are invalidated.
*/
else if (mmu.inval_nonleaf)
}
}
/*
* Link an entry for a new table at vaddr and level into the existing table
* one level higher. We are always holding the HASH_ENTER() when doing this.
*/
static void
{
/*
* When any top level VLP page table entry changes, we must issue
* a reload of cr3 on all processors using it.
* We also need to do this for the kernel hat on PAE 32 bit kernel.
*/
if (
#ifdef __i386
#endif
}
/*
* Release of hold on an htable. If this is the last use and the pagetable
* is empty we may want to free it, then recursively look at the pagetable
* above it. The recursion is handled by the outer while() loop.
*/
void
{
for (;;) {
/*
* The common case is that this isn't the last use of
* an htable so we don't want to free the htable.
*/
if (ht->ht_valid_cnt > 0)
break;
break;
/*
* we always release empty shared htables
*/
/*
* don't release if in address space tear down
*/
break;
/*
* At and above max_page_level, free if it's for
* a boot-time kernel mapping below kernelbase.
*/
break;
}
/*
* Remember if we destroy an htable that shares its PFN
* from elsewhere.
*/
}
/*
* Handle release of a table and freeing the htable_t.
* Unlink it from the table higher (ie. ht_parent).
*/
/*
* Unlink the pagetable.
*/
/*
* remove this htable from its hash list
*/
} else {
}
}
/*
* If we released a shared htable, do a release on the htable
* from which it shared
*/
}
}
/*
* Find the htable for the pagetable at the given level for the given address.
* If found acquires a hold that eventually needs to be htable_release()d
*/
htable_t *
{
base = 0;
else
break;
}
if (ht)
return (ht);
}
/*
* Acquires a hold on a known htable (from a locked hment entry).
*/
void
{
#ifdef DEBUG
/*
* make sure the htable is there
*/
{
htable_t *h;
h && h != ht;
h = h->ht_next)
;
}
#endif /* DEBUG */
}
/*
* Find the htable for the pagetable at the given level for the given address.
* If found acquires a hold that eventually needs to be htable_release()d
* If not found the table is created.
*
* Since we can't hold a hash table mutex during allocation, we have to
* drop it and redo the search on a create. Then we may have to free the newly
* allocated htable if another thread raced in and created it ahead of us.
*/
htable_t *
{
uint_t h;
level_t l;
/*
* Create the page tables in top down order.
*/
base = 0;
else
/*
* look up the htable at this level
*/
HTABLE_ENTER(h);
} else {
break;
}
}
/*
* if we found the htable, increment its busy cnt
* and if we had allocated a new htable, free it.
*/
/*
* If we find a pre-existing shared table, it must
* share from the same place.
*/
panic("htable shared from wrong place "
}
HTABLE_EXIT(h);
if (new)
/*
* if we didn't find it on the first search
* allocate a new one and search again
*/
HTABLE_EXIT(h);
goto try_again;
/*
* 2nd search and still not there, use "new" table
* Link new table into higher, when not at top level.
*/
} else {
}
if (hat->hat_ht_hash[h])
HTABLE_EXIT(h);
/*
* Note we don't do htable_release(higher).
* That happens recursively when "new" is removed by
* htable_release() or htable_steal().
*/
/*
* If we just created a new shared page table we
* increment the shared htable's busy count, so that
* it can't be the victim of a steal even if it's empty.
*/
}
}
}
return (ht);
}
/*
* Inherit initial pagetables from the boot program.
*/
void
{
uint_t h;
uint_t i;
ht = htable_get_reserve();
ht->ht_lock_cnt = 0;
ht->ht_valid_cnt = 0;
HTABLE_ENTER(h);
if (hat->hat_ht_hash[h])
HTABLE_EXIT(h);
/*
* make sure the page table physical page is not FREE
*/
panic("page_resv() failed in ptable alloc");
/*
* Record in the page_t that is a pagetable for segkpm setup.
*/
if (kpm_vbase)
/*
* Count valid mappings and recursively attach lower level pagetables.
*/
for (i = 0; i < HTABLE_NUM_PTES(ht); ++i) {
else
++ht->ht_valid_cnt;
}
}
}
/*
* As long as all the mappings we had were below kernel base
* we can release the htable.
*/
if (base < kernelbase)
}
/*
* Walk through a given htable looking for the first valid entry. This
* routine takes both a starting and ending address. The starting address
* is required to be within the htable provided by the caller, but there is
* no such restriction on the ending address.
*
* If the routine finds a valid entry in the htable (at or beyond the
* starting address), the PTE (and its address) will be returned.
* This PTE may correspond to either a page or a pagetable - it is the
* caller's responsibility to determine which. If no valid entry is
* found, 0 (and invalid PTE) and the next unexamined address will be
* returned.
*
* The loop has been carefully coded for optimization.
*/
static x86pte_t
{
uint_t e;
/*
* Compute the starting index and ending virtual address
*/
/*
* The following page table scan code knows that the valid
* bit of a PTE is in the lowest byte AND that x86 is little endian!!
*/
while (!PTE_ISVALID(*pte_ptr)) {
break;
if (pte_ptr == end_pte_ptr)
break;
}
/*
* if we found a valid PTE, load the entire PTE
*/
#if defined(__amd64)
/*
* deal with VA hole on amd64
*/
#endif /* __amd64 */
return (found_pte);
}
/*
* Find the address and htable for the first populated translation at or
* above the given virtual address. The caller may also specify an upper
* limit to the address range to search. Uses level information to quickly
* skip unpopulated sections of virtual address spaces.
*
* If not found returns NULL. When found, returns the htable and virt addr
* and has a hold on the htable.
*/
{
level_t l;
/*
* If this is a user address, then we know we need not look beyond
* kernelbase.
*/
eaddr == HTABLE_WALK_TO_END);
eaddr = kernelbase;
/*
* If we're coming in with a previous page table, search it first
* without doing an htable_lookup(), this should be frequent.
*/
if (prev) {
if (PTE_ISPAGE(pte, l)) {
return (pte);
}
}
/*
* We found nothing in the htable provided by the caller,
* so fall through and do the full search
*/
}
/*
* Find the level of the largest pagesize used by this HAT.
*/
max_mapped_level = 0;
if (hat->hat_pages_mapped[l] != 0)
max_mapped_level = l;
/*
* Find lowest table with any entry for given address.
*/
if (PTE_ISPAGE(pte, l)) {
return (pte);
}
break;
}
/*
* The ht is never NULL at the top level since
* the top level htable is created in hat_alloc().
*/
/*
* No htable covers the address. If there is no
* larger page size that could cover it, we
* skip to the start of the next page table.
*/
if (l >= max_mapped_level) {
break;
}
}
}
*vaddr = 0;
return (0);
}
/*
* Find the htable and page table entry index of the given virtual address
* with pagesize at or below given level.
* If not found returns NULL. When found, returns the htable, sets
* entry, and has a hold on the htable.
*/
htable_t *
{
level_t l;
uint_t e;
for (l = 0; l <= level; ++l) {
continue;
*entry = e;
return (ht);
}
return (NULL);
}
/*
* Find the htable and page table entry index of the given virtual address.
* There must be a valid page mapped at the given address.
* If not found returns NULL. When found, returns the htable, sets
* entry, and has a hold on the htable.
*/
htable_t *
{
uint_t e;
return (NULL);
if (entry)
*entry = e;
return (ht);
return (NULL);
}
void
{
/*
* To save on kernel VA usage, we avoid debug information in 32 bit
* kernels.
*/
#if defined(__amd64)
int kmem_flags = KMC_NOHASH;
#endif
/*
* initialize kmem caches
*/
}
/*
* get the pte index for the virtual address in the given htable's pagetable
*/
{
}
/*
* Given an htable and the index of a pte in it, return the virtual address
* of the page.
*/
{
/*
* Need to skip over any VA hole in top level table
*/
#if defined(__amd64)
#endif
return (va);
}
/*
* avoid atomicity problems, since PTEs can be 8 bytes on 32 bit systems.
* will naturally be atomic.
*
* The combination of using kpreempt_disable()/_enable() and the hci_mutex
* are used to ensure that an interrupt won't overwrite a temporary mapping
* while it's in use. If an interrupt thread tries to access a PTE, it will
* yield briefly back to the pinned thread which holds the cpu's hci_mutex.
*/
void
{
struct hat_cpu_info *hci;
}
void
{
}
#ifdef __i386
/*
* On 32 bit kernels, loading a 64 bit PTE is a little tricky
*/
{
x86pte_t t;
for (;;) {
t = p[0];
if ((t & 0xffffffff) == p[0])
return (t);
}
}
#endif /* __i386 */
/*
* Disable preemption and establish a mapping to the pagetable with the
* given pfn. This is optimized for there case where it's the same
* pfn as we last used referenced from this CPU.
*/
static x86pte_t *
{
/*
* VLP pagetables are contained in the hat_t
*/
}
/*
* map the given pfn into the page table window.
*/
/*ARGSUSED*/
x86pte_t *
{
int x;
if (!khat_running) {
}
/*
* If kpm is available, use it.
*/
if (kpm_vbase)
/*
* Disable preemption and grab the CPU's hci_mutex
*/
else
newpte |= PT_WRITABLE;
else
}
}
/*
* Release access to a page table.
*/
static void
{
/*
* nothing to do for VLP htables
*/
return;
}
void
x86pte_mapout(void)
{
return;
/*
* Drop the CPU's hci_mutex and restore preemption.
*/
}
/*
* Atomic retrieval of a pagetable entry
*/
{
/*
* Be careful that loading PAE entries in 32 bit kernel is atomic.
*/
return (pte);
}
/*
* Atomic unconditional set of a page table entry, it returns the previous
* value. For pre-existing mappings if the PFN changes, then we don't care
* about the old pte's REF / MOD bits. If the PFN remains the same, we leave
*
* If asked to overwrite a link to a lower page table with a large page
* mapping, this routine returns the special value of LPAGE_ERROR. This
* allows the upper HAT layers to retry with a smaller mapping size.
*/
{
x86pte_t n;
else
/*
* Install the new PTE. If remapping the same PFN, then
*/
do {
n = new;
/*
* Another thread may have installed this mapping already,
* flush the local TLB and be done.
*/
if (prev == n) {
goto done;
}
/*
* Detect if we have a collision of installing a large
* page mapping where there already is a lower page table.
*/
old = LPAGE_ERROR;
goto done;
}
/*
* Do a TLB demap if needed, ie. the old pte was valid.
*
* Note that a stale TLB writeback to the PTE here either can't happen
* or doesn't matter. The PFN can only change for NOSYNC|NOCONSIST
* mappings, but they were created with REF and MOD already set, so
* no stale writeback will happen.
*
* Segmap is the only place where remaps happen on the same pfn and for
*/
done:
return (old);
}
/*
* Atomic compare and swap of a page table entry. No TLB invalidates are done.
* This is used for links between pagetables of different levels.
* never change.
*/
{
return (pte);
}
/*
* Invalidate a page table entry as long as it currently maps something that
* matches the value determined by expect.
*
* Also invalidates any TLB entries and returns the previous value of the PTE.
*/
{
else
/*
* Note that the loop is needed to handle changes due to h/w updating
*/
do {
goto done;
done:
return (oldpte);
}
/*
* Change a page table entry af it currently matches the value in expect.
*/
{
/*
* When removing write permission *and* clearing the
* MOD bit, check if a write happened via a stale
* TLB entry before the TLB shootdown finished.
*
* If it did happen, simply re-enable write permission and
* act like the original CAS failed.
*/
do {
found =
} while ((found & PT_WRITABLE) == 0);
}
}
return (found);
}
/*
* Copy page tables - this is just a little more complicated than the
* previous routines. Note that it's also not atomic! It also is never
* used for VLP pagetables.
*/
void
{
/*
* Acquire access to the CPU pagetable windows for the dest and source.
*/
if (kpm_vbase) {
} else {
/*
* Finish defining the src pagetable mapping
*/
else
}
/*
* now do the copy
*/
}
/*
* Zero page table entries - Note this doesn't use atomic stores!
*/
static void
{
/*
* Map in the page table to be zeroed.
*/
#ifdef __i386
if ((x86_feature & X86_SSE2) == 0)
else
#endif
}
/*
* Called to ensure that all pagetables are in the system dump
*/
void
hat_dump(void)
{
uint_t h;
/*
* Dump all page tables
*/
for (h = 0; h < hat->hat_num_hash; ++h) {
}
}
}
}