htable.c revision aa2ed9e57a76431fbe2ba230496f4fcd22b2d41d
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/sysmacros.h>
#include <sys/machparam.h>
#include <sys/machsystm.h>
#include <sys/x86_archext.h>
#include <sys/bootconf.h>
#include <vm/seg_kmem.h>
extern cpuset_t khat_cpuset;
/*
* The variable htable_reserve_amount, rather than HTABLE_RESERVE_AMOUNT,
* is used in order to facilitate testing of the htable_steal() code.
* By resetting htable_reserve_amount to a lower value, we can force
* stealing to occur. The reserve amount is a guess to get us through boot.
*/
#define HTABLE_RESERVE_AMOUNT (200)
/*
*/
/*
* mutex stuff for access to htable hash
*/
#define NUM_HTABLE_MUTEX 128
/*
* forward declarations
*/
/*
* Address used for kernel page tables. See ptable_alloc() below.
*/
/*
* A counter to track if we are stealing or reaping htables. When non-zero
* htable_free() will directly free htables (either to the reserve or kmem)
* instead of putting them in a hat's htable cache.
*/
/*
* Track the number of active pagetables, so we can know how many to reap
*/
static uint32_t active_ptables = 0;
/*
* Allocate a memory page for a hardware page table.
*
* The pages allocated for page tables are currently gotten in a hacked up
* way. It works for now, but really needs to be fixed up a bit.
*
* During boot: The boot loader controls physical memory allocation via
* boot_alloc(). To avoid conflict with vmem, we just do boot_alloc()s with
* addresses less than kernelbase. These addresses are ignored when we take
* over mappings from the boot loader.
*
* Post-boot: we currently use page_create_va() on the kvp with fake offsets,
* segments and virt address. This is pretty bogus, but was copied from the
* old hat_i86.c code. A better approach would be to have a custom
* page_get_physical() interface that can specify either mnode random or
* mnode local and takes a page from whatever color has the MOST available -
* this would have a minimal impact on page coloring.
*
* For now the htable pointer in ht is only used to compute a unique vnode
* offset for the page.
*/
static void
{
static int first_time = 1;
/*
* Allocating the associated hardware page table is very different
* before boot has finished. We get a physical page to from boot
* w/o eating up any kernel address space.
*/
if (use_boot_reserve) {
/*
* Allocate, then demap the ptable_va, so that we're
* sure there exist page table entries for the addresses
*/
if (first_time) {
first_time = 0;
panic("BOP_ALLOC failed");
}
panic("page_resv() failed in ptable alloc");
} else {
/*
* Post boot get a page for the table.
*
* The first check is to see if there is memory in
* the system. If we drop to throttlefree, then fail
* the ptable_alloc() and let the stealing code kick in.
* Note that we have to do this test here, since the test in
* page_create_throttle() would let the NOSLEEP allocation
* go through and deplete the page reserves.
*/
return;
/*
* This code is temporary, so don't review too critically.
* I'm awaiting a new phys page allocator from Kit -- Joe
*
* We need assign an offset for the page to call
* page_create_va. To avoid conflicts with other pages,
* we get creative with the offset.
* for 32 bits, we pic an offset > 4Gig
* for 64 bits, pic an offset somewhere in the VA hole.
*/
offset <<= MMU_PAGESHIFT;
#if defined(__amd64)
#else
#endif
return;
#ifdef DEBUG
#endif
return;
}
if (pfn == PFN_INVALID)
panic("ptable_alloc(): Invalid PFN!!");
}
/*
* Free an htable's associated page table page. See the comments
* for ptable_alloc().
*/
static void
{
/*
* need to destroy the page used for the pagetable
*/
panic("ptable_free(): no page for pfn!");
/*
* Get an exclusive lock, might have to wait for a kmem reader.
*/
if (!page_tryupgrade(pp)) {
/*
* RFE: we could change this to not loop forever
* George Cameron had some idea on how to do that.
* For now looping works - it's just like sfmmu.
*/
continue;
}
page_unresv(1);
}
/*
* Put one htable on the reserve list.
*/
static void
{
}
/*
* Take one htable from the reserve.
*/
static htable_t *
htable_get_reserve(void)
{
if (htable_reserve_cnt != 0) {
}
return (ht);
}
/*
* Allocate initial htables with page tables and put them on the kernel hat's
* cache list.
*/
void
{
while (count > 0) {
panic("ptable_alloc() failed");
--count;
}
}
/*
* Readjust the reserves after a thread finishes using them.
*
* The first time this is called post boot, we'll also clear out the
* extra boot htables that were put in the kernel hat's cache list.
*/
void
{
static int first_time = 1;
/*
* The first time this is called after we can steal, we free up the
* tables that were allocated for boot up.
*/
if (first_time) {
first_time = 0;
}
return;
}
/*
* Free any excess htables in the reserve list
*/
while (htable_reserve_cnt > htable_reserve_amount) {
ht = htable_get_reserve();
return;
}
}
/*
* This routine steals htables from user processes for htable_alloc() or
* for htable_reap().
*/
static htable_t *
{
uint_t h;
uint_t e;
/*
* Limit htable_steal_passes to something reasonable
*/
if (htable_steal_passes == 0)
htable_steal_passes = 1;
/*
* Loop through all hats. The 1st pass takes cached htables that
* aren't in use. The later passes steal by removing mappings, too.
*/
for (;;) {
/*
* move to next hat
*/
do {
break;
}
/*
* Take any htables from the hat's cached "free" list.
*/
++stolen;
}
/*
* Don't steal on first pass.
*/
continue;
/*
* search the active htables for one to steal
*/
++h) {
HTABLE_ENTER(h);
/*
* Can we rule out reaping?
*/
ht->ht_valid_cnt > 0) ||
ht->ht_lock_cnt != 0)
continue;
/*
* Increment busy so the htable can't
* disappear. We drop the htable mutex
* to avoid deadlocks with
* hat_pageunload() and the hment mutex
* while we call hat_pte_unmap()
*/
HTABLE_EXIT(h);
/*
* Try stealing.
* - unload and invalidate all PTEs
*/
e < ht->ht_num_ptes &&
ht->ht_valid_cnt > 0 &&
ht->ht_lock_cnt == 0;
++e, va += MMU_PAGESIZE) {
if (!PTE_ISVALID(pte))
continue;
hat_pte_unmap(ht, e,
}
/*
* Reacquire htable lock. If we didn't
* remove all mappings in the table,
* or another thread added a new mapping
* behind us, give up on this table.
*/
HTABLE_ENTER(h);
ht->ht_valid_cnt != 0 ||
ht->ht_lock_cnt != 0) {
continue;
}
/*
* Steal it and unlink the page table.
*/
/*
* remove from the hash list
*/
} else {
ht);
hat->hat_ht_hash[h] =
}
/*
* Break to outer loop to release the
* higher (ht_parent) pagtable. This
* spreads out the pain caused by
* pagefaults.
*/
++stolen;
/*
* If this is the last steal, then move
* the hat list head, so that we start
* here next time.
*/
}
break;
}
HTABLE_EXIT(h);
}
}
}
return (list);
}
/*
* This is invoked from kmem when the system is low on memory. We try
* to free hments, htables, and ptables to improve the memory situation.
*/
/*ARGSUSED*/
static void
htable_reap(void *handle)
{
if (!can_steal_post_boot)
return;
/*
* Try to reap 5% of the page tables bounded by a maximum of
* 5% of physmem and a minimum of 10.
*/
/*
* Let htable_steal() do the work, we just call htable_free()
*/
}
/*
* Free up excess reserves
*/
}
/*
* allocate an htable, stealing one or using the reserve if necessary
*/
static htable_t *
{
is_bare = 1;
/*
* First reuse a cached htable from the hat_ht_cached field, this
* what happens during use_boot_reserve.
*/
need_to_zero = 0;
/* XX64 ASSERT() they're all zero somehow */
}
}
/*
* When allocating for hat_memload_arena, we use the reserve.
* Also use reserves if we are in a panic().
*/
ht = htable_get_reserve();
} else {
/*
* Donate successful htable allocations to the reserve.
*/
for (;;) {
break;
if (curthread == hat_reserves_thread ||
break;
}
}
/*
* allocate a page for the hardware page table if needed
*/
}
}
}
/*
* if allocations failed resort to stealing
*/
/*
* if we had to steal for a bare htable, release the
* page for the pagetable
*/
}
/*
* All attempts to allocate or steal failed...
*/
panic("htable_alloc(): couldn't steal\n");
/*
* Shared page tables have all entries locked and entries may not
* be added or deleted.
*/
ht->ht_lock_cnt = 0;
need_to_zero = 0;
} else {
ht->ht_lock_cnt = 0;
ht->ht_valid_cnt = 0;
}
/*
* setup flags, etc. for VLP htables
*/
if (is_vlp) {
need_to_zero = 0;
} else {
}
/*
* fill in the htable
*/
/*
* Zero out any freshly allocated page table
*/
if (need_to_zero)
return (ht);
}
/*
* Free up an htable, either to a hat's cached list, the reserves or
* back to kmem.
*/
static void
{
/*
* If the process isn't exiting, cache the free htable in the hat
* structure. We always do this for the boot reserve. We don't
*/
(use_boot_reserve ||
return;
}
/*
* If we have a hardware page table, free it.
* We don't free page tables that are accessed by sharing someone else.
*/
}
/*
* If we are the thread using the reserves, put free htables
* into reserves.
*/
if (curthread == hat_reserves_thread ||
else
}
/*
* This is called when a hat is being destroyed or swapped out. We reap all
* the remaining htables in the hat cache. If destroying all left over
* htables are also destroyed.
*
* We also don't need to invalidate any of the PTPs nor do any demapping.
*/
void
{
int h;
/*
* Purge the htable cache if just reaping.
*/
for (;;) {
break;
}
}
return;
}
/*
* if freeing, no locking is needed
*/
}
/*
* walk thru the htable hash table and free all the htables in it.
*/
for (h = 0; h < hat->hat_num_hash; ++h) {
} else {
}
}
}
}
/*
* Unlink an entry for a table at vaddr and level out of the existing table
* one level higher. We are always holding the HASH_ENTER() when doing this.
*/
static void
{
}
/*
* Link an entry for a new table at vaddr and level into the existing table
* one level higher. We are always holding the HASH_ENTER() when doing this.
*/
static void
{
if (found != 0)
}
/*
* Release of an htable.
*
* During process exit, some empty page tables are not unlinked - hat_free_end()
* cleans them up. Upper level pagetable (mmu.max_page_level and higher) are
* only released during hat_free_end() or by htable_steal(). We always
* release SHARED page tables.
*/
void
{
for (;;) {
/*
* The common case is that this isn't the last use of
* an htable so we don't want to free the htable.
*/
if (ht->ht_valid_cnt > 0)
break;
break;
/*
* we always release empty shared htables
*/
/*
* don't release if in address space tear down
*/
break;
/*
* At and above max_page_level, free if it's for
* a boot-time kernel mapping below kernelbase.
*/
break;
}
/*
* remember if we destroy an htable that shares its PFN
* from elsewhere
*/
}
/*
* Handle release of a table and freeing the htable_t.
* Unlink it from the table higher (ie. ht_parent).
*/
/*
* Unlink the pagetable.
*/
/*
* When any top level VLP page table entry changes, we
* must issue a reload of cr3 on all processors.
*/
/*
* remove this htable from its hash list
*/
} else {
}
}
/*
* If we released a shared htable, do a release on the htable
* from which it shared
*/
}
}
/*
* Find the htable for the pagetable at the given level for the given address.
* If found acquires a hold that eventually needs to be htable_release()d
*/
htable_t *
{
base = 0;
else
break;
}
if (ht)
return (ht);
}
/*
* Acquires a hold on a known htable (from a locked hment entry).
*/
void
{
#ifdef DEBUG
/*
* make sure the htable is there
*/
{
htable_t *h;
h && h != ht;
h = h->ht_next)
;
}
#endif /* DEBUG */
}
/*
* Find the htable for the pagetable at the given level for the given address.
* If found acquires a hold that eventually needs to be htable_release()d
* If not found the table is created.
*
* Since we can't hold a hash table mutex during allocation, we have to
* drop it and redo the search on a create. Then we may have to free the newly
* allocated htable if another thread raced in and created it ahead of us.
*/
htable_t *
{
uint_t h;
level_t l;
/*
* Create the page tables in top down order.
*/
base = 0;
else
/*
* look up the htable at this level
*/
HTABLE_ENTER(h);
} else {
break;
}
}
/*
* if we found the htable, increment its busy cnt
* and if we had allocated a new htable, free it.
*/
/*
* If we find a pre-existing shared table, it must
* share from the same place.
*/
panic("htable shared from wrong place "
}
HTABLE_EXIT(h);
if (new)
/*
* if we didn't find it on the first search
* allocate a new one and search again
*/
HTABLE_EXIT(h);
goto try_again;
/*
* 2nd search and still not there, use "new" table
* Link new table into higher, when not at top level.
*/
} else {
/*
* When any top level VLP page table changes,
* we must reload cr3 on all processors.
*/
#ifdef __i386
#else /* !__i386 */
#endif /* __i386 */
l == VLP_LEVEL - 1)
}
if (hat->hat_ht_hash[h])
HTABLE_EXIT(h);
/*
* Note we don't do htable_release(higher).
* That happens recursively when "new" is removed by
* htable_release() or htable_steal().
*/
/*
* If we just created a new shared page table we
* increment the shared htable's busy count, so that
* it can't be the victim of a steal even if it's empty.
*/
}
}
}
return (ht);
}
/*
* Walk through a given htable looking for the first valid entry. This
* routine takes both a starting and ending address. The starting address
* is required to be within the htable provided by the caller, but there is
* no such restriction on the ending address.
*
* If the routine finds a valid entry in the htable (at or beyond the
* starting address), the PTE (and its address) will be returned.
* This PTE may correspond to either a page or a pagetable - it is the
* caller's responsibility to determine which. If no valid entry is
* found, 0 (and invalid PTE) and the next unexamined address will be
* returned.
*
* The loop has been carefully coded for optimization.
*/
static x86pte_t
{
uint_t e;
char *pte_ptr;
char *end_pte_ptr;
/*
* Compute the starting index and ending virtual address
*/
/*
* The following page table scan code knows that the valid
* bit of a PTE is in the lowest byte AND that x86 is little endian!!
*/
while (*pte_ptr == 0) {
break;
if (pte_ptr == end_pte_ptr)
break;
}
/*
* if we found a valid PTE, load the entire PTE
*/
} else {
}
}
#if defined(__amd64)
/*
* deal with VA hole on amd64
*/
#endif /* __amd64 */
return (found_pte);
}
/*
* Find the address and htable for the first populated translation at or
* above the given virtual address. The caller may also specify an upper
* limit to the address range to search. Uses level information to quickly
* skip unpopulated sections of virtual address spaces.
*
* If not found returns NULL. When found, returns the htable and virt addr
* and has a hold on the htable.
*/
{
level_t l;
/*
* If this is a user address, then we know we need not look beyond
* kernelbase.
*/
eaddr == HTABLE_WALK_TO_END);
eaddr = kernelbase;
/*
* If we're coming in with a previous page table, search it first
* without doing an htable_lookup(), this should be frequent.
*/
if (prev) {
if (PTE_ISPAGE(pte, l)) {
return (pte);
}
}
/*
* We found nothing in the htable provided by the caller,
* so fall through and do the full search
*/
}
/*
* Find the level of the largest pagesize used by this HAT.
*/
max_mapped_level = 0;
if (hat->hat_pages_mapped[l] != 0)
max_mapped_level = l;
/*
* Find lowest table with any entry for given address.
*/
if (PTE_ISPAGE(pte, l)) {
return (pte);
}
break;
}
/*
* The ht is never NULL at the top level since
* the top level htable is created in hat_alloc().
*/
/*
* No htable covers the address. If there is no
* larger page size that could cover it, we
* skip to the start of the next page table.
*/
if (l >= max_mapped_level) {
break;
}
}
}
*vaddr = 0;
return (0);
}
/*
* Find the htable and page table entry index of the given virtual address
* with pagesize at or below given level.
* If not found returns NULL. When found, returns the htable, sets
* entry, and has a hold on the htable.
*/
htable_t *
{
level_t l;
uint_t e;
for (l = 0; l <= level; ++l) {
continue;
*entry = e;
return (ht);
}
return (NULL);
}
/*
* Find the htable and page table entry index of the given virtual address.
* There must be a valid page mapped at the given address.
* If not found returns NULL. When found, returns the htable, sets
* entry, and has a hold on the htable.
*/
htable_t *
{
uint_t e;
return (NULL);
if (entry)
*entry = e;
return (ht);
return (NULL);
}
void
{
/*
* To save on kernel VA usage, we avoid debug information in 32 bit
* kernels.
*/
#if defined(__amd64)
int kmem_flags = KMC_NOHASH;
#endif
/*
* initialize kmem caches
*/
}
/*
* get the pte index for the virtual address in the given htable's pagetable
*/
{
}
/*
* Given an htable and the index of a pte in it, return the virtual address
* of the page.
*/
{
/*
* Need to skip over any VA hole in top level table
*/
#if defined(__amd64)
#endif
return (va);
}
/*
* avoid atomicity problems, since PTEs can be 8 bytes on 32 bit systems.
* will naturally be atomic.
*
* The combination of using kpreempt_disable()/_enable() and the hci_mutex
* are used to ensure that an interrupt won't overwrite a temporary mapping
* while it's in use. If an interrupt thread tries to access a PTE, it will
* yield briefly back to the pinned thread which holds the cpu's hci_mutex.
*/
/*
* Initialize a CPU private window for mapping page tables.
* There will be 3 total pages of addressing needed:
*
* 1 for r/w access to pagetables
* 1 for r access when copying pagetables (hat_alloc)
* 1 that will map the PTEs for the 1st 2, so we can access them quickly
*
* We use vmem_xalloc() to get a correct alignment so that only one
* hat_mempte_setup() is needed.
*/
void
{
struct hat_cpu_info *hci;
/*
* We can't use kmem_alloc/vmem_alloc for the 1st CPU, as this is
* called before we've activated our own HAT
*/
} else {
}
/*
* If we are using segkpm, then there is no need for any of the
* mempte support. We can access the desired memory through a kpm
* mapping rather than setting up a temporary mempte mapping.
*/
if (kpm_enable == 0) {
}
}
/*
* Macro to establish temporary mappings for x86pte_XXX routines.
*/
x86pte_t t; \
\
else \
}
/*
* Disable preemption and establish a mapping to the pagetable with the
* given pfn. This is optimized for there case where it's the same
* pfn as we last used referenced from this CPU.
*/
static x86pte_t *
{
struct hat_cpu_info *hci;
/*
* VLP pagetables are contained in the hat_t
*/
/*
* During early boot, use hat_boot_remap() of a page table adddress.
*/
if (kpm_enable)
if (!khat_running) {
}
/*
* Normally, disable preemption and grab the CPU's hci_mutex
*/
/*
* The current mapping doesn't already point to this page.
* Update the CPU specific pagetable mapping to map the pfn.
*/
PT_WRITABLE, pfn);
}
return (hci->hci_pagetable_va);
}
/*
* Release access to a page table.
*/
static void
{
struct hat_cpu_info *hci;
if (kpm_enable)
return;
/*
* nothing to do for VLP htables
*/
return;
/*
* During boot-up hat_kern_setup(), erase the boot loader remapping.
*/
if (!khat_running) {
return;
}
/*
* Normal Operation: drop the CPU's hci_mutex and restore preemption
*/
}
/*
* Atomic retrieval of a pagetable entry
*/
{
/*
* Be careful that loading PAE entries in 32 bit kernel is atomic.
*/
} else {
}
return (pte);
}
/*
* Atomic unconditional set of a page table entry, it returns the previous
* value.
*/
{
} else {
}
for (;;) {
break;
}
break;
}
} else {
for (;;) {
break;
}
break;
}
}
return (old);
}
/*
* Atomic compare and swap of a page table entry.
*/
static x86pte_t
{
} else {
}
return (pte);
}
/*
* data structure for cross call information
*/
typedef struct xcall_info {
} xcall_info_t;
/*
* Cross call service function to atomically invalidate a PTE and flush TLBs
*/
/*ARGSUSED*/
static int
{
/*
* Only the initiating cpu invalidates the page table entry.
* It returns the previous PTE value to the caller.
*/
for (;;) {
break;
break;
}
} else {
for (;;) {
break;
break;
}
}
}
/*
* For a normal address, we just flush one page mapping
* Otherwise reload cr3 to effect a complete TLB flush.
*
* Note we don't reload VLP pte's -- this assume we never have a
* large page size at VLP_LEVEL for VLP processes.
*/
} else {
reload_cr3();
}
return (0);
}
/*
* Cross call service function to atomically change a PTE and flush TLBs
*/
/*ARGSUSED*/
static int
{
/*
* Only the initiating cpu changes the page table entry.
* It returns the previous PTE value to the caller.
*/
} else {
}
}
/*
* Flush the TLB entry
*/
else
reload_cr3();
return (0);
}
/*
* Use cross calls to change a page table entry and invalidate TLBs.
*/
void
{
/*
* Given the current implementation of hat_share(), doing a
* hat_pageunload() on a shared page table requries invalidating
* all user TLB entries on all CPUs.
*/
}
/*
* Use a cross call to do the invalidations.
* Note the current CPU always has to be in the cross call CPU set.
*/
} else {
}
/*
* Use a cross call to modify the page table entry and invalidate TLBs.
* If we're panic'ing, don't bother with the cross call.
* Note the panicstr check isn't bullet proof and the panic system
* ought to be made tighter.
*/
else
}
/*
* Invalidate a page table entry if it currently maps the given pfn.
* This returns the previous value of the PTE.
*/
{
} else {
}
/*
* Fill in the structure used by the cross call function to do the
* invalidation.
*/
}
/*
* update a PTE and invalidate any stale TLB entries.
*/
{
/*
* Fill in the structure used by the cross call function to do the
* invalidation.
*/
}
/*
* Copy page tables - this is just a little more complicated than the
* previous routines. Note that it's also not atomic! It also is never
* used for VLP pagetables.
*/
void
{
struct hat_cpu_info *hci;
/*
* Acquire access to the CPU pagetable window for the destination.
*/
if (kpm_enable) {
} else {
/*
* Finish defining the src pagetable mapping
*/
}
/*
* now do the copy
*/
}
/*
* Zero page table entries - Note this doesn't use atomic stores!
*/
void
{
x86pte_t *p;
extern void hat_pte_zero(void *, size_t);
/*
* Map in the page table to be zeroed.
*/
if (x86_feature & X86_SSE2) {
} else if (khat_running) {
} else {
/*
* Can't just use bzero during boot because it checks the
* address against kernelbase. Instead just use a zero loop.
*/
while (count-- > 0)
*p++ = 0;
} else {
while (count-- > 0)
*p32++ = 0;
}
}
}
/*
* Called to ensure that all pagetables are in the system dump
*/
void
hat_dump(void)
{
uint_t h;
int count;
/*
* kas.a_hat is the head of the circular list, but not an element of
* the list. Once we pass kas.a_hat->hat_next a second time, we
* know we've iterated through every hat structure.
*/
for (h = 0; h < hat->hat_num_hash; ++h) {
}
}
}
}
}