i86pc/vm/htable.h

	htable.h revision a6a74e0e62d62ff750cd4b790be5eacc99c3bb8c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */
/*
 * Copyright (c) 2014 by Delphix. All rights reserved.
 */

#ifndef _VM_HTABLE_H
#define _VM_HTABLE_H

#ifdef  __cplusplus
extern "C" {
#endif

#if defined(__GNUC__) && defined(_ASM_INLINES) && defined(_KERNEL)
#include <asm/htable.h>
#endif

extern void atomic_andb(uint8_t *addr, uint8_t value);
extern void atomic_orb(uint8_t *addr, uint8_t value);
extern void atomic_inc16(uint16_t *addr);
extern void atomic_dec16(uint16_t *addr);
extern void mmu_tlbflush_entry(caddr_t addr);

/*
 * Each hardware page table has an htable_t describing it.
 *
 * We use a reference counter mechanism to detect when we can free an htable.
 * In the implmentation the reference count is split into 2 separate counters:
 *
 *  ht_busy is a traditional reference count of uses of the htable pointer
 *
 *  ht_valid_cnt is a count of how references are implied by valid PTE/PTP
 *           entries in the pagetable
 *
 * ht_busy is only incremented by htable_lookup() or htable_create()
 * while holding the appropriate hash_table mutex. While installing a new
 * valid PTE or PTP, in order to increment ht_valid_cnt a thread must have
 * done an htable_lookup() or htable_create() but not the htable_release yet.
 *
 * htable_release(), while holding the mutex, can know that if
 * busy == 1 and valid_cnt == 0, the htable can be free'd.
 *
 * The fields have been ordered to make htable_lookup() fast. Hence,
 * ht_hat, ht_vaddr, ht_level and ht_next need to be clustered together.
 */
struct htable {
    struct htable   *ht_next;   /* forward link for hash table */
    struct hat  *ht_hat;    /* hat this mapping comes from */
    uintptr_t   ht_vaddr;   /* virt addr at start of this table */
    int8_t      ht_level;   /* page table level: 0=4K, 1=2M, ... */
    uint8_t     ht_flags;   /* see below */
    int16_t     ht_busy;    /* implements locking protocol */
    int16_t     ht_valid_cnt;   /* # of valid entries in this table */
    uint32_t    ht_lock_cnt;    /* # of locked entries in this table */
                    /* never used for kernel hat */
    pfn_t       ht_pfn;     /* pfn of page of the pagetable */
    struct htable   *ht_prev;   /* backward link for hash table */
    struct htable   *ht_parent; /* htable that points to this htable */
    struct htable   *ht_shares; /* for HTABLE_SHARED_PFN only */
};
typedef struct htable htable_t;

/*
 * Flags values for htable ht_flags field:
 *
 * HTABLE_VLP - this is the top level htable of a VLP HAT.
 *
 * HTABLE_SHARED_PFN - this htable had its PFN assigned from sharing another
 *  htable. Used by hat_share() for ISM.
 */
#define HTABLE_VLP      (0x01)
#define HTABLE_SHARED_PFN   (0x02)

/*
 * The htable hash table hashing function.  The 28 is so that high
 * order bits are include in the hash index to skew the wrap
 * around of addresses. Even though the hash buckets are stored per
 * hat we include the value of hat pointer in the hash function so
 * that the secondary hash for the htable mutex winds up begin different in
 * every address space.
 */
#define HTABLE_HASH(hat, va, lvl)                   \
    ((((va) >> LEVEL_SHIFT(1)) + ((va) >> 28) + (lvl) +     \
    ((uintptr_t)(hat) >> 4)) & ((hat)->hat_num_hash - 1))

/*
 * Each CPU gets a unique hat_cpu_info structure in cpu_hat_info.
 */
struct hat_cpu_info {
    kmutex_t hci_mutex;     /* mutex to ensure sequential usage */
#if defined(__amd64)
    pfn_t   hci_vlp_pfn;        /* pfn of hci_vlp_l3ptes */
    x86pte_t *hci_vlp_l3ptes;   /* VLP Level==3 pagetable (top) */
    x86pte_t *hci_vlp_l2ptes;   /* VLP Level==2 pagetable */
#endif  /* __amd64 */
};


/*
 * Compute the last page aligned VA mapped by an htable.
 *
 * Given a va and a level, compute the virtual address of the start of the
 * next page at that level.
 *
 * XX64 - The check for the VA hole needs to be better generalized.
 */
#if defined(__amd64)
#define HTABLE_NUM_PTES(ht) (((ht)->ht_flags & HTABLE_VLP) ? 4 : 512)

#define HTABLE_LAST_PAGE(ht)                        \
    ((ht)->ht_level == mmu.max_level ? ((uintptr_t)0UL - MMU_PAGESIZE) :\
    ((ht)->ht_vaddr - MMU_PAGESIZE +                \
    ((uintptr_t)HTABLE_NUM_PTES(ht) << LEVEL_SHIFT((ht)->ht_level))))

#define NEXT_ENTRY_VA(va, l)    \
    ((va & LEVEL_MASK(l)) + LEVEL_SIZE(l) == mmu.hole_start ?   \
    mmu.hole_end : (va & LEVEL_MASK(l)) + LEVEL_SIZE(l))

#elif defined(__i386)

#define HTABLE_NUM_PTES(ht) \
    (!mmu.pae_hat ? 1024 : ((ht)->ht_level == 2 ? 4 : 512))

#define HTABLE_LAST_PAGE(ht)    ((ht)->ht_vaddr - MMU_PAGESIZE + \
    ((uintptr_t)HTABLE_NUM_PTES(ht) << LEVEL_SHIFT((ht)->ht_level)))

#define NEXT_ENTRY_VA(va, l) ((va & LEVEL_MASK(l)) + LEVEL_SIZE(l))

#endif

#if defined(_KERNEL)

/*
 * initialization function called from hat_init()
 */
extern void htable_init(void);

/*
 * Functions to lookup, or "lookup and create", the htable corresponding
 * to the virtual address "vaddr"  in the "hat" at the given "level" of
 * page tables. htable_lookup() may return NULL if no such entry exists.
 *
 * On return the given htable is marked busy (a shared lock) - this prevents
 * the htable from being stolen or freed) until htable_release() is called.
 *
 * If kalloc_flag is set on an htable_create() we can't call kmem allocation
 * routines for this htable, since it's for the kernel hat itself.
 *
 * htable_acquire() is used when an htable pointer has been extracted from
 * an hment and we need to get a reference to the htable.
 */
extern htable_t *htable_lookup(struct hat *hat, uintptr_t vaddr, level_t level);
extern htable_t *htable_create(struct hat *hat, uintptr_t vaddr, level_t level,
    htable_t *shared);
extern void htable_acquire(htable_t *);

extern void htable_release(htable_t *ht);
extern void htable_destroy(htable_t *ht);

/*
 * Code to free all remaining htables for a hat. Called after the hat is no
 * longer in use by any thread.
 */
extern void htable_purge_hat(struct hat *hat);

/*
 * Find the htable, page table entry index, and PTE of the given virtual
 * address.  If not found returns NULL. When found, returns the htable_t *,
 * sets entry, and has a hold on the htable.
 */
extern htable_t *htable_getpte(struct hat *, uintptr_t, uint_t *, x86pte_t *,
    level_t);

/*
 * Similar to hat_getpte(), except that this only succeeds if a valid
 * page mapping is present.
 */
extern htable_t *htable_getpage(struct hat *hat, uintptr_t va, uint_t *entry);

/*
 * Called to allocate initial/additional htables for reserve.
 */
extern void htable_initial_reserve(uint_t);
extern void htable_reserve(uint_t);

/*
 * Used to readjust the htable reserve after the reserve list has been used.
 * Also called after boot to release left over boot reserves.
 */
extern void htable_adjust_reserve(void);

/*
 * return number of bytes mapped by all the htables in a given hat
 */
extern size_t htable_mapped(struct hat *);


/*
 * Attach initial pagetables as htables
 */
extern void htable_attach(struct hat *, uintptr_t, level_t, struct htable *,
    pfn_t);

/*
 * Routine to find the next populated htable at or above a given virtual
 * address. Can specify an upper limit, or HTABLE_WALK_TO_END to indicate
 * that it should search the entire address space.  Similar to
 * hat_getpte(), but used for walking through address ranges. It can be
 * used like this:
 *
 *  va = ...
 *  ht = NULL;
 *  while (va < end_va) {
 *      pte = htable_walk(hat, &ht, &va, end_va);
 *      if (!pte)
 *          break;
 *
 *      ... code to operate on page at va ...
 *
 *      va += LEVEL_SIZE(ht->ht_level);
 *  }
 *  if (ht)
 *      htable_release(ht);
 *
 */
extern x86pte_t htable_walk(struct hat *hat, htable_t **ht, uintptr_t *va,
    uintptr_t eaddr);

#define HTABLE_WALK_TO_END ((uintptr_t)-1)

/*
 * Utilities convert between virtual addresses and page table entry indeces.
 */
extern uint_t htable_va2entry(uintptr_t va, htable_t *ht);
extern uintptr_t htable_e2va(htable_t *ht, uint_t entry);

/*
 * Interfaces that provide access to page table entries via the htable.
 *
 * Note that all accesses except x86pte_copy() and x86pte_zero() are atomic.
 */
extern void x86pte_cpu_init(cpu_t *);
extern void x86pte_cpu_fini(cpu_t *);

extern x86pte_t x86pte_get(htable_t *, uint_t entry);

/*
 * x86pte_set returns LPAGE_ERROR if it's asked to overwrite a page table
 * link with a large page mapping.
 */
#define LPAGE_ERROR (-(x86pte_t)1)
extern x86pte_t x86pte_set(htable_t *, uint_t entry, x86pte_t new, void *);

extern x86pte_t x86pte_inval(htable_t *ht, uint_t entry,
    x86pte_t old, x86pte_t *ptr, boolean_t tlb);

extern x86pte_t x86pte_update(htable_t *ht, uint_t entry,
    x86pte_t old, x86pte_t new);

extern void x86pte_copy(htable_t *src, htable_t *dest, uint_t entry,
    uint_t cnt);

/*
 * access to a pagetable knowing only the pfn
 */
extern x86pte_t *x86pte_mapin(pfn_t, uint_t, htable_t *);
extern void x86pte_mapout(void);

/*
 * these are actually inlines for "lock; incw", "lock; decw", etc. instructions.
 */
#define HTABLE_INC(x)   atomic_inc16((uint16_t *)&x)
#define HTABLE_DEC(x)   atomic_dec16((uint16_t *)&x)
#define HTABLE_LOCK_INC(ht) atomic_inc_32(&(ht)->ht_lock_cnt)
#define HTABLE_LOCK_DEC(ht) atomic_dec_32(&(ht)->ht_lock_cnt)

#ifdef __xpv
extern void xen_flush_va(caddr_t va);
extern void xen_gflush_va(caddr_t va, cpuset_t);
extern void xen_flush_tlb(void);
extern void xen_gflush_tlb(cpuset_t);
extern void xen_pin(pfn_t, level_t);
extern void xen_unpin(pfn_t);
extern int xen_kpm_page(pfn_t, uint_t);

/*
 * The hypervisor maps all page tables into our address space read-only.
 * Under normal circumstances, the hypervisor then handles all updates to
 * the page tables underneath the covers for us.  However, when we are
 * trying to dump core after a hypervisor panic, the hypervisor is no
 * longer available to do these updates.  To work around the protection
 * problem, we simply disable write-protect checking for the duration of a
 * pagetable update operation.
 */
#define XPV_ALLOW_PAGETABLE_UPDATES()                   \
    {                               \
        if (IN_XPV_PANIC())                 \
            setcr0((getcr0() & ~CR0_WP) & 0xffffffff);  \
    }
#define XPV_DISALLOW_PAGETABLE_UPDATES()                \
    {                               \
        if (IN_XPV_PANIC() > 0)                 \
            setcr0((getcr0() | CR0_WP) & 0xffffffff);   \
    }

#else /* __xpv */

#define XPV_ALLOW_PAGETABLE_UPDATES()
#define XPV_DISALLOW_PAGETABLE_UPDATES()

#endif

#endif  /* _KERNEL */


#ifdef  __cplusplus
}
#endif

#endif  /* _VM_HTABLE_H */