trapstat.c revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/stat.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/modctl.h>
#include <sys/cpu_module.h>
#include <vm/hat_sfmmu.h>
#include <vm/seg_kmem.h>
#include <vm/seg_kpm.h>
#include <vm/vm_dep.h>
#include <sys/machsystm.h>
#include <sys/machasi.h>
#include <sys/sysmacros.h>
#include <sys/callb.h>
#include <sys/archsystm.h>
#include <sys/trapstat.h>
#ifdef sun4v
#include <sys/hypervisor_api.h>
#endif
/* BEGIN CSTYLED */
/*
* trapstat: Trap Statistics through Dynamic Trap Table Interposition
* -------------------------------------------------------------------
*
* Motivation and Overview
*
* Despite being a fundamental indicator of system behavior, there has
* historically been very little insight provided into the frequency and cost
* of machine-specific traps. The lack of insight has been especially acute
* on UltraSPARC microprocessors: because these microprocessors handle TLB
* misses as software traps, the frequency and duration of traps play a
* decisive role in the performance of the memory system. As applications have
* increasingly outstripped TLB reach, this has become increasingly true.
*
* Part of the difficulty of observing trap behavior is that the trap handlers
* are so frequently called (e.g. millions of times per second) that any
* permanently enabled instrumentation would induce an unacceptable performance
* degradation. Thus, it is a constraint on any trap observability
* infrastructure that it have no probe effect when not explicitly enabled.
*
* The basic idea, then, is to create an interposing trap table in which each
* entry increments a per-trap, in-memory counter and then jumps to the actual,
* underlying trap table entry. To enable trapstat, we atomically write to the
* trap base address (%tba) register to point to our interposing trap table.
* (Note that per-CPU statistics fall out by creating a different trap table
* for each CPU.)
*
* Implementation Details
*
* While the idea is straight-forward, a nuance of SPARC V9 slightly
* complicates the implementation. Unlike its predecessors, SPARC V9 supports
* the notion of nested traps. The trap level is kept in the TL register:
* during normal operation it is 0; when a trap is taken, the TL register is
* incremented by 1. To aid system software, SPARC V9 breaks the trap table
* into two halves: the lower half contains the trap handlers for traps taken
* when TL is 0; the upper half contains the trap handlers for traps taken
* when TL is greater than 0. Each half is further subdivided into two
* subsequent halves: the lower half contains the trap handlers for traps
* other than those induced by the trap instruction (Tcc variants); the upper
* half contains the trap handlers for traps induced by the trap instruction.
* This gives a total of four ranges, with each range containing 256 traps:
*
* +--------------------------------+- 3ff
* | | .
* | Trap instruction, TL>0 | .
* | | .
* |- - - - - - - - - - - - - - - - +- 300
* |- - - - - - - - - - - - - - - - +- 2ff
* | | .
* | Non-trap instruction, TL>0 | .
* | | .
* |- - - - - - - - - - - - - - - - +- 200
* |- - - - - - - - - - - - - - - - +- 1ff
* | | .
* | Trap instruction, TL=0 | .
* | | .
* |- - - - - - - - - - - - - - - - +- 100
* |- - - - - - - - - - - - - - - - +- 0ff
* | | .
* | Non-trap instruction, TL=0 | .
* | | .
* +--------------------------------+- 000
*
*
* Solaris, however, doesn't have reason to support trap instructions when
* TL>0 (only privileged code may execute at TL>0; not supporting this only
* constrains our own implementation). The trap table actually looks like:
*
* +--------------------------------+- 2ff
* | | .
* | Non-trap instruction, TL>0 | .
* | | .
* |- - - - - - - - - - - - - - - - +- 200
* |- - - - - - - - - - - - - - - - +- 1ff
* | | .
* | Trap instruction, TL=0 | .
* | | .
* |- - - - - - - - - - - - - - - - +- 100
* |- - - - - - - - - - - - - - - - +- 0ff
* | | .
* | Non-trap instruction, TL=0 | .
* | | .
* +--------------------------------+- 000
*
* Putatively to aid system software, SPARC V9 has the notion of multiple
* sets of global registers. UltraSPARC defines four sets of global
* registers:
*
* Normal Globals
* Alternate Globals (AGs)
* MMU Globals (MGs)
* Interrupt Globals (IGs)
*
* The set of globals in use is controlled by bits in PSTATE; when TL is 0
* (and PSTATE has not been otherwise explicitly modified), the Normal Globals
* are in use. When a trap is issued, PSTATE is modified to point to a set of
* globals corresponding to the trap type. Most traps correspond to the
* Alternate Globals, with a minority corresponding to the MMU Globals, and
* only the interrupt-vector trap (vector 0x60) corresponding to the Interrupt
* Globals. (The complete mapping can be found in the UltraSPARC I&II User's
* Manual.)
*
* Note that the sets of globals are per trap _type_, not per trap _level_.
* Thus, when executing a TL>0 trap handler, one may not have registers
* available (for example, both trap-instruction traps and spill traps execute
* on the alternate globals; if a trap-instruction trap induces a window spill,
* the window spill handler has no available globals). For trapstat, this is
* problematic: a register is required to transfer control from one arbitrary
* location (in the interposing trap table) to another (in the actual trap
* table).
*
* We solve this problem by exploiting the trap table's location at the bottom
* of valid kernel memory (i.e. at KERNELBASE). We locate the interposing trap
* tables just below KERNELBASE -- thereby allowing us to use a branch-always
* instruction (ba) instead of a jump instruction (jmp) to transfer control
* from the TL>0 entries in the interposing trap table to the TL>0 entries in
* the actual trap table. (N.B. while this allows trap table interposition to
* work, it necessarily limits trapstat to only recording information about
* TL=0 traps -- there is no way to increment a counter without using a
* register.) Diagrammatically:
*
* Actual trap table:
*
* +--------------------------------+- 2ff
* | | .
* | Non-trap instruction, TL>0 | . <-----------------------+
* | | . <-----------------------|-+
* |- - - - - - - - - - - - - - - - +- 200 <-----------------------|-|-+
* |- - - - - - - - - - - - - - - - +- 1ff | | |
* | | . | | |
* | Trap instruction, TL=0 | . <-----------------+ | | |
* | | . <-----------------|-+ | | |
* |- - - - - - - - - - - - - - - - +- 100 <-----------------|-|-+ | | |
* |- - - - - - - - - - - - - - - - +- 0ff | | | | | |
* | | . | | | | | |
* | Non-trap instruction, TL=0 | . <-----------+ | | | | | |
* | | . <-----------|-+ | | | | | |
* +--------------------------------+- 000 <-----------|-|-+ | | | | | |
* KERNELBASE | | | | | | | | |
* | | | | | | | | |
* | | | | | | | | |
* Interposing trap table: | | | | | | | | |
* | | | | | | | | |
* +--------------------------------+- 2ff | | | | | | | | |
* | ... | . | | | | | | | | |
* | ... | . | | | | | | | | |
* | ... | . | | | | | | | | |
* |- - - - - - - - - - - - - - - - +- 203 | | | | | | | | |
* | ba,a | -------------|-|-|-|-|-|-+ | |
* |- - - - - - - - - - - - - - - - +- 202 | | | | | | | |
* | ba,a | -------------|-|-|-|-|-|---+ |
* |- - - - - - - - - - - - - - - - +- 201 | | | | | | |
* | ba,a | -------------|-|-|-|-|-|-----+
* |- - - - - - - - - - - - - - - - +- 200 | | | | | |
* | ... | . | | | | | |
* | ... | . | | | | | |
* | ... | . | | | | | |
* |- - - - - - - - - - - - - - - - +- 103 | | | | | |
* | (Increment counter) | | | | | | |
* | ba,a | -------------------+ | |
* |- - - - - - - - - - - - - - - - +- 102 | | | | |
* | (Increment counter) | | | | | |
* | ba,a | ---------------------+ |
* |- - - - - - - - - - - - - - - - +- 101 | | | |
* | (Increment counter) | | | | |
* | ba,a | -----------------------+
* |- - - - - - - - - - - - - - - - +- 100 | | |
* | ... | . | | |
* | ... | . | | |
* | ... | . | | |
* |- - - - - - - - - - - - - - - - +- 003 | | |
* | (Increment counter) | | | |
* | ba,a | -------------+ | |
* |- - - - - - - - - - - - - - - - +- 002 | |
* | (Increment counter) | | |
* | ba,a | ---------------+ |
* |- - - - - - - - - - - - - - - - +- 001 |
* | (Increment counter) | |
* | ba,a | -----------------+
* +--------------------------------+- 000
* KERNELBASE - tstat_total_size
*
* tstat_total_size is the number of pages required for each trap table. It
* must be true that KERNELBASE - tstat_total_size is less than the maximum
* branch displacement; if each CPU were to consume a disjoint virtual range
* below KERNELBASE for its trap table, we could support at most
* (maximum_branch_displacement / tstat_total_size) CPUs. The maximum branch
* displacement for Bicc variants is just under eight megabytes, and (because
* the %tba must be 32K aligned), tstat_total_size must be at least 32K; if
* each CPU were to consume a disjoint virtual range, we would have an
* unacceptably low upper bound of 256 CPUs.
*
* While there are tricks that one could use to address this constraint (e.g.,
* creating trampolines every maximum_branch_displacement bytes), we instead
* solve this by not permitting each CPU to consume a disjoint virtual range.
* Rather, we have each CPU's interposing trap table use the _same_ virtual
* range, but we back the trap tables with disjoint physical memory. Normally,
* such one-to-many virtual-to-physical mappings are illegal; this is
* permissible here only because the pages for the interposing trap table are
* necessarily locked in the TLB. (The CPUs thus never have the opportunity to
* discover that they have conflicting translations.)
*
* On CMT architectures in which CPUs can share MMUs, the above trick will not
* work: two CPUs that share an MMU cannot have the same virtual address map
* to disjoint physical pages. On these architectures, any CPUs sharing the
* same MMU must consume a disjoint 32K virtual address range -- limiting the
* number of CPUs sharing an MMU on these architectures to 256 due to the
* branch displacement limitation described above. On the sun4v architecture,
* there is a further limitation: a guest may not have more than eight locked
* TLB entries per MMU. To allow operation under this restriction, the
* interposing trap table and the trap statistics are each accessed through
* a single 4M TLB entry. This limits the footprint to two locked entries
* (one for the I-TLB and one for the D-TLB), but further restricts the number
* of CPUs to 128 per MMU. However, support for more than 128 CPUs can easily
* be added via a hybrid scheme, where the same 4M virtual address is used
* on different MMUs.
*
*
* TLB Statistics
*
* Because TLB misses are an important component of system performance, we wish
* to know much more about these traps than simply the number received.
* Specifically, we wish to know:
*
* (a) The amount of time spent executing the TLB miss handler
* (b) TLB misses versus TSB misses
* (c) Kernel-level misses versus user-level misses
* (d) Misses per pagesize
*
* TLB Statistics: Time Spent Executing
*
* To accurately determine the amount of time spent executing the TLB miss
* handler, one must get a timestamp on trap entry and trap exit, subtract the
* latter from the former, and add the result to an accumulating count.
* Consider flow of control during normal TLB miss processing (where "ldx
* [%g2], %g2" is an arbitrary TLB-missing instruction):
*
* + - - - - - - - -+
* : :
* : ldx [%g2], %g2 :<-------------------------------------------------------+
* : : Return from trap: |
* + - - - - - - - -+ TL <- TL - 1 (0) |
* | %pc <- TSTATE[TL].TPC (address of load) |
* | TLB miss: |
* | TL <- TL + 1 (1) |
* | %pc <- TLB-miss-trap-handler |
* | |
* v |
* + - - - - - - - - - - - - - - - + |
* : : |
* : Lookup VA in TSB : |
* : If (hit) : |
* : Fill TLB : |
* : Else : |
* : Lookup VA (hme hash table : |
* : or segkpm) : |
* : Fill TLB : |
* : Endif : |
* : Issue "retry" ---------------------------------------------------------+
* : :
* + - - - - - - - - - - - - - - - +
* TLB-miss-trap-handler
*
*
* As the above diagram indicates, interposing on the trap table allows one
* only to determine a timestamp on trap _entry_: when the TLB miss handler
* has completed filling the TLB, a "retry" will be issued, and control will
* transfer immediately back to the missing %pc.
*
* To obtain a timestamp on trap exit, we must then somehow interpose between
* the "retry" and the subsequent control transfer to the TLB-missing
* instruction. To do this, we _push_ a trap level. The basic idea is to
* spoof a TLB miss by raising TL, setting the %tpc to be within text
* controlled by trapstat (the "TLB return entry") and branching to the
* underlying TLB miss handler. When the TLB miss handler issues its "retry",
* control will transfer not to the TLB-missing instruction, but rather to the
* TLB return entry. This code can then obtain a timestamp, and issue its own
* "retry" -- thereby correctly returning to the TLB-missing instruction.
* Here is the above TLB miss flow control diagram modified to reflect
* trapstat's operation:
*
* + - - - - - - - -+
* : :
* : ldx [%g2], %g2 :<-------------------------------------------------------+
* : : Return from trap: |
* + - - - - - - - -+ TL <- TL - 1 (0) |
* | %pc <- TSTATE[TL].TPC (address of load) |
* | TLB miss: |
* | TL <- TL + 1 (1) |
* | %pc <- TLB-miss-trap-handler (trapstat) |
* | |
* v TLB-return-entry (trapstat) |
* + - - - - - - - - - - - - - - - - - - + + - - - - - - - - - - - - - + |
* : : : : |
* : Record timestamp : : Record timestamp : |
* : TL <- 2 : : Take timestamp difference : |
* : TSTATE[1].TPC <- TLB-return-entry : : Add to running total : |
* : ba,a TLB-miss-trap-handler -----------+ : Issue "retry" --------------+
* : : | : :
* + - - - - - - - - - - - - - - - - - - + | + - - - - - - - - - - - - - +
* TLB-miss-trap-handler | ^
* (trapstat) | |
* | |
* | |
* +-----------------------+ |
* | |
* | |
* v |
* + - - - - - - - - - - - - - - - + |
* : : |
* : Lookup VA in TSB : |
* : If (hit) : |
* : Fill TLB : |
* : Else : |
* : Lookup VA (hme hash table : |
* : or segkpm) : |
* : Fill TLB : |
* : Endif : |
* : Issue "retry" ------------------------------------------+
* : : Return from trap:
* + - - - - - - - - - - - - - - - + TL <- TL - 1 (1)
* TLB-miss-trap-handler %pc <- TSTATE[TL].TPC (TLB-return-entry)
*
*
* A final subterfuge is required to complete our artifice: if we miss in
* the TLB, the TSB _and_ the subsequent hash or segkpm lookup (that is, if
* there is no valid translation for the TLB-missing address), common system
* software will need to accurately determine the %tpc as part of its page
* fault handling. We therefore modify the kernel to check the %tpc in this
* case: if the %tpc falls within the VA range controlled by trapstat and
* the TL is 2, TL is simply lowered back to 1 (this check is implemented
* by the TSTAT_CHECK_TL1 macro). Lowering TL to 1 has the effect of
* discarding the state pushed by trapstat.
*
* TLB Statistics: TLB Misses versus TSB Misses
*
* Distinguishing TLB misses from TSB misses requires further interposition
* on the TLB miss handler: we cannot know a priori or a posteriori if a
* given VA will or has hit in the TSB.
*
* We achieve this distinction by adding a second TLB return entry almost
* identical to the first -- differing only in the address to which it
* stores its results. We then modify the TLB miss handlers of the kernel
* such that they check the %tpc when they determine that a TLB miss has
* subsequently missed in the TSB: if the %tpc lies within trapstat's VA
* range and TL is 2 (that is, if trapstat is running), the TLB miss handler
* _increments_ the %tpc by the size of the TLB return entry. The ensuing
* "retry" will thus transfer control to the second TLB return entry, and
* the time spent in the handler will be accumulated in a memory location
* specific to TSB misses.
*
* N.B.: To minimize the amount of knowledge the kernel must have of trapstat,
* we do not allow the kernel to hard-code the size of the TLB return entry.
* Rather, the actual tsbmiss handler executes a known instruction at the
* corresponding tsbmiss patch points (see the tstat_tsbmiss_patch_table) with
* the %tpc in %g7: when trapstat is not running, these points contain the
* harmless TSTAT_TSBMISS_INSTR instruction ("add %g7, 0, %g7"). Before
* running, trapstat modifies the instructions at these patch points such
* that the simm13 equals the size of the TLB return entry.
*
* TLB Statistics: Kernel-level Misses versus User-level Misses
*
* Differentiating user-level misses from kernel-level misses employs a
* similar technique, but is simplified by the ability to distinguish a
* user-level miss from a kernel-level miss a priori by reading the context
* register: we implement kernel-/user-level differentiation by again doubling
* the number of TLB return entries, and setting the %tpc to the appropriate
* TLB return entry in trapstat's TLB miss handler. Together with the doubling
* of entries required for TLB-miss/TSB-miss differentiation, this yields a
* total of four TLB return entries:
*
* Level TSB hit? Structure member
* ------------------------------------------------------------
* Kernel Yes tstat_tlbret_t.ttlbr_ktlb
* Kernel No tstat_tlbret_t.ttlbr_ktsb
* User Yes tstat_tlbret_t.ttlbr_utlb
* User No tstat_tlbret_t.ttlbr_utsb
*
* TLB Statistics: Misses per Pagesize
*
* As with the TLB-/TSB-miss differentiation, we have no way of determining
* pagesize a priori. This is therefore implemented by mandating a new rule:
* whenever the kernel fills the TLB in its TLB miss handler, the TTE
* corresponding to the TLB-missing VA must be in %g5 when the handler
* executes its "retry". This allows the TLB return entry to determine
* pagesize by simply looking at the pagesize field in the TTE stored in
* %g5.
*
* TLB Statistics: Probe Effect
*
* As one might imagine, gathering TLB statistics by pushing a trap level
* induces significant probe effect. To account for this probe effect,
* trapstat attempts to observe it by executing a code sequence with a known
* number of TLB misses both before and after interposing on the trap table.
* This allows trapstat to determine a per-trap probe effect which can then be
* factored into the "%tim" fields of the trapstat command.
*
* Note that on sun4v platforms, TLB misses are normally handled by the
* hypervisor or the hardware TSB walker. Thus no fast MMU miss information
* is reported for normal operation. However, when trapstat is invoked with
* -t or -T option to collect detailed TLB statistics, kernel takes
* over TLB miss handling. This results in significantly more overhead
* and TLB statistics may not be as accurate as on sun4u platforms.
*
* Locking
*
* The implementation uses two locks: tstat_lock (a local lock) and the global
* cpu_lock. tstat_lock is used to assure trapstat's consistency in the
* presence of multithreaded /dev/trapstat consumers (while as of this writing
* the only consumer of /dev/trapstat is single threaded, it is obviously
* necessary to correctly support multithreaded access). cpu_lock is held
* whenever CPUs are being manipulated directly, to prevent them from
* disappearing in the process. Because trapstat's DR callback
* (trapstat_cpu_setup()) must grab tstat_lock and is called with cpu_lock
* held, the lock ordering is necessarily cpu_lock before tstat_lock.
*
*/
/* END CSTYLED */
static dev_info_t *tstat_devi; /* saved in xxattach() for xxinfo() */
static int tstat_open; /* set if driver is open */
static kmutex_t tstat_lock; /* serialize access */
static vmem_t *tstat_arena; /* arena for TLB-locked pages */
static tstat_percpu_t *tstat_percpu; /* per-CPU data */
static int tstat_running; /* set if trapstat is running */
static tstat_data_t *tstat_buffer; /* staging buffer for outgoing data */
static int tstat_options; /* bit-wise indication of options */
static int *tstat_enabled; /* map of enabled trap entries */
static int tstat_tsbmiss_patched; /* tsbmiss patch flag */
static callb_id_t tstat_cprcb; /* CPR callback */
static char *tstat_probe_area; /* VA range used for probe effect */
static caddr_t tstat_probe_phys; /* physical to back above VA */
static hrtime_t tstat_probe_time; /* time spent on probe effect */
static hrtime_t tstat_probe_before[TSTAT_PROBE_NLAPS];
static hrtime_t tstat_probe_after[TSTAT_PROBE_NLAPS];
static uint_t tstat_pgszs; /* # of kernel page sizes */
static uint_t tstat_user_pgszs; /* # of user page sizes */
/*
* sizeof tstat_data_t + pgsz data for the kernel. For simplicity's sake, when
* we collect data, we do it based upon szc, but when we report data back to
* userland, we have to do it based upon the userszc which may not match.
* So, these two variables are for internal use and exported use respectively.
*/
static size_t tstat_data_t_size;
static size_t tstat_data_t_exported_size;
static size_t tstat_data_pages; /* number of pages of tstat data */
static size_t tstat_data_size; /* tstat data size in bytes */
static size_t tstat_total_pages; /* #data pages + #instr pages */
static size_t tstat_total_size; /* tstat data size + instr size */
#ifdef sun4v
static caddr_t tstat_va; /* VA of memory reserved for TBA */
static pfn_t tstat_pfn; /* PFN of memory reserved for TBA */
#endif
/*
* In the above block comment, see "TLB Statistics: TLB Misses versus
* TSB Misses" for an explanation of the tsbmiss patch points.
*/
extern uint32_t tsbmiss_trapstat_patch_point;
extern uint32_t tsbmiss_trapstat_patch_point_kpm;
extern uint32_t tsbmiss_trapstat_patch_point_kpm_small;
/*
* Trapstat tsbmiss patch table
*/
tstat_tsbmiss_patch_entry_t tstat_tsbmiss_patch_table[] = {
{(uint32_t *)&tsbmiss_trapstat_patch_point, 0},
{(uint32_t *)&tsbmiss_trapstat_patch_point_kpm, 0},
{(uint32_t *)&tsbmiss_trapstat_patch_point_kpm_small, 0},
{(uint32_t *)NULL, 0}
};
/*
* We define some general SPARC-specific constants to allow more readable
* relocations.
*/
#define NOP 0x01000000
#define HI22(v) ((uint32_t)(v) >> 10)
#define LO10(v) ((uint32_t)(v) & 0x3ff)
#define LO12(v) ((uint32_t)(v) & 0xfff)
#define DISP22(from, to) \
((((uintptr_t)(to) - (uintptr_t)(from)) >> 2) & 0x3fffff)
#define ASI(asi) ((asi) << 5)
/*
* The interposing trap table must be locked in the I-TLB, and any data
* referred to in the interposing trap handler must be locked in the D-TLB.
* This function locks these pages in the appropriate TLBs by creating TTEs
* from whole cloth, and manually loading them into the TLB. This function is
* called from cross call context.
*
* On sun4v platforms, we use 4M page size mappings to minimize the number
* of locked down entries (i.e. permanent mappings). Each CPU uses a
* reserved portion of that 4M page for its TBA and data.
*/
static void
trapstat_load_tlb(void)
{
#ifndef sun4v
int i;
#else
uint64_t ret;
#endif
tte_t tte;
tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
caddr_t va = tcpu->tcpu_vabase;
ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
#ifndef sun4v
for (i = 0; i < tstat_total_pages; i++, va += MMU_PAGESIZE) {
tte.tte_inthi = TTE_VALID_INT | TTE_SZ_INT(TTE8K) |
TTE_PFN_INTHI(tcpu->tcpu_pfn[i]);
if (i < TSTAT_INSTR_PAGES) {
tte.tte_intlo = TTE_PFN_INTLO(tcpu->tcpu_pfn[i]) |
TTE_LCK_INT | TTE_CP_INT | TTE_PRIV_INT;
sfmmu_itlb_ld(va, KCONTEXT, &tte);
} else {
tte.tte_intlo = TTE_PFN_INTLO(tcpu->tcpu_pfn[i]) |
TTE_LCK_INT | TTE_CP_INT | TTE_CV_INT |
TTE_PRIV_INT | TTE_HWWR_INT;
sfmmu_dtlb_ld(va, KCONTEXT, &tte);
}
}
#else /* sun4v */
tte.tte_inthi = TTE_VALID_INT | TTE_PFN_INTHI(tstat_pfn);
tte.tte_intlo = TTE_PFN_INTLO(tstat_pfn) | TTE_CP_INT |
TTE_CV_INT | TTE_PRIV_INT | TTE_HWWR_INT |
TTE_SZ_INTLO(TTE4M);
ret = hv_mmu_map_perm_addr(va, KCONTEXT, *(uint64_t *)&tte,
MAP_ITLB | MAP_DTLB);
if (ret != H_EOK)
cmn_err(CE_PANIC, "trapstat: cannot map new TBA "
"for cpu %d (error: 0x%lx)", CPU->cpu_id, ret);
#endif /* sun4v */
}
/*
* As mentioned in the "TLB Statistics: TLB Misses versus TSB Misses" section
* of the block comment, TLB misses are differentiated from TSB misses in
* part by hot-patching the instructions at the tsbmiss patch points (see
* tstat_tsbmiss_patch_table). This routine is used both to initially patch
* the instructions, and to patch them back to their original values upon
* restoring the original trap table.
*/
static void
trapstat_hotpatch()
{
uint32_t instr;
uint32_t simm13;
tstat_tsbmiss_patch_entry_t *ep;
ASSERT(MUTEX_HELD(&tstat_lock));
if (!(tstat_options & TSTAT_OPT_TLBDATA))
return;
if (!tstat_tsbmiss_patched) {
/*
* We haven't patched the TSB paths; do so now.
*/
/*CONSTCOND*/
ASSERT(offsetof(tstat_tlbret_t, ttlbr_ktsb) -
offsetof(tstat_tlbret_t, ttlbr_ktlb) ==
offsetof(tstat_tlbret_t, ttlbr_utsb) -
offsetof(tstat_tlbret_t, ttlbr_utlb));
simm13 = offsetof(tstat_tlbret_t, ttlbr_ktsb) -
offsetof(tstat_tlbret_t, ttlbr_ktlb);
for (ep = tstat_tsbmiss_patch_table; ep->tpe_addr; ep++) {
ASSERT(ep->tpe_instr == 0);
instr = ep->tpe_instr = *ep->tpe_addr;
/*
* Assert that the instruction we're about to patch is
* "add %g7, 0, %g7" (0x8e01e000).
*/
ASSERT(instr == TSTAT_TSBMISS_INSTR);
instr |= simm13;
hot_patch_kernel_text((caddr_t)ep->tpe_addr,
instr, sizeof (instr));
}
tstat_tsbmiss_patched = 1;
} else {
/*
* Remove patches from the TSB paths.
*/
for (ep = tstat_tsbmiss_patch_table; ep->tpe_addr; ep++) {
ASSERT(ep->tpe_instr == TSTAT_TSBMISS_INSTR);
hot_patch_kernel_text((caddr_t)ep->tpe_addr,
ep->tpe_instr, sizeof (instr));
ep->tpe_instr = 0;
}
tstat_tsbmiss_patched = 0;
}
}
/*
* This is the routine executed to clock the performance of the trap table,
* executed both before and after interposing on the trap table to attempt to
* determine probe effect. The probe effect is used to adjust the "%tim"
* fields of trapstat's -t and -T output; we only use TLB misses to clock the
* trap table. We execute the inner loop (which is designed to exceed the
* TLB's reach) nlaps times, taking the best time as our time (thereby
* factoring out the effects of interrupts, cache misses or other perturbing
* events.
*/
static hrtime_t
trapstat_probe_laps(int nlaps, hrtime_t *buf)
{
int i, j = 0;
hrtime_t ts, best = INT64_MAX;
while (nlaps--) {
ts = rdtick();
for (i = 0; i < TSTAT_PROBE_SIZE; i += MMU_PAGESIZE)
*((volatile char *)&tstat_probe_area[i]);
if ((ts = rdtick() - ts) < best)
best = ts;
buf[j++] = ts;
}
return (best);
}
/*
* This routine determines the probe effect by calling trapstat_probe_laps()
* both without and with the interposing trap table. Note that this is
* called from a cross call on the desired CPU, and that it is called on
* every CPU (this is necessary because the probe effect may differ from
* one CPU to another).
*/
static void
trapstat_probe()
{
tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
hrtime_t before, after;
if (!(tcpu->tcpu_flags & TSTAT_CPU_SELECTED))
return;
if (tstat_probe_area == NULL || (tstat_options & TSTAT_OPT_NOGO))
return;
/*
* We very much expect the %tba to be KERNELBASE; this is a
* precautionary measure to assure that trapstat doesn't melt the
* machine should the %tba point unexpectedly elsewhere.
*/
if (get_tba() != (caddr_t)KERNELBASE)
return;
/*
* Preserve this CPU's data before destroying it by enabling the
* interposing trap table. We can safely use tstat_buffer because
* the caller of the trapstat_probe() cross call is holding tstat_lock.
*/
bcopy(tcpu->tcpu_data, tstat_buffer, tstat_data_t_size);
tstat_probe_time = gethrtime();
before = trapstat_probe_laps(TSTAT_PROBE_NLAPS, tstat_probe_before);
(void) set_tba(tcpu->tcpu_ibase);
after = trapstat_probe_laps(TSTAT_PROBE_NLAPS, tstat_probe_after);
(void) set_tba((caddr_t)KERNELBASE);
tstat_probe_time = gethrtime() - tstat_probe_time;
bcopy(tstat_buffer, tcpu->tcpu_data, tstat_data_t_size);
tcpu->tcpu_data->tdata_peffect = (after - before) / TSTAT_PROBE_NPAGES;
}
static void
trapstat_probe_alloc()
{
pfn_t pfn;
caddr_t va;
int i;
ASSERT(MUTEX_HELD(&tstat_lock));
ASSERT(tstat_probe_area == NULL);
ASSERT(tstat_probe_phys == NULL);
if (!(tstat_options & TSTAT_OPT_TLBDATA))
return;
/*
* Grab some virtual from the heap arena.
*/
tstat_probe_area = vmem_alloc(heap_arena, TSTAT_PROBE_SIZE, VM_SLEEP);
va = tstat_probe_area;
/*
* Grab a single physical page.
*/
tstat_probe_phys = vmem_alloc(tstat_arena, MMU_PAGESIZE, VM_SLEEP);
pfn = hat_getpfnum(kas.a_hat, tstat_probe_phys);
/*
* Now set the translation for every page in our virtual range
* to be our allocated physical page.
*/
for (i = 0; i < TSTAT_PROBE_NPAGES; i++) {
hat_devload(kas.a_hat, va, MMU_PAGESIZE, pfn, PROT_READ,
HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
va += MMU_PAGESIZE;
}
}
static void
trapstat_probe_free()
{
caddr_t va;
int i;
ASSERT(MUTEX_HELD(&tstat_lock));
if ((va = tstat_probe_area) == NULL)
return;
for (i = 0; i < TSTAT_PROBE_NPAGES; i++) {
hat_unload(kas.a_hat, va, MMU_PAGESIZE, HAT_UNLOAD_UNLOCK);
va += MMU_PAGESIZE;
}
vmem_free(tstat_arena, tstat_probe_phys, MMU_PAGESIZE);
vmem_free(heap_arena, tstat_probe_area, TSTAT_PROBE_SIZE);
tstat_probe_phys = NULL;
tstat_probe_area = NULL;
}
/*
* This routine actually enables a CPU by setting its %tba to be the
* CPU's interposing trap table. It is called out of cross call context.
*/
static void
trapstat_enable()
{
tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
if (!(tcpu->tcpu_flags & TSTAT_CPU_SELECTED))
return;
ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
if (get_tba() != (caddr_t)KERNELBASE)
return;
if (!(tstat_options & TSTAT_OPT_NOGO))
(void) set_tba(tcpu->tcpu_ibase);
tcpu->tcpu_flags |= TSTAT_CPU_ENABLED;
#ifdef sun4v
if (tstat_options & (TSTAT_OPT_TLBDATA | TSTAT_OPT_NOGO)) {
/*
* On sun4v platforms, TLB misses are normally handled by the
* hypervisor or the hardware -- provided one or more TSBs
* have been setup and communicated via hv_set_ctx0 and
* hv_set_nonctx0 API. However, as part of collecting TLB
* statistics, we disabled this miss processing by telling the
* hypervisor that there was not a TSB; we now need to
* communicate the proper kernel/user TSB information to
* resume efficient operation.
*
* While we restore kernel TSB information immediately, to
* avoid any locking dependency, we don't restore user TSB
* information right away. Rather, we simply clear the
* TSTAT_TLB_STATS flag so that the user TSB information is
* automatically restored on the next context switch.
*
* Note that the call to restore kernel TSB information is not
* expected to fail. Even in the event of failure, the system
* will still continue to function properly, if in a state of
* reduced performance due to the guest kernel handling all
* TLB misses.
*/
cpu_t *cp = CPU;
cp->cpu_m.cpu_tstat_flags |= TSTAT_TLB_STATS;
(void) hv_set_ctx0(NULL, NULL);
(void) hv_set_ctxnon0(NULL, NULL);
}
#endif
}
/*
* This routine disables a CPU (vis a vis trapstat) by setting its %tba to be
* the actual, underlying trap table. It is called out of cross call context.
*/
static void
trapstat_disable()
{
tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED))
return;
ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
if (!(tstat_options & TSTAT_OPT_NOGO))
(void) set_tba((caddr_t)KERNELBASE);
tcpu->tcpu_flags &= ~TSTAT_CPU_ENABLED;
#ifdef sun4v
if (tstat_options & (TSTAT_OPT_TLBDATA | TSTAT_OPT_NOGO)) {
/*
* On sun4v platforms, TlB misses are normally handled by
* the hypervisor or the hardware provided one or more TSBs
* have been setup and communicated via hv_set_ctx0 and
* hv_set_nonctx0 API. However, as part of collecting TLB
* statistics, we disabled that by faking NO TSB and we
* need to communicate proper kernel/user TSB information
* so that TLB misses can be handled by the hypervisor or
* the hardware more efficiently.
*
* We restore kernel TSB information right away. However,
* to minimize any locking dependency, we don't restore
* user TSB information right away. Instead, we simply
* clear the TSTAT_TLB_STATS flag so that the user TSB
* information is automatically restored on next context
* switch.
*
* Note that the call to restore kernel TSB information
* will normally not fail, unless wrong information is
* passed here. In that scenario, system will still
* continue to function properly with the exception of
* kernel handling all the TLB misses.
*/
struct hv_tsb_block *hvbp = &ksfmmup->sfmmu_hvblock;
cpu_t *cp = CPU;
cp->cpu_m.cpu_tstat_flags &= ~TSTAT_TLB_STATS;
(void) hv_set_ctx0(hvbp->hv_tsb_info_cnt, hvbp->hv_tsb_info_pa);
}
#endif
}
/*
* We use %tick as the time base when recording the time spent executing
* the trap handler. %tick, however, is not necessarily kept in sync
* across CPUs (indeed, different CPUs may have different %tick frequencies).
* We therefore cross call onto a CPU to get a snapshot of its data to
* copy out; this is the routine executed out of that cross call.
*/
static void
trapstat_snapshot()
{
tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
tstat_data_t *data = tcpu->tcpu_data;
ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ENABLED);
data->tdata_snapts = gethrtime();
data->tdata_snaptick = rdtick();
bcopy(data, tstat_buffer, tstat_data_t_size);
}
/*
* The TSTAT_RETENT_* constants define offsets in the TLB return entry.
* They are used only in trapstat_tlbretent() (below) and #undef'd
* immediately afterwards. Any change to "retent" in trapstat_tlbretent()
* will likely require changes to these constants.
*/
#ifndef sun4v
#define TSTAT_RETENT_STATHI 1
#define TSTAT_RETENT_STATLO 2
#define TSTAT_RETENT_SHIFT 8
#define TSTAT_RETENT_COUNT_LD 10
#define TSTAT_RETENT_COUNT_ST 12
#define TSTAT_RETENT_TMPTSHI 13
#define TSTAT_RETENT_TMPTSLO 14
#define TSTAT_RETENT_TIME_LD 16
#define TSTAT_RETENT_TIME_ST 18
#else /* sun4v */
#define TSTAT_RETENT_STATHI 1
#define TSTAT_RETENT_STATLO 2
#define TSTAT_RETENT_SHIFT 5
#define TSTAT_RETENT_COUNT_LD 7
#define TSTAT_RETENT_COUNT_ST 9
#define TSTAT_RETENT_TMPTSHI 10
#define TSTAT_RETENT_TMPTSLO 11
#define TSTAT_RETENT_TIME_LD 13
#define TSTAT_RETENT_TIME_ST 15
#endif /* sun4v */
static void
trapstat_tlbretent(tstat_percpu_t *tcpu, tstat_tlbretent_t *ret,
tstat_missdata_t *data)
{
uint32_t *ent = ret->ttlbrent_instr, shift;
uintptr_t base, tmptick = TSTAT_DATA_OFFS(tcpu, tdata_tmptick);
/*
* This is the entry executed upon return from the TLB/TSB miss
* handler (i.e. the code interpositioned between the "retry" and
* the actual return to the TLB-missing instruction). Detail on its
* theory of operation can be found in the "TLB Statistics" section
* of the block comment. Note that we expect the TTE just loaded
* into the TLB to be in %g5; all other globals are available as
* scratch. Finally, note that the page size information in sun4v is
* located in the lower bits of the TTE -- requiring us to have a
* different return entry on sun4v.
*/
static const uint32_t retent[TSTAT_TLBRET_NINSTR] = {
#ifndef sun4v
0x87410000, /* rd %tick, %g3 */
0x03000000, /* sethi %hi(stat), %g1 */
0x82106000, /* or %g1, %lo(stat), %g1 */
0x89297001, /* sllx %g5, 1, %g4 */
0x8931303e, /* srlx %g4, 62, %g4 */
0x8531702e, /* srlx %g5, 46, %g2 */
0x8408a004, /* and %g2, 4, %g2 */
0x88110002, /* or %g4, %g2, %g4 */
0x89292000, /* sll %g4, shift, %g4 */
0x82004004, /* add %g1, %g4, %g1 */
0xc4586000, /* ldx [%g1 + tmiss_count], %g2 */
0x8400a001, /* add %g2, 1, %g2 */
0xc4706000, /* stx %g2, [%g1 + tmiss_count] */
0x0d000000, /* sethi %hi(tdata_tmptick), %g6 */
0xc459a000, /* ldx [%g6 + %lo(tdata_tmptick)], %g2 */
0x8620c002, /* sub %g3, %g2, %g3 */
0xc4586000, /* ldx [%g1 + tmiss_time], %g2 */
0x84008003, /* add %g2, %g3, %g2 */
0xc4706000, /* stx %g2, [%g1 + tmiss_time] */
0x83f00000 /* retry */
#else /* sun4v */
0x87410000, /* rd %tick, %g3 */
0x03000000, /* sethi %hi(stat), %g1 */
0x82106000, /* or %g1, %lo(stat), %g1 */
0x8929703d, /* sllx %g5, 61, %g4 */
0x8931303d, /* srlx %g4, 61, %g4 */
0x89292000, /* sll %g4, shift, %g4 */
0x82004004, /* add %g1, %g4, %g1 */
0xc4586000, /* ldx [%g1 + tmiss_count], %g2 */
0x8400a001, /* add %g2, 1, %g2 */
0xc4706000, /* stx %g2, [%g1 + tmiss_count] */
0x0d000000, /* sethi %hi(tdata_tmptick), %g6 */
0xc459a000, /* ldx [%g6 + %lo(tdata_tmptick)], %g2 */
0x8620c002, /* sub %g3, %g2, %g3 */
0xc4586000, /* ldx [%g1 + tmiss_time], %g2 */
0x84008003, /* add %g2, %g3, %g2 */
0xc4706000, /* stx %g2, [%g1 + tmiss_time] */
0x83f00000 /* retry */
#endif /* sun4v */
};
ASSERT(MUTEX_HELD(&tstat_lock));
/*CONSTCOND*/
ASSERT(offsetof(tstat_missdata_t, tmiss_count) <= LO10(-1));
/*CONSTCOND*/
ASSERT(offsetof(tstat_missdata_t, tmiss_time) <= LO10(-1));
/*CONSTCOND*/
ASSERT(!((sizeof (tstat_pgszdata_t) - 1) & sizeof (tstat_pgszdata_t)));
for (shift = 1; (1 << shift) != sizeof (tstat_pgszdata_t); shift++)
continue;
base = (uintptr_t)tcpu->tcpu_dbase +
((uintptr_t)data - (uintptr_t)tcpu->tcpu_data);
bcopy(retent, ent, sizeof (retent));
ent[TSTAT_RETENT_STATHI] |= HI22(base);
ent[TSTAT_RETENT_STATLO] |= LO10(base);
ent[TSTAT_RETENT_SHIFT] |= shift;
/* LINTED E_EXPR_NULL_EFFECT */
ent[TSTAT_RETENT_COUNT_LD] |= offsetof(tstat_missdata_t, tmiss_count);
/* LINTED E_EXPR_NULL_EFFECT */
ent[TSTAT_RETENT_COUNT_ST] |= offsetof(tstat_missdata_t, tmiss_count);
ent[TSTAT_RETENT_TMPTSHI] |= HI22(tmptick);
ent[TSTAT_RETENT_TMPTSLO] |= LO10(tmptick);
ent[TSTAT_RETENT_TIME_LD] |= offsetof(tstat_missdata_t, tmiss_time);
ent[TSTAT_RETENT_TIME_ST] |= offsetof(tstat_missdata_t, tmiss_time);
}
#undef TSTAT_RETENT_STATHI
#undef TSTAT_RETENT_STATLO
#undef TSTAT_RETENT_SHIFT
#undef TSTAT_RETENT_COUNT_LD
#undef TSTAT_RETENT_COUNT_ST
#undef TSTAT_RETENT_TMPTSHI
#undef TSTAT_RETENT_TMPTSLO
#undef TSTAT_RETENT_TIME_LD
#undef TSTAT_RETENT_TIME_ST
/*
* The TSTAT_TLBENT_* constants define offsets in the TLB entry. They are
* used only in trapstat_tlbent() (below) and #undef'd immediately afterwards.
* Any change to "tlbent" in trapstat_tlbent() will likely require changes
* to these constants.
*/
#ifndef sun4v
#define TSTAT_TLBENT_STATHI 0
#define TSTAT_TLBENT_STATLO_LD 1
#define TSTAT_TLBENT_STATLO_ST 3
#define TSTAT_TLBENT_MMUASI 15
#define TSTAT_TLBENT_TPCHI 18
#define TSTAT_TLBENT_TPCLO_USER 19
#define TSTAT_TLBENT_TPCLO_KERN 21
#define TSTAT_TLBENT_TSHI 25
#define TSTAT_TLBENT_TSLO 27
#define TSTAT_TLBENT_BA 28
#else /* sun4v */
#define TSTAT_TLBENT_STATHI 0
#define TSTAT_TLBENT_STATLO_LD 1
#define TSTAT_TLBENT_STATLO_ST 3
#define TSTAT_TLBENT_TAGTARGET 19
#define TSTAT_TLBENT_TPCHI 21
#define TSTAT_TLBENT_TPCLO_USER 22
#define TSTAT_TLBENT_TPCLO_KERN 24
#define TSTAT_TLBENT_TSHI 28
#define TSTAT_TLBENT_TSLO 30
#define TSTAT_TLBENT_BA 31
#endif /* sun4v */
static void
trapstat_tlbent(tstat_percpu_t *tcpu, int entno)
{
uint32_t *ent;
uintptr_t orig, va, baoffs;
int itlb = entno == TSTAT_ENT_ITLBMISS;
int entoffs = entno << TSTAT_ENT_SHIFT;
uintptr_t tmptick, stat, tpc, utpc;
tstat_pgszdata_t *data = &tcpu->tcpu_data->tdata_pgsz[0];
tstat_tlbdata_t *udata, *kdata;
tstat_tlbret_t *ret;
#ifndef sun4v
uint32_t asi = itlb ? ASI(ASI_IMMU) : ASI(ASI_DMMU);
#else
uint32_t tagtarget_off = itlb ? MMFSA_I_CTX : MMFSA_D_CTX;
#endif
/*
* When trapstat is run with TLB statistics, this is the entry for
* both I- and D-TLB misses; this code performs trap level pushing,
* as described in the "TLB Statistics" section of the block comment.
* This code is executing at TL 1; %tstate[0] contains the saved
* state at the time of the TLB miss. Pushing trap level 1 (and thus
* raising TL to 2) requires us to fill in %tstate[1] with our %pstate,
* %cwp and %asi. We leave %tt unchanged, and we set %tpc and %tnpc to
* the appropriate TLB return entry (based on the context of the miss).
* Finally, we sample %tick, and stash it in the tdata_tmptick member
* the per-CPU tstat_data structure. tdata_tmptick will be used in
* the TLB return entry to determine the amount of time spent in the
* TLB miss handler.
*
* Note that on sun4v platforms, we must also force the %gl value to 1
* in %tstate and we must obtain the context information from the MMU
* fault status area. (The base address of this MMU fault status area
* is kept in the scratchpad register 0.)
*/
static const uint32_t tlbent[] = {
#ifndef sun4v
0x03000000, /* sethi %hi(stat), %g1 */
0xc4586000, /* ldx [%g1 + %lo(stat)], %g2 */
0x8400a001, /* add %g2, 1, %g2 */
0xc4706000, /* stx %g2, [%g1 + %lo(stat)] */
0x85524000, /* rdpr %cwp, %g2 */
0x87518000, /* rdpr %pstate, %g3 */
0x8728f008, /* sllx %g3, 8, %g3 */
0x84108003, /* or %g2, %g3, %g2 */
0x8740c000, /* rd %asi, %g3 */
0x8728f018, /* sllx %g3, 24, %g3 */
0x84108003, /* or %g2, %g3, %g2 */
0x8350c000, /* rdpr %tt, %g1 */
0x8f902002, /* wrpr %g0, 2, %tl */
0x85908000, /* wrpr %g2, %g0, %tstate */
0x87904000, /* wrpr %g1, %g0, %tt */
0xc2d80000, /* ldxa [%g0]ASI_MMU, %g1 */
0x83307030, /* srlx %g1, CTXSHIFT, %g1 */
0x02c04004, /* brz,pn %g1, .+0x10 */
0x03000000, /* sethi %hi(new_tpc), %g1 */
0x82106000, /* or %g1, %lo(new_tpc), %g1 */
0x30800002, /* ba,a .+0x8 */
0x82106000, /* or %g1, %lo(new_tpc), %g1 */
0x81904000, /* wrpr %g1, %g0, %tpc */
0x82006004, /* add %g1, 4, %g1 */
0x83904000, /* wrpr %g1, %g0, %tnpc */
0x03000000, /* sethi %hi(tmptick), %g1 */
0x85410000, /* rd %tick, %g2 */
0xc4706000, /* stx %g2, [%g1 + %lo(tmptick)] */
0x30800000, /* ba,a addr */
NOP, NOP, NOP
#else /* sun4v */
0x03000000, /* sethi %hi(stat), %g1 */
0xc4586000, /* ldx [%g1 + %lo(stat)], %g2 */
0x8400a001, /* add %g2, 1, %g2 */
0xc4706000, /* stx %g2, [%g1 + %lo(stat)] */
0x85524000, /* rdpr %cwp, %g2 */
0x87518000, /* rdpr %pstate, %g3 */
0x8728f008, /* sllx %g3, 8, %g3 */
0x84108003, /* or %g2, %g3, %g2 */
0x8740c000, /* rd %asi, %g3 */
0x03000040, /* sethi %hi(0x10000), %g1 */
0x86104003, /* or %g1, %g3, %g3 */
0x8728f018, /* sllx %g3, 24, %g3 */
0x84108003, /* or %g2, %g3, %g2 */
0x8350c000, /* rdpr %tt, %g1 */
0x8f902002, /* wrpr %g0, 2, %tl */
0x85908000, /* wrpr %g2, %g0, %tstate */
0x87904000, /* wrpr %g1, %g0, %tt */
0xa1902001, /* wrpr %g0, 1, %gl */
0xc2d80400, /* ldxa [%g0]ASI_SCRATCHPAD, %g1 */
0xc2586000, /* ldx [%g1 + MMFSA_?_CTX], %g1 */
0x02c04004, /* brz,pn %g1, .+0x10 */
0x03000000, /* sethi %hi(new_tpc), %g1 */
0x82106000, /* or %g1, %lo(new_tpc), %g1 */
0x30800002, /* ba,a .+0x8 */
0x82106000, /* or %g1, %lo(new_tpc), %g1 */
0x81904000, /* wrpr %g1, %g0, %tpc */
0x82006004, /* add %g1, 4, %g1 */
0x83904000, /* wrpr %g1, %g0, %tnpc */
0x03000000, /* sethi %hi(tmptick), %g1 */
0x85410000, /* rd %tick, %g2 */
0xc4706000, /* stx %g2, [%g1 + %lo(tmptick)] */
0x30800000 /* ba,a addr */
#endif /* sun4v */
};
ASSERT(MUTEX_HELD(&tstat_lock));
ASSERT(entno == TSTAT_ENT_ITLBMISS || entno == TSTAT_ENT_DTLBMISS);
stat = TSTAT_DATA_OFFS(tcpu, tdata_traps) + entoffs;
tmptick = TSTAT_DATA_OFFS(tcpu, tdata_tmptick);
if (itlb) {
ret = &tcpu->tcpu_instr->tinst_itlbret;
udata = &data->tpgsz_user.tmode_itlb;
kdata = &data->tpgsz_kernel.tmode_itlb;
tpc = TSTAT_INSTR_OFFS(tcpu, tinst_itlbret.ttlbr_ktlb);
} else {
ret = &tcpu->tcpu_instr->tinst_dtlbret;
udata = &data->tpgsz_user.tmode_dtlb;
kdata = &data->tpgsz_kernel.tmode_dtlb;
tpc = TSTAT_INSTR_OFFS(tcpu, tinst_dtlbret.ttlbr_ktlb);
}
utpc = tpc + offsetof(tstat_tlbret_t, ttlbr_utlb) -
offsetof(tstat_tlbret_t, ttlbr_ktlb);
ASSERT(HI22(tpc) == HI22(utpc));
ent = (uint32_t *)((uintptr_t)tcpu->tcpu_instr + entoffs);
orig = KERNELBASE + entoffs;
va = (uintptr_t)tcpu->tcpu_ibase + entoffs;
baoffs = TSTAT_TLBENT_BA * sizeof (uint32_t);
bcopy(tlbent, ent, sizeof (tlbent));
ent[TSTAT_TLBENT_STATHI] |= HI22(stat);
ent[TSTAT_TLBENT_STATLO_LD] |= LO10(stat);
ent[TSTAT_TLBENT_STATLO_ST] |= LO10(stat);
#ifndef sun4v
ent[TSTAT_TLBENT_MMUASI] |= asi;
#else
ent[TSTAT_TLBENT_TAGTARGET] |= tagtarget_off;
#endif
ent[TSTAT_TLBENT_TPCHI] |= HI22(tpc);
ent[TSTAT_TLBENT_TPCLO_USER] |= LO10(utpc);
ent[TSTAT_TLBENT_TPCLO_KERN] |= LO10(tpc);
ent[TSTAT_TLBENT_TSHI] |= HI22(tmptick);
ent[TSTAT_TLBENT_TSLO] |= LO10(tmptick);
ent[TSTAT_TLBENT_BA] |= DISP22(va + baoffs, orig);
/*
* And now set up the TLB return entries.
*/
trapstat_tlbretent(tcpu, &ret->ttlbr_ktlb, &kdata->ttlb_tlb);
trapstat_tlbretent(tcpu, &ret->ttlbr_ktsb, &kdata->ttlb_tsb);
trapstat_tlbretent(tcpu, &ret->ttlbr_utlb, &udata->ttlb_tlb);
trapstat_tlbretent(tcpu, &ret->ttlbr_utsb, &udata->ttlb_tsb);
}
#undef TSTAT_TLBENT_STATHI
#undef TSTAT_TLBENT_STATLO_LD
#undef TSTAT_TLBENT_STATLO_ST
#ifndef sun4v
#undef TSTAT_TLBENT_MMUASI
#else
#undef TSTAT_TLBENT_TAGTARGET
#endif
#undef TSTAT_TLBENT_TPCHI
#undef TSTAT_TLBENT_TPCLO_USER
#undef TSTAT_TLBENT_TPCLO_KERN
#undef TSTAT_TLBENT_TSHI
#undef TSTAT_TLBENT_TSLO
#undef TSTAT_TLBENT_BA
/*
* The TSTAT_ENABLED_* constants define offsets in the enabled entry; the
* TSTAT_DISABLED_BA constant defines an offset in the disabled entry. Both
* sets of constants are used only in trapstat_make_traptab() (below) and
* #undef'd immediately afterwards. Any change to "enabled" or "disabled"
* in trapstat_make_traptab() will likely require changes to these constants.
*/
#define TSTAT_ENABLED_STATHI 0
#define TSTAT_ENABLED_STATLO_LD 1
#define TSTAT_ENABLED_STATLO_ST 3
#define TSTAT_ENABLED_BA 4
#define TSTAT_DISABLED_BA 0
static void
trapstat_make_traptab(tstat_percpu_t *tcpu)
{
uint32_t *ent;
uint64_t *stat;
uintptr_t orig, va, en_baoffs, dis_baoffs;
int nent;
/*
* This is the entry in the interposing trap table for enabled trap
* table entries. It loads a counter, increments it and stores it
* back before branching to the actual trap table entry.
*/
static const uint32_t enabled[TSTAT_ENT_NINSTR] = {
0x03000000, /* sethi %hi(stat), %g1 */
0xc4586000, /* ldx [%g1 + %lo(stat)], %g2 */
0x8400a001, /* add %g2, 1, %g2 */
0xc4706000, /* stx %g2, [%g1 + %lo(stat)] */
0x30800000, /* ba,a addr */
NOP, NOP, NOP
};
/*
* This is the entry in the interposing trap table for disabled trap
* table entries. It simply branches to the actual, underlying trap
* table entry. As explained in the "Implementation Details" section
* of the block comment, all TL>0 traps _must_ use the disabled entry;
* additional entries may be explicitly disabled through the use
* of TSTATIOC_ENTRY/TSTATIOC_NOENTRY.
*/
static const uint32_t disabled[TSTAT_ENT_NINSTR] = {
0x30800000, /* ba,a addr */
NOP, NOP, NOP, NOP, NOP, NOP, NOP,
};
ASSERT(MUTEX_HELD(&tstat_lock));
ent = tcpu->tcpu_instr->tinst_traptab;
stat = (uint64_t *)TSTAT_DATA_OFFS(tcpu, tdata_traps);
orig = KERNELBASE;
va = (uintptr_t)tcpu->tcpu_ibase;
en_baoffs = TSTAT_ENABLED_BA * sizeof (uint32_t);
dis_baoffs = TSTAT_DISABLED_BA * sizeof (uint32_t);
for (nent = 0; nent < TSTAT_TOTAL_NENT; nent++) {
if (tstat_enabled[nent]) {
bcopy(enabled, ent, sizeof (enabled));
ent[TSTAT_ENABLED_STATHI] |= HI22(stat);
ent[TSTAT_ENABLED_STATLO_LD] |= LO10(stat);
ent[TSTAT_ENABLED_STATLO_ST] |= LO10(stat);
ent[TSTAT_ENABLED_BA] |= DISP22(va + en_baoffs, orig);
} else {
bcopy(disabled, ent, sizeof (disabled));
ent[TSTAT_DISABLED_BA] |= DISP22(va + dis_baoffs, orig);
}
stat++;
orig += sizeof (enabled);
ent += sizeof (enabled) / sizeof (*ent);
va += sizeof (enabled);
}
}
#undef TSTAT_ENABLED_STATHI
#undef TSTAT_ENABLED_STATLO_LD
#undef TSTAT_ENABLED_STATLO_ST
#undef TSTAT_ENABLED_BA
#undef TSTAT_DISABLED_BA
static void
trapstat_setup(processorid_t cpu)
{
tstat_percpu_t *tcpu = &tstat_percpu[cpu];
#ifndef sun4v
int i;
caddr_t va;
pfn_t *pfn;
#endif
ASSERT(tcpu->tcpu_pfn == NULL);
ASSERT(tcpu->tcpu_instr == NULL);
ASSERT(tcpu->tcpu_data == NULL);
ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED));
ASSERT(MUTEX_HELD(&cpu_lock));
ASSERT(MUTEX_HELD(&tstat_lock));
/*
* The lower fifteen bits of the %tba are always read as zero; we must
* align our instruction base address appropriately.
*/
#ifndef sun4v
tcpu->tcpu_ibase = (caddr_t)((KERNELBASE - tstat_total_size)
& TSTAT_TBA_MASK);
tcpu->tcpu_dbase = tcpu->tcpu_ibase + TSTAT_INSTR_SIZE;
tcpu->tcpu_vabase = tcpu->tcpu_ibase;
tcpu->tcpu_pfn = vmem_alloc(tstat_arena, tstat_total_pages, VM_SLEEP);
bzero(tcpu->tcpu_pfn, tstat_total_pages);
pfn = tcpu->tcpu_pfn;
tcpu->tcpu_instr = vmem_alloc(tstat_arena, TSTAT_INSTR_SIZE, VM_SLEEP);
va = (caddr_t)tcpu->tcpu_instr;
for (i = 0; i < TSTAT_INSTR_PAGES; i++, va += MMU_PAGESIZE)
*pfn++ = hat_getpfnum(kas.a_hat, va);
/*
* We must be sure that the pages that we will use to examine the data
* have the same virtual color as the pages to which the data is being
* recorded, hence the alignment and phase constraints on the
* allocation.
*/
tcpu->tcpu_data = vmem_xalloc(tstat_arena, tstat_data_size,
shm_alignment, (uintptr_t)tcpu->tcpu_dbase & (shm_alignment - 1),
0, 0, NULL, VM_SLEEP);
bzero(tcpu->tcpu_data, tstat_data_size);
tcpu->tcpu_data->tdata_cpuid = cpu;
va = (caddr_t)tcpu->tcpu_data;
for (i = 0; i < tstat_data_pages; i++, va += MMU_PAGESIZE)
*pfn++ = hat_getpfnum(kas.a_hat, va);
#else /* sun4v */
ASSERT(!(tstat_total_size > (1 + ~TSTAT_TBA_MASK)));
tcpu->tcpu_vabase = (caddr_t)(KERNELBASE - MMU_PAGESIZE4M);
tcpu->tcpu_ibase = tcpu->tcpu_vabase + (cpu * (1 + ~TSTAT_TBA_MASK));
tcpu->tcpu_dbase = tcpu->tcpu_ibase + TSTAT_INSTR_SIZE;
tcpu->tcpu_pfn = &tstat_pfn;
tcpu->tcpu_instr = (tstat_instr_t *)(tstat_va + (cpu *
(1 + ~TSTAT_TBA_MASK)));
tcpu->tcpu_data = (tstat_data_t *)(tstat_va + (cpu *
(1 + ~TSTAT_TBA_MASK)) + TSTAT_INSTR_SIZE);
bzero(tcpu->tcpu_data, tstat_data_size);
tcpu->tcpu_data->tdata_cpuid = cpu;
#endif /* sun4v */
/*
* Now that we have all of the instruction and data pages allocated,
* make the trap table from scratch.
*/
trapstat_make_traptab(tcpu);
if (tstat_options & TSTAT_OPT_TLBDATA) {
/*
* TLB Statistics have been specified; set up the I- and D-TLB
* entries and corresponding TLB return entries.
*/
trapstat_tlbent(tcpu, TSTAT_ENT_ITLBMISS);
trapstat_tlbent(tcpu, TSTAT_ENT_DTLBMISS);
}
tcpu->tcpu_flags |= TSTAT_CPU_ALLOCATED;
/*
* Finally, get the target CPU to load the locked pages into its TLBs.
*/
xc_one(cpu, (xcfunc_t *)trapstat_load_tlb, 0, 0);
}
static void
trapstat_teardown(processorid_t cpu)
{
tstat_percpu_t *tcpu = &tstat_percpu[cpu];
#ifndef sun4v
int i;
#endif
caddr_t va = tcpu->tcpu_vabase;
ASSERT(tcpu->tcpu_pfn != NULL);
ASSERT(tcpu->tcpu_instr != NULL);
ASSERT(tcpu->tcpu_data != NULL);
ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
ASSERT(MUTEX_HELD(&cpu_lock));
ASSERT(MUTEX_HELD(&tstat_lock));
#ifndef sun4v
vmem_free(tstat_arena, tcpu->tcpu_pfn, tstat_total_pages);
vmem_free(tstat_arena, tcpu->tcpu_instr, TSTAT_INSTR_SIZE);
vmem_free(tstat_arena, tcpu->tcpu_data, tstat_data_size);
for (i = 0; i < tstat_total_pages; i++, va += MMU_PAGESIZE) {
xt_one(cpu, vtag_flushpage_tl1, (uint64_t)va, KCONTEXT);
}
#else
xt_one(cpu, vtag_unmap_perm_tl1, (uint64_t)va, KCONTEXT);
#endif
tcpu->tcpu_pfn = NULL;
tcpu->tcpu_instr = NULL;
tcpu->tcpu_data = NULL;
tcpu->tcpu_flags &= ~TSTAT_CPU_ALLOCATED;
}
static int
trapstat_go()
{
cpu_t *cp;
mutex_enter(&cpu_lock);
mutex_enter(&tstat_lock);
if (tstat_running) {
mutex_exit(&tstat_lock);
mutex_exit(&cpu_lock);
return (EBUSY);
}
#ifdef sun4v
/*
* Allocate large page to hold interposing tables
*/
tstat_va = contig_mem_alloc(MMU_PAGESIZE4M);
tstat_pfn = va_to_pfn(tstat_va);
if (tstat_pfn == PFN_INVALID) {
contig_mem_free(tstat_va, MMU_PAGESIZE4M);
return (EAGAIN);
}
#endif
/*
* First, perform any necessary hot patching.
*/
trapstat_hotpatch();
/*
* Allocate the resources we'll need to measure probe effect.
*/
trapstat_probe_alloc();
cp = cpu_list;
do {
if (!(tstat_percpu[cp->cpu_id].tcpu_flags & TSTAT_CPU_SELECTED))
continue;
trapstat_setup(cp->cpu_id);
/*
* Note that due to trapstat_probe()'s use of global data,
* we determine the probe effect on each CPU serially instead
* of in parallel with an xc_all().
*/
xc_one(cp->cpu_id, (xcfunc_t *)trapstat_probe, 0, 0);
} while ((cp = cp->cpu_next) != cpu_list);
xc_all((xcfunc_t *)trapstat_enable, 0, 0);
trapstat_probe_free();
tstat_running = 1;
mutex_exit(&tstat_lock);
mutex_exit(&cpu_lock);
return (0);
}
static int
trapstat_stop()
{
int i;
mutex_enter(&cpu_lock);
mutex_enter(&tstat_lock);
if (!tstat_running) {
mutex_exit(&tstat_lock);
mutex_exit(&cpu_lock);
return (ENXIO);
}
xc_all((xcfunc_t *)trapstat_disable, 0, 0);
for (i = 0; i <= max_cpuid; i++) {
if (tstat_percpu[i].tcpu_flags & TSTAT_CPU_ALLOCATED)
trapstat_teardown(i);
}
#ifdef sun4v
contig_mem_free(tstat_va, MMU_PAGESIZE4M);
#endif
trapstat_hotpatch();
tstat_running = 0;
mutex_exit(&tstat_lock);
mutex_exit(&cpu_lock);
return (0);
}
/*
* This is trapstat's DR CPU configuration callback. It's called (with
* cpu_lock held) to unconfigure a newly powered-off CPU, or to configure a
* powered-off CPU that is to be brought into the system. We need only take
* action in the unconfigure case: because a powered-off CPU will have its
* trap table restored to KERNELBASE if it is ever powered back on, we must
* update the flags to reflect that trapstat is no longer enabled on the
* powered-off CPU. Note that this means that a TSTAT_CPU_ENABLED CPU that
* is unconfigured/powered off and later powered back on/reconfigured will
* _not_ be re-TSTAT_CPU_ENABLED.
*/
static int
trapstat_cpu_setup(cpu_setup_t what, processorid_t cpu)
{
tstat_percpu_t *tcpu = &tstat_percpu[cpu];
ASSERT(MUTEX_HELD(&cpu_lock));
mutex_enter(&tstat_lock);
if (!tstat_running) {
mutex_exit(&tstat_lock);
return (0);
}
switch (what) {
case CPU_CONFIG:
ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
break;
case CPU_UNCONFIG:
if (tcpu->tcpu_flags & TSTAT_CPU_ENABLED)
tcpu->tcpu_flags &= ~TSTAT_CPU_ENABLED;
break;
default:
break;
}
mutex_exit(&tstat_lock);
return (0);
}
/*
* This is called before a CPR suspend and after a CPR resume. We don't have
* anything to do before a suspend, but after a restart we must restore the
* trap table to be our interposing trap table. However, we don't actually
* know whether or not the CPUs have been powered off -- this routine may be
* called while restoring from a failed CPR suspend. We thus run through each
* TSTAT_CPU_ENABLED CPU, and explicitly destroy and reestablish its
* interposing trap table. This assures that our state is correct regardless
* of whether or not the CPU has been newly powered on.
*/
/*ARGSUSED*/
static boolean_t
trapstat_cpr(void *arg, int code)
{
cpu_t *cp;
if (code == CB_CODE_CPR_CHKPT)
return (B_TRUE);
ASSERT(code == CB_CODE_CPR_RESUME);
mutex_enter(&cpu_lock);
mutex_enter(&tstat_lock);
if (!tstat_running) {
mutex_exit(&tstat_lock);
mutex_exit(&cpu_lock);
return (B_TRUE);
}
cp = cpu_list;
do {
tstat_percpu_t *tcpu = &tstat_percpu[cp->cpu_id];
if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED))
continue;
ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
xc_one(cp->cpu_id, (xcfunc_t *)trapstat_disable, 0, 0);
ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
/*
* Preserve this CPU's data in tstat_buffer and rip down its
* interposing trap table.
*/
bcopy(tcpu->tcpu_data, tstat_buffer, tstat_data_t_size);
trapstat_teardown(cp->cpu_id);
ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED));
/*
* Reestablish the interposing trap table and restore the old
* data.
*/
trapstat_setup(cp->cpu_id);
ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
bcopy(tstat_buffer, tcpu->tcpu_data, tstat_data_t_size);
xc_one(cp->cpu_id, (xcfunc_t *)trapstat_enable, 0, 0);
} while ((cp = cp->cpu_next) != cpu_list);
mutex_exit(&tstat_lock);
mutex_exit(&cpu_lock);
return (B_TRUE);
}
/*ARGSUSED*/
static int
trapstat_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
{
int i;
mutex_enter(&cpu_lock);
mutex_enter(&tstat_lock);
if (tstat_open != 0) {
mutex_exit(&tstat_lock);
mutex_exit(&cpu_lock);
return (EBUSY);
}
/*
* Register this in open() rather than in attach() to prevent deadlock
* with DR code. During attach, I/O device tree locks are grabbed
* before trapstat_attach() is invoked - registering in attach
* will result in the lock order: device tree lock, cpu_lock.
* DR code however requires that cpu_lock be acquired before
* device tree locks.
*/
ASSERT(!tstat_running);
register_cpu_setup_func((cpu_setup_func_t *)trapstat_cpu_setup, NULL);
/*
* Clear all options. And until specific CPUs are specified, we'll
* mark all CPUs as selected.
*/
tstat_options = 0;
for (i = 0; i <= max_cpuid; i++)
tstat_percpu[i].tcpu_flags |= TSTAT_CPU_SELECTED;
/*
* By default, all traps at TL=0 are enabled. Traps at TL>0 must
* be disabled.
*/
for (i = 0; i < TSTAT_TOTAL_NENT; i++)
tstat_enabled[i] = i < TSTAT_NENT ? 1 : 0;
tstat_open = 1;
mutex_exit(&tstat_lock);
mutex_exit(&cpu_lock);
return (0);
}
/*ARGSUSED*/
static int
trapstat_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
{
(void) trapstat_stop();
ASSERT(!tstat_running);
mutex_enter(&cpu_lock);
unregister_cpu_setup_func((cpu_setup_func_t *)trapstat_cpu_setup, NULL);
mutex_exit(&cpu_lock);
tstat_open = 0;
return (DDI_SUCCESS);
}
static int
trapstat_option(int option)
{
mutex_enter(&tstat_lock);
if (tstat_running) {
mutex_exit(&tstat_lock);
return (EBUSY);
}
tstat_options |= option;
mutex_exit(&tstat_lock);
return (0);
}
/*ARGSUSED*/
static int
trapstat_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *crd, int *rval)
{
int i, j, out;
size_t dsize;
switch (cmd) {
case TSTATIOC_GO:
return (trapstat_go());
case TSTATIOC_NOGO:
return (trapstat_option(TSTAT_OPT_NOGO));
case TSTATIOC_STOP:
return (trapstat_stop());
case TSTATIOC_CPU:
if (arg < 0 || arg > max_cpuid)
return (EINVAL);
/*FALLTHROUGH*/
case TSTATIOC_NOCPU:
mutex_enter(&tstat_lock);
if (tstat_running) {
mutex_exit(&tstat_lock);
return (EBUSY);
}
/*
* If this is the first CPU to be specified (or if we are
* being asked to explicitly de-select CPUs), disable all CPUs.
*/
if (!(tstat_options & TSTAT_OPT_CPU) || cmd == TSTATIOC_NOCPU) {
tstat_options |= TSTAT_OPT_CPU;
for (i = 0; i <= max_cpuid; i++) {
tstat_percpu_t *tcpu = &tstat_percpu[i];
ASSERT(cmd == TSTATIOC_NOCPU ||
(tcpu->tcpu_flags & TSTAT_CPU_SELECTED));
tcpu->tcpu_flags &= ~TSTAT_CPU_SELECTED;
}
}
if (cmd == TSTATIOC_CPU)
tstat_percpu[arg].tcpu_flags |= TSTAT_CPU_SELECTED;
mutex_exit(&tstat_lock);
return (0);
case TSTATIOC_ENTRY:
mutex_enter(&tstat_lock);
if (tstat_running) {
mutex_exit(&tstat_lock);
return (EBUSY);
}
if (arg >= TSTAT_NENT || arg < 0) {
mutex_exit(&tstat_lock);
return (EINVAL);
}
if (!(tstat_options & TSTAT_OPT_ENTRY)) {
/*
* If this is the first entry that we are explicitly
* enabling, explicitly disable every TL=0 entry.
*/
for (i = 0; i < TSTAT_NENT; i++)
tstat_enabled[i] = 0;
tstat_options |= TSTAT_OPT_ENTRY;
}
tstat_enabled[arg] = 1;
mutex_exit(&tstat_lock);
return (0);
case TSTATIOC_NOENTRY:
mutex_enter(&tstat_lock);
if (tstat_running) {
mutex_exit(&tstat_lock);
return (EBUSY);
}
for (i = 0; i < TSTAT_NENT; i++)
tstat_enabled[i] = 0;
mutex_exit(&tstat_lock);
return (0);
case TSTATIOC_READ:
mutex_enter(&tstat_lock);
if (tstat_options & TSTAT_OPT_TLBDATA) {
dsize = tstat_data_t_exported_size;
} else {
dsize = sizeof (tstat_data_t);
}
for (i = 0, out = 0; i <= max_cpuid; i++) {
tstat_percpu_t *tcpu = &tstat_percpu[i];
if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED))
continue;
ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
tstat_buffer->tdata_cpuid = -1;
xc_one(i, (xcfunc_t *)trapstat_snapshot, 0, 0);
if (tstat_buffer->tdata_cpuid == -1) {
/*
* This CPU is not currently responding to
* cross calls; we have caught it while it is
* being unconfigured. We'll drop tstat_lock
* and pick up and drop cpu_lock. By the
* time we acquire cpu_lock, the DR operation
* will appear consistent and we can assert
* that trapstat_cpu_setup() has cleared
* TSTAT_CPU_ENABLED.
*/
mutex_exit(&tstat_lock);
mutex_enter(&cpu_lock);
mutex_exit(&cpu_lock);
mutex_enter(&tstat_lock);
ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
continue;
}
/*
* Need to compensate for the difference between page
* sizes exported to users and page sizes available
* within the kernel.
*/
if ((tstat_options & TSTAT_OPT_TLBDATA) &&
(tstat_pgszs != tstat_user_pgszs)) {
tstat_pgszdata_t *tp;
uint_t szc;
tp = &tstat_buffer->tdata_pgsz[0];
for (j = 0; j < tstat_user_pgszs; j++) {
if ((szc = USERSZC_2_SZC(j)) != j) {
bcopy(&tp[szc], &tp[j],
sizeof (tstat_pgszdata_t));
}
}
}
if (copyout(tstat_buffer, (void *)arg, dsize) != 0) {
mutex_exit(&tstat_lock);
return (EFAULT);
}
out++;
arg += dsize;
}
if (out != max_cpuid + 1) {
processorid_t cpuid = -1;
arg += offsetof(tstat_data_t, tdata_cpuid);
if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0) {
mutex_exit(&tstat_lock);
return (EFAULT);
}
}
mutex_exit(&tstat_lock);
return (0);
case TSTATIOC_TLBDATA:
return (trapstat_option(TSTAT_OPT_TLBDATA));
default:
break;
}
return (ENOTTY);
}
/*ARGSUSED*/
static int
trapstat_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
{
int error;
switch (infocmd) {
case DDI_INFO_DEVT2DEVINFO:
*result = (void *)tstat_devi;
error = DDI_SUCCESS;
break;
case DDI_INFO_DEVT2INSTANCE:
*result = (void *)0;
error = DDI_SUCCESS;
break;
default:
error = DDI_FAILURE;
}
return (error);
}
static int
trapstat_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
{
switch (cmd) {
case DDI_ATTACH:
break;
case DDI_RESUME:
return (DDI_SUCCESS);
default:
return (DDI_FAILURE);
}
if (ddi_create_minor_node(devi, "trapstat", S_IFCHR,
0, DDI_PSEUDO, 0) == DDI_FAILURE) {
ddi_remove_minor_node(devi, NULL);
return (DDI_FAILURE);
}
ddi_report_dev(devi);
tstat_devi = devi;
tstat_pgszs = page_num_pagesizes();
tstat_user_pgszs = page_num_user_pagesizes();
tstat_data_t_size = sizeof (tstat_data_t) +
(tstat_pgszs - 1) * sizeof (tstat_pgszdata_t);
tstat_data_t_exported_size = sizeof (tstat_data_t) +
(tstat_user_pgszs - 1) * sizeof (tstat_pgszdata_t);
#ifndef sun4v
tstat_data_pages = (tstat_data_t_size >> MMU_PAGESHIFT) + 1;
tstat_total_pages = TSTAT_INSTR_PAGES + tstat_data_pages;
tstat_data_size = tstat_data_pages * MMU_PAGESIZE;
tstat_total_size = TSTAT_INSTR_SIZE + tstat_data_size;
#else
tstat_data_pages = 0;
tstat_data_size = tstat_data_t_size;
tstat_total_pages = ((TSTAT_INSTR_SIZE + tstat_data_size) >>
MMU_PAGESHIFT) + 1;
tstat_total_size = tstat_total_pages * MMU_PAGESIZE;
#endif
tstat_percpu = kmem_zalloc((max_cpuid + 1) *
sizeof (tstat_percpu_t), KM_SLEEP);
/*
* Create our own arena backed by segkmem to assure a source of
* MMU_PAGESIZE-aligned allocations. We allocate out of the
* heap32_arena to assure that we can address the allocated memory with
* a single sethi/simm13 pair in the interposing trap table entries.
*/
tstat_arena = vmem_create("trapstat", NULL, 0, MMU_PAGESIZE,
segkmem_alloc_permanent, segkmem_free, heap32_arena, 0, VM_SLEEP);
tstat_enabled = kmem_alloc(TSTAT_TOTAL_NENT * sizeof (int), KM_SLEEP);
tstat_buffer = kmem_alloc(tstat_data_t_size, KM_SLEEP);
/*
* CB_CL_CPR_POST_USER is the class that executes from cpr_resume()
* after user threads can be restarted. By executing in this class,
* we are assured of the availability of system services needed to
* resume trapstat (specifically, we are assured that all CPUs are
* restarted and responding to cross calls).
*/
tstat_cprcb =
callb_add(trapstat_cpr, NULL, CB_CL_CPR_POST_USER, "trapstat");
return (DDI_SUCCESS);
}
static int
trapstat_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
{
int rval;
ASSERT(devi == tstat_devi);
switch (cmd) {
case DDI_DETACH:
break;
case DDI_SUSPEND:
return (DDI_SUCCESS);
default:
return (DDI_FAILURE);
}
ASSERT(!tstat_running);
rval = callb_delete(tstat_cprcb);
ASSERT(rval == 0);
kmem_free(tstat_buffer, tstat_data_t_size);
kmem_free(tstat_enabled, TSTAT_TOTAL_NENT * sizeof (int));
vmem_destroy(tstat_arena);
kmem_free(tstat_percpu, (max_cpuid + 1) * sizeof (tstat_percpu_t));
ddi_remove_minor_node(devi, NULL);
return (DDI_SUCCESS);
}
/*
* Configuration data structures
*/
static struct cb_ops trapstat_cb_ops = {
trapstat_open, /* open */
trapstat_close, /* close */
nulldev, /* strategy */
nulldev, /* print */
nodev, /* dump */
nodev, /* read */
nodev, /* write */
trapstat_ioctl, /* ioctl */
nodev, /* devmap */
nodev, /* mmap */
nodev, /* segmap */
nochpoll, /* poll */
ddi_prop_op, /* cb_prop_op */
0, /* streamtab */
D_MP | D_NEW /* Driver compatibility flag */
};
static struct dev_ops trapstat_ops = {
DEVO_REV, /* devo_rev, */
0, /* refcnt */
trapstat_info, /* getinfo */
nulldev, /* identify */
nulldev, /* probe */
trapstat_attach, /* attach */
trapstat_detach, /* detach */
nulldev, /* reset */
&trapstat_cb_ops, /* cb_ops */
(struct bus_ops *)0, /* bus_ops */
};
static struct modldrv modldrv = {
&mod_driverops, /* Type of module. This one is a driver */
"Trap Statistics", /* name of module */
&trapstat_ops, /* driver ops */
};
static struct modlinkage modlinkage = {
MODREV_1, (void *)&modldrv, NULL
};
int
_init(void)
{
return (mod_install(&modlinkage));
}
int
_fini(void)
{
return (mod_remove(&modlinkage));
}
int
_info(struct modinfo *modinfop)
{
return (mod_info(&modlinkage, modinfop));
}