vm_usage.c revision dc32d872cbeb56532bcea030255db9cd79bac7da
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* vm_usage
*
* This file implements the getvmusage() private system call.
* getvmusage() counts the amount of resident memory pages and swap
* reserved by the specified process collective. A "process collective" is
* the set of processes owned by a particular, zone, project, task, or user.
*
* rss and swap are counted so that for a given process collective, a page is
* only counted once. For example, this means that if multiple processes in
* the same project map the same page, then the project will only be charged
* once for that page. On the other hand, if two processes in different
* projects map the same page, then both projects will be charged
* for the page.
*
* The vm_getusage() calculation is implemented so that the first thread
* finish, copying the results. This enables multiple rcapds and prstats to
* consume data from the same calculation. The results are also cached so that
* a caller interested in recent results can just copy them instead of starting
* a new calculation. The caller passes the maximium age (in seconds) of the
* data. If the cached data is young enough, the cache is copied, otherwise,
* a new calculation is executed and the cache is replaced with the new
* data.
*
* The rss calculation for each process collective is as follows:
*
* - Inspect flags, determine if counting rss for zones, projects, tasks,
* - For each proc:
* - For each seg in proc's address space:
* - If seg is private:
* - Lookup anons in the amp.
* - For incore pages not previously visited each of the
* proc's collectives, add incore pagesize to each.
* collective.
* Anon's with a refcnt of 1 can be assummed to be not
* previously visited.
* - For address ranges without anons in the amp:
* - Lookup pages in underlying vnode.
* - For incore pages not previously visiting for
* each of the proc's collectives, add incore
* pagesize to each collective.
* - If seg is shared:
* - Lookup pages in the shared amp or vnode.
* - For incore pages not previously visited for each of
* the proc's collectives, add incore pagesize to each
* collective.
*
* Swap is reserved by private segments, and shared anonymous segments.
* The only shared anon segments which do not reserve swap are ISM segments
* and schedctl segments, both of which can be identified by having
* amp->swresv == 0.
*
* The swap calculation for each collective is as follows:
*
* - Inspect flags, determine if counting rss for zones, projects, tasks,
* - For each proc:
* - For each seg in proc's address space:
* - If seg is private:
* - Add svd->swresv pages to swap count for each of the
* proc's collectives.
* - If seg is anon, shared, and amp->swresv != 0
* - For address ranges in amp not previously visited for
* each of the proc's collectives, add size of address
* range to the swap count for each collective.
*
* These two calculations are done simultaneously, with most of the work
* being done in vmu_calculate_seg(). The results of the calculation are
* copied into "vmu_data.vmu_cache_results".
*
* To perform the calculation, various things are tracked and cached:
*
* - incore/not-incore page ranges for all vnodes.
* (vmu_data.vmu_all_vnodes_hash)
* This eliminates looking up the same page more than once.
*
* - incore/not-incore page ranges for all shared amps.
* (vmu_data.vmu_all_amps_hash)
* This eliminates looking up the same page more than once.
*
* - visited page ranges for each collective.
* - per vnode (entity->vme_vnode_hash)
* - per shared amp (entity->vme_amp_hash)
* For accurate counting of map-shared and COW-shared pages.
*
* - visited private anons (refcnt > 1) for each collective.
* (entity->vme_anon_hash)
* For accurate counting of COW-shared pages.
*
* The common accounting structure is the vmu_entity_t, which represents
* collectives:
*
* - A zone.
* - A project, task, or user within a zone.
* - The entire system (vmu_data.vmu_system).
* - Each collapsed (col) project and user. This means a given projid or
* uid, regardless of which zone the process is in. For instance,
* project 0 in the global zone and project 0 in a non global zone are
* the same collapsed project.
*
* Each entity structure tracks which pages have been already visited for
* that entity (via previously inspected processes) so that these pages are
* not double counted.
*/
#include <sys/modhash_impl.h>
#include <sys/vm_usage.h>
#define VMUSAGE_HASH_SIZE 512
#define VMUSAGE_TYPE_VNODE 1
#define VMUSAGE_TYPE_AMP 2
#define VMUSAGE_TYPE_ANON 3
#define VMUSAGE_BOUND_UNKNOWN 0
#define VMUSAGE_BOUND_INCORE 1
#define VMUSAGE_BOUND_NOT_INCORE 2
/*
* bounds for vnodes and shared amps
* Each bound is either entirely incore, entirely not in core, or
* entirely unknown. bounds are stored in an avl tree sorted by start member
* when in use, otherwise (free or temporary lists) they're strung
* together off of vmb_next.
*/
typedef struct vmu_bound {
char vmb_type; /* One of VMUSAGE_BOUND_* */
} vmu_bound_t;
/*
* hash of visited objects (vnodes or shared amps)
* key is address of vnode or amp. Bounds lists known incore/non-incore
*/
typedef struct vmu_object {
short vmo_type;
} vmu_object_t;
/*
* Entity by which to count results.
*
* (zone, project, etc), and hashes of vm structures that have already
* been visited for the entity.
*
* vme_next: links the list of all entities currently being counted by
* vmu_calculate().
*
* vme_next_calc: links the list of entities related to the current process
* being counted by vmu_calculate_proc().
*
* vmu_calculate_proc() walks all processes. For each process, it makes a
* list of the entities related to that process using vme_next_calc. This
* list changes each time vmu_calculate_proc() is called.
*
*/
typedef struct vmu_entity {
struct vmu_entity *vme_next;
struct vmu_entity *vme_next_calc;
} vmu_entity_t;
/*
* Hash of entities visited within a zone, and an entity for the zone
* itself.
*/
typedef struct vmu_zone {
} vmu_zone_t;
/*
* Cache of results from last calculation
*/
typedef struct vmu_cache {
/* vm_getusage(). */
} vmu_cache_t;
/*
* top level rss info for the system
*/
typedef struct vmu_data {
/* Waiting for */
/* Rss_calc_thread to finish */
/* in all zones */
/* to implement VMUSAGE_COL_* */
/* flags, which aggregate by */
/* project or user regardless */
/* of zoneid. */
/* to track incore/not-incore */
/* incore */
/* vmu_calculate() */
/* currently running calc */
/* thread */
/* threads waiting for */
/* calc thread to finish */
/* for calc thread */
} vmu_data_t;
extern zone_t *global_zone;
extern struct seg_ops segspt_shmops;
static vmu_data_t vmu_data;
static kmem_cache_t *vmu_bound_cache;
static kmem_cache_t *vmu_object_cache;
/*
* Comparison routine for AVL tree. We base our comparison on vmb_start.
*/
static int
{
return (0);
}
return (-1);
}
return (1);
}
/*
* Save a bound on the free list.
*/
static void
{
}
/*
* Free an object, and all visited bound info.
*/
static void
{
}
/*
* Free an entity, and hashes of visited objects for that entity.
*/
static void
{
}
/*
* Free zone entity, and all hashes of entities inside that zone,
* which are projects, tasks, and users.
*/
static void
{
}
}
/*
* Initialize synchronization primitives and hashes for system-wide tracking
* of visited vnodes and shared amps. Initialize results cache.
*/
void
{
sizeof (vnode_t));
sizeof (struct anon_map));
"vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
"vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
"vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
vmu_data.vmu_nentities = 0;
vmu_data.vmu_calc_flags = 0;
}
/*
* Allocate hashes for tracking vm objects visited for an entity.
* Update list of entities.
*/
static vmu_entity_t *
{
} else {
}
sizeof (vnode_t));
sizeof (struct anon_map));
"vmusage anon hash", VMUSAGE_HASH_SIZE,
mod_hash_null_valdtor, sizeof (struct anon));
return (entity);
}
/*
* Allocate a zone entity, and hashes for tracking visited vm objects
* for projects, tasks, and users within that zone.
*/
static vmu_zone_t *
{
} else {
}
return (zone);
}
/*
* Allocate a structure for tracking visited bounds for a vm object.
*/
static vmu_object_t *
{
} else {
}
return (object);
}
/*
* Allocate and return a bound structure.
*/
static vmu_bound_t *
{
} else {
}
return (bound);
}
/*
* vmu_find_insert_* functions implement hash lookup or allocate and
* insert operations.
*/
static vmu_object_t *
{
int ret;
(mod_hash_val_t *)&object);
if (ret != 0) {
}
return (object);
}
static int
{
int ret;
(mod_hash_val_t *)&val);
if (ret == 0)
return (0);
return (1);
}
static vmu_entity_t *
{
int ret;
(mod_hash_val_t *)&entity);
if (ret != 0) {
(mod_hash_hndl_t)0);
}
return (entity);
}
/*
* Returns list of object bounds between start and end. New bounds inserted
* by this call are given type.
*
* Returns the number of pages covered if new bounds are created. Returns 0
*/
static pgcnt_t
{
tmp = vmu_alloc_bound();
/* Hopelessly optimistic case. */
/* We got lucky. */
}
/* Is start in the previous node? */
/* We found start. */
}
}
}
/*
* At this point, if *first is still NULL, then we
* didn't get a direct hit and start isn't covered
* by the previous node. We know that the next node
* must have a greater start value than we require
* because avl_find tells us where the AVL routines would
* insert our new node. We have some gap between the
* start we want and the next node.
*/
/* Fill the gap. */
} else {
/* We have a gap over [start, end]. */
}
}
/* We're done. */
return (ret);
}
/*
* If we are here we still need to set *last and
* that may involve filling in some gaps.
*/
for (;;) {
/* We're done. */
break;
}
/* Bottom or mid tree with gap. */
tmp = vmu_alloc_bound();
break;
} else {
/* Non-contiguous. */
tmp = vmu_alloc_bound();
} else {
}
}
}
return (ret);
}
/*
* vmu_update_bounds()
*
* tree: avl_tree in which first and last hang.
*
* first, last: list of continuous bounds, of which zero or more are of
* type VMUSAGE_BOUND_UNKNOWN.
*
* new_tree: avl_tree in which new_first and new_last hang.
*
* new_first, new_last: list of continuous bounds, of which none are of
* type VMUSAGE_BOUND_UNKNOWN. These bounds are used to
* update the types of bounds in (first,last) with
* type VMUSAGE_BOUND_UNKNOWN.
*
* For the list of bounds (first,last), this function updates any bounds
* with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
* the list (new_first, new_last).
*
* If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
* (new_first, new_last), it will be split into multiple bounds.
*
* Return value:
* The number of pages in the list of bounds (first,last) that were of
* type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
* VMUSAGE_BOUND_INCORE.
*
*/
static pgcnt_t
{
/*
* Verify first and last bound are covered by new bounds if they
* have unknown type.
*/
for (;;) {
/* If bound already has type, proceed to next bound. */
break;
continue;
}
/* need to split bound */
tmp = vmu_alloc_bound();
} else {
break;
}
}
return (rss);
}
/*
* Merges adjacent bounds with same type between first and last bound.
* After merge, last pointer may point to a different bound, as (incoming)
* last bound may have been merged away.
*/
static void
{
}
} else {
}
}
}
/*
* Given an amp and a list of bounds, updates each bound's type with
* VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
*
* If a bound is partially incore, it will be split into two bounds.
* first and last may be modified, as bounds may be split into multiple
* bounds if they are partially incore/not-incore.
*
* Set incore to non-zero if bounds are already known to be incore.
*
*/
static void
{
short bound_type;
short page_type;
/* Shared anon slots don't change once set. */
for (;;) {
break;
continue;
}
/*
* These are used to determine how much to increment
* index when a large page is found.
*/
- 1;
}
} else {
}
if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
/*
* If current bound type does not match page
* type, need to split off new bound.
*/
tmp = vmu_alloc_bound();
}
if (pgcnt > 1) {
/*
* If inside large page, jump to next large
* page
*/
} else {
index++;
}
}
break;
} else
}
}
/*
* Same as vmu_amp_update_incore_bounds(), except for tracking
* incore-/not-incore for vnodes.
*/
static void
{
short bound_type;
short page_type;
for (;;) {
break;
continue;
}
/*
* These are used to determine how much to increment
* index when a large page is found.
*/
- 1;
}
} else {
}
if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
/*
* If current bound type does not match page
* type, need to split off new bound.
*/
tmp = vmu_alloc_bound();
}
if (pgcnt > 1) {
/*
* If inside large page, jump to next large
* page
*/
} else {
index++;
}
}
break;
} else
}
}
/*
* Calculate the rss and swap consumed by a segment. vmu_entities is the
* list of entities to visit. For shared segments, the vnode or amp
* is looked up in each entity to see if it has been already counted. Private
* anon pages are checked per entity to ensure that COW pages are not
* double counted.
*
* For private mapped files, first the amp is checked for private pages.
* Bounds not backed by the amp are looked up in the vnode for each entity
* to avoid double counting of private COW vnode pages.
*/
static void
{
struct segvn_data *svd;
int file = 0;
/* Can zero-length segments exist? Not sure, so paranoia. */
return;
/*
* Figure out if there is a shared object (such as a named vnode or
* a shared amp, then figure out if there is a private amp, which
* identifies private pages.
*/
} else {
RW_READER) != 0) {
/*
* Text replication anon maps can be shared
* across all zones. Space used for text
* replication is typically capped as a small %
* of memory. To keep it simple for now we
* don't account for swap and memory space used
* for text replication.
*/
}
}
}
file = 1;
}
/* schedctl mappings are always in core */
}
s_start = 0;
/* ism segments are always incore and do not reserve swap */
} else {
return;
}
/*
* If there is a private amp, count anon pages that exist. If an
* anon has a refcnt > 1 (COW sharing), then save the anon in a
* hash so that it is not double counted.
*
* If there is also a shared object, then figure out the bounds
* which are not mapped by the private amp.
*/
if (private_amp != NULL) {
/* Enter as writer to prevent COW anons from being freed */
int cnt;
&p_index_next);
/*
* If next anon is past end of mapping, simulate
* end of anon so loop terminates.
*/
if (p_index_next > p_end) {
}
/*
* For COW segments, keep track of bounds not
* backed by private amp so they can be looked
* up in the backing vnode
*/
if (p_index_next != p_index) {
/*
* Compute index difference between anon and
* previous anon.
*/
if (shared_object != NULL) {
cur = vmu_alloc_bound();
} else {
}
}
}
/* Detect end of anons in amp */
break;
p_index++;
s_index++;
continue;
}
/*
* If large page is found, compute portion of large
* page in mapping, and increment indicies to the next
* large page.
*/
/* First page in large page */
/* Last page in large page */
/*
* Artifically end page if page extends past
* end of mapping.
*/
/*
* Compute number of pages from large page
* which are mapped.
*/
/*
* Point indicies at page after large page,
* or at page after end of mapping.
*/
} else {
p_index++;
s_index++;
}
/*
* Assume anon structs with a refcnt
* of 1 are not COW shared, so there
* is no reason to track them per entity.
*/
if (cnt == 1) {
continue;
}
/*
* Track COW anons per entity so
* they are not double counted.
*/
continue;
}
}
}
/* Add up resident anon and swap reserved for private mappings */
}
}
/* Compute resident pages backing shared amp or named vnode */
if (shared_object != NULL) {
/*
* No private amp, or private amp has no anon
* structs. This means entire segment is backed by
* the shared object.
*/
first = vmu_alloc_bound();
}
/*
* Iterate bounds not backed by private amp, and compute
* resident pages.
*/
/* new bounds, find incore/not-incore */
if (shared_object->vmo_type ==
tree,
(vnode_t *)
&last);
} else {
tree,
(struct anon_map *)
}
}
if (virt == 0)
continue;
/*
* Range visited for this entity
*/
/* shared anon mapping */
result->vmu_swap_all +=
result->vmu_rss_shared +=
/* shared file mapping */
result->vmu_rss_shared +=
/* private file mapping */
}
}
}
}
}
/*
* Based on the current calculation flags, find the relevant entities
* which are relative to the process. Then calculate each segment
* in the process'es address space for each relevant entity.
*/
static void
{
int ret;
/* Figure out which entities are being computed */
}
if (vmu_data.vmu_calc_flags &
(mod_hash_val_t *)&zone);
if (ret != 0) {
}
}
if (vmu_data.vmu_calc_flags &
}
if (vmu_data.vmu_calc_flags &
(VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
}
if (vmu_data.vmu_calc_flags &
(VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
}
if (vmu_data.vmu_calc_flags &
(VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
}
}
/* Entities which collapse projects and users for all zones */
}
}
}
/* process all segs in process's address space */
}
}
/*
* Free data created by previous call to vmu_calculate().
*/
static void
{
}
/*
* Free unused data structures. These can result if the system workload
* decreases between calculations.
*/
static void
{
vmu_zone_t *tz;
}
}
}
}
}
extern kcondvar_t *pr_pid_cv;
/*
* Determine which entity types are relevant and allocate the hashes to
* track them. Then walk the process table and count rss and swap
* for each process'es address space. Address space object such as
* vnodes, amps and anons are tracked per entity, so that they are
* not double counted in the results.
*
*/
static void
{
int i = 0;
int ret;
proc_t *p;
/*
* Walk process table and calculate rss of each proc.
*
* Pidlock and p_lock cannot be held while doing the rss calculation.
* This is because:
* 1. The calculation allocates using KM_SLEEP.
* 2. The calculation grabs a_lock, which cannot be grabbed
* after p_lock.
*
* Since pidlock must be dropped, we cannot simply just walk the
* practive list. Instead, we walk the process table, and sprlock
* each process to ensure that it does not exit during the
* calculation.
*/
for (i = 0; i < v.v_proc; i++) {
p = pid_entry(i);
if (p == NULL)
continue;
mutex_enter(&p->p_lock);
if (panicstr) {
mutex_exit(&p->p_lock);
return;
}
/* Try to set P_PR_LOCK */
ret = sprtrylock_proc(p);
if (ret == -1) {
/* Process in invalid state */
mutex_exit(&p->p_lock);
continue;
} else if (ret == 1) {
/*
* P_PR_LOCK is already set. Wait and try again.
* This also drops p_lock.
*/
sprwaitlock_proc(p);
goto again;
}
mutex_exit(&p->p_lock);
mutex_enter(&p->p_lock);
sprunlock(p);
}
}
/*
* allocate a new cache for N results satisfying flags
*/
{
return (cache);
}
/*
* Make sure cached results are not freed
*/
static void
{
cache->vmc_refcnt++;
}
/*
* free cache data
*/
static void
{
cache->vmc_refcnt--;
if (cache->vmc_refcnt == 0) {
}
}
/*
* Copy out the cached results to a caller. Inspect the callers flags
* and zone to determine which cached results should be copied.
*/
static int
{
int ret = 0;
} else {
bufsize = 0;
}
/* figure out what results the caller is interested in. */
types |= VMUSAGE_SYSTEM;
types |= VMUSAGE_ZONE;
types |= VMUSAGE_TASKS;
types |= VMUSAGE_RUSERS;
types |= VMUSAGE_EUSERS;
/* count results for current zone */
out_result = buf;
/* Do not return "other-zone" results to non-global zones */
continue;
/*
* If non-global zone requests VMUSAGE_SYSTEM, fake
* up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
*/
(flags & VMUSAGE_SYSTEM) != 0 &&
count++;
if (out_result != NULL) {
} else {
out_result++;
}
}
}
/* Skip results that do not match requested type */
continue;
/* Skip collated results if not requested */
(flags & VMUSAGE_COL_PROJECTS) == 0)
continue;
(flags & VMUSAGE_COL_EUSERS) == 0)
continue;
(flags & VMUSAGE_COL_RUSERS) == 0)
continue;
}
/* Skip "other zone" results if not requested */
(flags & VMUSAGE_ALL_ZONES) == 0)
continue;
(flags & (VMUSAGE_ALL_PROJECTS |
VMUSAGE_COL_PROJECTS)) == 0)
continue;
(flags & VMUSAGE_ALL_TASKS) == 0)
continue;
(flags & (VMUSAGE_ALL_RUSERS |
VMUSAGE_COL_RUSERS)) == 0)
continue;
(flags & (VMUSAGE_ALL_EUSERS |
VMUSAGE_COL_EUSERS)) == 0)
continue;
}
count++;
if (out_result != NULL) {
} else {
out_result++;
}
}
}
return (ret);
}
/*
* vm_getusage()
*
* determines the type of results structures returned. Flags requesting
* results from more than one zone are "flattened" to the local zone if the
* caller is not the global zone.
*
* args:
* flags: bitmap consisting of one or more of VMUSAGE_*.
* age: maximum allowable age (time since counting was done) in
* seconds of the results. Results from previous callers are
* cached in kernel.
* buf: pointer to buffer array of vmusage_t. If NULL, then only nres
* set on success.
* nres: Set to number of vmusage_t structures pointed to by buf
* before calling vm_getusage().
* On return 0 (success) or ENOSPC, is set to the number of result
* structures returned or attempted to return.
*
* returns 0 on success, -1 on failure:
* EINTR (interrupted)
* ENOSPC (nres to small for results, nres set to needed value for success)
* EINVAL (flags invalid)
* EFAULT (bad address for buf or nres)
*/
int
{
int ret = 0;
int cacherecent = 0;
/*
* results, or the system result, so munge the flags accordingly.
*/
flags_orig = flags;
}
flags |= VMUSAGE_RUSERS;
}
flags |= VMUSAGE_EUSERS;
}
if (flags & VMUSAGE_SYSTEM) {
flags &= ~VMUSAGE_SYSTEM;
flags |= VMUSAGE_ZONE;
}
}
/* Check for unknown flags */
if ((flags & (~VMUSAGE_MASK)) != 0)
/* Check for no flags */
if ((flags & VMUSAGE_MASK) == 0)
cacherecent = 1;
cacherecent == 1) {
cpflg);
if (vmu_data.vmu_pending_waiters > 0)
return (ret);
}
/*
* If the cache is recent, it is likely that there are other
* consumers of vm_getusage running, so add their flags to the
* desired flags for the calculation.
*/
if (cacherecent == 1)
}
vmu_data.vmu_nentities = 0;
if (vmu_data.vmu_pending_waiters > 0)
/* copy results to cache */
result++;
}
vmu_data.vmu_calc_flags = 0;
if (vmu_data.vmu_pending_waiters > 0)
/* copy cache */
return (ret);
}
}
}
goto start;
}