vm_usage.c revision 2cb27123907a098a777e39eebc349d73e99a518f
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* vm_usage
*
* This file implements the getvmusage() private system call.
* getvmusage() counts the amount of resident memory pages and swap
* reserved by the specified process collective. A "process collective" is
* the set of processes owned by a particular, zone, project, task, or user.
*
* rss and swap are counted so that for a given process collective, a page is
* only counted once. For example, this means that if multiple processes in
* the same project map the same page, then the project will only be charged
* once for that page. On the other hand, if two processes in different
* projects map the same page, then both projects will be charged
* for the page.
*
* The vm_getusage() calculation is implemented so that the first thread
* performs the rss/swap counting. Other callers will wait for that thread to
* finish, copying the results. This enables multiple rcapds and prstats to
* consume data from the same calculation. The results are also cached so that
* a caller interested in recent results can just copy them instead of starting
* a new calculation. The caller passes the maximium age (in seconds) of the
* data. If the cached data is young enough, the cache is copied, otherwise,
* a new calculation is executed and the cache is replaced with the new
* data.
*
* The rss calculation for each process collective is as follows:
*
* - Inspect flags, determine if counting rss for zones, projects, tasks,
* and/or users.
* - For each proc:
* - Figure out proc's collectives (zone, project, task, and/or user).
* - For each seg in proc's address space:
* - If seg is private:
* - Lookup anons in the amp.
* - For incore pages not previously visited each of the
* proc's collectives, add incore pagesize to each.
* collective.
* Anon's with a refcnt of 1 can be assummed to be not
* previously visited.
* - For address ranges without anons in the amp:
* - Lookup pages in underlying vnode.
* - For incore pages not previously visiting for
* each of the proc's collectives, add incore
* pagesize to each collective.
* - If seg is shared:
* - Lookup pages in the shared amp or vnode.
* - For incore pages not previously visited for each of
* the proc's collectives, add incore pagesize to each
* collective.
*
* Swap is reserved by private segments, and shared anonymous segments.
* The only shared anon segments which do not reserve swap are ISM segments
* and schedctl segments, both of which can be identified by having
* amp->swresv == 0.
*
* The swap calculation for each collective is as follows:
*
* - Inspect flags, determine if counting rss for zones, projects, tasks,
* and/or users.
* - For each proc:
* - Figure out proc's collectives (zone, project, task, and/or user).
* - For each seg in proc's address space:
* - If seg is private:
* - Add svd->swresv pages to swap count for each of the
* proc's collectives.
* - If seg is anon, shared, and amp->swresv != 0
* - For address ranges in amp not previously visited for
* each of the proc's collectives, add size of address
* range to the swap count for each collective.
*
* These two calculations are done simultaneously, with most of the work
* being done in vmu_calculate_seg(). The results of the calculation are
* copied into "vmu_data.vmu_cache_results".
*
* To perform the calculation, various things are tracked and cached:
*
* - incore/not-incore page ranges for all vnodes.
* (vmu_data.vmu_all_vnodes_hash)
* This eliminates looking up the same page more than once.
*
* - incore/not-incore page ranges for all shared amps.
* (vmu_data.vmu_all_amps_hash)
* This eliminates looking up the same page more than once.
*
* - visited page ranges for each collective.
* - per vnode (entity->vme_vnode_hash)
* - per shared amp (entity->vme_amp_hash)
* For accurate counting of map-shared and cow-shared pages.
*
* - visited private anons (refcnt > 1) for each collective.
* (entity->vme_anon_hash)
* For accurate counting of cow-shared pages.
*
* The common accounting structure is the vmu_entity_t, which represents
* collectives:
*
* - A zone.
* - A project, task, or user within a zone.
* - The entire system (vmu_data.vmu_system).
* - Each collapsed (col) project and user. This means a given projid or
* uid, regardless of which zone the process is in. For instance,
* project 0 in the global zone and project 0 in a non global zone are
* the same collapsed project.
*
* Each entity structure tracks which pages have been already visited for
* that entity (via previously inspected processes) so that these pages are
* not double counted.
*/
#include <sys/errno.h>
#include <sys/types.h>
#include <sys/zone.h>
#include <sys/proc.h>
#include <sys/project.h>
#include <sys/task.h>
#include <sys/thread.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <sys/modhash.h>
#include <sys/modhash_impl.h>
#include <sys/shm.h>
#include <sys/swap.h>
#include <sys/synch.h>
#include <sys/systm.h>
#include <sys/var.h>
#include <sys/vm_usage.h>
#include <sys/zone.h>
#include <vm/anon.h>
#include <vm/as.h>
#include <vm/seg_vn.h>
#include <vm/seg_spt.h>
#define VMUSAGE_HASH_SIZE 512
#define VMUSAGE_TYPE_VNODE 1
#define VMUSAGE_TYPE_AMP 2
#define VMUSAGE_TYPE_ANON 3
#define VMUSAGE_BOUND_UNKNOWN 0
#define VMUSAGE_BOUND_INCORE 1
#define VMUSAGE_BOUND_NOT_INCORE 2
/*
* bounds for vnodes and shared amps
* Each bound is either entirely incore, entirely not in core, or
* entirely unknown. bounds are stored in order by offset.
*/
typedef struct vmu_bound {
struct vmu_bound *vmb_next;
pgcnt_t vmb_start; /* page offset in vnode/amp on which bound starts */
pgcnt_t vmb_end; /* page offset in vnode/amp on which bound ends */
char vmb_type; /* One of VMUSAGE_BOUND_* */
} vmu_bound_t;
/*
* hash of visited objects (vnodes or shared amps)
* key is address of vnode or amp. Bounds lists known incore/non-incore
* bounds for vnode/amp.
*/
typedef struct vmu_object {
struct vmu_object *vmo_next; /* free list */
caddr_t vmo_key;
short vmo_type;
vmu_bound_t *vmo_bounds;
} vmu_object_t;
/*
* Entity by which to count results.
*
* The entity structure keeps the current rss/swap counts for each entity
* (zone, project, etc), and hashes of vm structures that have already
* been visited for the entity.
*
* vme_next: links the list of all entities currently being counted by
* vmu_calculate().
*
* vme_next_calc: links the list of entities related to the current process
* being counted by vmu_calculate_proc().
*
* vmu_calculate_proc() walks all processes. For each process, it makes a
* list of the entities related to that process using vme_next_calc. This
* list changes each time vmu_calculate_proc() is called.
*
*/
typedef struct vmu_entity {
struct vmu_entity *vme_next;
struct vmu_entity *vme_next_calc;
mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */
mod_hash_t *vme_amp_hash; /* shared amps visited for entity */
mod_hash_t *vme_anon_hash; /* cow anons visited for entity */
vmusage_t vme_result; /* identifies entity and results */
} vmu_entity_t;
/*
* Hash of entities visited within a zone, and an entity for the zone
* itself.
*/
typedef struct vmu_zone {
struct vmu_zone *vmz_next; /* free list */
id_t vmz_id;
vmu_entity_t *vmz_zone;
mod_hash_t *vmz_projects_hash;
mod_hash_t *vmz_tasks_hash;
mod_hash_t *vmz_rusers_hash;
mod_hash_t *vmz_eusers_hash;
} vmu_zone_t;
/*
* Cache of results from last calculation
*/
typedef struct vmu_cache {
vmusage_t *vmc_results; /* Results from last call to */
/* vm_getusage(). */
uint64_t vmc_nresults; /* Count of cached results */
uint64_t vmc_refcnt; /* refcnt for free */
uint_t vmc_flags; /* Flags for vm_getusage() */
hrtime_t vmc_timestamp; /* when cache was created */
} vmu_cache_t;
/*
* top level rss info for the system
*/
typedef struct vmu_data {
kmutex_t vmu_lock; /* Protects vmu_data */
kcondvar_t vmu_cv; /* Used to signal threads */
/* Waiting for */
/* Rss_calc_thread to finish */
vmu_entity_t *vmu_system; /* Entity for tracking */
/* rss/swap for all processes */
/* in all zones */
mod_hash_t *vmu_zones_hash; /* Zones visited */
mod_hash_t *vmu_projects_col_hash; /* These *_col_hash hashes */
mod_hash_t *vmu_rusers_col_hash; /* keep track of entities, */
mod_hash_t *vmu_eusers_col_hash; /* ignoring zoneid, in order */
/* to implement VMUSAGE_COL_* */
/* flags, which aggregate by */
/* project or user regardless */
/* of zoneid. */
mod_hash_t *vmu_all_vnodes_hash; /* System wide visited vnodes */
/* to track incore/not-incore */
mod_hash_t *vmu_all_amps_hash; /* System wide visited shared */
/* amps to track incore/not- */
/* incore */
vmu_entity_t *vmu_entities; /* Linked list of entities */
size_t vmu_nentities; /* Count of entities in list */
vmu_cache_t *vmu_cache; /* Cached results */
kthread_t *vmu_calc_thread; /* NULL, or thread running */
/* vmu_calculate() */
uint_t vmu_calc_flags; /* Flags being using by */
/* currently running calc */
/* thread */
uint_t vmu_pending_flags; /* Flags of vm_getusage() */
/* threads waiting for */
/* calc thread to finish */
uint_t vmu_pending_waiters; /* Number of threads waiting */
/* for calc thread */
vmu_bound_t *vmu_free_bounds;
vmu_object_t *vmu_free_objects;
vmu_entity_t *vmu_free_entities;
vmu_zone_t *vmu_free_zones;
} vmu_data_t;
extern struct as kas;
extern proc_t *practive;
extern zone_t *global_zone;
extern struct seg_ops segvn_ops;
extern struct seg_ops segspt_shmops;
static vmu_data_t vmu_data;
static kmem_cache_t *vmu_bound_cache;
static kmem_cache_t *vmu_object_cache;
/*
* Save a bound on the free list
*/
static void
vmu_free_bound(vmu_bound_t *bound)
{
bound->vmb_next = vmu_data.vmu_free_bounds;
vmu_data.vmu_free_bounds = bound;
}
/*
* Free an object, and all visited bound info.
*/
static void
vmu_free_object(mod_hash_val_t val)
{
vmu_object_t *obj = (vmu_object_t *)val;
vmu_bound_t *bound = obj->vmo_bounds;
vmu_bound_t *tmp;
while (bound != NULL) {
tmp = bound;
bound = bound->vmb_next;
vmu_free_bound(tmp);
}
obj->vmo_next = vmu_data.vmu_free_objects;
vmu_data.vmu_free_objects = obj;
}
/*
* Free an entity, and hashes of visited objects for that entity.
*/
static void
vmu_free_entity(mod_hash_val_t val)
{
vmu_entity_t *entity = (vmu_entity_t *)val;
if (entity->vme_vnode_hash != NULL)
i_mod_hash_clear_nosync(entity->vme_vnode_hash);
if (entity->vme_amp_hash != NULL)
i_mod_hash_clear_nosync(entity->vme_amp_hash);
if (entity->vme_anon_hash != NULL)
i_mod_hash_clear_nosync(entity->vme_anon_hash);
entity->vme_next = vmu_data.vmu_free_entities;
vmu_data.vmu_free_entities = entity;
}
/*
* Free zone entity, and all hashes of entities inside that zone,
* which are projects, tasks, and users.
*/
static void
vmu_free_zone(mod_hash_val_t val)
{
vmu_zone_t *zone = (vmu_zone_t *)val;
if (zone->vmz_zone != NULL) {
vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
zone->vmz_zone = NULL;
}
if (zone->vmz_projects_hash != NULL)
i_mod_hash_clear_nosync(zone->vmz_projects_hash);
if (zone->vmz_tasks_hash != NULL)
i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
if (zone->vmz_rusers_hash != NULL)
i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
if (zone->vmz_eusers_hash != NULL)
i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
zone->vmz_next = vmu_data.vmu_free_zones;
vmu_data.vmu_free_zones = zone;
}
/*
* Initialize synchronization primitives and hashes for system-wide tracking
* of visited vnodes and shared amps. Initialize results cache.
*/
void
vm_usage_init()
{
mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);
vmu_data.vmu_system = NULL;
vmu_data.vmu_zones_hash = NULL;
vmu_data.vmu_projects_col_hash = NULL;
vmu_data.vmu_rusers_col_hash = NULL;
vmu_data.vmu_eusers_col_hash = NULL;
vmu_data.vmu_free_bounds = NULL;
vmu_data.vmu_free_objects = NULL;
vmu_data.vmu_free_entities = NULL;
vmu_data.vmu_free_zones = NULL;
vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
"vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
sizeof (vnode_t));
vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
"vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
sizeof (struct anon_map));
vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
"vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
vmu_free_entity);
vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
"vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
vmu_free_entity);
vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
"vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
vmu_free_entity);
vmu_data.vmu_zones_hash = mod_hash_create_idhash(
"vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);
vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
vmu_object_cache = kmem_cache_create("vmu_object_cache",
sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
vmu_data.vmu_entities = NULL;
vmu_data.vmu_nentities = 0;
vmu_data.vmu_cache = NULL;
vmu_data.vmu_calc_thread = NULL;
vmu_data.vmu_calc_flags = 0;
vmu_data.vmu_pending_flags = 0;
vmu_data.vmu_pending_waiters = 0;
}
/*
* Allocate hashes for tracking vm objects visited for an entity.
* Update list of entities.
*/
static vmu_entity_t *
vmu_alloc_entity(id_t id, int type, id_t zoneid)
{
vmu_entity_t *entity;
if (vmu_data.vmu_free_entities != NULL) {
entity = vmu_data.vmu_free_entities;
vmu_data.vmu_free_entities =
vmu_data.vmu_free_entities->vme_next;
bzero(&entity->vme_result, sizeof (vmusage_t));
} else {
entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
}
entity->vme_result.vmu_id = id;
entity->vme_result.vmu_zoneid = zoneid;
entity->vme_result.vmu_type = type;
if (entity->vme_vnode_hash == NULL)
entity->vme_vnode_hash = mod_hash_create_ptrhash(
"vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
sizeof (vnode_t));
if (entity->vme_amp_hash == NULL)
entity->vme_amp_hash = mod_hash_create_ptrhash(
"vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
sizeof (struct anon_map));
if (entity->vme_anon_hash == NULL)
entity->vme_anon_hash = mod_hash_create_ptrhash(
"vmusage anon hash", VMUSAGE_HASH_SIZE,
mod_hash_null_valdtor, sizeof (struct anon));
entity->vme_next = vmu_data.vmu_entities;
vmu_data.vmu_entities = entity;
vmu_data.vmu_nentities++;
return (entity);
}
/*
* Allocate a zone entity, and hashes for tracking visited vm objects
* for projects, tasks, and users within that zone.
*/
static vmu_zone_t *
vmu_alloc_zone(id_t id)
{
vmu_zone_t *zone;
if (vmu_data.vmu_free_zones != NULL) {
zone = vmu_data.vmu_free_zones;
vmu_data.vmu_free_zones =
vmu_data.vmu_free_zones->vmz_next;
zone->vmz_next = NULL;
zone->vmz_zone = NULL;
} else {
zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
}
zone->vmz_id = id;
if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
zone->vmz_projects_hash = mod_hash_create_idhash(
"vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
!= 0 && zone->vmz_tasks_hash == NULL)
zone->vmz_tasks_hash = mod_hash_create_idhash(
"vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
!= 0 && zone->vmz_rusers_hash == NULL)
zone->vmz_rusers_hash = mod_hash_create_idhash(
"vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
!= 0 && zone->vmz_eusers_hash == NULL)
zone->vmz_eusers_hash = mod_hash_create_idhash(
"vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
return (zone);
}
/*
* Allocate a structure for tracking visited bounds for a vm object.
*/
static vmu_object_t *
vmu_alloc_object(caddr_t key, int type)
{
vmu_object_t *object;
if (vmu_data.vmu_free_objects != NULL) {
object = vmu_data.vmu_free_objects;
vmu_data.vmu_free_objects =
vmu_data.vmu_free_objects->vmo_next;
} else {
object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
}
object->vmo_key = key;
object->vmo_type = type;
object->vmo_bounds = NULL;
return (object);
}
/*
* Allocate and return a bound structure.
*/
static vmu_bound_t *
vmu_alloc_bound()
{
vmu_bound_t *bound;
if (vmu_data.vmu_free_bounds != NULL) {
bound = vmu_data.vmu_free_bounds;
vmu_data.vmu_free_bounds =
vmu_data.vmu_free_bounds->vmb_next;
bzero(bound, sizeof (vmu_bound_t));
} else {
bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
bzero(bound, sizeof (vmu_bound_t));
}
return (bound);
}
/*
* vmu_find_insert_* functions implement hash lookup or allocate and
* insert operations.
*/
static vmu_object_t *
vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
{
int ret;
vmu_object_t *object;
ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
(mod_hash_val_t *)&object);
if (ret != 0) {
object = vmu_alloc_object(key, type);
ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
(mod_hash_val_t)object, (mod_hash_hndl_t)0);
ASSERT(ret == 0);
}
return (object);
}
static int
vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
{
int ret;
caddr_t val;
ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
(mod_hash_val_t *)&val);
if (ret == 0)
return (0);
ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
(mod_hash_val_t)key, (mod_hash_hndl_t)0);
ASSERT(ret == 0);
return (1);
}
static vmu_entity_t *
vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
{
int ret;
vmu_entity_t *entity;
ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
(mod_hash_val_t *)&entity);
if (ret != 0) {
entity = vmu_alloc_entity(id, type, zoneid);
ret = i_mod_hash_insert_nosync(hash,
(mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
(mod_hash_hndl_t)0);
ASSERT(ret == 0);
}
return (entity);
}
/*
* Returns list of object bounds between start and end. New bounds inserted
* by this call are given type.
*
* Returns the number of pages covered if new bounds are created. Returns 0
* if region between start/end consists of all existing bounds.
*/
static pgcnt_t
vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
end, char type, vmu_bound_t **first, vmu_bound_t **last)
{
vmu_bound_t *next;
vmu_bound_t *prev = NULL;
vmu_bound_t *tmp = NULL;
pgcnt_t ret = 0;
*first = *last = NULL;
for (next = ro->vmo_bounds; next != NULL; next = next->vmb_next) {
/*
* Find bounds overlapping or overlapped by range [start,end].
*/
if (start > next->vmb_end) {
/* bound is before new bound */
prev = next;
continue;
}
if (next->vmb_start > end) {
/* bound is after new bound */
break;
}
if (*first == NULL)
*first = next;
*last = next;
}
if (*first == NULL) {
ASSERT(*last == NULL);
/*
* No bounds overlapping range [start,end], so create new
* bound
*/
tmp = vmu_alloc_bound();
tmp->vmb_start = start;
tmp->vmb_end = end;
tmp->vmb_type = type;
if (prev == NULL) {
tmp->vmb_next = ro->vmo_bounds;
ro->vmo_bounds = tmp;
} else {
tmp->vmb_next = prev->vmb_next;
prev->vmb_next = tmp;
}
*first = tmp;
*last = tmp;
ASSERT(tmp->vmb_end >= tmp->vmb_start);
ret = tmp->vmb_end - tmp->vmb_start + 1;
return (ret);
}
/* Check to see if start is before first known bound */
ASSERT(first != NULL && last != NULL);
next = (*first);
if (start < (*first)->vmb_start) {
/* Create new bound before first bound */
tmp = vmu_alloc_bound();
tmp->vmb_start = start;
tmp->vmb_end = (*first)->vmb_start - 1;
tmp->vmb_type = type;
tmp->vmb_next = *first;
if (*first == ro->vmo_bounds)
ro->vmo_bounds = tmp;
if (prev != NULL)
prev->vmb_next = tmp;
ASSERT(tmp->vmb_end >= tmp->vmb_start);
ret += tmp->vmb_end - tmp->vmb_start + 1;
*first = tmp;
}
/*
* Between start and end, search for gaps between and after existing
* bounds. Create new bounds to fill gaps if they exist.
*/
while (end > next->vmb_end) {
/*
* Check for gap between bound and next bound. if no gap,
* continue.
*/
if ((next != *last) &&
((next->vmb_end + 1) == next->vmb_next->vmb_start)) {
next = next->vmb_next;
continue;
}
/*
* Insert new bound in gap after bound, and before next
* bound if next bound exists.
*/
tmp = vmu_alloc_bound();
tmp->vmb_type = type;
tmp->vmb_next = next->vmb_next;
tmp->vmb_start = next->vmb_end + 1;
if (next != *last) {
tmp->vmb_end = next->vmb_next->vmb_start - 1;
ASSERT(tmp->vmb_end >= tmp->vmb_start);
ret += tmp->vmb_end - tmp->vmb_start + 1;
next->vmb_next = tmp;
next = tmp->vmb_next;
} else {
tmp->vmb_end = end;
ASSERT(tmp->vmb_end >= tmp->vmb_start);
ret += tmp->vmb_end - tmp->vmb_start + 1;
next->vmb_next = tmp;
*last = tmp;
break;
}
}
return (ret);
}
/*
* vmu_update_bounds()
*
* first, last: list of continuous bounds, of which zero or more are of
* type VMUSAGE_BOUND_UNKNOWN.
*
* new_first, new_last: list of continuous bounds, of which none are of
* type VMUSAGE_BOUND_UNKNOWN. These bounds are used to
* update the types of bounds in (first,last) with
* type VMUSAGE_BOUND_UNKNOWN.
*
* For the list of bounds (first,last), this function updates any bounds
* with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
* the list (new_first, new_last).
*
* If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
* (new_first, new_last), it will be split into multiple bounds.
*
* Return value:
* The number of pages in the list of bounds (first,last) that were of
* type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
* VMUSAGE_BOUND_INCORE.
*
*/
static pgcnt_t
vmu_update_bounds(vmu_bound_t **first, vmu_bound_t **last,
vmu_bound_t *new_first, vmu_bound_t *new_last)
{
vmu_bound_t *next, *new_next, *tmp;
pgcnt_t rss = 0;
next = *first;
new_next = new_first;
/*
* Verify first and last bound are covered by new bounds if they
* have unknown type.
*/
ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
(*first)->vmb_start >= new_next->vmb_start);
ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
(*last)->vmb_end <= new_last->vmb_end);
for (;;) {
/* If bound already has type, proceed to next bound */
if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
if (next == *last)
break;
next = next->vmb_next;
continue;
}
while (new_next->vmb_end < next->vmb_start)
new_next = new_next->vmb_next;
ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
next->vmb_type = new_next->vmb_type;
if (new_next->vmb_end < next->vmb_end) {
/* need to split bound */
tmp = vmu_alloc_bound();
tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
tmp->vmb_start = new_next->vmb_end + 1;
tmp->vmb_end = next->vmb_end;
tmp->vmb_next = next->vmb_next;
next->vmb_end = new_next->vmb_end;
next->vmb_next = tmp;
if (*last == next)
*last = tmp;
if (next->vmb_type == VMUSAGE_BOUND_INCORE)
rss += next->vmb_end - next->vmb_start + 1;
next = tmp;
} else {
if (next->vmb_type == VMUSAGE_BOUND_INCORE)
rss += next->vmb_end - next->vmb_start + 1;
if (next == *last)
break;
next = next->vmb_next;
}
}
return (rss);
}
/*
* merges adjacent bounds with same type between first and last bound.
* After merge, last pointer is no longer valid, as last bound may be
* merged away.
*/
static void
vmu_merge_bounds(vmu_bound_t **first, vmu_bound_t **last)
{
vmu_bound_t *next;
vmu_bound_t *tmp;
ASSERT(*first != NULL);
ASSERT(*last != NULL);
next = *first;
while (next != *last) {
/* If bounds are adjacent and have same type, merge them */
if (((next->vmb_end + 1) == next->vmb_next->vmb_start) &&
(next->vmb_type == next->vmb_next->vmb_type)) {
tmp = next->vmb_next;
next->vmb_end = tmp->vmb_end;
next->vmb_next = tmp->vmb_next;
vmu_free_bound(tmp);
if (tmp == *last)
*last = next;
} else {
next = next->vmb_next;
}
}
}
/*
* Given an amp and a list of bounds, updates each bound's type with
* VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
*
* If a bound is partially incore, it will be split into two bounds.
* first and last may be modified, as bounds may be split into multiple
* bounds if the are partially incore/not-incore.
*
* Set incore to non-zero if bounds are already known to be incore
*
*/
static void
vmu_amp_update_incore_bounds(struct anon_map *amp, vmu_bound_t **first,
vmu_bound_t **last, boolean_t incore)
{
vmu_bound_t *next;
vmu_bound_t *tmp;
pgcnt_t index;
short bound_type;
short page_type;
vnode_t *vn;
anoff_t off;
struct anon *ap;
next = *first;
/* Shared anon slots don't change once set */
ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
for (;;) {
if (incore == B_TRUE)
next->vmb_type = VMUSAGE_BOUND_INCORE;
if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
if (next == *last)
break;
next = next->vmb_next;
continue;
}
bound_type = next->vmb_type;
index = next->vmb_start;
while (index <= next->vmb_end) {
/*
* These are used to determine how much to increment
* index when a large page is found.
*/
page_t *page;
pgcnt_t pgcnt = 1;
uint_t pgshft;
pgcnt_t pgmsk;
ap = anon_get_ptr(amp->ahp, index);
if (ap != NULL)
swap_xlate(ap, &vn, &off);
if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
(page = page_exists(vn, off)) != NULL) {
page_type = VMUSAGE_BOUND_INCORE;
if (page->p_szc > 0) {
pgcnt = page_get_pagecnt(page->p_szc);
pgshft = page_get_shift(page->p_szc);
pgmsk = (0x1 << (pgshft - PAGESHIFT))
- 1;
}
} else {
page_type = VMUSAGE_BOUND_NOT_INCORE;
}
if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
next->vmb_type = page_type;
} else if (next->vmb_type != page_type) {
/*
* if current bound type does not match page
* type, need to split off new bound.
*/
tmp = vmu_alloc_bound();
tmp->vmb_type = page_type;
tmp->vmb_start = index;
tmp->vmb_end = next->vmb_end;
tmp->vmb_next = next->vmb_next;
next->vmb_end = index - 1;
next->vmb_next = tmp;
if (*last == next)
*last = tmp;
next = tmp;
}
if (pgcnt > 1) {
/*
* If inside large page, jump to next large
* page
*/
index = (index & ~pgmsk) + pgcnt;
} else {
index++;
}
}
if (next == *last) {
ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
break;
} else
next = next->vmb_next;
}
ANON_LOCK_EXIT(&amp->a_rwlock);
}
/*
* Same as vmu_amp_update_incore_bounds(), except for tracking
* incore-/not-incore for vnodes.
*/
static void
vmu_vnode_update_incore_bounds(vnode_t *vnode, vmu_bound_t **first,
vmu_bound_t **last)
{
vmu_bound_t *next;
vmu_bound_t *tmp;
pgcnt_t index;
short bound_type;
short page_type;
next = *first;
for (;;) {
if (vnode->v_pages == NULL)
next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
if (next == *last)
break;
next = next->vmb_next;
continue;
}
bound_type = next->vmb_type;
index = next->vmb_start;
while (index <= next->vmb_end) {
/*
* These are used to determine how much to increment
* index when a large page is found.
*/
page_t *page;
pgcnt_t pgcnt = 1;
uint_t pgshft;
pgcnt_t pgmsk;
if (vnode->v_pages != NULL &&
(page = page_exists(vnode, ptob(index))) != NULL) {
page_type = VMUSAGE_BOUND_INCORE;
if (page->p_szc > 0) {
pgcnt = page_get_pagecnt(page->p_szc);
pgshft = page_get_shift(page->p_szc);
pgmsk = (0x1 << (pgshft - PAGESHIFT))
- 1;
}
} else {
page_type = VMUSAGE_BOUND_NOT_INCORE;
}
if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
next->vmb_type = page_type;
} else if (next->vmb_type != page_type) {
/*
* if current bound type does not match page
* type, need to split off new bound.
*/
tmp = vmu_alloc_bound();
tmp->vmb_type = page_type;
tmp->vmb_start = index;
tmp->vmb_end = next->vmb_end;
tmp->vmb_next = next->vmb_next;
next->vmb_end = index - 1;
next->vmb_next = tmp;
if (*last == next)
*last = tmp;
next = tmp;
}
if (pgcnt > 1) {
/*
* If inside large page, jump to next large
* page
*/
index = (index & ~pgmsk) + pgcnt;
} else {
index++;
}
}
if (next == *last) {
ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
break;
} else
next = next->vmb_next;
}
}
/*
* Calculate the rss and swap consumed by a segment. vmu_entities is the
* list of entities to visit. For shared segments, the vnode or amp
* is looked up in each entity to see if has been already counted. Private
* anon pages are checked per entity to ensure that cow pages are not
* double counted.
*
* For private mapped files, first the amp is checked for private pages.
* Bounds not backed by the amp are looked up in the vnode for each entity
* to avoid double counting of private COW vnode pages.
*/
static void
vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
{
struct segvn_data *svd;
struct shm_data *shmd;
struct spt_data *sptd;
vmu_object_t *shared_object = NULL;
vmu_object_t *entity_object = NULL;
vmu_entity_t *entity;
vmusage_t *result;
vmu_bound_t *first = NULL;
vmu_bound_t *last = NULL;
vmu_bound_t *cur = NULL;
vmu_bound_t *e_first = NULL;
vmu_bound_t *e_last = NULL;
vmu_bound_t *tmp;
pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
struct anon_map *private_amp = NULL;
boolean_t incore = B_FALSE;
boolean_t shared = B_FALSE;
int file = 0;
pgcnt_t swresv = 0;
pgcnt_t panon = 0;
/* Can zero-length segments exist? Not sure, so parenoia */
if (seg->s_size <= 0)
return;
/*
* Figure out if there is a shared object (such as a named vnode or
* a shared amp, then figure out if there is a private amp, which
* identifies private pages.
*/
if (seg->s_ops == &segvn_ops) {
svd = (struct segvn_data *)seg->s_data;
if (svd->type == MAP_SHARED)
shared = B_TRUE;
else
swresv = svd->swresv;
if (svd->vp != NULL) {
file = 1;
shared_object = vmu_find_insert_object(
vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
VMUSAGE_TYPE_VNODE);
s_start = btop(svd->offset);
s_end = btop(svd->offset + seg->s_size) - 1;
}
if (svd->amp != NULL && svd->type == MAP_SHARED) {
ASSERT(shared_object == NULL);
shared_object = vmu_find_insert_object(
vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
VMUSAGE_TYPE_AMP);
s_start = svd->anon_index;
s_end = svd->anon_index + btop(seg->s_size) - 1;
/* schedctl mappings are always in core */
if (svd->amp->swresv == 0)
incore = B_TRUE;
}
SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
/*
* Text replication anon maps can be shared across all zones.
* Space used for text replication is typically capped as
* small % of memory. To keep it simple for now we don't
* account for swap and memory space used for text replication.
*/
if (svd->tr_state == SEGVN_TR_OFF && svd->amp != NULL &&
svd->type == MAP_PRIVATE) {
private_amp = svd->amp;
p_start = svd->anon_index;
p_end = svd->anon_index + btop(seg->s_size) - 1;
}
SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
} else if (seg->s_ops == &segspt_shmops) {
shared = B_TRUE;
shmd = (struct shm_data *)seg->s_data;
shared_object = vmu_find_insert_object(
vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
VMUSAGE_TYPE_AMP);
s_start = 0;
s_end = btop(seg->s_size) - 1;
sptd = shmd->shm_sptseg->s_data;
/* ism segments are always incore and do not reserve swap */
if (sptd->spt_flags & SHM_SHARE_MMU)
incore = B_TRUE;
} else {
return;
}
/*
* If there is a private amp, count anon pages that exist. If an
* anon has a refcnt > 1 (cow sharing), then save the anon in a
* hash so that it is not double counted.
*
* If there is also a shared object, they figure out the bounds
* which are not mapped by the private amp.
*/
if (private_amp != NULL) {
/* Enter as writer to prevent cow anons from being freed */
ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);
p_index = p_start;
s_index = s_start;
while (p_index <= p_end) {
pgcnt_t p_index_next;
pgcnt_t p_bound_size;
int cnt;
anoff_t off;
struct vnode *vn;
struct anon *ap;
page_t *page; /* For handling of large */
pgcnt_t pgcnt = 1; /* pages */
pgcnt_t pgstart;
pgcnt_t pgend;
uint_t pgshft;
pgcnt_t pgmsk;
p_index_next = p_index;
ap = anon_get_next_ptr(private_amp->ahp,
&p_index_next);
/*
* If next anon is past end of mapping, simulate
* end of anon so loop terminates.
*/
if (p_index_next > p_end) {
p_index_next = p_end + 1;
ap = NULL;
}
/*
* For cow segments, keep track of bounds not
* backed by private amp so they can be looked
* up in the backing vnode
*/
if (p_index_next != p_index) {
/*
* Compute index difference between anon and
* previous anon.
*/
p_bound_size = p_index_next - p_index - 1;
if (shared_object != NULL) {
cur = vmu_alloc_bound();
cur->vmb_next = NULL;
cur->vmb_start = s_index;
cur->vmb_end = s_index + p_bound_size;
cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
if (first == NULL) {
first = cur;
last = cur;
} else {
last->vmb_next = cur;
last = cur;
}
}
p_index = p_index + p_bound_size + 1;
s_index = s_index + p_bound_size + 1;
}
/* Detect end of anons in amp */
if (ap == NULL)
break;
cnt = ap->an_refcnt;
swap_xlate(ap, &vn, &off);
if (vn == NULL || vn->v_pages == NULL ||
(page = page_exists(vn, off)) == NULL) {
p_index++;
s_index++;
continue;
}
/*
* If large page is found, compute portion of large
* page in mapping, and increment indicies to the next
* large page.
*/
if (page->p_szc > 0) {
pgcnt = page_get_pagecnt(page->p_szc);
pgshft = page_get_shift(page->p_szc);
pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;
/* First page in large page */
pgstart = p_index & ~pgmsk;
/* Last page in large page */
pgend = pgstart + pgcnt - 1;
/*
* Artifically end page if page extends past
* end of mapping.
*/
if (pgend > p_end)
pgend = p_end;
/*
* Compute number of pages from large page
* which are mapped.
*/
pgcnt = pgend - p_index + 1;
/*
* Point indicies at page after large page,
* or at page after end of mapping.
*/
p_index += pgcnt;
s_index += pgcnt;
} else {
p_index++;
s_index++;
}
/*
* Assume anon structs with a refcnt
* of 1 are not cow shared, so there
* is no reason to track them per entity.
*/
if (cnt == 1) {
panon += pgcnt;
continue;
}
for (entity = vmu_entities; entity != NULL;
entity = entity->vme_next_calc) {
result = &entity->vme_result;
/*
* Track cow anons per entity so
* they are not double counted.
*/
if (vmu_find_insert_anon(entity->vme_anon_hash,
(caddr_t)ap) == 0)
continue;
result->vmu_rss_all += (pgcnt << PAGESHIFT);
result->vmu_rss_private +=
(pgcnt << PAGESHIFT);
}
}
ANON_LOCK_EXIT(&private_amp->a_rwlock);
}
/* Add up resident anon and swap reserved for private mappings */
if (swresv > 0 || panon > 0) {
for (entity = vmu_entities; entity != NULL;
entity = entity->vme_next_calc) {
result = &entity->vme_result;
result->vmu_swap_all += swresv;
result->vmu_swap_private += swresv;
result->vmu_rss_all += (panon << PAGESHIFT);
result->vmu_rss_private += (panon << PAGESHIFT);
}
}
/* Compute resident pages backing shared amp or named vnode */
if (shared_object != NULL) {
if (first == NULL) {
/*
* No private amp, or private amp has no anon
* structs. This means entire segment is backed by
* the shared object.
*/
first = vmu_alloc_bound();
first->vmb_next = NULL;
first->vmb_start = s_start;
first->vmb_end = s_end;
first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
}
/*
* Iterate bounds not backed by private amp, and compute
* resident pages.
*/
cur = first;
while (cur != NULL) {
if (vmu_insert_lookup_object_bounds(shared_object,
cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
&first, &last) > 0) {
/* new bounds, find incore/not-incore */
if (shared_object->vmo_type ==
VMUSAGE_TYPE_VNODE)
vmu_vnode_update_incore_bounds(
(vnode_t *)
shared_object->vmo_key, &first,
&last);
else
vmu_amp_update_incore_bounds(
(struct anon_map *)
shared_object->vmo_key, &first,
&last, incore);
vmu_merge_bounds(&first, &last);
}
for (entity = vmu_entities; entity != NULL;
entity = entity->vme_next_calc) {
result = &entity->vme_result;
entity_object = vmu_find_insert_object(
shared_object->vmo_type ==
VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
entity->vme_amp_hash,
shared_object->vmo_key,
shared_object->vmo_type);
virt = vmu_insert_lookup_object_bounds(
entity_object, cur->vmb_start, cur->vmb_end,
VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);
if (virt == 0)
continue;
/*
* Range visited for this entity
*/
rss = vmu_update_bounds(&e_first,
&e_last, first, last);
result->vmu_rss_all += (rss << PAGESHIFT);
if (shared == B_TRUE && file == B_FALSE) {
/* shared anon mapping */
result->vmu_swap_all +=
(virt << PAGESHIFT);
result->vmu_swap_shared +=
(virt << PAGESHIFT);
result->vmu_rss_shared +=
(rss << PAGESHIFT);
} else if (shared == B_TRUE && file == B_TRUE) {
/* shared file mapping */
result->vmu_rss_shared +=
(rss << PAGESHIFT);
} else if (shared == B_FALSE &&
file == B_TRUE) {
/* private file mapping */
result->vmu_rss_private +=
(rss << PAGESHIFT);
}
vmu_merge_bounds(&e_first, &e_last);
}
tmp = cur;
cur = cur->vmb_next;
vmu_free_bound(tmp);
}
}
}
/*
* Based on the current calculation flags, find the relevant entities
* which are relative to the process. Then calculate each segment
* in the process'es address space for each relevant entity.
*/
static void
vmu_calculate_proc(proc_t *p)
{
vmu_entity_t *entities = NULL;
vmu_zone_t *zone;
vmu_entity_t *tmp;
struct as *as;
struct seg *seg;
int ret;
/* Figure out which entities are being computed */
if ((vmu_data.vmu_system) != NULL) {
tmp = vmu_data.vmu_system;
tmp->vme_next_calc = entities;
entities = tmp;
}
if (vmu_data.vmu_calc_flags &
(VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
VMUSAGE_ALL_EUSERS)) {
ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
(mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
(mod_hash_val_t *)&zone);
if (ret != 0) {
zone = vmu_alloc_zone(p->p_zone->zone_id);
ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
(mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
(mod_hash_val_t)zone, (mod_hash_hndl_t)0);
ASSERT(ret == 0);
}
if (zone->vmz_zone != NULL) {
tmp = zone->vmz_zone;
tmp->vme_next_calc = entities;
entities = tmp;
}
if (vmu_data.vmu_calc_flags &
(VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
zone->vmz_id);
tmp->vme_next_calc = entities;
entities = tmp;
}
if (vmu_data.vmu_calc_flags &
(VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
tmp->vme_next_calc = entities;
entities = tmp;
}
if (vmu_data.vmu_calc_flags &
(VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
tmp->vme_next_calc = entities;
entities = tmp;
}
if (vmu_data.vmu_calc_flags &
(VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
tmp->vme_next_calc = entities;
entities = tmp;
}
}
/* Entities which collapse projects and users for all zones */
if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
tmp->vme_next_calc = entities;
entities = tmp;
}
if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
tmp->vme_next_calc = entities;
entities = tmp;
}
if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
tmp->vme_next_calc = entities;
entities = tmp;
}
ASSERT(entities != NULL);
/* process all segs in process's address space */
as = p->p_as;
AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
for (seg = AS_SEGFIRST(as); seg != NULL;
seg = AS_SEGNEXT(as, seg)) {
vmu_calculate_seg(entities, seg);
}
AS_LOCK_EXIT(as, &as->a_lock);
}
/*
* Free data created by previous call to vmu_calculate().
*/
static void
vmu_clear_calc()
{
if (vmu_data.vmu_system != NULL)
vmu_free_entity(vmu_data.vmu_system);
vmu_data.vmu_system = NULL;
if (vmu_data.vmu_zones_hash != NULL)
i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
if (vmu_data.vmu_projects_col_hash != NULL)
i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
if (vmu_data.vmu_rusers_col_hash != NULL)
i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
if (vmu_data.vmu_eusers_col_hash != NULL)
i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);
i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
}
/*
* Free unused data structures. These can result if the system workload
* decreases between calculations.
*/
static void
vmu_free_extra()
{
vmu_bound_t *tb;
vmu_object_t *to;
vmu_entity_t *te;
vmu_zone_t *tz;
while (vmu_data.vmu_free_bounds != NULL) {
tb = vmu_data.vmu_free_bounds;
vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
kmem_cache_free(vmu_bound_cache, tb);
}
while (vmu_data.vmu_free_objects != NULL) {
to = vmu_data.vmu_free_objects;
vmu_data.vmu_free_objects =
vmu_data.vmu_free_objects->vmo_next;
kmem_cache_free(vmu_object_cache, to);
}
while (vmu_data.vmu_free_entities != NULL) {
te = vmu_data.vmu_free_entities;
vmu_data.vmu_free_entities =
vmu_data.vmu_free_entities->vme_next;
if (te->vme_vnode_hash != NULL)
mod_hash_destroy_hash(te->vme_vnode_hash);
if (te->vme_amp_hash != NULL)
mod_hash_destroy_hash(te->vme_amp_hash);
if (te->vme_anon_hash != NULL)
mod_hash_destroy_hash(te->vme_anon_hash);
kmem_free(te, sizeof (vmu_entity_t));
}
while (vmu_data.vmu_free_zones != NULL) {
tz = vmu_data.vmu_free_zones;
vmu_data.vmu_free_zones =
vmu_data.vmu_free_zones->vmz_next;
if (tz->vmz_projects_hash != NULL)
mod_hash_destroy_hash(tz->vmz_projects_hash);
if (tz->vmz_tasks_hash != NULL)
mod_hash_destroy_hash(tz->vmz_tasks_hash);
if (tz->vmz_rusers_hash != NULL)
mod_hash_destroy_hash(tz->vmz_rusers_hash);
if (tz->vmz_eusers_hash != NULL)
mod_hash_destroy_hash(tz->vmz_eusers_hash);
kmem_free(tz, sizeof (vmu_zone_t));
}
}
extern kcondvar_t *pr_pid_cv;
/*
* Determine which entity types are relevant and allocate the hashes to
* track them. Then walk the process table and count rss and swap
* for each process'es address space. Address space object such as
* vnodes, amps and anons are tracked per entity, so that they are
* not double counted in the results.
*
*/
static void
vmu_calculate()
{
int i = 0;
int ret;
proc_t *p;
vmu_clear_calc();
if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
ALL_ZONES);
/*
* Walk process table and calculate rss of each proc.
*
* Pidlock and p_lock cannot be held while doing the rss calculation.
* This is because:
* 1. The calculation allocates using KM_SLEEP.
* 2. The calculation grabs a_lock, which cannot be grabbed
* after p_lock.
*
* Since pidlock must be dropped, we cannot simply just walk the
* practive list. Instead, we walk the process table, and sprlock
* each process to ensure that it does not exit during the
* calculation.
*/
mutex_enter(&pidlock);
for (i = 0; i < v.v_proc; i++) {
again:
p = pid_entry(i);
if (p == NULL)
continue;
mutex_enter(&p->p_lock);
mutex_exit(&pidlock);
if (panicstr) {
mutex_exit(&p->p_lock);
return;
}
/* Try to set P_PR_LOCK */
ret = sprtrylock_proc(p);
if (ret == -1) {
/* Process in invalid state */
mutex_exit(&p->p_lock);
mutex_enter(&pidlock);
continue;
} else if (ret == 1) {
/*
* P_PR_LOCK is already set. Wait and try again.
* This also drops p_lock.
*/
sprwaitlock_proc(p);
mutex_enter(&pidlock);
goto again;
}
mutex_exit(&p->p_lock);
vmu_calculate_proc(p);
mutex_enter(&p->p_lock);
sprunlock(p);
mutex_enter(&pidlock);
}
mutex_exit(&pidlock);
vmu_free_extra();
}
/*
* allocate a new cache for N results satisfying flags
*/
vmu_cache_t *
vmu_cache_alloc(size_t nres, uint_t flags)
{
vmu_cache_t *cache;
cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
cache->vmc_nresults = nres;
cache->vmc_flags = flags;
cache->vmc_refcnt = 1;
return (cache);
}
/*
* Make sure cached results are not freed
*/
static void
vmu_cache_hold(vmu_cache_t *cache)
{
ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
cache->vmc_refcnt++;
}
/*
* free cache data
*/
static void
vmu_cache_rele(vmu_cache_t *cache)
{
ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
ASSERT(cache->vmc_refcnt > 0);
cache->vmc_refcnt--;
if (cache->vmc_refcnt == 0) {
kmem_free(cache->vmc_results, sizeof (vmusage_t) *
cache->vmc_nresults);
kmem_free(cache, sizeof (vmu_cache_t));
}
}
/*
* Copy out the cached results to a caller. Inspect the callers flags
* and zone to determine which cached results should be copied.
*/
static int
vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
uint_t flags)
{
vmusage_t *result, *out_result;
vmusage_t dummy;
size_t i, count = 0;
size_t bufsize;
int ret = 0;
uint_t types = 0;
if (nres != NULL) {
if (copyin((caddr_t)nres, &bufsize, sizeof (size_t)))
return (set_errno(EFAULT));
} else {
bufsize = 0;
}
/* figure out what results the caller is interested in. */
if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
types |= VMUSAGE_SYSTEM;
if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
types |= VMUSAGE_ZONE;
if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
VMUSAGE_COL_PROJECTS))
types |= VMUSAGE_PROJECTS;
if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
types |= VMUSAGE_TASKS;
if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
types |= VMUSAGE_RUSERS;
if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
types |= VMUSAGE_EUSERS;
/* count results for current zone */
out_result = buf;
for (result = cache->vmc_results, i = 0;
i < cache->vmc_nresults; result++, i++) {
/* Do not return "other-zone" results to non-global zones */
if (curproc->p_zone != global_zone &&
curproc->p_zone->zone_id != result->vmu_zoneid)
continue;
/*
* If non-global zone requests VMUSAGE_SYSTEM, fake
* up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
*/
if (curproc->p_zone != global_zone &&
(flags & VMUSAGE_SYSTEM) != 0 &&
result->vmu_type == VMUSAGE_ZONE) {
count++;
if (out_result != NULL) {
if (bufsize < count) {
ret = set_errno(EOVERFLOW);
} else {
dummy = *result;
dummy.vmu_zoneid = ALL_ZONES;
dummy.vmu_id = 0;
dummy.vmu_type = VMUSAGE_SYSTEM;
if (copyout(&dummy, out_result,
sizeof (vmusage_t)))
return (set_errno(
EFAULT));
out_result++;
}
}
}
/* Skip results that do not match requested type */
if ((result->vmu_type & types) == 0)
continue;
/* Skip collated results if not requested */
if (result->vmu_zoneid == ALL_ZONES) {
if (result->vmu_type == VMUSAGE_PROJECTS &&
(flags & VMUSAGE_COL_PROJECTS) == 0)
continue;
if (result->vmu_type == VMUSAGE_EUSERS &&
(flags & VMUSAGE_COL_EUSERS) == 0)
continue;
if (result->vmu_type == VMUSAGE_RUSERS &&
(flags & VMUSAGE_COL_RUSERS) == 0)
continue;
}
/* Skip "other zone" results if not requested */
if (result->vmu_zoneid != curproc->p_zone->zone_id) {
if (result->vmu_type == VMUSAGE_ZONE &&
(flags & VMUSAGE_ALL_ZONES) == 0)
continue;
if (result->vmu_type == VMUSAGE_PROJECTS &&
(flags & (VMUSAGE_ALL_PROJECTS |
VMUSAGE_COL_PROJECTS)) == 0)
continue;
if (result->vmu_type == VMUSAGE_TASKS &&
(flags & VMUSAGE_ALL_TASKS) == 0)
continue;
if (result->vmu_type == VMUSAGE_RUSERS &&
(flags & (VMUSAGE_ALL_RUSERS |
VMUSAGE_COL_RUSERS)) == 0)
continue;
if (result->vmu_type == VMUSAGE_EUSERS &&
(flags & (VMUSAGE_ALL_EUSERS |
VMUSAGE_COL_EUSERS)) == 0)
continue;
}
count++;
if (out_result != NULL) {
if (bufsize < count) {
ret = set_errno(EOVERFLOW);
} else {
if (copyout(result, out_result,
sizeof (vmusage_t)))
return (set_errno(EFAULT));
out_result++;
}
}
}
if (nres != NULL)
if (copyout(&count, (void *)nres, sizeof (size_t)))
return (set_errno(EFAULT));
return (ret);
}
/*
* vm_getusage()
*
* Counts rss and swap by zone, project, task, and/or user. The flags argument
* determines the type of results structures returned. Flags requesting
* results from more than one zone are "flattened" to the local zone if the
* caller is not the global zone.
*
* args:
* flags: bitmap consisting of one or more of VMUSAGE_*.
* age: maximum allowable age (time since counting was done) in
* seconds of the results. Results from previous callers are
* cached in kernel.
* buf: pointer to buffer array of vmusage_t. If NULL, then only nres
* set on success.
* nres: Set to number of vmusage_t structures pointed to by buf
* before calling vm_getusage().
* On return 0 (success) or ENOSPC, is set to the number of result
* structures returned or attempted to return.
*
* returns 0 on success, -1 on failure:
* EINTR (interrupted)
* ENOSPC (nres to small for results, nres set to needed value for success)
* EINVAL (flags invalid)
* EFAULT (bad address for buf or nres)
*/
int
vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres)
{
vmu_entity_t *entity;
vmusage_t *result;
int ret = 0;
int cacherecent = 0;
hrtime_t now;
uint_t flags_orig;
/*
* Non-global zones cannot request system wide and/or collated
* results, or the system result, so munge the flags accordingly.
*/
flags_orig = flags;
if (curproc->p_zone != global_zone) {
if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
flags |= VMUSAGE_PROJECTS;
}
if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
flags |= VMUSAGE_RUSERS;
}
if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
flags |= VMUSAGE_EUSERS;
}
if (flags & VMUSAGE_SYSTEM) {
flags &= ~VMUSAGE_SYSTEM;
flags |= VMUSAGE_ZONE;
}
}
/* Check for unknown flags */
if ((flags & (~VMUSAGE_MASK)) != 0)
return (set_errno(EINVAL));
/* Check for no flags */
if ((flags & VMUSAGE_MASK) == 0)
return (set_errno(EINVAL));
mutex_enter(&vmu_data.vmu_lock);
now = gethrtime();
start:
if (vmu_data.vmu_cache != NULL) {
vmu_cache_t *cache;
if ((vmu_data.vmu_cache->vmc_timestamp +
((hrtime_t)age * NANOSEC)) > now)
cacherecent = 1;
if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
cacherecent == 1) {
cache = vmu_data.vmu_cache;
vmu_cache_hold(cache);
mutex_exit(&vmu_data.vmu_lock);
ret = vmu_copyout_results(cache, buf, nres, flags_orig);
mutex_enter(&vmu_data.vmu_lock);
vmu_cache_rele(cache);
if (vmu_data.vmu_pending_waiters > 0)
cv_broadcast(&vmu_data.vmu_cv);
mutex_exit(&vmu_data.vmu_lock);
return (ret);
}
/*
* If the cache is recent, it is likely that there are other
* consumers of vm_getusage running, so add their flags to the
* desired flags for the calculation.
*/
if (cacherecent == 1)
flags = vmu_data.vmu_cache->vmc_flags | flags;
}
if (vmu_data.vmu_calc_thread == NULL) {
vmu_cache_t *cache;
vmu_data.vmu_calc_thread = curthread;
vmu_data.vmu_calc_flags = flags;
vmu_data.vmu_entities = NULL;
vmu_data.vmu_nentities = 0;
if (vmu_data.vmu_pending_waiters > 0)
vmu_data.vmu_calc_flags |=
vmu_data.vmu_pending_flags;
vmu_data.vmu_pending_flags = 0;
mutex_exit(&vmu_data.vmu_lock);
vmu_calculate();
mutex_enter(&vmu_data.vmu_lock);
/* copy results to cache */
if (vmu_data.vmu_cache != NULL)
vmu_cache_rele(vmu_data.vmu_cache);
cache = vmu_data.vmu_cache =
vmu_cache_alloc(vmu_data.vmu_nentities,
vmu_data.vmu_calc_flags);
result = cache->vmc_results;
for (entity = vmu_data.vmu_entities; entity != NULL;
entity = entity->vme_next) {
*result = entity->vme_result;
result++;
}
cache->vmc_timestamp = gethrtime();
vmu_cache_hold(cache);
vmu_data.vmu_calc_flags = 0;
vmu_data.vmu_calc_thread = NULL;
if (vmu_data.vmu_pending_waiters > 0)
cv_broadcast(&vmu_data.vmu_cv);
mutex_exit(&vmu_data.vmu_lock);
/* copy cache */
ret = vmu_copyout_results(cache, buf, nres, flags_orig);
mutex_enter(&vmu_data.vmu_lock);
vmu_cache_rele(cache);
mutex_exit(&vmu_data.vmu_lock);
return (ret);
}
vmu_data.vmu_pending_flags |= flags;
vmu_data.vmu_pending_waiters++;
while (vmu_data.vmu_calc_thread != NULL) {
if (cv_wait_sig(&vmu_data.vmu_cv,
&vmu_data.vmu_lock) == 0) {
vmu_data.vmu_pending_waiters--;
mutex_exit(&vmu_data.vmu_lock);
return (set_errno(EINTR));
}
}
vmu_data.vmu_pending_waiters--;
goto start;
}