common/vm/vm_usage.c

	vm_usage.c revision dc32d872cbeb56532bcea030255db9cd79bac7da
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * vm_usage
 *
 * This file implements the getvmusage() private system call.
 * getvmusage() counts the amount of resident memory pages and swap
 * reserved by the specified process collective. A "process collective" is
 * the set of processes owned by a particular, zone, project, task, or user.
 *
 * rss and swap are counted so that for a given process collective, a page is
 * only counted once.  For example, this means that if multiple processes in
 * the same project map the same page, then the project will only be charged
 * once for that page.  On the other hand, if two processes in different
 * projects map the same page, then both projects will be charged
 * for the page.
 *
 * The vm_getusage() calculation is implemented so that the first thread
 * performs the rss/swap counting. Other callers will wait for that thread to
 * finish, copying the results.  This enables multiple rcapds and prstats to
 * consume data from the same calculation.  The results are also cached so that
 * a caller interested in recent results can just copy them instead of starting
 * a new calculation. The caller passes the maximium age (in seconds) of the
 * data.  If the cached data is young enough, the cache is copied, otherwise,
 * a new calculation is executed and the cache is replaced with the new
 * data.
 *
 * The rss calculation for each process collective is as follows:
 *
 *   - Inspect flags, determine if counting rss for zones, projects, tasks,
 *     and/or users.
 *   - For each proc:
 *  - Figure out proc's collectives (zone, project, task, and/or user).
 *  - For each seg in proc's address space:
 *      - If seg is private:
 *          - Lookup anons in the amp.
 *          - For incore pages not previously visited each of the
 *            proc's collectives, add incore pagesize to each.
 *            collective.
 *            Anon's with a refcnt of 1 can be assummed to be not
 *            previously visited.
 *          - For address ranges without anons in the amp:
 *              - Lookup pages in underlying vnode.
 *              - For incore pages not previously visiting for
 *                each of the proc's collectives, add incore
 *                pagesize to each collective.
 *      - If seg is shared:
 *          - Lookup pages in the shared amp or vnode.
 *          - For incore pages not previously visited for each of
 *            the proc's collectives, add incore pagesize to each
 *            collective.
 *
 * Swap is reserved by private segments, and shared anonymous segments.
 * The only shared anon segments which do not reserve swap are ISM segments
 * and schedctl segments, both of which can be identified by having
 * amp->swresv == 0.
 *
 * The swap calculation for each collective is as follows:
 *
 *   - Inspect flags, determine if counting rss for zones, projects, tasks,
 *     and/or users.
 *   - For each proc:
 *  - Figure out proc's collectives (zone, project, task, and/or user).
 *  - For each seg in proc's address space:
 *      - If seg is private:
 *          - Add svd->swresv pages to swap count for each of the
 *            proc's collectives.
 *      - If seg is anon, shared, and amp->swresv != 0
 *          - For address ranges in amp not previously visited for
 *            each of the proc's collectives, add size of address
 *            range to the swap count for each collective.
 *
 * These two calculations are done simultaneously, with most of the work
 * being done in vmu_calculate_seg().  The results of the calculation are
 * copied into "vmu_data.vmu_cache_results".
 *
 * To perform the calculation, various things are tracked and cached:
 *
 *    - incore/not-incore page ranges for all vnodes.
 *  (vmu_data.vmu_all_vnodes_hash)
 *  This eliminates looking up the same page more than once.
 *
 *    - incore/not-incore page ranges for all shared amps.
 *  (vmu_data.vmu_all_amps_hash)
 *  This eliminates looking up the same page more than once.
 *
 *    - visited page ranges for each collective.
 *     - per vnode (entity->vme_vnode_hash)
 *     - per shared amp (entity->vme_amp_hash)
 *  For accurate counting of map-shared and COW-shared pages.
 *
 *    - visited private anons (refcnt > 1) for each collective.
 *  (entity->vme_anon_hash)
 *  For accurate counting of COW-shared pages.
 *
 * The common accounting structure is the vmu_entity_t, which represents
 * collectives:
 *
 *    - A zone.
 *    - A project, task, or user within a zone.
 *    - The entire system (vmu_data.vmu_system).
 *    - Each collapsed (col) project and user.  This means a given projid or
 *  uid, regardless of which zone the process is in.  For instance,
 *      project 0 in the global zone and project 0 in a non global zone are
 *  the same collapsed project.
 *
 *  Each entity structure tracks which pages have been already visited for
 *  that entity (via previously inspected processes) so that these pages are
 *  not double counted.
 */

#include <sys/errno.h>
#include <sys/types.h>
#include <sys/zone.h>
#include <sys/proc.h>
#include <sys/project.h>
#include <sys/task.h>
#include <sys/thread.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <sys/modhash.h>
#include <sys/modhash_impl.h>
#include <sys/shm.h>
#include <sys/swap.h>
#include <sys/synch.h>
#include <sys/systm.h>
#include <sys/var.h>
#include <sys/vm_usage.h>
#include <sys/zone.h>
#include <sys/sunddi.h>
#include <sys/avl.h>
#include <vm/anon.h>
#include <vm/as.h>
#include <vm/seg_vn.h>
#include <vm/seg_spt.h>

#define VMUSAGE_HASH_SIZE       512

#define VMUSAGE_TYPE_VNODE      1
#define VMUSAGE_TYPE_AMP        2
#define VMUSAGE_TYPE_ANON       3

#define VMUSAGE_BOUND_UNKNOWN       0
#define VMUSAGE_BOUND_INCORE        1
#define VMUSAGE_BOUND_NOT_INCORE    2

#define ISWITHIN(node, addr)    ((node)->vmb_start <= addr && \
                    (node)->vmb_end >= addr ? 1 : 0)

/*
 * bounds for vnodes and shared amps
 * Each bound is either entirely incore, entirely not in core, or
 * entirely unknown.  bounds are stored in an avl tree sorted by start member
 * when in use, otherwise (free or temporary lists) they're strung
 * together off of vmb_next.
 */
typedef struct vmu_bound {
    avl_node_t vmb_node;
    struct vmu_bound *vmb_next; /* NULL in tree else on free or temp list */
    pgcnt_t vmb_start;  /* page offset in vnode/amp on which bound starts */
    pgcnt_t vmb_end;    /* page offset in vnode/amp on which bound ends */
    char    vmb_type;   /* One of VMUSAGE_BOUND_* */
} vmu_bound_t;

/*
 * hash of visited objects (vnodes or shared amps)
 * key is address of vnode or amp.  Bounds lists known incore/non-incore
 * bounds for vnode/amp.
 */
typedef struct vmu_object {
    struct vmu_object   *vmo_next;  /* free list */
    caddr_t     vmo_key;
    short       vmo_type;
    avl_tree_t  vmo_bounds;
} vmu_object_t;

/*
 * Entity by which to count results.
 *
 * The entity structure keeps the current rss/swap counts for each entity
 * (zone, project, etc), and hashes of vm structures that have already
 * been visited for the entity.
 *
 * vme_next:    links the list of all entities currently being counted by
 *      vmu_calculate().
 *
 * vme_next_calc: links the list of entities related to the current process
 *       being counted by vmu_calculate_proc().
 *
 * vmu_calculate_proc() walks all processes.  For each process, it makes a
 * list of the entities related to that process using vme_next_calc.  This
 * list changes each time vmu_calculate_proc() is called.
 *
 */
typedef struct vmu_entity {
    struct vmu_entity *vme_next;
    struct vmu_entity *vme_next_calc;
    mod_hash_t  *vme_vnode_hash; /* vnodes visited for entity */
    mod_hash_t  *vme_amp_hash;   /* shared amps visited for entity */
    mod_hash_t  *vme_anon_hash;  /* COW anons visited for entity */
    vmusage_t   vme_result;  /* identifies entity and results */
} vmu_entity_t;

/*
 * Hash of entities visited within a zone, and an entity for the zone
 * itself.
 */
typedef struct vmu_zone {
    struct vmu_zone *vmz_next;  /* free list */
    id_t        vmz_id;
    vmu_entity_t    *vmz_zone;
    mod_hash_t  *vmz_projects_hash;
    mod_hash_t  *vmz_tasks_hash;
    mod_hash_t  *vmz_rusers_hash;
    mod_hash_t  *vmz_eusers_hash;
} vmu_zone_t;

/*
 * Cache of results from last calculation
 */
typedef struct vmu_cache {
    vmusage_t   *vmc_results;   /* Results from last call to */
                    /* vm_getusage(). */
    uint64_t    vmc_nresults;   /* Count of cached results */
    uint64_t    vmc_refcnt; /* refcnt for free */
    uint_t      vmc_flags;  /* Flags for vm_getusage() */
    hrtime_t    vmc_timestamp;  /* when cache was created */
} vmu_cache_t;

/*
 * top level rss info for the system
 */
typedef struct vmu_data {
    kmutex_t    vmu_lock;       /* Protects vmu_data */
    kcondvar_t  vmu_cv;         /* Used to signal threads */
                        /* Waiting for */
                        /* Rss_calc_thread to finish */
    vmu_entity_t    *vmu_system;        /* Entity for tracking */
                        /* rss/swap for all processes */
                        /* in all zones */
    mod_hash_t  *vmu_zones_hash;    /* Zones visited */
    mod_hash_t  *vmu_projects_col_hash; /* These *_col_hash hashes */
    mod_hash_t  *vmu_rusers_col_hash;   /* keep track of entities, */
    mod_hash_t  *vmu_eusers_col_hash;   /* ignoring zoneid, in order */
                        /* to implement VMUSAGE_COL_* */
                        /* flags, which aggregate by */
                        /* project or user regardless */
                        /* of zoneid. */
    mod_hash_t  *vmu_all_vnodes_hash;   /* System wide visited vnodes */
                        /* to track incore/not-incore */
    mod_hash_t  *vmu_all_amps_hash; /* System wide visited shared */
                        /* amps to track incore/not- */
                        /* incore */
    vmu_entity_t    *vmu_entities;      /* Linked list of entities */
    size_t      vmu_nentities;      /* Count of entities in list */
    vmu_cache_t *vmu_cache;     /* Cached results */
    kthread_t   *vmu_calc_thread;   /* NULL, or thread running */
                        /* vmu_calculate() */
    uint_t      vmu_calc_flags;     /* Flags being using by */
                        /* currently running calc */
                        /* thread */
    uint_t      vmu_pending_flags;  /* Flags of vm_getusage() */
                        /* threads waiting for */
                        /* calc thread to finish */
    uint_t      vmu_pending_waiters;    /* Number of threads waiting */
                        /* for calc thread */
    vmu_bound_t *vmu_free_bounds;
    vmu_object_t    *vmu_free_objects;
    vmu_entity_t    *vmu_free_entities;
    vmu_zone_t  *vmu_free_zones;
} vmu_data_t;

extern struct as kas;
extern proc_t *practive;
extern zone_t *global_zone;
extern struct seg_ops segvn_ops;
extern struct seg_ops segspt_shmops;

static vmu_data_t vmu_data;
static kmem_cache_t *vmu_bound_cache;
static kmem_cache_t *vmu_object_cache;

/*
 * Comparison routine for AVL tree. We base our comparison on vmb_start.
 */
static int
bounds_cmp(const void *bnd1, const void *bnd2)
{
    const vmu_bound_t *bound1 = bnd1;
    const vmu_bound_t *bound2 = bnd2;

    if (bound1->vmb_start == bound2->vmb_start) {
        return (0);
    }
    if (bound1->vmb_start < bound2->vmb_start) {
        return (-1);
    }

    return (1);
}

/*
 * Save a bound on the free list.
 */
static void
vmu_free_bound(vmu_bound_t *bound)
{
    bound->vmb_next = vmu_data.vmu_free_bounds;
    bound->vmb_start = 0;
    bound->vmb_end = 0;
    bound->vmb_type = 0;
    vmu_data.vmu_free_bounds = bound;
}

/*
 * Free an object, and all visited bound info.
 */
static void
vmu_free_object(mod_hash_val_t val)
{
    vmu_object_t *obj = (vmu_object_t *)val;
    avl_tree_t *tree = &(obj->vmo_bounds);
    vmu_bound_t *bound;
    void *cookie = NULL;

    while ((bound = avl_destroy_nodes(tree, &cookie)) != NULL)
        vmu_free_bound(bound);
    avl_destroy(tree);

    obj->vmo_type = 0;
    obj->vmo_next = vmu_data.vmu_free_objects;
    vmu_data.vmu_free_objects = obj;
}

/*
 * Free an entity, and hashes of visited objects for that entity.
 */
static void
vmu_free_entity(mod_hash_val_t val)
{
    vmu_entity_t *entity = (vmu_entity_t *)val;

    if (entity->vme_vnode_hash != NULL)
        i_mod_hash_clear_nosync(entity->vme_vnode_hash);
    if (entity->vme_amp_hash != NULL)
        i_mod_hash_clear_nosync(entity->vme_amp_hash);
    if (entity->vme_anon_hash != NULL)
        i_mod_hash_clear_nosync(entity->vme_anon_hash);

    entity->vme_next = vmu_data.vmu_free_entities;
    vmu_data.vmu_free_entities = entity;
}

/*
 * Free zone entity, and all hashes of entities inside that zone,
 * which are projects, tasks, and users.
 */
static void
vmu_free_zone(mod_hash_val_t val)
{
    vmu_zone_t *zone = (vmu_zone_t *)val;

    if (zone->vmz_zone != NULL) {
        vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
        zone->vmz_zone = NULL;
    }
    if (zone->vmz_projects_hash != NULL)
        i_mod_hash_clear_nosync(zone->vmz_projects_hash);
    if (zone->vmz_tasks_hash != NULL)
        i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
    if (zone->vmz_rusers_hash != NULL)
        i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
    if (zone->vmz_eusers_hash != NULL)
        i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
    zone->vmz_next = vmu_data.vmu_free_zones;
    vmu_data.vmu_free_zones = zone;
}

/*
 * Initialize synchronization primitives and hashes for system-wide tracking
 * of visited vnodes and shared amps.  Initialize results cache.
 */
void
vm_usage_init()
{
    mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
    cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);

    vmu_data.vmu_system = NULL;
    vmu_data.vmu_zones_hash = NULL;
    vmu_data.vmu_projects_col_hash = NULL;
    vmu_data.vmu_rusers_col_hash = NULL;
    vmu_data.vmu_eusers_col_hash = NULL;

    vmu_data.vmu_free_bounds = NULL;
    vmu_data.vmu_free_objects = NULL;
    vmu_data.vmu_free_entities = NULL;
    vmu_data.vmu_free_zones = NULL;

    vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
        "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
        sizeof (vnode_t));
    vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
        "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
        sizeof (struct anon_map));
    vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
        "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
        vmu_free_entity);
    vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
        "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
        vmu_free_entity);
    vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
        "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
        vmu_free_entity);
    vmu_data.vmu_zones_hash = mod_hash_create_idhash(
        "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);

    vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
        sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
    vmu_object_cache = kmem_cache_create("vmu_object_cache",
        sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);

    vmu_data.vmu_entities = NULL;
    vmu_data.vmu_nentities = 0;

    vmu_data.vmu_cache = NULL;
    vmu_data.vmu_calc_thread = NULL;
    vmu_data.vmu_calc_flags = 0;
    vmu_data.vmu_pending_flags = 0;
    vmu_data.vmu_pending_waiters = 0;
}

/*
 * Allocate hashes for tracking vm objects visited for an entity.
 * Update list of entities.
 */
static vmu_entity_t *
vmu_alloc_entity(id_t id, int type, id_t zoneid)
{
    vmu_entity_t *entity;

    if (vmu_data.vmu_free_entities != NULL) {
        entity = vmu_data.vmu_free_entities;
        vmu_data.vmu_free_entities =
            vmu_data.vmu_free_entities->vme_next;
        bzero(&entity->vme_result, sizeof (vmusage_t));
    } else {
        entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
    }
    entity->vme_result.vmu_id = id;
    entity->vme_result.vmu_zoneid = zoneid;
    entity->vme_result.vmu_type = type;

    if (entity->vme_vnode_hash == NULL)
        entity->vme_vnode_hash = mod_hash_create_ptrhash(
            "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
            sizeof (vnode_t));

    if (entity->vme_amp_hash == NULL)
        entity->vme_amp_hash = mod_hash_create_ptrhash(
            "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
            sizeof (struct anon_map));

    if (entity->vme_anon_hash == NULL)
        entity->vme_anon_hash = mod_hash_create_ptrhash(
            "vmusage anon hash", VMUSAGE_HASH_SIZE,
            mod_hash_null_valdtor, sizeof (struct anon));

    entity->vme_next = vmu_data.vmu_entities;
    vmu_data.vmu_entities = entity;
    vmu_data.vmu_nentities++;

    return (entity);
}

/*
 * Allocate a zone entity, and hashes for tracking visited vm objects
 * for projects, tasks, and users within that zone.
 */
static vmu_zone_t *
vmu_alloc_zone(id_t id)
{
    vmu_zone_t *zone;

    if (vmu_data.vmu_free_zones != NULL) {
        zone = vmu_data.vmu_free_zones;
        vmu_data.vmu_free_zones =
            vmu_data.vmu_free_zones->vmz_next;
        zone->vmz_next = NULL;
        zone->vmz_zone = NULL;
    } else {
        zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
    }

    zone->vmz_id = id;

    if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
        zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);

    if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
        VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
        zone->vmz_projects_hash = mod_hash_create_idhash(
            "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);

    if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
        != 0 && zone->vmz_tasks_hash == NULL)
        zone->vmz_tasks_hash = mod_hash_create_idhash(
            "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);

    if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
        != 0 && zone->vmz_rusers_hash == NULL)
        zone->vmz_rusers_hash = mod_hash_create_idhash(
            "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);

    if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
        != 0 && zone->vmz_eusers_hash == NULL)
        zone->vmz_eusers_hash = mod_hash_create_idhash(
            "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);

    return (zone);
}

/*
 * Allocate a structure for tracking visited bounds for a vm object.
 */
static vmu_object_t *
vmu_alloc_object(caddr_t key, int type)
{
    vmu_object_t *object;

    if (vmu_data.vmu_free_objects != NULL) {
        object = vmu_data.vmu_free_objects;
        vmu_data.vmu_free_objects =
            vmu_data.vmu_free_objects->vmo_next;
    } else {
        object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
    }

    object->vmo_next = NULL;
    object->vmo_key = key;
    object->vmo_type = type;
    avl_create(&(object->vmo_bounds), bounds_cmp, sizeof (vmu_bound_t), 0);

    return (object);
}

/*
 * Allocate and return a bound structure.
 */
static vmu_bound_t *
vmu_alloc_bound()
{
    vmu_bound_t *bound;

    if (vmu_data.vmu_free_bounds != NULL) {
        bound = vmu_data.vmu_free_bounds;
        vmu_data.vmu_free_bounds =
            vmu_data.vmu_free_bounds->vmb_next;
    } else {
        bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
    }

    bound->vmb_next = NULL;
    bound->vmb_start = 0;
    bound->vmb_end = 0;
    bound->vmb_type = 0;
    return (bound);
}

/*
 * vmu_find_insert_* functions implement hash lookup or allocate and
 * insert operations.
 */
static vmu_object_t *
vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
{
    int ret;
    vmu_object_t *object;

    ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
        (mod_hash_val_t *)&object);
    if (ret != 0) {
        object = vmu_alloc_object(key, type);
        ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
            (mod_hash_val_t)object, (mod_hash_hndl_t)0);
        ASSERT(ret == 0);
    }
    return (object);
}

static int
vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
{
    int ret;
    caddr_t val;

    ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
        (mod_hash_val_t *)&val);

    if (ret == 0)
        return (0);

    ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
        (mod_hash_val_t)key, (mod_hash_hndl_t)0);

    ASSERT(ret == 0);

    return (1);
}

static vmu_entity_t *
vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
{
    int ret;
    vmu_entity_t *entity;

    ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
        (mod_hash_val_t *)&entity);
    if (ret != 0) {
        entity = vmu_alloc_entity(id, type, zoneid);
        ret = i_mod_hash_insert_nosync(hash,
            (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
            (mod_hash_hndl_t)0);
        ASSERT(ret == 0);
    }
    return (entity);
}


/*
 * Returns list of object bounds between start and end.  New bounds inserted
 * by this call are given type.
 *
 * Returns the number of pages covered if new bounds are created.  Returns 0
 * if region between start/end consists of all existing bounds.
 */
static pgcnt_t
vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
    end, char type, vmu_bound_t **first, vmu_bound_t **last)
{
    avl_tree_t  *tree = &(ro->vmo_bounds);
    avl_index_t where;
    vmu_bound_t *walker, *tmp;
    pgcnt_t     ret = 0;

    ASSERT(start <= end);

    *first = *last = NULL;

    tmp = vmu_alloc_bound();
    tmp->vmb_start = start;
    tmp->vmb_type = type;

    /* Hopelessly optimistic case. */
    if (walker = avl_find(tree, tmp, &where)) {
        /* We got lucky. */
        vmu_free_bound(tmp);
        *first = walker;
    }

    if (walker == NULL) {
        /* Is start in the previous node? */
        walker = avl_nearest(tree, where, AVL_BEFORE);
        if (walker != NULL) {
            if (ISWITHIN(walker, start)) {
                /* We found start. */
                vmu_free_bound(tmp);
                *first = walker;
            }
        }
    }

    /*
     * At this point, if *first is still NULL, then we
     * didn't get a direct hit and start isn't covered
     * by the previous node. We know that the next node
     * must have a greater start value than we require
     * because avl_find tells us where the AVL routines would
     * insert our new node. We have some gap between the
     * start we want and the next node.
     */
    if (*first == NULL) {
        walker = avl_nearest(tree, where, AVL_AFTER);
        if (walker != NULL && walker->vmb_start <= end) {
            /* Fill the gap. */
            tmp->vmb_end = walker->vmb_start - 1;
            *first = tmp;
        } else {
            /* We have a gap over [start, end]. */
            tmp->vmb_end = end;
            *first = *last = tmp;
        }
        ret += tmp->vmb_end - tmp->vmb_start + 1;
        avl_insert(tree, tmp, where);
    }

    ASSERT(*first != NULL);

    if (*last != NULL) {
        /* We're done. */
        return (ret);
    }

    /*
     * If we are here we still need to set *last and
     * that may involve filling in some gaps.
     */
    *last = *first;
    for (;;) {
        if (ISWITHIN(*last, end)) {
            /* We're done. */
            break;
        }
        walker = AVL_NEXT(tree, *last);
        if (walker == NULL || walker->vmb_start > end) {
            /* Bottom or mid tree with gap. */
            tmp = vmu_alloc_bound();
            tmp->vmb_start = (*last)->vmb_end + 1;
            tmp->vmb_end = end;
            tmp->vmb_type = type;
            ret += tmp->vmb_end - tmp->vmb_start + 1;
            avl_insert_here(tree, tmp, *last, AVL_AFTER);
            *last = tmp;
            break;
        } else {
            if ((*last)->vmb_end + 1 != walker->vmb_start) {
                /* Non-contiguous. */
                tmp = vmu_alloc_bound();
                tmp->vmb_start = (*last)->vmb_end + 1;
                tmp->vmb_end = walker->vmb_start - 1;
                tmp->vmb_type = type;
                ret += tmp->vmb_end - tmp->vmb_start + 1;
                avl_insert_here(tree, tmp, *last, AVL_AFTER);
                *last = tmp;
            } else {
                *last = walker;
            }
        }
    }

    return (ret);
}

/*
 * vmu_update_bounds()
 *
 * tree: avl_tree in which first and last hang.
 *
 * first, last: list of continuous bounds, of which zero or more are of
 *      type VMUSAGE_BOUND_UNKNOWN.
 *
 * new_tree: avl_tree in which new_first and new_last hang.
 *
 * new_first, new_last: list of continuous bounds, of which none are of
 *          type VMUSAGE_BOUND_UNKNOWN.  These bounds are used to
 *          update the types of bounds in (first,last) with
 *          type VMUSAGE_BOUND_UNKNOWN.
 *
 * For the list of bounds (first,last), this function updates any bounds
 * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
 * the list (new_first, new_last).
 *
 * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
 * (new_first, new_last), it will be split into multiple bounds.
 *
 * Return value:
 *  The number of pages in the list of bounds (first,last) that were of
 *  type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
 *  VMUSAGE_BOUND_INCORE.
 *
 */
static pgcnt_t
vmu_update_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last,
    avl_tree_t *new_tree, vmu_bound_t *new_first, vmu_bound_t *new_last)
{
    vmu_bound_t *next, *new_next, *tmp;
    pgcnt_t rss = 0;

    next = *first;
    new_next = new_first;

    /*
     * Verify first and last bound are covered by new bounds if they
     * have unknown type.
     */
    ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
        (*first)->vmb_start >= new_first->vmb_start);
    ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
        (*last)->vmb_end <= new_last->vmb_end);
    for (;;) {
        /* If bound already has type, proceed to next bound. */
        if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
            if (next == *last)
                break;
            next = AVL_NEXT(tree, next);
            continue;
        }
        while (new_next->vmb_end < next->vmb_start)
            new_next = AVL_NEXT(new_tree, new_next);
        ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
        next->vmb_type = new_next->vmb_type;
        if (new_next->vmb_end < next->vmb_end) {
            /* need to split bound */
            tmp = vmu_alloc_bound();
            tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
            tmp->vmb_start = new_next->vmb_end + 1;
            tmp->vmb_end = next->vmb_end;
            avl_insert_here(tree, tmp, next, AVL_AFTER);
            next->vmb_end = new_next->vmb_end;
            if (*last == next)
                *last = tmp;
            if (next->vmb_type == VMUSAGE_BOUND_INCORE)
                rss += next->vmb_end - next->vmb_start + 1;
            next = tmp;
        } else {
            if (next->vmb_type == VMUSAGE_BOUND_INCORE)
                rss += next->vmb_end - next->vmb_start + 1;
            if (next == *last)
                break;
            next = AVL_NEXT(tree, next);
        }
    }
    return (rss);
}

/*
 * Merges adjacent bounds with same type between first and last bound.
 * After merge, last pointer may point to a different bound, as (incoming)
 * last bound may have been merged away.
 */
static void
vmu_merge_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last)
{
    vmu_bound_t *current;
    vmu_bound_t *next;

    ASSERT(tree != NULL);
    ASSERT(*first != NULL);
    ASSERT(*last != NULL);

    current = *first;
    while (current != *last) {
        next = AVL_NEXT(tree, current);
        if ((current->vmb_end + 1) == next->vmb_start &&
            current->vmb_type == next->vmb_type) {
            current->vmb_end = next->vmb_end;
            avl_remove(tree, next);
            vmu_free_bound(next);
            if (next == *last) {
                *last = current;
            }
        } else {
            current = AVL_NEXT(tree, current);
        }
    }
}

/*
 * Given an amp and a list of bounds, updates each bound's type with
 * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
 *
 * If a bound is partially incore, it will be split into two bounds.
 * first and last may be modified, as bounds may be split into multiple
 * bounds if they are partially incore/not-incore.
 *
 * Set incore to non-zero if bounds are already known to be incore.
 *
 */
static void
vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
    vmu_bound_t **first, vmu_bound_t **last, boolean_t incore)
{
    vmu_bound_t *next;
    vmu_bound_t *tmp;
    pgcnt_t index;
    short bound_type;
    short page_type;
    vnode_t *vn;
    anoff_t off;
    struct anon *ap;

    next = *first;
    /* Shared anon slots don't change once set. */
    ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
    for (;;) {
        if (incore == B_TRUE)
            next->vmb_type = VMUSAGE_BOUND_INCORE;

        if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
            if (next == *last)
                break;
            next = AVL_NEXT(tree, next);
            continue;
        }
        bound_type = next->vmb_type;
        index = next->vmb_start;
        while (index <= next->vmb_end) {

            /*
             * These are used to determine how much to increment
             * index when a large page is found.
             */
            page_t *page;
            pgcnt_t pgcnt = 1;
            uint_t pgshft;
            pgcnt_t pgmsk;

            ap = anon_get_ptr(amp->ahp, index);
            if (ap != NULL)
                swap_xlate(ap, &vn, &off);

            if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
                (page = page_exists(vn, off)) != NULL) {
                page_type = VMUSAGE_BOUND_INCORE;
                if (page->p_szc > 0) {
                    pgcnt = page_get_pagecnt(page->p_szc);
                    pgshft = page_get_shift(page->p_szc);
                    pgmsk = (0x1 << (pgshft - PAGESHIFT))
                        - 1;
                }
            } else {
                page_type = VMUSAGE_BOUND_NOT_INCORE;
            }
            if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
                next->vmb_type = page_type;
            } else if (next->vmb_type != page_type) {
                /*
                 * If current bound type does not match page
                 * type, need to split off new bound.
                 */
                tmp = vmu_alloc_bound();
                tmp->vmb_type = page_type;
                tmp->vmb_start = index;
                tmp->vmb_end = next->vmb_end;
                avl_insert_here(tree, tmp, next, AVL_AFTER);
                next->vmb_end = index - 1;
                if (*last == next)
                    *last = tmp;
                next = tmp;
            }
            if (pgcnt > 1) {
                /*
                 * If inside large page, jump to next large
                 * page
                 */
                index = (index & ~pgmsk) + pgcnt;
            } else {
                index++;
            }
        }
        if (next == *last) {
            ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
            break;
        } else
            next = AVL_NEXT(tree, next);
    }
    ANON_LOCK_EXIT(&amp->a_rwlock);
}

/*
 * Same as vmu_amp_update_incore_bounds(), except for tracking
 * incore-/not-incore for vnodes.
 */
static void
vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
    vmu_bound_t **first, vmu_bound_t **last)
{
    vmu_bound_t *next;
    vmu_bound_t *tmp;
    pgcnt_t index;
    short bound_type;
    short page_type;

    next = *first;
    for (;;) {
        if (vnode->v_pages == NULL)
            next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;

        if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
            if (next == *last)
                break;
            next = AVL_NEXT(tree, next);
            continue;
        }

        bound_type = next->vmb_type;
        index = next->vmb_start;
        while (index <= next->vmb_end) {

            /*
             * These are used to determine how much to increment
             * index when a large page is found.
             */
            page_t *page;
            pgcnt_t pgcnt = 1;
            uint_t pgshft;
            pgcnt_t pgmsk;

            if (vnode->v_pages != NULL &&
                (page = page_exists(vnode, ptob(index))) != NULL) {
                page_type = VMUSAGE_BOUND_INCORE;
                if (page->p_szc > 0) {
                    pgcnt = page_get_pagecnt(page->p_szc);
                    pgshft = page_get_shift(page->p_szc);
                    pgmsk = (0x1 << (pgshft - PAGESHIFT))
                        - 1;
                }
            } else {
                page_type = VMUSAGE_BOUND_NOT_INCORE;
            }
            if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
                next->vmb_type = page_type;
            } else if (next->vmb_type != page_type) {
                /*
                 * If current bound type does not match page
                 * type, need to split off new bound.
                 */
                tmp = vmu_alloc_bound();
                tmp->vmb_type = page_type;
                tmp->vmb_start = index;
                tmp->vmb_end = next->vmb_end;
                avl_insert_here(tree, tmp, next, AVL_AFTER);
                next->vmb_end = index - 1;
                if (*last == next)
                    *last = tmp;
                next = tmp;
            }
            if (pgcnt > 1) {
                /*
                 * If inside large page, jump to next large
                 * page
                 */
                index = (index & ~pgmsk) + pgcnt;
            } else {
                index++;
            }
        }
        if (next == *last) {
            ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
            break;
        } else
            next = AVL_NEXT(tree, next);
    }
}

/*
 * Calculate the rss and swap consumed by a segment.  vmu_entities is the
 * list of entities to visit.  For shared segments, the vnode or amp
 * is looked up in each entity to see if it has been already counted.  Private
 * anon pages are checked per entity to ensure that COW pages are not
 * double counted.
 *
 * For private mapped files, first the amp is checked for private pages.
 * Bounds not backed by the amp are looked up in the vnode for each entity
 * to avoid double counting of private COW vnode pages.
 */
static void
vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
{
    struct segvn_data *svd;
    struct shm_data *shmd;
    struct spt_data *sptd;
    vmu_object_t *shared_object = NULL;
    vmu_object_t *entity_object = NULL;
    vmu_entity_t *entity;
    vmusage_t *result;
    vmu_bound_t *first = NULL;
    vmu_bound_t *last = NULL;
    vmu_bound_t *cur = NULL;
    vmu_bound_t *e_first = NULL;
    vmu_bound_t *e_last = NULL;
    vmu_bound_t *tmp;
    pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
    struct anon_map *private_amp = NULL;
    boolean_t incore = B_FALSE;
    boolean_t shared = B_FALSE;
    int file = 0;
    pgcnt_t swresv = 0;
    pgcnt_t panon = 0;

    /* Can zero-length segments exist?  Not sure, so paranoia. */
    if (seg->s_size <= 0)
        return;

    /*
     * Figure out if there is a shared object (such as a named vnode or
     * a shared amp, then figure out if there is a private amp, which
     * identifies private pages.
     */
    if (seg->s_ops == &segvn_ops) {
        svd = (struct segvn_data *)seg->s_data;
        if (svd->type == MAP_SHARED) {
            shared = B_TRUE;
        } else {
            swresv = svd->swresv;

            if (SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock,
                RW_READER) != 0) {
                /*
                 * Text replication anon maps can be shared
                 * across all zones. Space used for text
                 * replication is typically capped as a small %
                 * of memory.  To keep it simple for now we
                 * don't account for swap and memory space used
                 * for text replication.
                 */
                if (svd->tr_state == SEGVN_TR_OFF &&
                    svd->amp != NULL) {
                    private_amp = svd->amp;
                    p_start = svd->anon_index;
                    p_end = svd->anon_index +
                        btop(seg->s_size) - 1;
                }
                SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
            }
        }
        if (svd->vp != NULL) {
            file = 1;
            shared_object = vmu_find_insert_object(
                vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
                VMUSAGE_TYPE_VNODE);
            s_start = btop(svd->offset);
            s_end = btop(svd->offset + seg->s_size) - 1;
        }
        if (svd->amp != NULL && svd->type == MAP_SHARED) {
            ASSERT(shared_object == NULL);
            shared_object = vmu_find_insert_object(
                vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
                VMUSAGE_TYPE_AMP);
            s_start = svd->anon_index;
            s_end = svd->anon_index + btop(seg->s_size) - 1;
            /* schedctl mappings are always in core */
            if (svd->amp->swresv == 0)
                incore = B_TRUE;
        }
    } else if (seg->s_ops == &segspt_shmops) {
        shared = B_TRUE;
        shmd = (struct shm_data *)seg->s_data;
        shared_object = vmu_find_insert_object(
            vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
            VMUSAGE_TYPE_AMP);
        s_start = 0;
        s_end = btop(seg->s_size) - 1;
        sptd = shmd->shm_sptseg->s_data;

        /* ism segments are always incore and do not reserve swap */
        if (sptd->spt_flags & SHM_SHARE_MMU)
            incore = B_TRUE;

    } else {
        return;
    }

    /*
     * If there is a private amp, count anon pages that exist.  If an
     * anon has a refcnt > 1 (COW sharing), then save the anon in a
     * hash so that it is not double counted.
     *
     * If there is also a shared object, then figure out the bounds
     * which are not mapped by the private amp.
     */
    if (private_amp != NULL) {

        /* Enter as writer to prevent COW anons from being freed */
        ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);

        p_index = p_start;
        s_index = s_start;

        while (p_index <= p_end) {

            pgcnt_t p_index_next;
            pgcnt_t p_bound_size;
            int cnt;
            anoff_t off;
            struct vnode *vn;
            struct anon *ap;
            page_t *page;       /* For handling of large */
            pgcnt_t pgcnt = 1;  /* pages */
            pgcnt_t pgstart;
            pgcnt_t pgend;
            uint_t pgshft;
            pgcnt_t pgmsk;

            p_index_next = p_index;
            ap = anon_get_next_ptr(private_amp->ahp,
                &p_index_next);

            /*
             * If next anon is past end of mapping, simulate
             * end of anon so loop terminates.
             */
            if (p_index_next > p_end) {
                p_index_next = p_end + 1;
                ap = NULL;
            }
            /*
             * For COW segments, keep track of bounds not
             * backed by private amp so they can be looked
             * up in the backing vnode
             */
            if (p_index_next != p_index) {

                /*
                 * Compute index difference between anon and
                 * previous anon.
                 */
                p_bound_size = p_index_next - p_index - 1;

                if (shared_object != NULL) {
                    cur = vmu_alloc_bound();
                    cur->vmb_start = s_index;
                    cur->vmb_end = s_index + p_bound_size;
                    cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
                    if (first == NULL) {
                        first = cur;
                        last = cur;
                    } else {
                        last->vmb_next = cur;
                        last = cur;
                    }
                }
                p_index = p_index + p_bound_size + 1;
                s_index = s_index + p_bound_size + 1;
            }

            /* Detect end of anons in amp */
            if (ap == NULL)
                break;

            cnt = ap->an_refcnt;
            swap_xlate(ap, &vn, &off);

            if (vn == NULL || vn->v_pages == NULL ||
                (page = page_exists(vn, off)) == NULL) {
                p_index++;
                s_index++;
                continue;
            }

            /*
             * If large page is found, compute portion of large
             * page in mapping, and increment indicies to the next
             * large page.
             */
            if (page->p_szc > 0) {

                pgcnt = page_get_pagecnt(page->p_szc);
                pgshft = page_get_shift(page->p_szc);
                pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;

                /* First page in large page */
                pgstart = p_index & ~pgmsk;
                /* Last page in large page */
                pgend = pgstart + pgcnt - 1;
                /*
                 * Artifically end page if page extends past
                 * end of mapping.
                 */
                if (pgend > p_end)
                    pgend = p_end;

                /*
                 * Compute number of pages from large page
                 * which are mapped.
                 */
                pgcnt = pgend - p_index + 1;

                /*
                 * Point indicies at page after large page,
                 * or at page after end of mapping.
                 */
                p_index += pgcnt;
                s_index += pgcnt;
            } else {
                p_index++;
                s_index++;
            }

            /*
             * Assume anon structs with a refcnt
             * of 1 are not COW shared, so there
             * is no reason to track them per entity.
             */
            if (cnt == 1) {
                panon += pgcnt;
                continue;
            }
            for (entity = vmu_entities; entity != NULL;
                entity = entity->vme_next_calc) {

                result = &entity->vme_result;
                /*
                 * Track COW anons per entity so
                 * they are not double counted.
                 */
                if (vmu_find_insert_anon(entity->vme_anon_hash,
                    (caddr_t)ap) == 0)
                    continue;

                result->vmu_rss_all += (pgcnt << PAGESHIFT);
                result->vmu_rss_private +=
                    (pgcnt << PAGESHIFT);
            }
        }
        ANON_LOCK_EXIT(&private_amp->a_rwlock);
    }

    /* Add up resident anon and swap reserved for private mappings */
    if (swresv > 0 || panon > 0) {
        for (entity = vmu_entities; entity != NULL;
            entity = entity->vme_next_calc) {
            result = &entity->vme_result;
            result->vmu_swap_all += swresv;
            result->vmu_swap_private += swresv;
            result->vmu_rss_all += (panon << PAGESHIFT);
            result->vmu_rss_private += (panon << PAGESHIFT);
        }
    }

    /* Compute resident pages backing shared amp or named vnode */
    if (shared_object != NULL) {
        avl_tree_t *tree = &(shared_object->vmo_bounds);

        if (first == NULL) {
            /*
             * No private amp, or private amp has no anon
             * structs.  This means entire segment is backed by
             * the shared object.
             */
            first = vmu_alloc_bound();
            first->vmb_start = s_start;
            first->vmb_end = s_end;
            first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
        }
        /*
         * Iterate bounds not backed by private amp, and compute
         * resident pages.
         */
        cur = first;
        while (cur != NULL) {

            if (vmu_insert_lookup_object_bounds(shared_object,
                cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
                &first, &last) > 0) {
                /* new bounds, find incore/not-incore */
                if (shared_object->vmo_type ==
                    VMUSAGE_TYPE_VNODE) {
                    vmu_vnode_update_incore_bounds(
                        tree,
                        (vnode_t *)
                        shared_object->vmo_key, &first,
                        &last);
                } else {
                    vmu_amp_update_incore_bounds(
                        tree,
                        (struct anon_map *)
                        shared_object->vmo_key, &first,
                        &last, incore);
                }
                vmu_merge_bounds(tree, &first, &last);
            }
            for (entity = vmu_entities; entity != NULL;
                entity = entity->vme_next_calc) {
                avl_tree_t *e_tree;

                result = &entity->vme_result;

                entity_object = vmu_find_insert_object(
                    shared_object->vmo_type ==
                    VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
                    entity->vme_amp_hash,
                    shared_object->vmo_key,
                    shared_object->vmo_type);

                virt = vmu_insert_lookup_object_bounds(
                    entity_object, cur->vmb_start, cur->vmb_end,
                    VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);

                if (virt == 0)
                    continue;
                /*
                 * Range visited for this entity
                 */
                e_tree = &(entity_object->vmo_bounds);
                rss = vmu_update_bounds(e_tree, &e_first,
                    &e_last, tree, first, last);
                result->vmu_rss_all += (rss << PAGESHIFT);
                if (shared == B_TRUE && file == B_FALSE) {
                    /* shared anon mapping */
                    result->vmu_swap_all +=
                        (virt << PAGESHIFT);
                    result->vmu_swap_shared +=
                        (virt << PAGESHIFT);
                    result->vmu_rss_shared +=
                        (rss << PAGESHIFT);
                } else if (shared == B_TRUE && file == B_TRUE) {
                    /* shared file mapping */
                    result->vmu_rss_shared +=
                        (rss << PAGESHIFT);
                } else if (shared == B_FALSE &&
                    file == B_TRUE) {
                    /* private file mapping */
                    result->vmu_rss_private +=
                        (rss << PAGESHIFT);
                }
                vmu_merge_bounds(e_tree, &e_first, &e_last);
            }
            tmp = cur;
            cur = cur->vmb_next;
            vmu_free_bound(tmp);
        }
    }
}

/*
 * Based on the current calculation flags, find the relevant entities
 * which are relative to the process.  Then calculate each segment
 * in the process'es address space for each relevant entity.
 */
static void
vmu_calculate_proc(proc_t *p)
{
    vmu_entity_t *entities = NULL;
    vmu_zone_t *zone;
    vmu_entity_t *tmp;
    struct as *as;
    struct seg *seg;
    int ret;

    /* Figure out which entities are being computed */
    if ((vmu_data.vmu_system) != NULL) {
        tmp = vmu_data.vmu_system;
        tmp->vme_next_calc = entities;
        entities = tmp;
    }
    if (vmu_data.vmu_calc_flags &
        (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
        VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
        VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
        VMUSAGE_ALL_EUSERS)) {
        ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
            (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
            (mod_hash_val_t *)&zone);
        if (ret != 0) {
            zone = vmu_alloc_zone(p->p_zone->zone_id);
            ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
                (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
                (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
            ASSERT(ret == 0);
        }
        if (zone->vmz_zone != NULL) {
            tmp = zone->vmz_zone;
            tmp->vme_next_calc = entities;
            entities = tmp;
        }
        if (vmu_data.vmu_calc_flags &
            (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
            tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
                p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
                zone->vmz_id);
            tmp->vme_next_calc = entities;
            entities = tmp;
        }
        if (vmu_data.vmu_calc_flags &
            (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
            tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
                p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
            tmp->vme_next_calc = entities;
            entities = tmp;
        }
        if (vmu_data.vmu_calc_flags &
            (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
            tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
                crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
            tmp->vme_next_calc = entities;
            entities = tmp;
        }
        if (vmu_data.vmu_calc_flags &
            (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
            tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
                crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
            tmp->vme_next_calc = entities;
            entities = tmp;
        }
    }
    /* Entities which collapse projects and users for all zones */
    if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
        tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
            p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
        tmp->vme_next_calc = entities;
        entities = tmp;
    }
    if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
        tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
            crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
        tmp->vme_next_calc = entities;
        entities = tmp;
    }
    if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
        tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
            crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
        tmp->vme_next_calc = entities;
        entities = tmp;
    }

    ASSERT(entities != NULL);
    /* process all segs in process's address space */
    as = p->p_as;
    AS_LOCK_ENTER(as, RW_READER);
    for (seg = AS_SEGFIRST(as); seg != NULL;
        seg = AS_SEGNEXT(as, seg)) {
        vmu_calculate_seg(entities, seg);
    }
    AS_LOCK_EXIT(as);
}

/*
 * Free data created by previous call to vmu_calculate().
 */
static void
vmu_clear_calc()
{
    if (vmu_data.vmu_system != NULL)
        vmu_free_entity(vmu_data.vmu_system);
        vmu_data.vmu_system = NULL;
    if (vmu_data.vmu_zones_hash != NULL)
        i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
    if (vmu_data.vmu_projects_col_hash != NULL)
        i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
    if (vmu_data.vmu_rusers_col_hash != NULL)
        i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
    if (vmu_data.vmu_eusers_col_hash != NULL)
        i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);

    i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
    i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
}

/*
 * Free unused data structures.  These can result if the system workload
 * decreases between calculations.
 */
static void
vmu_free_extra()
{
    vmu_bound_t *tb;
    vmu_object_t *to;
    vmu_entity_t *te;
    vmu_zone_t *tz;

    while (vmu_data.vmu_free_bounds != NULL) {
        tb = vmu_data.vmu_free_bounds;
        vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
        kmem_cache_free(vmu_bound_cache, tb);
    }
    while (vmu_data.vmu_free_objects != NULL) {
        to = vmu_data.vmu_free_objects;
        vmu_data.vmu_free_objects =
            vmu_data.vmu_free_objects->vmo_next;
        kmem_cache_free(vmu_object_cache, to);
    }
    while (vmu_data.vmu_free_entities != NULL) {
        te = vmu_data.vmu_free_entities;
        vmu_data.vmu_free_entities =
            vmu_data.vmu_free_entities->vme_next;
        if (te->vme_vnode_hash != NULL)
            mod_hash_destroy_hash(te->vme_vnode_hash);
        if (te->vme_amp_hash != NULL)
            mod_hash_destroy_hash(te->vme_amp_hash);
        if (te->vme_anon_hash != NULL)
            mod_hash_destroy_hash(te->vme_anon_hash);
        kmem_free(te, sizeof (vmu_entity_t));
    }
    while (vmu_data.vmu_free_zones != NULL) {
        tz = vmu_data.vmu_free_zones;
        vmu_data.vmu_free_zones =
            vmu_data.vmu_free_zones->vmz_next;
        if (tz->vmz_projects_hash != NULL)
            mod_hash_destroy_hash(tz->vmz_projects_hash);
        if (tz->vmz_tasks_hash != NULL)
            mod_hash_destroy_hash(tz->vmz_tasks_hash);
        if (tz->vmz_rusers_hash != NULL)
            mod_hash_destroy_hash(tz->vmz_rusers_hash);
        if (tz->vmz_eusers_hash != NULL)
            mod_hash_destroy_hash(tz->vmz_eusers_hash);
        kmem_free(tz, sizeof (vmu_zone_t));
    }
}

extern kcondvar_t *pr_pid_cv;

/*
 * Determine which entity types are relevant and allocate the hashes to
 * track them.  Then walk the process table and count rss and swap
 * for each process'es address space.  Address space object such as
 * vnodes, amps and anons are tracked per entity, so that they are
 * not double counted in the results.
 *
 */
static void
vmu_calculate()
{
    int i = 0;
    int ret;
    proc_t *p;

    vmu_clear_calc();

    if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
        vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
            ALL_ZONES);

    /*
     * Walk process table and calculate rss of each proc.
     *
     * Pidlock and p_lock cannot be held while doing the rss calculation.
     * This is because:
     *  1.  The calculation allocates using KM_SLEEP.
     *  2.  The calculation grabs a_lock, which cannot be grabbed
     *      after p_lock.
     *
     * Since pidlock must be dropped, we cannot simply just walk the
     * practive list.  Instead, we walk the process table, and sprlock
     * each process to ensure that it does not exit during the
     * calculation.
     */

    mutex_enter(&pidlock);
    for (i = 0; i < v.v_proc; i++) {
again:
        p = pid_entry(i);
        if (p == NULL)
            continue;

        mutex_enter(&p->p_lock);
        mutex_exit(&pidlock);

        if (panicstr) {
            mutex_exit(&p->p_lock);
            return;
        }

        /* Try to set P_PR_LOCK */
        ret = sprtrylock_proc(p);
        if (ret == -1) {
            /* Process in invalid state */
            mutex_exit(&p->p_lock);
            mutex_enter(&pidlock);
            continue;
        } else if (ret == 1) {
            /*
             * P_PR_LOCK is already set.  Wait and try again.
             * This also drops p_lock.
             */
            sprwaitlock_proc(p);
            mutex_enter(&pidlock);
            goto again;
        }
        mutex_exit(&p->p_lock);

        vmu_calculate_proc(p);

        mutex_enter(&p->p_lock);
        sprunlock(p);
        mutex_enter(&pidlock);
    }
    mutex_exit(&pidlock);

    vmu_free_extra();
}

/*
 * allocate a new cache for N results satisfying flags
 */
vmu_cache_t *
vmu_cache_alloc(size_t nres, uint_t flags)
{
    vmu_cache_t *cache;

    cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
    cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
    cache->vmc_nresults = nres;
    cache->vmc_flags = flags;
    cache->vmc_refcnt = 1;
    return (cache);
}

/*
 * Make sure cached results are not freed
 */
static void
vmu_cache_hold(vmu_cache_t *cache)
{
    ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
    cache->vmc_refcnt++;
}

/*
 * free cache data
 */
static void
vmu_cache_rele(vmu_cache_t *cache)
{
    ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
    ASSERT(cache->vmc_refcnt > 0);
    cache->vmc_refcnt--;
    if (cache->vmc_refcnt == 0) {
        kmem_free(cache->vmc_results, sizeof (vmusage_t) *
            cache->vmc_nresults);
        kmem_free(cache, sizeof (vmu_cache_t));
    }
}

/*
 * Copy out the cached results to a caller.  Inspect the callers flags
 * and zone to determine which cached results should be copied.
 */
static int
vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
    uint_t flags, int cpflg)
{
    vmusage_t *result, *out_result;
    vmusage_t dummy;
    size_t i, count = 0;
    size_t bufsize;
    int ret = 0;
    uint_t types = 0;

    if (nres != NULL) {
        if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
            return (set_errno(EFAULT));
    } else {
        bufsize = 0;
    }

    /* figure out what results the caller is interested in. */
    if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
        types |= VMUSAGE_SYSTEM;
    if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
        types |= VMUSAGE_ZONE;
    if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
        VMUSAGE_COL_PROJECTS))
        types |= VMUSAGE_PROJECTS;
    if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
        types |= VMUSAGE_TASKS;
    if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
        types |= VMUSAGE_RUSERS;
    if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
        types |= VMUSAGE_EUSERS;

    /* count results for current zone */
    out_result = buf;
    for (result = cache->vmc_results, i = 0;
        i < cache->vmc_nresults; result++, i++) {

        /* Do not return "other-zone" results to non-global zones */
        if (curproc->p_zone != global_zone &&
            curproc->p_zone->zone_id != result->vmu_zoneid)
            continue;

        /*
         * If non-global zone requests VMUSAGE_SYSTEM, fake
         * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
         */
        if (curproc->p_zone != global_zone &&
            (flags & VMUSAGE_SYSTEM) != 0 &&
            result->vmu_type == VMUSAGE_ZONE) {
            count++;
            if (out_result != NULL) {
                if (bufsize < count) {
                    ret = set_errno(EOVERFLOW);
                } else {
                    dummy = *result;
                    dummy.vmu_zoneid = ALL_ZONES;
                    dummy.vmu_id = 0;
                    dummy.vmu_type = VMUSAGE_SYSTEM;
                    if (ddi_copyout(&dummy, out_result,
                        sizeof (vmusage_t), cpflg))
                        return (set_errno(EFAULT));
                    out_result++;
                }
            }
        }

        /* Skip results that do not match requested type */
        if ((result->vmu_type & types) == 0)
            continue;

        /* Skip collated results if not requested */
        if (result->vmu_zoneid == ALL_ZONES) {
            if (result->vmu_type == VMUSAGE_PROJECTS &&
                (flags & VMUSAGE_COL_PROJECTS) == 0)
                continue;
            if (result->vmu_type == VMUSAGE_EUSERS &&
                (flags & VMUSAGE_COL_EUSERS) == 0)
                continue;
            if (result->vmu_type == VMUSAGE_RUSERS &&
                (flags & VMUSAGE_COL_RUSERS) == 0)
                continue;
        }

        /* Skip "other zone" results if not requested */
        if (result->vmu_zoneid != curproc->p_zone->zone_id) {
            if (result->vmu_type == VMUSAGE_ZONE &&
                (flags & VMUSAGE_ALL_ZONES) == 0)
                continue;
            if (result->vmu_type == VMUSAGE_PROJECTS &&
                (flags & (VMUSAGE_ALL_PROJECTS |
                VMUSAGE_COL_PROJECTS)) == 0)
                continue;
            if (result->vmu_type == VMUSAGE_TASKS &&
                (flags & VMUSAGE_ALL_TASKS) == 0)
                continue;
            if (result->vmu_type == VMUSAGE_RUSERS &&
                (flags & (VMUSAGE_ALL_RUSERS |
                VMUSAGE_COL_RUSERS)) == 0)
                continue;
            if (result->vmu_type == VMUSAGE_EUSERS &&
                (flags & (VMUSAGE_ALL_EUSERS |
                VMUSAGE_COL_EUSERS)) == 0)
                continue;
        }
        count++;
        if (out_result != NULL) {
            if (bufsize < count) {
                ret = set_errno(EOVERFLOW);
            } else {
                if (ddi_copyout(result, out_result,
                    sizeof (vmusage_t), cpflg))
                    return (set_errno(EFAULT));
                out_result++;
            }
        }
    }
    if (nres != NULL)
        if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg))
            return (set_errno(EFAULT));

    return (ret);
}

/*
 * vm_getusage()
 *
 * Counts rss and swap by zone, project, task, and/or user.  The flags argument
 * determines the type of results structures returned.  Flags requesting
 * results from more than one zone are "flattened" to the local zone if the
 * caller is not the global zone.
 *
 * args:
 *  flags:  bitmap consisting of one or more of VMUSAGE_*.
 *  age:    maximum allowable age (time since counting was done) in
 *      seconds of the results.  Results from previous callers are
 *      cached in kernel.
 *  buf:    pointer to buffer array of vmusage_t.  If NULL, then only nres
 *      set on success.
 *  nres:   Set to number of vmusage_t structures pointed to by buf
 *      before calling vm_getusage().
 *      On return 0 (success) or ENOSPC, is set to the number of result
 *      structures returned or attempted to return.
 *
 * returns 0 on success, -1 on failure:
 *  EINTR (interrupted)
 *  ENOSPC (nres to small for results, nres set to needed value for success)
 *  EINVAL (flags invalid)
 *  EFAULT (bad address for buf or nres)
 */
int
vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
{
    vmu_entity_t *entity;
    vmusage_t *result;
    int ret = 0;
    int cacherecent = 0;
    hrtime_t now;
    uint_t flags_orig;

    /*
     * Non-global zones cannot request system wide and/or collated
     * results, or the system result, so munge the flags accordingly.
     */
    flags_orig = flags;
    if (curproc->p_zone != global_zone) {
        if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
            flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
            flags |= VMUSAGE_PROJECTS;
        }
        if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
            flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
            flags |= VMUSAGE_RUSERS;
        }
        if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
            flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
            flags |= VMUSAGE_EUSERS;
        }
        if (flags & VMUSAGE_SYSTEM) {
            flags &= ~VMUSAGE_SYSTEM;
            flags |= VMUSAGE_ZONE;
        }
    }

    /* Check for unknown flags */
    if ((flags & (~VMUSAGE_MASK)) != 0)
        return (set_errno(EINVAL));

    /* Check for no flags */
    if ((flags & VMUSAGE_MASK) == 0)
        return (set_errno(EINVAL));

    mutex_enter(&vmu_data.vmu_lock);
    now = gethrtime();

start:
    if (vmu_data.vmu_cache != NULL) {

        vmu_cache_t *cache;

        if ((vmu_data.vmu_cache->vmc_timestamp +
            ((hrtime_t)age * NANOSEC)) > now)
            cacherecent = 1;

        if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
            cacherecent == 1) {
            cache = vmu_data.vmu_cache;
            vmu_cache_hold(cache);
            mutex_exit(&vmu_data.vmu_lock);

            ret = vmu_copyout_results(cache, buf, nres, flags_orig,
                cpflg);
            mutex_enter(&vmu_data.vmu_lock);
            vmu_cache_rele(cache);
            if (vmu_data.vmu_pending_waiters > 0)
                cv_broadcast(&vmu_data.vmu_cv);
            mutex_exit(&vmu_data.vmu_lock);
            return (ret);
        }
        /*
         * If the cache is recent, it is likely that there are other
         * consumers of vm_getusage running, so add their flags to the
         * desired flags for the calculation.
         */
        if (cacherecent == 1)
            flags = vmu_data.vmu_cache->vmc_flags | flags;
    }
    if (vmu_data.vmu_calc_thread == NULL) {

        vmu_cache_t *cache;

        vmu_data.vmu_calc_thread = curthread;
        vmu_data.vmu_calc_flags = flags;
        vmu_data.vmu_entities = NULL;
        vmu_data.vmu_nentities = 0;
        if (vmu_data.vmu_pending_waiters > 0)
            vmu_data.vmu_calc_flags |=
                vmu_data.vmu_pending_flags;

        vmu_data.vmu_pending_flags = 0;
        mutex_exit(&vmu_data.vmu_lock);
        vmu_calculate();
        mutex_enter(&vmu_data.vmu_lock);
        /* copy results to cache */
        if (vmu_data.vmu_cache != NULL)
            vmu_cache_rele(vmu_data.vmu_cache);
        cache = vmu_data.vmu_cache =
            vmu_cache_alloc(vmu_data.vmu_nentities,
            vmu_data.vmu_calc_flags);

        result = cache->vmc_results;
        for (entity = vmu_data.vmu_entities; entity != NULL;
            entity = entity->vme_next) {
            *result = entity->vme_result;
            result++;
        }
        cache->vmc_timestamp = gethrtime();
        vmu_cache_hold(cache);

        vmu_data.vmu_calc_flags = 0;
        vmu_data.vmu_calc_thread = NULL;

        if (vmu_data.vmu_pending_waiters > 0)
            cv_broadcast(&vmu_data.vmu_cv);

        mutex_exit(&vmu_data.vmu_lock);

        /* copy cache */
        ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
        mutex_enter(&vmu_data.vmu_lock);
        vmu_cache_rele(cache);
        mutex_exit(&vmu_data.vmu_lock);

        return (ret);
    }
    vmu_data.vmu_pending_flags |= flags;
    vmu_data.vmu_pending_waiters++;
    while (vmu_data.vmu_calc_thread != NULL) {
        if (cv_wait_sig(&vmu_data.vmu_cv,
            &vmu_data.vmu_lock) == 0) {
            vmu_data.vmu_pending_waiters--;
            mutex_exit(&vmu_data.vmu_lock);
            return (set_errno(EINTR));
        }
    }
    vmu_data.vmu_pending_waiters--;
    goto start;
}