common/os/cap_util.c

	cap_util.c revision b885580b43755ee4ea1e280b85428893d2ba9291
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Support for determining capacity and utilization of performance relevant
 * hardware components in a computer
 *
 * THEORY
 * ------
 * The capacity and utilization of the performance relevant hardware components
 * is needed to be able to optimize performance while minimizing the amount of
 * power used on a system.  The idea is to use hardware performance counters
 * and potentially other means to determine the capacity and utilization of
 * performance relevant hardware components (eg. execution pipeline, cache,
 * memory, etc.) and attribute the utilization to the responsible CPU and the
 * thread running there.
 *
 * This will help characterize the utilization of performance relevant
 * components and how much is used by each CPU and each thread.  With
 * that data, the utilization can be aggregated to all the CPUs sharing each
 * performance relevant hardware component to calculate the total utilization
 * of each component and compare that with the component's capacity to
 * essentially determine the actual hardware load of the component.  The
 * hardware utilization attributed to each running thread can also be
 * aggregated to determine the total hardware utilization of each component to
 * a workload.
 *
 * Once that is done, one can determine how much of each performance relevant
 * hardware component is needed by a given thread or set of threads (eg. a
 * workload) and size up exactly what hardware is needed by the threads and how
 * much.  With this info, we can better place threads among CPUs to match their
 * exact hardware resource needs and potentially lower or raise the power based
 * on their utilization or pack threads onto the fewest hardware components
 * needed and power off any remaining unused components to minimize power
 * without sacrificing performance.
 *
 * IMPLEMENTATION
 * --------------
 * The code has been designed and implemented to make (un)programming and
 * reading the counters for a given CPU as lightweight and fast as possible.
 * This is very important because we need to read and potentially (un)program
 * the counters very often and in performance sensitive code.  Specifically,
 * the counters may need to be (un)programmed during context switch and/or a
 * cyclic handler when there are more counter events to count than existing
 * counters.
 *
 * Consequently, the code has been split up to allow allocating and
 * initializing everything needed to program and read the counters on a given
 * CPU once and make (un)programming and reading the counters for a given CPU
 * not have to allocate/free memory or grab any locks.  To do this, all the
 * state needed to (un)program and read the counters on a CPU is kept per CPU
 * and is made lock free by forcing any code that reads or manipulates the
 * counters or the state needed to (un)program or read the counters to run on
 * the target CPU and disable preemption while running on the target CPU to
 * protect any critical sections. All counter manipulation on the target CPU is
 * happening either from a cross-call to the target CPU or at the same PIL as
 * used by the cross-call subsystem. This guarantees that counter manipulation
 * is not interrupted by cross-calls from other CPUs.
 *
 * The synchronization has been made lock free or as simple as possible for
 * performance and to avoid getting the locking all tangled up when we interpose
 * on the CPC routines that (un)program the counters to manage the counters
 * between the kernel and user on each CPU.  When the user starts using the
 * counters on a given CPU, the kernel will unprogram the counters that it is
 * using on that CPU just before they are programmed for the user.  Then the
 * kernel will program the counters on a given CPU for its own use when the user
 * stops using them.
 *
 * There is a special interaction with DTrace cpc provider (dcpc). Before dcpc
 * enables any probe, it requests to disable and unprogram all counters used for
 * capacity and utilizations. These counters are never re-programmed back until
 * dcpc completes. When all DTrace cpc probes are removed, dcpc notifies CU
 * framework and it re-programs the counters.
 *
 * When a CPU is going offline, its CU counters are unprogrammed and disabled,
 * so that they would not be re-programmed again by some other activity on the
 * CPU that is going offline.
 *
 * The counters are programmed during boot.  However, a flag is available to
 * disable this if necessary (see cu_flag below).  A handler is provided to
 * (un)program the counters during CPU on/offline.  Basic routines are provided
 * to initialize and tear down this module, initialize and tear down any state
 * needed for a given CPU, and (un)program the counters for a given CPU.
 * Lastly, a handler is provided to read the counters and attribute the
 * utilization to the responsible CPU.
 */
#include <sys/types.h>
#include <sys/cmn_err.h>
#include <sys/cpuvar.h>
#include <sys/ddi.h>
#include <sys/disp.h>
#include <sys/sdt.h>
#include <sys/sunddi.h>
#include <sys/thread.h>
#include <sys/pghw.h>
#include <sys/cmt.h>
#include <sys/x_call.h>
#include <sys/cap_util.h>

#include <sys/archsystm.h>
#include <sys/promif.h>

#if defined(__x86)
#include <sys/xc_levels.h>
#endif


/*
 * Default CPU hardware performance counter flags to use for measuring capacity
 * and utilization
 */
#define CU_CPC_FLAGS_DEFAULT    \
    (CPC_COUNT_USER|CPC_COUNT_SYSTEM|CPC_OVF_NOTIFY_EMT)

/*
 * Possible Flags for controlling this module.
 */
#define CU_FLAG_ENABLE      1   /* Enable module */
#define CU_FLAG_READY       2   /* Ready to setup module */
#define CU_FLAG_ON      4   /* Module is on */

/*
 * pg_cpu kstats calculate utilization rate and maximum utilization rate for
 * some CPUs. The rate is calculated based on data from two subsequent
 * snapshots. When the time between such two snapshots is too small, the
 * resulting rate may have low accuracy, so we only consider snapshots which
 * are separated by SAMPLE_INTERVAL nanoseconds from one another. We do not
 * update the rate if the interval is smaller than that.
 *
 * Use one tenth of a second as the minimum interval for utilization rate
 * calculation.
 *
 * NOTE: The CU_SAMPLE_INTERVAL_MIN should be higher than the scaling factor in
 * the CU_RATE() macro below to guarantee that we never divide by zero.
 *
 * Rate is the number of events per second. The rate is the number of events
 * divided by time and multiplied by the number of nanoseconds in a second. We
 * do not want time to be too small since it will cause large errors in
 * division.
 *
 * We do not want to multiply two large numbers (the instruction count and
 * NANOSEC) either since it may cause integer overflow. So we divide both the
 * numerator and the denominator by the same value.
 *
 * NOTE: The scaling factor below should be less than CU_SAMPLE_INTERVAL_MIN
 * above to guarantee that time divided by this value is always non-zero.
 */
#define CU_RATE(val, time) \
    (((val) * (NANOSEC / CU_SCALE)) / ((time) / CU_SCALE))

#define CU_SAMPLE_INTERVAL_MIN  (NANOSEC / 10)

#define CU_SCALE (CU_SAMPLE_INTERVAL_MIN / 10000)

/*
 * When the time between two kstat reads for the same CPU is less than
 * CU_UPDATE_THRESHOLD use the old counter data and skip updating counter values
 * for the CPU. This helps reduce cross-calls when kstat consumers read data
 * very often or when they read PG utilization data and then CPU utilization
 * data quickly after that.
 */
#define CU_UPDATE_THRESHOLD (NANOSEC / 10)

/*
 * The IS_HIPIL() macro verifies that the code is executed either from a
 * cross-call or from high-PIL interrupt
 */
#ifdef DEBUG
#define IS_HIPIL() (getpil() >= XCALL_PIL)
#else
#define IS_HIPIL()
#endif  /* DEBUG */


typedef void (*cu_cpu_func_t)(uintptr_t, int *);


/*
 * Flags to use for programming CPU hardware performance counters to measure
 * capacity and utilization
 */
int             cu_cpc_flags = CU_CPC_FLAGS_DEFAULT;

/*
 * Initial value used for programming hardware counters
 */
uint64_t            cu_cpc_preset_value = 0;

/*
 * List of CPC event requests for capacity and utilization.
 */
static kcpc_request_list_t  *cu_cpc_reqs = NULL;

/*
 * When a CPU is a member of PG with a sharing relationship that is supported
 * by the capacity/utilization framework, a kstat is created for that CPU and
 * sharing relationship.
 *
 * These kstats are updated one at a time, so we can have a single scratch
 * space to fill the data.
 *
 * CPU counter kstats fields:
 *
 *   cu_cpu_id      CPU ID for this kstat
 *
 *   cu_generation  Generation value that increases whenever any CPU goes
 *            offline or online. Two kstat snapshots for the same
 *            CPU may only be compared if they have the same
 *            generation.
 *
 *   cu_pg_id       PG ID for the relationship described by this kstat
 *
 *   cu_cpu_util    Running value of CPU utilization for the sharing
 *            relationship
 *
 *   cu_cpu_time_running Total time spent collecting CU data. The time may be
 *             less than wall time if CU counters were stopped for
 *             some time.
 *
 *   cu_cpu_time_stopped Total time the CU counters were stopped.
 *
 *   cu_cpu_rate    Utilization rate, expressed in operations per second.
 *
 *   cu_cpu_rate_max    Maximum observed value of utilization rate.
 */
struct cu_cpu_kstat {
    kstat_named_t   cu_cpu_id;
    kstat_named_t   cu_generation;
    kstat_named_t   cu_pg_id;
    kstat_named_t   cu_cpu_util;
    kstat_named_t   cu_cpu_time_running;
    kstat_named_t   cu_cpu_time_stopped;
    kstat_named_t   cu_cpu_rate;
    kstat_named_t   cu_cpu_rate_max;
} cu_cpu_kstat = {
    { "id",             KSTAT_DATA_UINT32 },
    { "generation",         KSTAT_DATA_UINT32 },
    { "pg_id",          KSTAT_DATA_LONG },
    { "hw_util",            KSTAT_DATA_UINT64 },
    { "hw_util_time_running",   KSTAT_DATA_UINT64 },
    { "hw_util_time_stopped",   KSTAT_DATA_UINT64 },
    { "hw_util_rate",       KSTAT_DATA_UINT64 },
    { "hw_util_rate_max",       KSTAT_DATA_UINT64 },
};

/*
 * Flags for controlling this module
 */
uint_t              cu_flags = CU_FLAG_ENABLE;

/*
 * Error return value for cu_init() since it can't return anything to be called
 * from mp_init_tbl[] (:-(
 */
static int          cu_init_error = 0;

hrtime_t            cu_sample_interval_min = CU_SAMPLE_INTERVAL_MIN;

hrtime_t            cu_update_threshold = CU_UPDATE_THRESHOLD;

static kmutex_t         pg_cpu_kstat_lock;


/*
 * Forward declaration of interface routines
 */
void        cu_disable(void);
void        cu_enable(void);
void        cu_init(void);
void        cu_cpc_program(cpu_t *cp, int *err);
void        cu_cpc_unprogram(cpu_t *cp, int *err);
int     cu_cpu_update(struct cpu *cp, boolean_t move_to);
void        cu_pg_update(pghw_t *pg);


/*
 * Forward declaration of private routines
 */
static int  cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs);
static void cu_cpc_program_xcall(uintptr_t arg, int *err);
static int  cu_cpc_req_add(char *event, kcpc_request_list_t *reqs,
    int nreqs, cu_cntr_stats_t *stats, int kmem_flags, int *nevents);
static int  cu_cpu_callback(cpu_setup_t what, int id, void *arg);
static void cu_cpu_disable(cpu_t *cp);
static void cu_cpu_enable(cpu_t *cp);
static int  cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs);
static int  cu_cpu_fini(cpu_t *cp);
static void cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info);
static int  cu_cpu_kstat_update(kstat_t *ksp, int rw);
static int  cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg);
static int  cu_cpu_update_stats(cu_cntr_stats_t *stats,
    uint64_t cntr_value);
static void cu_cpu_info_detach_xcall(void);

/*
 * Disable or enable Capacity Utilization counters on all CPUs.
 */
void
cu_disable(void)
{
    cpu_t *cp;

    ASSERT(MUTEX_HELD(&cpu_lock));

    cp = cpu_active;
    do {
        if (!(cp->cpu_flags & CPU_OFFLINE))
            cu_cpu_disable(cp);
    } while ((cp = cp->cpu_next_onln) != cpu_active);
}


void
cu_enable(void)
{
    cpu_t *cp;

    ASSERT(MUTEX_HELD(&cpu_lock));

    cp = cpu_active;
    do {
        if (!(cp->cpu_flags & CPU_OFFLINE))
            cu_cpu_enable(cp);
    } while ((cp = cp->cpu_next_onln) != cpu_active);
}


/*
 * Setup capacity and utilization support
 */
void
cu_init(void)
{
    cpu_t   *cp;

    cu_init_error = 0;
    if (!(cu_flags & CU_FLAG_ENABLE) || (cu_flags & CU_FLAG_ON)) {
        cu_init_error = -1;
        return;
    }

    if (kcpc_init() != 0) {
        cu_init_error = -2;
        return;
    }

    /*
     * Can't measure hardware capacity and utilization without CPU
     * hardware performance counters
     */
    if (cpc_ncounters <= 0) {
        cu_init_error = -3;
        return;
    }

    /*
     * Setup CPC event request queue
     */
    cu_cpc_reqs = kcpc_reqs_init(cpc_ncounters, KM_SLEEP);

    mutex_enter(&cpu_lock);

    /*
     * Mark flags to say that module is ready to be setup
     */
    cu_flags |= CU_FLAG_READY;

    cp = cpu_active;
    do {
        /*
         * Allocate and setup state needed to measure capacity and
         * utilization
         */
        if (cu_cpu_init(cp, cu_cpc_reqs) != 0)
            cu_init_error = -5;

        /*
         * Reset list of counter event requests so its space can be
         * reused for a different set of requests for next CPU
         */
        (void) kcpc_reqs_reset(cu_cpc_reqs);

        cp = cp->cpu_next_onln;
    } while (cp != cpu_active);

    /*
     * Mark flags to say that module is on now and counters are ready to be
     * programmed on all active CPUs
     */
    cu_flags |= CU_FLAG_ON;

    /*
     * Program counters on currently active CPUs
     */
    cp = cpu_active;
    do {
        if (cu_cpu_run(cp, cu_cpc_program_xcall,
            (uintptr_t)B_FALSE) != 0)
            cu_init_error = -6;

        cp = cp->cpu_next_onln;
    } while (cp != cpu_active);

    /*
     * Register callback for CPU state changes to enable and disable
     * CPC counters as CPUs come on and offline
     */
    register_cpu_setup_func(cu_cpu_callback, NULL);

    mutex_exit(&cpu_lock);
}


/*
 * Return number of counter events needed to measure capacity and utilization
 * for specified CPU and fill in list of CPC requests with each counter event
 * needed if list where to add CPC requests is given
 *
 * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
 *   everything that has been successfully allocated if any memory
 *   allocation fails
 */
static int
cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs)
{
    group_t     *cmt_pgs;
    cu_cntr_info_t  **cntr_info_array;
    cpu_pg_t    *cpu_pgs;
    cu_cpu_info_t   *cu_cpu_info;
    pg_cmt_t    *pg_cmt;
    pghw_t      *pg_hw;
    cu_cntr_stats_t *stats;
    int     nevents;
    pghw_type_t pg_hw_type;
    group_iter_t    iter;

    ASSERT(MUTEX_HELD(&cpu_lock));

    /*
     * There has to be a target CPU for this
     */
    if (cp == NULL)
        return (-1);

    /*
     * Return 0 when CPU doesn't belong to any group
     */
    cpu_pgs = cp->cpu_pg;
    if (cpu_pgs == NULL || GROUP_SIZE(&cpu_pgs->cmt_pgs) < 1)
        return (0);

    cmt_pgs = &cpu_pgs->cmt_pgs;
    cu_cpu_info = cp->cpu_cu_info;

    /*
     * Grab counter statistics and info
     */
    if (reqs == NULL) {
        stats = NULL;
        cntr_info_array = NULL;
    } else {
        if (cu_cpu_info == NULL || cu_cpu_info->cu_cntr_stats == NULL)
            return (-2);

        stats = cu_cpu_info->cu_cntr_stats;
        cntr_info_array = cu_cpu_info->cu_cntr_info;
    }

    /*
     * See whether platform (or processor) specific code knows which CPC
     * events to request, etc. are needed to measure hardware capacity and
     * utilization on this machine
     */
    nevents = cu_plat_cpc_init(cp, reqs, nreqs);
    if (nevents >= 0)
        return (nevents);

    /*
     * Let common code decide which CPC events to request, etc. to measure
     * capacity and utilization since platform (or processor) specific does
     * not know....
     *
     * Walk CPU's PG lineage and do following:
     *
     * - Setup CPC request, counter info, and stats needed for each counter
     *   event to measure capacity and and utilization for each of CPU's PG
     *   hardware sharing relationships
     *
     * - Create PG CPU kstats to export capacity and utilization for each PG
     */
    nevents = 0;
    group_iter_init(&iter);
    while ((pg_cmt = group_iterate(cmt_pgs, &iter)) != NULL) {
        cu_cntr_info_t  *cntr_info;
        int     nevents_save;
        int     nstats;

        pg_hw = (pghw_t *)pg_cmt;
        pg_hw_type = pg_hw->pghw_hw;
        nevents_save = nevents;
        nstats = 0;

        switch (pg_hw_type) {
        case PGHW_IPIPE:
            if (cu_cpc_req_add("PAPI_tot_ins", reqs, nreqs, stats,
                KM_NOSLEEP, &nevents) != 0)
                continue;
            nstats = 1;
            break;

        case PGHW_FPU:
            if (cu_cpc_req_add("PAPI_fp_ins", reqs, nreqs, stats,
                KM_NOSLEEP, &nevents) != 0)
                continue;
            nstats = 1;
            break;

        default:
            /*
             * Don't measure capacity and utilization for this kind
             * of PG hardware relationship so skip to next PG in
             * CPU's PG lineage
             */
            continue;
        }

        cntr_info = cntr_info_array[pg_hw_type];

        /*
         * Nothing to measure for this hardware sharing relationship
         */
        if (nevents - nevents_save == 0) {
            if (cntr_info != NULL)
                kmem_free(cntr_info, sizeof (cu_cntr_info_t));
                cntr_info_array[pg_hw_type] = NULL;
            continue;
        }

        /*
         * Fill in counter info for this PG hardware relationship
         */
        if (cntr_info == NULL) {
            cntr_info = kmem_zalloc(sizeof (cu_cntr_info_t),
                KM_NOSLEEP);
            if (cntr_info == NULL)
                continue;
            cntr_info_array[pg_hw_type] = cntr_info;
        }
        cntr_info->ci_cpu = cp;
        cntr_info->ci_pg = pg_hw;
        cntr_info->ci_stats = &stats[nevents_save];
        cntr_info->ci_nstats = nstats;

        /*
         * Create PG CPU kstats for this hardware relationship
         */
        cu_cpu_kstat_create(pg_hw, cntr_info);
    }

    return (nevents);
}


/*
 * Program counters for capacity and utilization on given CPU
 *
 * If any of the following conditions is true, the counters are not programmed:
 *
 * - CU framework is disabled
 * - The cpu_cu_info field of the cpu structure is NULL
 * - DTrace is active
 * - Counters are programmed already
 * - Counters are disabled (by calls to cu_cpu_disable())
 */
void
cu_cpc_program(cpu_t *cp, int *err)
{
    cu_cpc_ctx_t    *cpu_ctx;
    kcpc_ctx_t  *ctx;
    cu_cpu_info_t   *cu_cpu_info;

    ASSERT(IS_HIPIL());
    /*
     * Should be running on given CPU. We disable preemption to keep CPU
     * from disappearing and make sure flags and CPC context don't change
     * from underneath us
     */
    kpreempt_disable();
    ASSERT(cp == CPU);

    /*
     * Module not ready to program counters
     */
    if (!(cu_flags & CU_FLAG_ON)) {
        *err = -1;
        kpreempt_enable();
        return;
    }

    if (cp == NULL) {
        *err = -2;
        kpreempt_enable();
        return;
    }

    cu_cpu_info = cp->cpu_cu_info;
    if (cu_cpu_info == NULL) {
        *err = -3;
        kpreempt_enable();
        return;
    }

    /*
     * If DTrace CPC is active or counters turned on already or are
     * disabled, just return.
     */
    if (dtrace_cpc_in_use || (cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON) ||
        cu_cpu_info->cu_disabled) {
        *err = 1;
        kpreempt_enable();
        return;
    }

    if ((CPU->cpu_cpc_ctx != NULL) &&
        !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
        *err = -4;
        kpreempt_enable();
        return;
    }

    /*
     * Get CPU's CPC context needed for capacity and utilization
     */
    cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
    ASSERT(cpu_ctx != NULL);
    ASSERT(cpu_ctx->nctx >= 0);

    ASSERT(cpu_ctx->ctx_ptr_array == NULL || cpu_ctx->ctx_ptr_array_sz > 0);
    ASSERT(cpu_ctx->nctx <= cpu_ctx->ctx_ptr_array_sz);
    if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
        cpu_ctx->ctx_ptr_array_sz <= 0) {
        *err = -5;
        kpreempt_enable();
        return;
    }

    /*
     * Increment index in CPU's CPC context info to point at next context
     * to program
     *
     * NOTE: Do this now instead of after programming counters to ensure
     *   that index will always point at *current* context so we will
     *   always be able to unprogram *current* context if necessary
     */
    cpu_ctx->cur_index = (cpu_ctx->cur_index + 1) % cpu_ctx->nctx;

    ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];

    /*
     * Clear KCPC_CTX_INVALID and KCPC_CTX_INVALID_STOPPED from CPU's CPC
     * context before programming counters
     *
     * Context is marked with KCPC_CTX_INVALID_STOPPED when context is
     * unprogrammed and may be marked with KCPC_CTX_INVALID when
     * kcpc_invalidate_all() is called by cpustat(1M) and dtrace CPC to
     * invalidate all CPC contexts before they take over all the counters.
     *
     * This isn't necessary since these flags are only used for thread bound
     * CPC contexts not CPU bound CPC contexts like ones used for capacity
     * and utilization.
     *
     * There is no need to protect the flag update since no one is using
     * this context now.
     */
    ctx->kc_flags &= ~(KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);

    /*
     * Program counters on this CPU
     */
    kcpc_program(ctx, B_FALSE, B_FALSE);

    cp->cpu_cpc_ctx = ctx;

    /*
     * Set state in CPU structure to say that CPU's counters are programmed
     * for capacity and utilization now and that they are transitioning from
     * off to on state. This will cause cu_cpu_update to update stop times
     * for all programmed counters.
     */
    cu_cpu_info->cu_flag |= CU_CPU_CNTRS_ON | CU_CPU_CNTRS_OFF_ON;

    /*
     * Update counter statistics
     */
    (void) cu_cpu_update(cp, B_FALSE);

    cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_OFF_ON;

    *err = 0;
    kpreempt_enable();
}


/*
 * Cross call wrapper routine for cu_cpc_program()
 *
 * Checks to make sure that counters on CPU aren't being used by someone else
 * before calling cu_cpc_program() since cu_cpc_program() needs to assert that
 * nobody else is using the counters to catch and prevent any broken code.
 * Also, this check needs to happen on the target CPU since the CPU's CPC
 * context can only be changed while running on the CPU.
 *
 * If the first argument is TRUE, cu_cpc_program_xcall also checks that there is
 * no valid thread bound cpc context. This is important to check to prevent
 * re-programming thread counters with CU counters when CPU is coming on-line.
 */
static void
cu_cpc_program_xcall(uintptr_t arg, int *err)
{
    boolean_t   avoid_thread_context = (boolean_t)arg;

    kpreempt_disable();

    if (CPU->cpu_cpc_ctx != NULL &&
        !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
        *err = -100;
        kpreempt_enable();
        return;
    }

    if (avoid_thread_context && (curthread->t_cpc_ctx != NULL) &&
        !(curthread->t_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
        *err = -200;
        kpreempt_enable();
        return;
    }

    cu_cpc_program(CPU, err);
    kpreempt_enable();
}


/*
 * Unprogram counters for capacity and utilization on given CPU
 * This function should be always executed on the target CPU at high PIL
 */
void
cu_cpc_unprogram(cpu_t *cp, int *err)
{
    cu_cpc_ctx_t    *cpu_ctx;
    kcpc_ctx_t  *ctx;
    cu_cpu_info_t   *cu_cpu_info;

    ASSERT(IS_HIPIL());
    /*
     * Should be running on given CPU with preemption disabled to keep CPU
     * from disappearing and make sure flags and CPC context don't change
     * from underneath us
     */
    kpreempt_disable();
    ASSERT(cp == CPU);

    /*
     * Module not on
     */
    if (!(cu_flags & CU_FLAG_ON)) {
        *err = -1;
        kpreempt_enable();
        return;
    }

    cu_cpu_info = cp->cpu_cu_info;
    if (cu_cpu_info == NULL) {
        *err = -3;
        kpreempt_enable();
        return;
    }

    /*
     * Counters turned off already
     */
    if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) {
        *err = 1;
        kpreempt_enable();
        return;
    }

    /*
     * Update counter statistics
     */
    (void) cu_cpu_update(cp, B_FALSE);

    /*
     * Get CPU's CPC context needed for capacity and utilization
     */
    cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
    if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
        cpu_ctx->ctx_ptr_array_sz <= 0) {
        *err = -5;
        kpreempt_enable();
        return;
    }
    ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];

    /*
     * CPU's CPC context should be current capacity and utilization CPC
     * context
     */
    ASSERT(cp->cpu_cpc_ctx == ctx);
    if (cp->cpu_cpc_ctx != ctx) {
        *err = -6;
        kpreempt_enable();
        return;
    }

    /*
     * Unprogram counters on CPU.
     */
    kcpc_unprogram(ctx, B_FALSE);

    ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);

    /*
     * Unset state in CPU structure saying that CPU's counters are
     * programmed
     */
    cp->cpu_cpc_ctx = NULL;
    cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_ON;

    *err = 0;
    kpreempt_enable();
}


/*
 * Add given counter event to list of CPC requests
 */
static int
cu_cpc_req_add(char *event, kcpc_request_list_t *reqs, int nreqs,
    cu_cntr_stats_t *stats, int kmem_flags, int *nevents)
{
    int n;
    int retval;
    uint_t  flags;

    /*
     * Return error when no counter event specified, counter event not
     * supported by CPC's PCBE, or number of events not given
     */
    if (event == NULL || kcpc_event_supported(event) == B_FALSE ||
        nevents == NULL)
        return (-1);

    n = *nevents;

    /*
     * Only count number of counter events needed if list
     * where to add CPC requests not given
     */
    if (reqs == NULL) {
        n++;
        *nevents = n;
        return (-3);
    }

    /*
     * Return error when stats not given or not enough room on list of CPC
     * requests for more counter events
     */
    if (stats == NULL || (nreqs <= 0 && n >= nreqs))
        return (-4);

    /*
     * Use flags in cu_cpc_flags to program counters and enable overflow
     * interrupts/traps (unless PCBE can't handle overflow interrupts) so
     * PCBE can catch counters before they wrap to hopefully give us an
     * accurate (64-bit) virtualized counter
     */
    flags = cu_cpc_flags;
    if ((kcpc_pcbe_capabilities() & CPC_CAP_OVERFLOW_INTERRUPT) == 0)
        flags &= ~CPC_OVF_NOTIFY_EMT;

    /*
     * Add CPC request to list
     */
    retval = kcpc_reqs_add(reqs, event, cu_cpc_preset_value,
        flags, 0, NULL, &stats[n], kmem_flags);

    if (retval != 0)
        return (-5);

    n++;
    *nevents = n;
    return (0);
}

static void
cu_cpu_info_detach_xcall(void)
{
    ASSERT(IS_HIPIL());

    CPU->cpu_cu_info = NULL;
}


/*
 * Enable or disable collection of capacity/utilization data for a current CPU.
 * Counters are enabled if 'on' argument is True and disabled if it is False.
 * This function should be always executed at high PIL
 */
static void
cu_cpc_trigger(uintptr_t arg1, uintptr_t arg2)
{
    cpu_t       *cp = (cpu_t *)arg1;
    boolean_t   on = (boolean_t)arg2;
    int     error;
    cu_cpu_info_t   *cu_cpu_info;

    ASSERT(IS_HIPIL());
    kpreempt_disable();
    ASSERT(cp == CPU);

    if (!(cu_flags & CU_FLAG_ON)) {
        kpreempt_enable();
        return;
    }

    cu_cpu_info = cp->cpu_cu_info;
    if (cu_cpu_info == NULL) {
        kpreempt_enable();
        return;
    }

    ASSERT(!cu_cpu_info->cu_disabled ||
        !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));

    if (on) {
        /*
         * Decrement the cu_disabled counter.
         * Once it drops to zero, call cu_cpc_program.
         */
        if (cu_cpu_info->cu_disabled > 0)
            cu_cpu_info->cu_disabled--;
        if (cu_cpu_info->cu_disabled == 0)
            cu_cpc_program(CPU, &error);
    } else if (cu_cpu_info->cu_disabled++ == 0) {
        /*
         * This is the first attempt to disable CU, so turn it off
         */
        cu_cpc_unprogram(cp, &error);
        ASSERT(!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));
    }

    kpreempt_enable();
}


/*
 * Callback for changes in CPU states
 * Used to enable or disable hardware performance counters on CPUs that are
 * turned on or off
 *
 * NOTE: cpc should be programmed/unprogrammed while running on the target CPU.
 * We have to use thread_affinity_set to hop to the right CPU because these
 * routines expect cpu_lock held, so we can't cross-call other CPUs while
 * holding CPU lock.
 */
static int
/* LINTED E_FUNC_ARG_UNUSED */
cu_cpu_callback(cpu_setup_t what, int id, void *arg)
{
    cpu_t   *cp;
    int retval = 0;

    ASSERT(MUTEX_HELD(&cpu_lock));

    if (!(cu_flags & CU_FLAG_ON))
        return (-1);

    cp = cpu_get(id);
    if (cp == NULL)
        return (-2);

    switch (what) {
    case CPU_ON:
        /*
         * Setup counters on CPU being turned on
         */
        retval = cu_cpu_init(cp, cu_cpc_reqs);

        /*
         * Reset list of counter event requests so its space can be
         * reused for a different set of requests for next CPU
         */
        (void) kcpc_reqs_reset(cu_cpc_reqs);
        break;
    case CPU_INTR_ON:
        /*
         * Setup counters on CPU being turned on.
         */
        retval = cu_cpu_run(cp, cu_cpc_program_xcall,
            (uintptr_t)B_TRUE);
        break;
    case CPU_OFF:
        /*
         * Disable counters on CPU being turned off. Counters will not
         * be re-enabled on this CPU until it comes back online.
         */
        cu_cpu_disable(cp);
        ASSERT(!CU_CPC_ON(cp));
        retval = cu_cpu_fini(cp);
        break;
    default:
        break;
    }
    return (retval);
}


/*
 * Disable or enable Capacity Utilization counters on a given CPU. This function
 * can be called from any CPU to disable counters on the given CPU.
 */
static void
cu_cpu_disable(cpu_t *cp)
{
    cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_FALSE);
}


static void
cu_cpu_enable(cpu_t *cp)
{
    cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_TRUE);
}


/*
 * Setup capacity and utilization support for given CPU
 *
 * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
 *   everything that has been successfully allocated including cpu_cu_info
 *  if any memory allocation fails
 */
static int
cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs)
{
    kcpc_ctx_t  **ctx_ptr_array;
    size_t      ctx_ptr_array_sz;
    cu_cpc_ctx_t    *cpu_ctx;
    cu_cpu_info_t   *cu_cpu_info;
    int     n;

    /*
     * cpu_lock should be held and protect against CPU going away and races
     * with cu_{init,fini,cpu_fini}()
     */
    ASSERT(MUTEX_HELD(&cpu_lock));

    /*
     * Return if not ready to setup counters yet
     */
    if (!(cu_flags & CU_FLAG_READY))
        return (-1);

    if (cp->cpu_cu_info == NULL) {
        cp->cpu_cu_info = kmem_zalloc(sizeof (cu_cpu_info_t),
            KM_NOSLEEP);
        if (cp->cpu_cu_info == NULL)
            return (-2);
    }

    /*
     * Get capacity and utilization CPC context for CPU and check to see
     * whether it has been setup already
     */
    cu_cpu_info = cp->cpu_cu_info;
    cu_cpu_info->cu_cpu = cp;
    cu_cpu_info->cu_disabled = dtrace_cpc_in_use ? 1 : 0;

    cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
    if (cpu_ctx->nctx > 0 && cpu_ctx->ctx_ptr_array != NULL &&
        cpu_ctx->ctx_ptr_array_sz > 0) {
        return (1);
    }

    /*
     * Should have no contexts since it hasn't been setup already
     */
    ASSERT(cpu_ctx->nctx == 0 && cpu_ctx->ctx_ptr_array == NULL &&
        cpu_ctx->ctx_ptr_array_sz == 0);

    /*
     * Determine how many CPC events needed to measure capacity and
     * utilization for this CPU, allocate space for counter statistics for
     * each event, and fill in list of CPC event requests with corresponding
     * counter stats for each request to make attributing counter data
     * easier later....
     */
    n = cu_cpc_init(cp, NULL, 0);
    if (n <= 0) {
        (void) cu_cpu_fini(cp);
        return (-3);
    }

    cu_cpu_info->cu_cntr_stats = kmem_zalloc(n * sizeof (cu_cntr_stats_t),
        KM_NOSLEEP);
    if (cu_cpu_info->cu_cntr_stats == NULL) {
        (void) cu_cpu_fini(cp);
        return (-4);
    }

    cu_cpu_info->cu_ncntr_stats = n;

    n = cu_cpc_init(cp, reqs, n);
    if (n <= 0) {
        (void) cu_cpu_fini(cp);
        return (-5);
    }

    /*
     * Create CPC context with given requests
     */
    ctx_ptr_array = NULL;
    ctx_ptr_array_sz = 0;
    n = kcpc_cpu_ctx_create(cp, reqs, KM_NOSLEEP, &ctx_ptr_array,
        &ctx_ptr_array_sz);
    if (n <= 0) {
        (void) cu_cpu_fini(cp);
        return (-6);
    }

    /*
     * Should have contexts
     */
    ASSERT(n > 0 && ctx_ptr_array != NULL && ctx_ptr_array_sz > 0);
    if (ctx_ptr_array == NULL || ctx_ptr_array_sz <= 0) {
        (void) cu_cpu_fini(cp);
        return (-7);
    }

    /*
     * Fill in CPC context info for CPU needed for capacity and utilization
     */
    cpu_ctx->cur_index = 0;
    cpu_ctx->nctx = n;
    cpu_ctx->ctx_ptr_array = ctx_ptr_array;
    cpu_ctx->ctx_ptr_array_sz = ctx_ptr_array_sz;
    return (0);
}

/*
 * Tear down capacity and utilization support for given CPU
 */
static int
cu_cpu_fini(cpu_t *cp)
{
    kcpc_ctx_t  *ctx;
    cu_cpc_ctx_t    *cpu_ctx;
    cu_cpu_info_t   *cu_cpu_info;
    int     i;
    pghw_type_t pg_hw_type;

    /*
     * cpu_lock should be held and protect against CPU going away and races
     * with cu_{init,fini,cpu_init}()
     */
    ASSERT(MUTEX_HELD(&cpu_lock));

    /*
     * Have to at least be ready to setup counters to have allocated
     * anything that needs to be deallocated now
     */
    if (!(cu_flags & CU_FLAG_READY))
        return (-1);

    /*
     * Nothing to do if CPU's capacity and utilization info doesn't exist
     */
    cu_cpu_info = cp->cpu_cu_info;
    if (cu_cpu_info == NULL)
        return (1);

    /*
     * Tear down any existing kstats and counter info for each hardware
     * sharing relationship
     */
    for (pg_hw_type = PGHW_START; pg_hw_type < PGHW_NUM_COMPONENTS;
        pg_hw_type++) {
        cu_cntr_info_t  *cntr_info;

        cntr_info = cu_cpu_info->cu_cntr_info[pg_hw_type];
        if (cntr_info == NULL)
            continue;

        if (cntr_info->ci_kstat != NULL) {
            kstat_delete(cntr_info->ci_kstat);
            cntr_info->ci_kstat = NULL;
        }
        kmem_free(cntr_info, sizeof (cu_cntr_info_t));
    }

    /*
     * Free counter statistics for CPU
     */
    ASSERT(cu_cpu_info->cu_cntr_stats == NULL ||
        cu_cpu_info->cu_ncntr_stats > 0);
    if (cu_cpu_info->cu_cntr_stats != NULL &&
        cu_cpu_info->cu_ncntr_stats > 0) {
        kmem_free(cu_cpu_info->cu_cntr_stats,
            cu_cpu_info->cu_ncntr_stats * sizeof (cu_cntr_stats_t));
        cu_cpu_info->cu_cntr_stats = NULL;
        cu_cpu_info->cu_ncntr_stats = 0;
    }

    /*
     * Get capacity and utilization CPC contexts for given CPU and check to
     * see whether they have been freed already
     */
    cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
    if (cpu_ctx != NULL && cpu_ctx->ctx_ptr_array != NULL &&
        cpu_ctx->ctx_ptr_array_sz > 0) {
        /*
         * Free CPC contexts for given CPU
         */
        for (i = 0; i < cpu_ctx->nctx; i++) {
            ctx = cpu_ctx->ctx_ptr_array[i];
            if (ctx == NULL)
                continue;
            kcpc_free(ctx, 0);
        }

        /*
         * Free CPC context pointer array
         */
        kmem_free(cpu_ctx->ctx_ptr_array, cpu_ctx->ctx_ptr_array_sz);

        /*
         * Zero CPC info for CPU
         */
        bzero(cpu_ctx, sizeof (cu_cpc_ctx_t));
    }

    /*
     * Set cp->cpu_cu_info pointer to NULL. Go through cross-call to ensure
     * that no one is going to access the cpu_cu_info whicch we are going to
     * free.
     */
    if (cpu_is_online(cp))
        cpu_call(cp, (cpu_call_func_t)cu_cpu_info_detach_xcall, 0, 0);
    else
        cp->cpu_cu_info = NULL;

    /*
     * Free CPU's capacity and utilization info
     */
    kmem_free(cu_cpu_info, sizeof (cu_cpu_info_t));

    return (0);
}

/*
 * Create capacity & utilization kstats for given PG CPU hardware sharing
 * relationship
 */
static void
cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info)
{
    char        *class, *sh_name;
    kstat_t     *ks;

    /*
     * Just return when no counter info or CPU
     */
    if (cntr_info == NULL || cntr_info->ci_cpu == NULL)
        return;

    /*
     * Get the class name from the leaf PG that this CPU belongs to.
     * If there are no PGs, just use the default class "cpu".
     */
    class = pg ? pghw_type_string(pg->pghw_hw) : "cpu";
    sh_name = pg ? pghw_type_shortstring(pg->pghw_hw) : "cpu";

    if ((ks = kstat_create_zone("pg_cpu", cntr_info->ci_cpu->cpu_id,
        sh_name, class, KSTAT_TYPE_NAMED,
        sizeof (cu_cpu_kstat) / sizeof (kstat_named_t),
        KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID)) == NULL)
        return;

    ks->ks_lock = &pg_cpu_kstat_lock;
    ks->ks_data = &cu_cpu_kstat;
    ks->ks_update = cu_cpu_kstat_update;

    ks->ks_private = cntr_info;
    cntr_info->ci_kstat = ks;
    kstat_install(cntr_info->ci_kstat);
}


/*
 * Propagate values from CPU capacity & utilization stats to kstats
 */
static int
cu_cpu_kstat_update(kstat_t *ksp, int rw)
{
    cpu_t       *cp;
    cu_cntr_info_t  *cntr_info = ksp->ks_private;
    struct cu_cpu_kstat *kstat = &cu_cpu_kstat;
    pghw_t      *pg;
    cu_cntr_stats_t *stats;

    if (rw == KSTAT_WRITE)
        return (EACCES);

    kpreempt_disable();

    /*
     * Update capacity and utilization statistics needed for CPU's PG (CPU)
     * kstats
     */
    cp = cntr_info->ci_cpu;
    (void) cu_cpu_update(cp, B_TRUE);

    pg = cntr_info->ci_pg;
    stats = cntr_info->ci_stats;
    kstat->cu_cpu_id.value.ui32 = cp->cpu_id;
    kstat->cu_generation.value.ui32 = cp->cpu_generation;
    if (pg == NULL)
        kstat->cu_pg_id.value.l = -1;
    else
        kstat->cu_pg_id.value.l = pg->pghw_pg.pg_id;

    kstat->cu_cpu_util.value.ui64 = stats->cs_value_total;
    kstat->cu_cpu_rate.value.ui64 = stats->cs_rate;
    kstat->cu_cpu_rate_max.value.ui64 = stats->cs_rate_max;
    kstat->cu_cpu_time_running.value.ui64 = stats->cs_time_running;
    kstat->cu_cpu_time_stopped.value.ui64 = stats->cs_time_stopped;
    /*
     * Counters are stopped now, so the cs_time_stopped was last
     * updated at cs_time_start time. Add the time passed since then
     * to the stopped time.
     */
    if (!(cp->cpu_cu_info->cu_flag & CU_CPU_CNTRS_ON))
        kstat->cu_cpu_time_stopped.value.ui64 +=
            gethrtime() - stats->cs_time_start;

    kpreempt_enable();

    return (0);
}

/*
 * Run specified function with specified argument on a given CPU and return
 * whatever the function returns
 */
static int
cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg)
{
    int error = 0;

    /*
     * cpu_call() will call func on the CPU specified with given argument
     * and return func's return value in last argument
     */
    cpu_call(cp, (cpu_call_func_t)func, arg, (uintptr_t)&error);
    return (error);
}


/*
 * Update counter statistics on a given CPU.
 *
 * If move_to argument is True, execute the function on the CPU specified
 * Otherwise, assume that it is already runninng on the right CPU
 *
 * If move_to is specified, the caller should hold cpu_lock or have preemption
 * disabled. Otherwise it is up to the caller to guarantee that things do not
 * change in the process.
 */
int
cu_cpu_update(struct cpu *cp, boolean_t move_to)
{
    int retval;
    cu_cpu_info_t   *cu_cpu_info = cp->cpu_cu_info;
    hrtime_t    time_snap;

    ASSERT(!move_to || MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0);

    /*
     * Nothing to do if counters are not programmed
     */
    if (!(cu_flags & CU_FLAG_ON) ||
        (cu_cpu_info == NULL) ||
        !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
        return (0);

    /*
     * Don't update CPU statistics if it was updated recently
     * and provide old results instead
     */
    time_snap = gethrtime();
    if ((time_snap - cu_cpu_info->cu_sample_time) < cu_update_threshold) {
        DTRACE_PROBE1(cu__drop__sample, cpu_t *, cp);
        return (0);
    }

    cu_cpu_info->cu_sample_time = time_snap;

    /*
     * CPC counter should be read on the CPU that is running the counter. We
     * either have to move ourselves to the target CPU or insure that we
     * already run there.
     *
     * We use cross-call to the target CPU to execute kcpc_read() and
     * cu_cpu_update_stats() there.
     */
    retval = 0;
    if (move_to)
        (void) cu_cpu_run(cp, (cu_cpu_func_t)kcpc_read,
            (uintptr_t)cu_cpu_update_stats);
    else {
        retval = kcpc_read((kcpc_update_func_t)cu_cpu_update_stats);
        /*
         * Offset negative return value by -10 so we can distinguish it
         * from error return values of this routine vs kcpc_read()
         */
        if (retval < 0)
            retval -= 10;
    }

    return (retval);
}


/*
 * Update CPU counter statistics for current CPU.
 * This function may be called from a cross-call
 */
static int
cu_cpu_update_stats(cu_cntr_stats_t *stats, uint64_t cntr_value)
{
    cu_cpu_info_t   *cu_cpu_info = CPU->cpu_cu_info;
    uint_t      flags;
    uint64_t    delta;
    hrtime_t    time_delta;
    hrtime_t    time_snap;

    if (stats == NULL)
        return (-1);

    /*
     * Nothing to do if counters are not programmed. This should not happen,
     * but we check just in case.
     */
    ASSERT(cu_flags & CU_FLAG_ON);
    ASSERT(cu_cpu_info != NULL);
    if (!(cu_flags & CU_FLAG_ON) ||
        (cu_cpu_info == NULL))
        return (-2);

    flags = cu_cpu_info->cu_flag;
    ASSERT(flags & CU_CPU_CNTRS_ON);
    if (!(flags & CU_CPU_CNTRS_ON))
        return (-2);

    /*
     * Take snapshot of high resolution timer
     */
    time_snap = gethrtime();

    /*
     * CU counters have just been programmed. We cannot assume that the new
     * cntr_value continues from where we left off, so use the cntr_value as
     * the new initial value.
     */
    if (flags & CU_CPU_CNTRS_OFF_ON)
        stats->cs_value_start = cntr_value;

    /*
     * Calculate delta in counter values between start of sampling period
     * and now
     */
    delta = cntr_value - stats->cs_value_start;

    /*
     * Calculate time between start of sampling period and now
     */
    time_delta = stats->cs_time_start ?
        time_snap - stats->cs_time_start :
        0;
    stats->cs_time_start = time_snap;
    stats->cs_value_start = cntr_value;

    if (time_delta > 0) { /* wrap shouldn't happen */
        /*
         * Update either running or stopped time based on the transition
         * state
         */
        if (flags & CU_CPU_CNTRS_OFF_ON)
            stats->cs_time_stopped += time_delta;
        else
            stats->cs_time_running += time_delta;
    }

    /*
     * Update rest of counter statistics if counter value didn't wrap
     */
    if (delta > 0) {
        /*
         * Update utilization rate if the interval between samples is
         * sufficient.
         */
        ASSERT(cu_sample_interval_min > CU_SCALE);
        if (time_delta > cu_sample_interval_min)
            stats->cs_rate = CU_RATE(delta, time_delta);
        if (stats->cs_rate_max < stats->cs_rate)
            stats->cs_rate_max = stats->cs_rate;

        stats->cs_value_last = delta;
        stats->cs_value_total += delta;
    }

    return (0);
}

/*
 * Update CMT PG utilization data.
 *
 * This routine computes the running total utilization and times for the
 * specified PG by adding up the total utilization and counter running and
 * stopped times of all CPUs in the PG and calculates the utilization rate and
 * maximum rate for all CPUs in the PG.
 */
void
cu_pg_update(pghw_t *pg)
{
    pg_cpu_itr_t    cpu_iter;
    pghw_type_t pg_hwtype;
    cpu_t       *cpu;
    pghw_util_t *hw_util = &pg->pghw_stats;
    uint64_t    old_utilization = hw_util->pghw_util;
    hrtime_t    now;
    hrtime_t    time_delta;
    uint64_t    utilization_delta;

    ASSERT(MUTEX_HELD(&cpu_lock));

    now = gethrtime();

    pg_hwtype = pg->pghw_hw;

    /*
     * Initialize running total utilization and times for PG to 0
     */
    hw_util->pghw_util = 0;
    hw_util->pghw_time_running = 0;
    hw_util->pghw_time_stopped = 0;

    /*
     * Iterate over all CPUs in the PG and aggregate utilization, running
     * time and stopped time.
     */
    PG_CPU_ITR_INIT(pg, cpu_iter);
    while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
        cu_cpu_info_t   *cu_cpu_info = cpu->cpu_cu_info;
        cu_cntr_info_t  *cntr_info;
        cu_cntr_stats_t *stats;

        if (cu_cpu_info == NULL)
            continue;

        /*
         * Update utilization data for the CPU and then
         * aggregate per CPU running totals for PG
         */
        (void) cu_cpu_update(cpu, B_TRUE);
        cntr_info = cu_cpu_info->cu_cntr_info[pg_hwtype];

        if (cntr_info == NULL || (stats = cntr_info->ci_stats) == NULL)
            continue;

        hw_util->pghw_util += stats->cs_value_total;
        hw_util->pghw_time_running += stats->cs_time_running;
        hw_util->pghw_time_stopped += stats->cs_time_stopped;

        /*
         * If counters are stopped now, the pg_time_stopped was last
         * updated at cs_time_start time. Add the time passed since then
         * to the stopped time.
         */
        if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
            hw_util->pghw_time_stopped +=
                now - stats->cs_time_start;
    }

    /*
     * Compute per PG instruction rate and maximum rate
     */
    time_delta = now - hw_util->pghw_time_stamp;
    hw_util->pghw_time_stamp = now;

    if (old_utilization == 0)
        return;

    /*
     * Calculate change in utilization over sampling period and set this to
     * 0 if the delta would be 0 or negative which may happen if any CPUs go
     * offline during the sampling period
     */
    if (hw_util->pghw_util > old_utilization)
        utilization_delta = hw_util->pghw_util - old_utilization;
    else
        utilization_delta = 0;

    /*
     * Update utilization rate if the interval between samples is
     * sufficient.
     */
    ASSERT(cu_sample_interval_min > CU_SCALE);
    if (time_delta > CU_SAMPLE_INTERVAL_MIN)
        hw_util->pghw_rate = CU_RATE(utilization_delta, time_delta);

    /*
     * Update the maximum observed rate
     */
    if (hw_util->pghw_rate_max < hw_util->pghw_rate)
        hw_util->pghw_rate_max = hw_util->pghw_rate;
}