common/disp/cmt.c

	cmt.c revision fb2f18f820d90b001aea4fb27dd654bc1263c440
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

#include <sys/systm.h>
#include <sys/types.h>
#include <sys/param.h>
#include <sys/thread.h>
#include <sys/cpuvar.h>
#include <sys/cpupart.h>
#include <sys/kmem.h>
#include <sys/cmn_err.h>
#include <sys/kstat.h>
#include <sys/processor.h>
#include <sys/disp.h>
#include <sys/group.h>
#include <sys/pghw.h>
#include <sys/bitset.h>
#include <sys/lgrp.h>
#include <sys/cmt.h>

/*
 * CMT scheduler / dispatcher support
 *
 * This file implements CMT scheduler support using Processor Groups.
 * The CMT processor group class creates and maintains the CMT class
 * specific processor group pg_cmt_t.
 *
 * ---------------------------- <-- pg_cmt_t *
 * | pghw_t                   |
 * ----------------------------
 * | CMT class specific data  |
 * | - hierarchy linkage      |
 * | - CMT load balancing data|
 * | - active CPU group/bitset|
 * ----------------------------
 *
 * The scheduler/dispatcher leverages knowledge of the performance
 * relevant CMT sharing relationships existing between cpus to implement
 * optimized affinity and load balancing policies.
 *
 * Load balancing policy seeks to improve performance by minimizing
 * contention over shared processor resources / facilities, while the
 * affinity policies seek to improve cache and TLB utilization.
 *
 * The CMT PGs created by this class are already arranged into a
 * hierarchy (which is done in the pghw layer). To implement the top-down
 * CMT load balancing algorithm, the CMT PGs additionally maintain
 * parent, child and sibling hierarchy relationships.
 * Parent PGs always contain a superset of their children(s) resources,
 * each PG can have at most one parent, and siblings are the group of PGs
 * sharing the same parent.
 *
 * On NUMA systems, the CMT load balancing algorithm balances across the
 * CMT PGs within their respective lgroups. On UMA based system, there
 * exists a top level group of PGs to balance across. On NUMA systems multiple
 * top level groups are instantiated, where the top level balancing begins by
 * balancng across the CMT PGs within their respective (per lgroup) top level
 * groups.
 */

typedef struct cmt_lgrp {
    group_t     cl_pgs;     /* Top level group of active CMT PGs */
    int     cl_npgs;    /* # of top level PGs in the lgroup */
    lgrp_handle_t   cl_hand;    /* lgroup's platform handle */
    struct cmt_lgrp *cl_next;   /* next cmt_lgrp */
} cmt_lgrp_t;

static cmt_lgrp_t   *cmt_lgrps = NULL;

static int      is_cpu0 = 1;
static int      cmt_sched_disabled = 0;

static pg_cid_t     pg_cmt_class_id;        /* PG class id */

static pg_t     *pg_cmt_alloc();
static void     pg_cmt_free(pg_t *);
static void     pg_cmt_cpu_init(cpu_t *);
static void     pg_cmt_cpu_fini(cpu_t *);
static void     pg_cmt_cpu_active(cpu_t *);
static void     pg_cmt_cpu_inactive(cpu_t *);
static void     pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
static void     pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
static void     pg_cmt_hier_pack(pg_cmt_t **, int);
static int      pg_cmt_cpu_belongs(pg_t *, cpu_t *);
static int      pg_cmt_hw(pghw_type_t);
static cmt_lgrp_t   *pg_cmt_find_lgrp(lgrp_handle_t);

/*
 * Macro to test if PG is managed by the CMT PG class
 */
#define IS_CMT_PG(pg)   (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)

/*
 * CMT PG ops
 */
struct pg_ops pg_ops_cmt = {
    pg_cmt_alloc,
    pg_cmt_free,
    pg_cmt_cpu_init,
    pg_cmt_cpu_fini,
    pg_cmt_cpu_active,
    pg_cmt_cpu_inactive,
    pg_cmt_cpupart_in,
    NULL,           /* cpupart_out */
    pg_cmt_cpupart_move,
    pg_cmt_cpu_belongs,
};

/*
 * Initialize the CMT PG class
 */
void
pg_cmt_class_init(void)
{
    if (cmt_sched_disabled)
        return;

    pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
}

/*
 * Called to indicate a new CPU has started up so
 * that either t0 or the slave startup thread can
 * be accounted for.
 */
void
pg_cmt_cpu_startup(cpu_t *cp)
{
    PG_NRUN_UPDATE(cp, 1);
}

/*
 * Adjust the CMT load in the CMT PGs in which the CPU belongs
 * Note that "n" can be positive in the case of increasing
 * load, or negative in the case of decreasing load.
 */
void
pg_cmt_load(cpu_t *cp, int n)
{
    pg_cmt_t    *pg;

    pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
    while (pg != NULL) {
        ASSERT(IS_CMT_PG(pg));
        atomic_add_32(&pg->cmt_nrunning, n);
        pg = pg->cmt_parent;
    }
}

/*
 * Return non-zero if thread can migrate between "from" and "to"
 * without a performance penalty
 */
int
pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
{
    if (from->cpu_physid->cpu_cacheid ==
        to->cpu_physid->cpu_cacheid)
        return (1);
    return (0);
}

/*
 * CMT class specific PG allocation
 */
static pg_t *
pg_cmt_alloc(void)
{
    return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
}

/*
 * Class specific PG de-allocation
 */
static void
pg_cmt_free(pg_t *pg)
{
    ASSERT(pg != NULL);
    ASSERT(IS_CMT_PG(pg));

    kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
}

/*
 * Return 1 if CMT load balancing policies should be
 * implemented across instances of the specified hardware
 * sharing relationship.
 */
static int
pg_cmt_load_bal_hw(pghw_type_t hw)
{
    if (hw == PGHW_IPIPE ||
        hw == PGHW_FPU ||
        hw == PGHW_CHIP)
        return (1);
    else
        return (0);
}

/*
 * Return 1 if thread affinity polices should be implemented
 * for instances of the specifed hardware sharing relationship.
 */
static int
pg_cmt_affinity_hw(pghw_type_t hw)
{
    if (hw == PGHW_CACHE)
        return (1);
    else
        return (0);
}

/*
 * Return 1 if CMT scheduling policies should be impelmented
 * for the specified hardware sharing relationship.
 */
static int
pg_cmt_hw(pghw_type_t hw)
{
    return (pg_cmt_load_bal_hw(hw) ||
        pg_cmt_affinity_hw(hw));
}

/*
 * CMT class callback for a new CPU entering the system
 */
static void
pg_cmt_cpu_init(cpu_t *cp)
{
    pg_cmt_t    *pg;
    group_t     *cmt_pgs;
    int     level, max_level, nlevels;
    pghw_type_t hw;
    pg_t        *pg_cache = NULL;
    pg_cmt_t    *cpu_cmt_hier[PGHW_NUM_COMPONENTS];
    lgrp_handle_t   lgrp_handle;
    cmt_lgrp_t  *lgrp;

    ASSERT(MUTEX_HELD(&cpu_lock));

    /*
     * A new CPU is coming into the system.
     * Interrogate the platform to see if the CPU
     * has any performance relevant CMT sharing
     * relationships
     */
    cmt_pgs = &cp->cpu_pg->cmt_pgs;
    cp->cpu_pg->cmt_lineage = NULL;

    bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
    max_level = nlevels = 0;
    for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {

        /*
         * We're only interested in CMT hw sharing relationships
         */
        if (pg_cmt_hw(hw) == 0 || pg_plat_hw_shared(cp, hw) == 0)
            continue;

        /*
         * Find (or create) the PG associated with
         * the hw sharing relationship in which cp
         * belongs.
         *
         * Determine if a suitable PG already
         * exists, or if one needs to be created.
         */
        pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
        if (pg == NULL) {
            /*
             * Create a new one.
             * Initialize the common...
             */
            pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);

            /* ... physical ... */
            pghw_init((pghw_t *)pg, cp, hw);

            /*
             * ... and CMT specific portions of the
             * structure.
             */
            bitset_init(&pg->cmt_cpus_actv_set);
            group_create(&pg->cmt_cpus_actv);
        } else {
            ASSERT(IS_CMT_PG(pg));
        }

        /* Add the CPU to the PG */
        pg_cpu_add((pg_t *)pg, cp);

        /*
         * Ensure capacity of the active CPUs group/bitset
         */
        group_expand(&pg->cmt_cpus_actv,
            GROUP_SIZE(&((pg_t *)pg)->pg_cpus));

        if (cp->cpu_seqid >=
            bitset_capacity(&pg->cmt_cpus_actv_set)) {
            bitset_resize(&pg->cmt_cpus_actv_set,
                cp->cpu_seqid + 1);
        }

        /*
         * Build a lineage of CMT PGs for load balancing
         */
        if (pg_cmt_load_bal_hw(hw)) {
            level = pghw_level(hw);
            cpu_cmt_hier[level] = pg;
            if (level > max_level)
                max_level = level;
            nlevels++;
        }

        /* Cache this for later */
        if (hw == PGHW_CACHE)
            pg_cache = (pg_t *)pg;
    }

    /*
     * Pack out any gaps in the constructed lineage.
     * Gaps may exist where the architecture knows
     * about a hardware sharing relationship, but such a
     * relationship either isn't relevant for load
     * balancing or doesn't exist between CPUs on the system.
     */
    pg_cmt_hier_pack(cpu_cmt_hier, max_level + 1);

    /*
     * For each of the PGs int the CPU's lineage:
     *  - Add an entry in the CPU sorted CMT PG group
     *    which is used for top down CMT load balancing
     *  - Tie the PG into the CMT hierarchy by connecting
     *    it to it's parent and siblings.
     */
    group_expand(cmt_pgs, nlevels);

    /*
     * Find the lgrp that encapsulates this CPU's CMT hierarchy
     */
    lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
    lgrp = pg_cmt_find_lgrp(lgrp_handle);

    for (level = 0; level < nlevels; level++) {
        uint_t      children;
        int     err;

        pg = cpu_cmt_hier[level];
        err = group_add_at(cmt_pgs, pg, nlevels - level - 1);
        ASSERT(err == 0);

        if (level == 0)
            cp->cpu_pg->cmt_lineage = (pg_t *)pg;

        if (pg->cmt_siblings != NULL) {
            /* Already initialized */
            ASSERT(pg->cmt_parent == NULL ||
                pg->cmt_parent == cpu_cmt_hier[level + 1]);
            ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
                pg->cmt_siblings == pg->cmt_parent->cmt_children);
            continue;
        }

        if ((level + 1) == nlevels) {
            pg->cmt_parent = NULL;
            pg->cmt_siblings = &lgrp->cl_pgs;
            children = ++lgrp->cl_npgs;
        } else {
            pg->cmt_parent = cpu_cmt_hier[level + 1];

            /*
             * A good parent keeps track of their children.
             * The parent's children group is also the PG's
             * siblings.
             */
            if (pg->cmt_parent->cmt_children == NULL) {
                pg->cmt_parent->cmt_children =
                    kmem_zalloc(sizeof (group_t), KM_SLEEP);
                group_create(pg->cmt_parent->cmt_children);
            }
            pg->cmt_siblings = pg->cmt_parent->cmt_children;
            children = ++pg->cmt_parent->cmt_nchildren;
        }
        pg->cmt_hint = 0;
        group_expand(pg->cmt_siblings, children);
    }

    /*
     * Cache the chip and core IDs in the cpu_t->cpu_physid structure
     * for fast lookups later.
     */
    if (cp->cpu_physid) {
        cp->cpu_physid->cpu_chipid =
            pg_plat_hw_instance_id(cp, PGHW_CHIP);
        cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);

        /*
         * If this cpu has a PG representing shared cache, then set
         * cpu_cacheid to that PG's logical id
         */
        if (pg_cache)
            cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
    }

    /* CPU0 only initialization */
    if (is_cpu0) {
        pg_cmt_cpu_startup(cp);
        is_cpu0 = 0;
    }

}

/*
 * Class callback when a CPU is leaving the system (deletion)
 */
static void
pg_cmt_cpu_fini(cpu_t *cp)
{
    group_iter_t    i;
    pg_cmt_t    *pg;
    group_t     *pgs, *cmt_pgs;
    lgrp_handle_t   lgrp_handle;
    cmt_lgrp_t  *lgrp;

    pgs = &cp->cpu_pg->pgs;
    cmt_pgs = &cp->cpu_pg->cmt_pgs;

    /*
     * Find the lgroup that encapsulates this CPU's CMT hierarchy
     */
    lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
    lgrp = pg_cmt_find_lgrp(lgrp_handle);

    /*
     * First, clean up anything load balancing specific for each of
     * the CPU's PGs that participated in CMT load balancing
     */
    pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
    while (pg != NULL) {

        /*
         * Remove the PG from the CPU's load balancing lineage
         */
        (void) group_remove(cmt_pgs, pg, GRP_RESIZE);

        /*
         * If it's about to become empty, destroy it's children
         * group, and remove it's reference from it's siblings.
         * This is done here (rather than below) to avoid removing
         * our reference from a PG that we just eliminated.
         */
        if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
            if (pg->cmt_children != NULL)
                group_destroy(pg->cmt_children);
            if (pg->cmt_siblings != NULL) {
                if (pg->cmt_siblings == &lgrp->cl_pgs)
                    lgrp->cl_npgs--;
                else
                    pg->cmt_parent->cmt_nchildren--;
            }
        }
        pg = pg->cmt_parent;
    }

    ASSERT(GROUP_SIZE(cmt_pgs) == 0);

    /*
     * Now that the load balancing lineage updates have happened,
     * remove the CPU from all it's PGs (destroying any that become
     * empty).
     */
    group_iter_init(&i);
    while ((pg = group_iterate(pgs, &i)) != NULL) {
        if (IS_CMT_PG(pg) == 0)
            continue;

        pg_cpu_delete((pg_t *)pg, cp);
        /*
         * Deleting the CPU from the PG changes the CPU's
         * PG group over which we are actively iterating
         * Re-initialize the iteration
         */
        group_iter_init(&i);

        if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {

            /*
             * The PG has become zero sized, so destroy it.
             */
            group_destroy(&pg->cmt_cpus_actv);
            bitset_fini(&pg->cmt_cpus_actv_set);
            pghw_fini((pghw_t *)pg);

            pg_destroy((pg_t *)pg);
        }
    }
}

/*
 * Class callback when a CPU is entering a cpu partition
 */
static void
pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
{
    group_t     *pgs;
    pg_t        *pg;
    group_iter_t    i;

    ASSERT(MUTEX_HELD(&cpu_lock));

    pgs = &cp->cpu_pg->pgs;

    /*
     * Ensure that the new partition's PG bitset
     * is large enough for all CMT PG's to which cp
     * belongs
     */
    group_iter_init(&i);
    while ((pg = group_iterate(pgs, &i)) != NULL) {
        if (IS_CMT_PG(pg) == 0)
            continue;

        if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
            bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
    }
}

/*
 * Class callback when a CPU is actually moving partitions
 */
static void
pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
{
    cpu_t       *cpp;
    group_t     *pgs;
    pg_t        *pg;
    group_iter_t    pg_iter;
    pg_cpu_itr_t    cpu_iter;
    boolean_t   found;

    ASSERT(MUTEX_HELD(&cpu_lock));

    pgs = &cp->cpu_pg->pgs;
    group_iter_init(&pg_iter);

    /*
     * Iterate over the CPUs CMT PGs
     */
    while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {

        if (IS_CMT_PG(pg) == 0)
            continue;

        /*
         * Add the PG to the bitset in the new partition.
         */
        bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);

        /*
         * Remove the PG from the bitset in the old partition
         * if the last of the PG's CPUs have left.
         */
        found = B_FALSE;
        PG_CPU_ITR_INIT(pg, cpu_iter);
        while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
            if (cpp == cp)
                continue;
            if (cpp->cpu_part->cp_id == oldpp->cp_id) {
                found = B_TRUE;
                break;
            }
        }
        if (!found)
            bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
    }
}

/*
 * Class callback when a CPU becomes active (online)
 *
 * This is called in a context where CPUs are paused
 */
static void
pg_cmt_cpu_active(cpu_t *cp)
{
    int     err;
    group_iter_t    i;
    pg_cmt_t    *pg;
    group_t     *pgs;

    ASSERT(MUTEX_HELD(&cpu_lock));

    pgs = &cp->cpu_pg->pgs;
    group_iter_init(&i);

    /*
     * Iterate over the CPU's PGs
     */
    while ((pg = group_iterate(pgs, &i)) != NULL) {

        if (IS_CMT_PG(pg) == 0)
            continue;

        err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
        ASSERT(err == 0);

        /*
         * If this is the first active CPU in the PG, and it
         * represents a hardware sharing relationship over which
         * CMT load balancing is performed, add it as a candidate
         * for balancing with it's siblings.
         */
        if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
            pg_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) {
            err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
            ASSERT(err == 0);
        }

        /*
         * Notate the CPU in the PGs active CPU bitset.
         * Also notate the PG as being active in it's associated
         * partition
         */
        bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
        bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
    }
}

/*
 * Class callback when a CPU goes inactive (offline)
 *
 * This is called in a context where CPUs are paused
 */
static void
pg_cmt_cpu_inactive(cpu_t *cp)
{
    int     err;
    group_t     *pgs;
    pg_cmt_t    *pg;
    cpu_t       *cpp;
    group_iter_t    i;
    pg_cpu_itr_t    cpu_itr;
    boolean_t   found;

    ASSERT(MUTEX_HELD(&cpu_lock));

    pgs = &cp->cpu_pg->pgs;
    group_iter_init(&i);

    while ((pg = group_iterate(pgs, &i)) != NULL) {

        if (IS_CMT_PG(pg) == 0)
            continue;

        /*
         * Remove the CPU from the CMT PGs active CPU group
         * bitmap
         */
        err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
        ASSERT(err == 0);

        bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);

        /*
         * If there are no more active CPUs in this PG over which
         * load was balanced, remove it as a balancing candidate.
         */
        if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
            pg_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) {
            err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
            ASSERT(err == 0);
        }

        /*
         * Assert the number of active CPUs does not exceed
         * the total number of CPUs in the PG
         */
        ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
            GROUP_SIZE(&((pg_t *)pg)->pg_cpus));

        /*
         * Update the PG bitset in the CPU's old partition
         */
        found = B_FALSE;
        PG_CPU_ITR_INIT(pg, cpu_itr);
        while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
            if (cpp == cp)
                continue;
            if (cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
                found = B_TRUE;
                break;
            }
        }
        if (!found) {
            bitset_del(&cp->cpu_part->cp_cmt_pgs,
                ((pg_t *)pg)->pg_id);
        }
    }
}

/*
 * Return non-zero if the CPU belongs in the given PG
 */
static int
pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
{
    cpu_t   *pg_cpu;

    pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);

    ASSERT(pg_cpu != NULL);

    /*
     * The CPU belongs if, given the nature of the hardware sharing
     * relationship represented by the PG, the CPU has that
     * relationship with some other CPU already in the PG
     */
    if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
        return (1);

    return (0);
}

/*
 * Pack the CPUs CMT hierarchy
 * The hierarchy order is preserved
 */
static void
pg_cmt_hier_pack(pg_cmt_t *hier[], int sz)
{
    int i, j;

    for (i = 0; i < sz; i++) {
        if (hier[i] != NULL)
            continue;

        for (j = i; j < sz; j++) {
            if (hier[j] != NULL) {
                hier[i] = hier[j];
                hier[j] = NULL;
                break;
            }
        }
        if (j == sz)
            break;
    }
}

/*
 * Return a cmt_lgrp_t * given an lgroup handle.
 * If the right one doesn't yet exist, create one
 * by growing the cmt_lgrps array
 */
static cmt_lgrp_t *
pg_cmt_find_lgrp(lgrp_handle_t hand)
{
    cmt_lgrp_t  *lgrp;

    ASSERT(MUTEX_HELD(&cpu_lock));

    lgrp = cmt_lgrps;
    while (lgrp != NULL) {
        if (lgrp->cl_hand == hand)
            return (lgrp);
        lgrp = lgrp->cl_next;
    }

    /*
     * Haven't seen this lgrp yet
     */
    lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);

    lgrp->cl_hand = hand;
    lgrp->cl_npgs = 0;
    lgrp->cl_next = cmt_lgrps;
    cmt_lgrps = lgrp;
    group_create(&lgrp->cl_pgs);

    return (lgrp);
}