/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/systm.h>
#include <sys/types.h>
#include <sys/param.h>
#include <sys/thread.h>
#include <sys/cpuvar.h>
#include <sys/cpupart.h>
#include <sys/cmn_err.h>
#include <sys/disp.h>
#include <sys/group.h>
#include <sys/bitset.h>
#include <sys/lgrp.h>
#include <sys/cmt.h>
/*
* CMT dispatcher policies
*
* This file implements CMT dispatching policies using Processor Groups.
*
* The scheduler/dispatcher leverages knowledge of the performance
* relevant CMT sharing relationships existing between CPUs to implement
* load balancing, and coalescence thread placement policies.
*
* Load balancing policy seeks to improve performance by minimizing
* contention over shared processor resources / facilities. Coalescence
* policies improve resource utilization and ultimately power efficiency.
*
* On NUMA systems, the dispatcher will generally perform load balancing and
* coalescence within (and not across) lgroups. This is because there isn't
* much sense in trying to correct an imbalance by sending a thread outside
* of its home, if it would attempt to return home a short while later.
* The dispatcher will implement CMT policy across lgroups however, if
* it can do so with a thread homed to the root lgroup, since root homed
* threads have no lgroup affinity.
*/
/*
* Return non-zero if, given the policy, we should migrate from running
* somewhere "here" to somewhere "there".
*/
static int
cmt_should_migrate(pg_cmt_t *here, pg_cmt_t *there, pg_cmt_policy_t policy,
int self)
{
uint32_t here_util, there_util;
here_util = here->cmt_utilization;
there_util = there->cmt_utilization;
/*
* This assumes that curthread's utilization is "1"
*/
if (self && bitset_in_set(&here->cmt_cpus_actv_set, CPU->cpu_seqid))
here_util--; /* Ignore curthread's effect */
/*
* Load balancing and coalescence are conflicting policies
*/
ASSERT((policy & (CMT_BALANCE|CMT_COALESCE)) !=
(CMT_BALANCE|CMT_COALESCE));
if (policy & CMT_BALANCE) {
/*
* Balance utilization
*
* If the target is comparatively underutilized
* (either in an absolute sense, or scaled by capacity),
* then choose to balance.
*/
if ((here_util > there_util) ||
(here_util == there_util &&
(CMT_CAPACITY(there) > CMT_CAPACITY(here)))) {
return (1);
}
} else if (policy & CMT_COALESCE) {
/*
* Attempt to drive group utilization up to capacity
*/
if (there_util > here_util &&
there_util < CMT_CAPACITY(there))
return (1);
}
return (0);
}
/*
* Perform multi-level CMT load balancing of running threads.
*
* tp is the thread being enqueued.
* cp is a hint CPU, against which CMT load balancing will be performed.
*
* Returns cp, or a CPU better than cp with respect to balancing
* running thread load.
*/
cpu_t *
cmt_balance(kthread_t *tp, cpu_t *cp)
{
int hint, i, cpu, nsiblings;
int self = 0;
group_t *cmt_pgs, *siblings;
pg_cmt_t *pg, *pg_tmp, *tpg = NULL;
int level = 0;
cpu_t *newcp;
extern cmt_lgrp_t *cmt_root;
ASSERT(THREAD_LOCK_HELD(tp));
cmt_pgs = &cp->cpu_pg->cmt_pgs;
if (GROUP_SIZE(cmt_pgs) == 0)
return (cp); /* nothing to do */
if (tp == curthread)
self = 1;
/*
* Balance across siblings in the CPUs CMT lineage
* If the thread is homed to the root lgroup, perform
* top level balancing against other top level PGs
* in the system. Otherwise, start with the default
* top level siblings group, which is within the leaf lgroup
*/
pg = GROUP_ACCESS(cmt_pgs, level);
if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID)
siblings = &cmt_root->cl_pgs;
else
siblings = pg->cmt_siblings;
/*
* Traverse down the lineage until we find a level that needs
* balancing, or we get to the end.
*/
for (;;) {
nsiblings = GROUP_SIZE(siblings); /* self inclusive */
if (nsiblings == 1)
goto next_level;
hint = CPU_PSEUDO_RANDOM() % nsiblings;
/*
* Find a balancing candidate from among our siblings
* "hint" is a hint for where to start looking
*/
i = hint;
do {
ASSERT(i < nsiblings);
pg_tmp = GROUP_ACCESS(siblings, i);
/*
* The candidate must not be us, and must
* have some CPU resources in the thread's
* partition
*/
if (pg_tmp != pg &&
bitset_in_set(&tp->t_cpupart->cp_cmt_pgs,
((pg_t *)pg_tmp)->pg_id)) {
tpg = pg_tmp;
break;
}
if (++i >= nsiblings)
i = 0;
} while (i != hint);
if (!tpg)
goto next_level; /* no candidates at this level */
/*
* Decide if we should migrate from the current PG to a
* target PG given a policy
*/
if (cmt_should_migrate(pg, tpg, pg->cmt_policy, self))
break;
tpg = NULL;
next_level:
if (++level == GROUP_SIZE(cmt_pgs))
break;
pg = GROUP_ACCESS(cmt_pgs, level);
siblings = pg->cmt_siblings;
}
if (tpg) {
uint_t tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv);
/*
* Select an idle CPU from the target
*/
hint = CPU_PSEUDO_RANDOM() % tgt_size;
cpu = hint;
do {
newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu);
if (newcp->cpu_part == tp->t_cpupart &&
newcp->cpu_dispatch_pri == -1) {
cp = newcp;
break;
}
if (++cpu == tgt_size)
cpu = 0;
} while (cpu != hint);
}
return (cp);
}