/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* These functions implement the process of commitment for a pool
* configuration. This process can be described as taking instructions
* from a static configuration file and using the information about
* the target system contained in the dynamic configuration to make
* decisions about how best to allocate resources to meet the
* constraints specified in the static configuration file.
*
* Mechanically, this process relies upon ordering the individual
* components of the file and stepping through the lists of components
* and taking actions depending on their type and which file they are
* part of.
*
* Configuration components can be broken down into different types
* which are then treated according to the following table:
*
* Element Type Action
* system || pool ||
* res_comp || res_agg If the element is a required element, then create or
* update it (don't destroy required elements in the
* static configuration) otherwise manipulate the
* dynamic configuration to create, destroy or update
* the element on the system.
* comp Create, destroy or update the static configuration
* component.
*
* The treatment of the different elements reflects the fact that all
* elements other than comp are configurable and thus libpool can
* create, destroy and modify these elements at will. comp elements
* reflect the disposition of the system, these elements can be moved
* around but they can't be created or destroyed in the dynamic
* configuration in the commit process. comp elements can be created
* and destroyed in the static configuration file as a result of a
* commit operation, since it's possible for a comp to not appear in
* the dynamic configuration. For instance, if the static
* configuration file was created on a different machine or after a DR
* operation which has removed or added components.
*
*/
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <errno.h>
#include <string.h>
#include <limits.h>
#include <unistd.h>
#include <pool.h>
#include "pool_internal.h"
#include "pool_impl.h"
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define POA_IMPORTANCE_NUM 0
#define POA_SURPLUS_TO_DEFAULT_NUM 1
/*
* This resource specific structure is used to determine allocation of resources
* during resource set allocation. Each set will receive its min, plus
* some number of dealt resources based on the global allocation policy.
*/
typedef struct res_info {
pool_resource_t *ri_res; /* Resource set */
uint64_t ri_min; /* Resource set's low watermark */
uint64_t ri_max; /* Resource set's high watermark */
uint64_t ri_oldsize; /* Size of resource set at the start */
uint64_t ri_newsize; /* New resource set size allocated */
uint64_t ri_pinned; /* Count of pinned resources in set */
uint64_t ri_dealt; /* Count of resources dealt to set */
int64_t ri_transfer; /* oldsize - newsize */
/* The signed quantity of resources */
/* to tranfer into or out of this */
/* resource set */
/* + transfer: tranfer resources out */
/* - transfer: tranfer resources in */
} res_info_t;
/*
* diff_and_fix operations
*/
static int commit_create(pool_conf_t *, pool_elem_t **);
static int commit_delete(pool_elem_t *);
static int commit_update(pool_elem_t *, pool_elem_t *, int);
/*
* configuration commit processing
*/
static int diff_and_fix(pool_conf_t *, pool_conf_t *);
static int process_elem_lt(pool_elem_t *, pool_conf_t *);
static int process_elem_gt(pool_elem_t *, pool_conf_t *,
pool_conf_t *);
static int process_lists(int, pool_conf_t *,
pool_conf_t *, int);
static pool_elem_t **get_elem_list(const pool_conf_t *, int, uint_t *);
static int share_resources(pool_conf_t *);
static int resource_allocate(const char *, pool_resource_t **,
uint_t);
static int resource_allocate_default(pool_resource_t **, uint_t);
static int pset_allocate_imp(pool_resource_t **, uint_t);
static int resource_compare_by_descending_importance(const void *,
const void *);
static int compute_size_to_transfer(const void *, const void *);
static int set_importance_cb(pool_conf_t *, pool_t *, void *);
static int unset_importance_cb(pool_conf_t *, pool_t *, void *);
static int add_importance_props(pool_conf_t *);
static int remove_importance_props(pool_conf_t *);
static int clone_element(pool_conf_t *, pool_elem_t *,
const char *, pool_value_t *, void *);
static int clean_element(pool_conf_t *, pool_elem_t *,
const char *, pool_value_t *, void *);
/*
* commit_create() is used to create a configuration element upon the
* system. Since only pools and resource actually need to perform any
* action, other elements are ignored as a no-op.
*/
static int
commit_create(pool_conf_t *conf, pool_elem_t **e1)
{
pool_resource_t *res;
pool_t *pool;
const char *res_type;
pool_elem_t *src = *e1;
uint64_t smin, smax, dmax;
pool_value_t val = POOL_VALUE_INITIALIZER;
char *name;
switch (pool_elem_class(src)) {
case PEC_SYSTEM: /* NO-OP */
break;
case PEC_POOL:
name = elem_get_name(src);
if ((pool = pool_create(conf, name)) == NULL) {
free(name);
return (PO_FAIL);
}
free(name);
/*
* Now copy the properties from the original pool to the
* new one
*/
if (pool_walk_properties(TO_CONF(src), src, TO_ELEM(pool),
clone_element) != PO_SUCCESS)
return (PO_FAIL);
/*
* Add a pointer to the src element which can be
* updated with a sys_id when the sys_id is allocated
* to the created element.
*/
pool_set_pair(TO_ELEM(pool), src);
*e1 = TO_ELEM(pool);
break;
case PEC_RES_COMP:
case PEC_RES_AGG:
name = elem_get_name(src);
res_type = pool_elem_class_string(src);
if ((res = pool_resource_create(conf, res_type, name)) ==
NULL) {
free(name);
return (PO_FAIL);
}
free(name);
/*
* Need to do some ordering of property updates.
* Compare the values of source min/max and
* destination min/max. If smin < dmax then update the
* smin first, else update the max first.
*/
if (resource_get_min(pool_elem_res(src), &smin) != PO_SUCCESS ||
resource_get_max(pool_elem_res(src), &smax) != PO_SUCCESS ||
resource_get_max(res, &dmax) != PO_SUCCESS)
return (PO_FAIL);
if (smin < dmax) {
pool_value_set_uint64(&val, smin);
if (pool_put_ns_property(TO_ELEM(res), c_min_prop,
&val) != PO_SUCCESS)
return (PO_FAIL);
} else {
pool_value_set_uint64(&val, smax);
if (pool_put_ns_property(TO_ELEM(res), c_max_prop,
&val) != PO_SUCCESS)
return (PO_FAIL);
}
/*
* Now copy the properties from the original resource
* to the new one
*/
if (pool_walk_properties(TO_CONF(src), src, TO_ELEM(res),
clone_element) != PO_SUCCESS)
return (PO_FAIL);
/*
* Add a pointer to the src element which can be
* updated with a sys_id when the sys_id is allocated
* to the created element.
*/
pool_set_pair(TO_ELEM(res), src);
*e1 = TO_ELEM(res);
break;
case PEC_COMP: /* NO-OP */
break;
default:
return (PO_FAIL);
}
return (PO_SUCCESS);
}
/*
* commit_delete() is used to delete a configuration element upon the
* system. Since only pools and resources actually need to perform
* any action, other elements are ignored as a no-op.
*/
static int
commit_delete(pool_elem_t *pe)
{
pool_resource_t *res;
pool_t *pool;
int ret = 0;
if (elem_is_tmp(pe))
return (PO_SUCCESS);
switch (pool_elem_class(pe)) {
case PEC_SYSTEM: /* NO-OP */
break;
case PEC_POOL:
pool = pool_elem_pool(pe);
ret = pool_destroy(TO_CONF(pe), pool);
break;
case PEC_RES_COMP:
case PEC_RES_AGG:
res = pool_elem_res(pe);
ret = pool_resource_destroy(TO_CONF(pe), res);
break;
case PEC_COMP: /* NO-OP */
break;
default:
return (PO_FAIL);
}
return (ret);
}
/*
* commit_update() is used to update a configuration element upon the
* system or in a static configuration file. The pass parameter
* governs whether properties are being updated or associations. In
* pass 0, properties are updated. If the element is of class
* PEC_COMP, then make sure that the element in the static
* configuration file is correctly located before proceeding with the
* update. Then, the element in the dynamic configuration file is
* updated. In pass 1, ie. pass != 0, any pool components have their
* associations updated in the dynamic configuration.
*/
static int
commit_update(pool_elem_t *e1, pool_elem_t *e2, int pass)
{
if (pass == 0) {
pool_resource_t *res1;
pool_resource_t *res2;
if (pool_elem_class(e1) == PEC_COMP) {
res1 = pool_get_owning_resource(TO_CONF(e1),
pool_elem_comp(e1));
res2 = pool_get_owning_resource(TO_CONF(e2),
pool_elem_comp(e2));
if (pool_elem_compare_name(TO_ELEM(res1),
TO_ELEM(res2)) != 0) {
char *name;
const pool_resource_t *newres;
pool_component_t *comps[2] = { NULL };
comps[0] = pool_elem_comp(e2);
name = elem_get_name(TO_ELEM(res1));
newres = pool_get_resource(TO_CONF(e2),
pool_elem_class_string(TO_ELEM(res1)),
name);
free(name);
assert(newres);
#ifdef DEBUG
dprintf("transferring: res, comp\n");
pool_elem_dprintf(TO_ELEM(newres));
pool_elem_dprintf(e2);
#endif /* DEBUG */
(void) pool_resource_xtransfer(TO_CONF(e2),
res2, (pool_resource_t *)newres, comps);
}
}
if (pool_walk_properties(TO_CONF(e2), e2, NULL,
clean_element) != PO_SUCCESS) {
return (PO_FAIL);
}
/*
* Need to do some ordering of property updates if the
* element to be updated is a resource. Compare the
* values of source min/max and destination
* min/max. If smin < dmax then update the smin first,
* else update the max first.
*/
if (pool_elem_class(e1) == PEC_RES_COMP ||
pool_elem_class(e1) == PEC_RES_AGG) {
uint64_t smin, smax, dmax;
pool_value_t val = POOL_VALUE_INITIALIZER;
if (resource_get_min(pool_elem_res(e1), &smin) !=
PO_SUCCESS ||
resource_get_max(pool_elem_res(e1), &smax) !=
PO_SUCCESS ||
resource_get_max(pool_elem_res(e2), &dmax) !=
PO_SUCCESS)
return (PO_FAIL);
if (smin < dmax) {
pool_value_set_uint64(&val, smin);
if (pool_put_ns_property(e2, c_min_prop,
&val) != PO_SUCCESS)
return (PO_FAIL);
} else {
pool_value_set_uint64(&val, smax);
if (pool_put_ns_property(e2, c_max_prop,
&val) != PO_SUCCESS)
return (PO_FAIL);
}
}
/*
* This next couple of steps needs some
* explanation. The first walk, copies all the
* properties that are writeable from the static
* configuration to the dynamic configuration. The
* second walk copies all properties (writeable or
* not) from the dynamic configuration element back to
* the static configuration element. This ensures that
* updates from the static configuration element are
* correctly applied to the dynamic configuration and
* then the static configuration element is updated
* with the latest values of the read-only xproperties
* from the dynamic configuration element. The
* enforcing of permisssions is performed in
* clone_element by its choice of property
* manipulation function.
*/
if (pool_walk_properties(TO_CONF(e1), e1, e2, clone_element) !=
PO_SUCCESS) {
return (PO_FAIL);
}
if (pool_walk_properties(TO_CONF(e2), e2, e1, clone_element) !=
PO_SUCCESS) {
return (PO_FAIL);
}
} else {
if (pool_elem_class(e1) == PEC_POOL) {
pool_resource_t **rs;
uint_t nelem;
int i;
pool_value_t val = POOL_VALUE_INITIALIZER;
pool_value_t *pvals[] = { NULL, NULL };
pvals[0] = &val;
if (pool_value_set_string(&val, "pset") != PO_SUCCESS ||
pool_value_set_name(&val, c_type) != PO_SUCCESS)
return (PO_FAIL);
if ((rs = pool_query_pool_resources(TO_CONF(e1),
pool_elem_pool(e1), &nelem, pvals)) != NULL) {
for (i = 0; i < nelem; i++) {
const pool_resource_t *tgt_res;
char *res_name =
elem_get_name(TO_ELEM(rs[i]));
if ((tgt_res = pool_get_resource(
TO_CONF(e2), pool_elem_class_string(
TO_ELEM(rs[i])), res_name)) ==
NULL) {
tgt_res = get_default_resource(
rs[i]);
}
free(res_name);
if (pool_associate(TO_CONF(e2),
pool_elem_pool(e2), tgt_res) !=
PO_SUCCESS) {
free(rs);
return (PO_FAIL);
}
}
free(rs);
}
}
}
return (PO_SUCCESS);
}
/*
* diff_and_fix() works out the differences between two configurations
* and modifies the state of the system to match the operations
* required to bring the two configurations into sync.
*
* Returns PO_SUCCESS/PO_FAIL.
*/
static int
diff_and_fix(pool_conf_t *stc, pool_conf_t *dyn)
{
/*
* The ordering of the operations is significant, we must
* process the system element, then the pools elements, then
* the resource elements, then the pools elements again and
* finally the resource components.
*
* TODO
* PEC_RES_COMP are the only type of resources
* currently. When PEC_RES_AGG resources are added they must
* also be processed.
*/
if (process_lists(PEC_SYSTEM, stc, dyn, 0) != PO_SUCCESS) {
return (PO_FAIL);
}
if (process_lists(PEC_POOL, stc, dyn, 0) != PO_SUCCESS) {
return (PO_FAIL);
}
if (process_lists(PEC_RES_COMP, stc, dyn, 0) != PO_SUCCESS) {
return (PO_FAIL);
}
if (process_lists(PEC_COMP, stc, dyn, 0) != PO_SUCCESS) {
return (PO_FAIL);
}
if (process_lists(PEC_POOL, stc, dyn, 1) != PO_SUCCESS) {
return (PO_FAIL);
}
/*
* Share the resources. It has to be called for both
* configurations to ensure that the configurations still look
* the same.
*/
if (share_resources(dyn) != PO_SUCCESS) {
return (PO_FAIL);
}
if (share_resources(stc) != PO_SUCCESS) {
return (PO_FAIL);
}
return (PO_SUCCESS);
}
static int
process_elem_lt(pool_elem_t *pe, pool_conf_t *dyn)
{
if (pool_elem_class(pe) == PEC_COMP) {
if (pool_component_destroy(pool_elem_comp(pe)) == PO_FAIL) {
return (PO_FAIL);
}
} else if (! elem_is_default(pe)) {
if (commit_create(dyn, &pe) != PO_SUCCESS) {
return (PO_FAIL);
}
}
return (PO_SUCCESS);
}
static int
process_elem_gt(pool_elem_t *pe, pool_conf_t *stc, pool_conf_t *dyn)
{
if (pool_elem_class(pe) == PEC_COMP) {
pool_resource_t *owner;
const pool_resource_t *parent_res;
pool_value_t val = POOL_VALUE_INITIALIZER;
const pool_component_t *newcomp;
const char *resname;
const char *restype;
/*
* I have to find the right parent in the static
* configuration. It may not exist, in which case it's
* correct to put it in the default
*/
owner = pool_get_owning_resource(dyn,
pool_elem_comp(pe));
if (pool_get_ns_property(TO_ELEM(owner), "name", &val) ==
POC_INVAL)
return (PO_FAIL);
if (pool_value_get_string(&val, &resname) == PO_FAIL)
return (PO_FAIL);
if ((resname = strdup(resname)) == NULL)
return (PO_FAIL);
restype = pool_elem_class_string(TO_ELEM(owner));
parent_res = pool_get_resource(stc, restype, resname);
free((void *)resname);
if (parent_res == NULL)
parent_res = resource_by_sysid(stc, PS_NONE, restype);
/*
* Now need to make a copy of the component in the
* dynamic configuration in the static configuration.
*/
if ((newcomp = pool_component_create(stc, parent_res,
elem_get_sysid(pe))) == NULL)
return (PO_FAIL);
if (pool_walk_properties(TO_CONF(pe), pe, TO_ELEM(newcomp),
clone_element) != PO_SUCCESS)
return (PO_FAIL);
} else if (elem_is_default(pe)) {
pool_resource_t *newres;
pool_t *newpool;
char *name;
if ((name = elem_get_name(pe)) == NULL)
return (PO_FAIL);
switch (pool_elem_class(pe)) {
case PEC_POOL:
if ((newpool = pool_create(stc, name)) == NULL) {
free(name);
return (PO_FAIL);
}
free(name);
if (pool_walk_properties(TO_CONF(pe), pe,
TO_ELEM(newpool), clone_element) != PO_SUCCESS)
return (PO_FAIL);
break;
case PEC_RES_AGG:
case PEC_RES_COMP:
if ((newres = pool_resource_create(stc,
pool_elem_class_string(pe), name)) ==
NULL) {
free(name);
return (PO_FAIL);
}
free(name);
if (pool_walk_properties(TO_CONF(pe), pe,
TO_ELEM(newres), clone_element) != PO_SUCCESS)
return (PO_FAIL);
break;
default:
free(name);
break;
}
} else {
if (commit_delete(pe) != PO_SUCCESS)
return (PO_FAIL);
}
return (PO_SUCCESS);
}
/*
* This function compares the elements of the supplied type in the
* static and dynamic configurations supplied. The lists of elements
* are compared and used to create, delete and updated elements in
* both the static and dynamic configurations. The pass parameter is
* used to indicate to commit_update() whether property updates or
* association updates should be performed.
*/
static int
process_lists(int type, pool_conf_t *stc, pool_conf_t *dyn, int pass)
{
uint_t stc_nelem = 0, dyn_nelem = 0;
pool_elem_t **stc_elems, **dyn_elems;
int i, j;
int status = PO_SUCCESS;
if ((stc_elems = get_elem_list(stc, type, &stc_nelem)) == NULL)
return (PO_FAIL);
qsort(stc_elems, stc_nelem, sizeof (pool_elem_t *),
qsort_elem_compare);
if ((dyn_elems = get_elem_list(dyn, type, &dyn_nelem)) == NULL) {
free(stc_elems);
return (PO_FAIL);
}
qsort(dyn_elems, dyn_nelem, sizeof (pool_elem_t *),
qsort_elem_compare);
/*
* Step through and do the updating, remember that we are
* comparing using the compare function for the configuration
* and that is fixed.
*/
i = j = 0;
while (status == PO_SUCCESS && i < stc_nelem && j < dyn_nelem) {
int compare;
/*
* We are going to do this by stepping through the static
* list first.
*/
if (elem_is_default(stc_elems[i]) &&
elem_is_default(dyn_elems[j]))
compare = 0;
else
compare = pool_elem_compare_name(stc_elems[i],
dyn_elems[j]);
if (compare < 0) {
status = process_elem_lt(stc_elems[i], dyn);
i++;
} else if (compare > 0) {
status = process_elem_gt(dyn_elems[j], stc, dyn);
j++;
} else { /* compare == 0 */
if (commit_update(stc_elems[i], dyn_elems[j], pass)
!= PO_SUCCESS) {
status = PO_FAIL;
}
i++;
j++;
}
}
if (status == PO_FAIL) {
free(stc_elems);
free(dyn_elems);
return (PO_FAIL);
}
while (status == PO_SUCCESS && i < stc_nelem) {
status = process_elem_lt(stc_elems[i], dyn);
i++;
}
if (status == PO_FAIL) {
free(stc_elems);
free(dyn_elems);
return (PO_FAIL);
}
while (status == PO_SUCCESS && j < dyn_nelem) {
status = process_elem_gt(dyn_elems[j], stc, dyn);
j++;
}
free(stc_elems);
free(dyn_elems);
return (status);
}
/*
* get_elem_list() returns a list of pool_elem_t's. The size of the
* list is written into nelem. The list contains elements of all types
* that pools is interested in: i.e. system, pool, resources and
* resource components. It is the caller's responsibility to free the
* list when it is finished with.
*
* The array of pointers returned by the type specific query can be
* safely cast to be an array of pool_elem_t pointers. In the case of
* PEC_RES_COMP some additional processing is required to qualify the
* list of elements.
*
* Returns a pointer to a list of pool_elem_t's or NULL on failure.
*/
static pool_elem_t **
get_elem_list(const pool_conf_t *conf, int type, uint_t *nelem)
{
pool_resource_t **rl;
pool_t **pl;
pool_component_t **cl;
pool_elem_t **elems = NULL;
int i;
switch (type) {
case PEC_SYSTEM:
if ((elems = malloc(sizeof (pool_elem_t *))) == NULL)
return (NULL);
*nelem = 1;
elems[0] = pool_conf_to_elem(conf);
break;
case PEC_POOL:
if ((pl = pool_query_pools(conf, nelem, NULL)) != NULL) {
elems = (pool_elem_t **)pl;
}
break;
case PEC_RES_COMP:
if ((rl = pool_query_resources(conf, nelem, NULL)) != NULL) {
int j = 0;
elems = (pool_elem_t **)rl;
for (i = 0; i < *nelem; i++) {
if (pool_elem_class(TO_ELEM(rl[i])) ==
PEC_RES_COMP)
elems[j++] = TO_ELEM(rl[i]);
}
*nelem = j;
}
break;
case PEC_COMP:
if ((cl = pool_query_components(conf, nelem, NULL)) != NULL) {
elems = (pool_elem_t **)cl;
}
break;
default:
abort();
break;
}
return (elems);
}
/*
* share_resources() sets up the allocation of resources by each
* provider. Firstly all resources are updated with the importance of
* each pool, then each resource provider is invoked in turn with a
* list of it's own resources. Finally, the pool importance details
* are removed from the resources.
*
* Returns PO_SUCCESS/PO_FAIL
*/
static int
share_resources(pool_conf_t *conf)
{
pool_resource_t **resources;
uint_t nelem;
pool_value_t *props[] = { NULL, NULL };
pool_value_t val = POOL_VALUE_INITIALIZER;
props[0] = &val;
/*
* Call an allocation function for each type of supported resource.
* This function is responsible for "sharing" resources to resource
* sets as determined by the system.allocate-method.
*/
if (pool_value_set_string(props[0], "pset") != PO_SUCCESS ||
pool_value_set_name(props[0], c_type) != PO_SUCCESS)
return (PO_FAIL);
if (add_importance_props(conf) != PO_SUCCESS) {
(void) remove_importance_props(conf);
return (PO_FAIL);
}
if ((resources = pool_query_resources(conf, &nelem, props)) != NULL) {
/*
* 'pool.importance' defines the importance of a pool;
* resources inherit the importance of the pool that
* is associated with them. If more than one pool is
* associated with a resource, the importance of the
* resource is the maximum importance of all
* associated pools. Use '_importance' on resources
* to determine who gets extra.
*/
if (resource_allocate("pset", resources, nelem) != PO_SUCCESS) {
free(resources);
(void) remove_importance_props(conf);
return (PO_FAIL);
}
}
free(resources);
(void) remove_importance_props(conf);
return (PO_SUCCESS);
}
/*
* Work out which allocation method to use based on the value of the
* system.allocate-method property.
*/
int
resource_allocate(const char *type, pool_resource_t **res, uint_t nelem)
{
pool_elem_t *pe;
const char *method_name;
uint64_t method;
pool_value_t val = POOL_VALUE_INITIALIZER;
int ret;
pe = pool_conf_to_elem(TO_CONF(TO_ELEM(res[0])));
if (pool_get_ns_property(pe, "allocate-method", &val) != POC_STRING)
method_name = POA_IMPORTANCE;
else {
(void) pool_value_get_string(&val, &method_name);
}
if (strcmp(POA_IMPORTANCE, method_name) != 0) {
if (strcmp(POA_SURPLUS_TO_DEFAULT, method_name) != 0) {
pool_seterror(POE_INVALID_CONF);
return (PO_FAIL);
} else {
method = POA_SURPLUS_TO_DEFAULT_NUM;
}
} else {
method = POA_IMPORTANCE_NUM;
}
switch (method) {
case POA_IMPORTANCE_NUM:
/*
* TODO: Add support for new resource types
*/
switch (pool_resource_elem_class_from_string(type)) {
case PREC_PSET:
ret = pset_allocate_imp(res, nelem);
break;
default:
ret = PO_FAIL;
break;
}
break;
case POA_SURPLUS_TO_DEFAULT_NUM:
ret = resource_allocate_default(res, nelem);
break;
}
return (ret);
}
/*
* Each set will get its minimum, however if there is more than the
* total minimum available, then leave this in the default set.
*/
int
resource_allocate_default(pool_resource_t **res, uint_t nelem)
{
res_info_t *res_info;
uint_t j;
pool_resource_t *default_res = NULL;
if (nelem == 1)
return (PO_SUCCESS);
if ((res_info = calloc(nelem, sizeof (res_info_t))) == NULL) {
return (PO_FAIL);
}
/* Load current resource values. */
for (j = 0; j < nelem; j++) {
if (default_res == NULL &&
resource_is_default(res[j]) == PO_TRUE)
default_res = res[j];
if (resource_get_max(res[j],
&res_info[j].ri_max) == PO_FAIL ||
resource_get_min(res[j],
&res_info[j].ri_min) == PO_FAIL ||
resource_get_size(res[j],
&res_info[j].ri_oldsize) == PO_FAIL ||
resource_get_pinned(res[j],
&res_info[j].ri_pinned) == PO_FAIL) {
free(res_info);
return (PO_FAIL);
}
res_info[j].ri_res = res[j];
}
/*
* Firstly, for all resources that have size greater than min,
* transfer all movable size above min to the default resource.
*/
for (j = 0; j < nelem; j++) {
uint64_t real_min;
/* compute the real minimum number of resources */
real_min = MAX(res_info[j].ri_pinned, res_info[j].ri_min);
if (res_info[j].ri_res != default_res &&
res_info[j].ri_oldsize > real_min) {
uint64_t num;
num = res_info[j].ri_oldsize - real_min;
if (pool_resource_transfer(
TO_CONF(TO_ELEM(default_res)),
res_info[j].ri_res, default_res, num) !=
PO_SUCCESS) {
free(res_info);
return (PO_FAIL);
}
}
}
/*
* Now, transfer resources below min from the default.
*/
for (j = 0; j < nelem; j++) {
/*
* We don't want to interfere with resources which are reserved
*/
if (res_info[j].ri_res != default_res &&
res_info[j].ri_oldsize < res_info[j].ri_min) {
if (pool_resource_transfer(
TO_CONF(TO_ELEM(default_res)),
default_res, res_info[j].ri_res,
res_info[j].ri_min - res_info[j].ri_oldsize) !=
PO_SUCCESS) {
free(res_info);
return (PO_FAIL);
}
}
}
free(res_info);
return (PO_SUCCESS);
}
/*
* Allocate cpus to pset resource sets, favoring sets with higher importance.
*
* Step 1: Sort resource sets by decreasing importance, and load each sets
* current size (oldsize), min, max, and number of pinned cpus.
* Compute the total number of cpus by totaling oldsize.
*
* Step 2: Compute the newsize for each set:
*
* Give each set its min number of cpus. This min may be greater than
* its pset.min due to pinned cpus. If there are more cpus than the total
* of all mins, then the surplus cpus are dealt round-robin to all sets
* (up to their max) in order of decreasing importance. A set may be
* skipped during dealing because it started with more than its min due to
* pinned cpus. The dealing stops when there are no more cpus or all
* sets are at their max. If all sets are at their max, any remaining cpus
* are given to the default set.
*
* Step 3: Transfer cpus from sets with (oldsize > newsize) to sets with
* (oldsize < newsize).
*/
int
pset_allocate_imp(pool_resource_t **res, uint_t nelem)
{
res_info_t *res_info;
res_info_t *default_res_info;
const pool_resource_t *default_res = NULL;
uint64_t tot_resources = 0; /* total count of resources */
uint64_t tot_min = 0; /* total of all resource set mins */
uint64_t num_to_deal = 0; /* total resources above mins to deal */
uint64_t sets_maxed = 0; /* number of resource sets dealt to */
/* their max */
uint64_t sets_finished = 0; /* number of resource sets that have */
/* size == newsize */
int donor, receiver;
int deal;
int j;
int ret = PO_SUCCESS;
/*
* Build list of res_info_t's
*/
if ((res_info = calloc(nelem, sizeof (res_info_t))) == NULL) {
pool_seterror(POE_SYSTEM);
return (PO_FAIL);
}
/* Order resources by importance, most important being first */
qsort(res, nelem, sizeof (pool_resource_t *),
resource_compare_by_descending_importance);
for (j = 0; j < nelem; j++) {
/* Track which resource is the default */
if (default_res == NULL &&
resource_is_default(res[j]) == PO_TRUE) {
default_res = res[j];
default_res_info = &(res_info[j]);
}
/* Load sets' current values */
if (resource_get_max(res[j], &res_info[j].ri_max) == PO_FAIL ||
resource_get_min(res[j], &res_info[j].ri_min) == PO_FAIL ||
resource_get_size(res[j], &res_info[j].ri_oldsize) ==
PO_FAIL ||
resource_get_pinned(res[j],
&res_info[j].ri_pinned) == PO_FAIL) {
free(res_info);
return (PO_FAIL);
}
/* Start each set's newsize out at their min. */
res_info[j].ri_newsize = res_info[j].ri_min;
/* pre-deal pinned resources that exceed min */
if (res_info[j].ri_pinned > res_info[j].ri_min) {
res_info[j].ri_newsize = res_info[j].ri_pinned;
res_info[j].ri_dealt =
res_info[j].ri_newsize - res_info[j].ri_min;
}
res_info[j].ri_res = res[j];
/* Compute total number of resources to deal out */
tot_resources += res_info[j].ri_oldsize;
tot_min += res_info[j].ri_newsize;
#ifdef DEBUG
dprintf("res allocation details\n");
pool_elem_dprintf(TO_ELEM(res[j]));
dprintf("size=%llu\n", res_info[j].ri_oldsize);
#endif /* DEBUG */
}
num_to_deal = tot_resources - tot_min;
/*
* Deal one resource to each set, and then another, until all
* resources are dealt or all sets are at their max.
*/
for (deal = 1; num_to_deal > 0 && sets_maxed < nelem; deal++) {
for (j = 0; j < nelem; j++) {
/*
* Skip this resource set if it has already been
* pre-dealt a resource due to pinned resources.
*/
if (res_info[j].ri_dealt >= deal)
continue;
if (res_info[j].ri_newsize < res_info[j].ri_max) {
res_info[j].ri_dealt++;
res_info[j].ri_newsize++;
if (res_info[j].ri_newsize ==
res_info[j].ri_max)
sets_maxed++;
num_to_deal--;
if (num_to_deal == 0)
break;
}
}
}
/*
* If all resource sets are at their max, deal the remaining to the
* default resource set.
*/
if ((sets_maxed == nelem) && (num_to_deal > 0)) {
default_res_info->ri_dealt += num_to_deal;
default_res_info->ri_newsize += num_to_deal;
}
/*
* Sort so that resource sets needing resources preced resource sets
* that have extra resources. The sort function will also compute
* The quantity of resources that need to be transfered into or out
* of each set so that it's size == newsize.
*/
qsort(res_info, nelem, sizeof (res_info_t),
compute_size_to_transfer);
/*
* The donor index starts at the end of the resource set list and
* walks up. The receiver index starts at the beginning of the
* resource set list and walks down. Cpu's are transfered from the
* donors to the receivers until all sets have transfer == 0).
*/
donor = nelem - 1;
receiver = 0;
/* Number of sets with transfer == 0 */
sets_finished = 0;
/* Tranfer resources so that each set's size becomes newsize */
for (;;) {
uint64_t ntrans;
if (donor == receiver) {
if (res_info[donor].ri_transfer != 0) {
free(res_info);
return (PO_FAIL);
}
sets_finished++;
break;
}
if (res_info[donor].ri_transfer == 0) {
sets_finished++;
donor--;
continue;
}
if (res_info[receiver].ri_transfer == 0) {
sets_finished++;
receiver++;
continue;
}
/* Transfer resources from the donor set to the receiver */
ntrans = MIN(res_info[donor].ri_transfer,
-res_info[receiver].ri_transfer);
if (pool_resource_transfer(
TO_CONF(TO_ELEM(res_info[donor].ri_res)),
res_info[donor].ri_res, res_info[receiver].ri_res,
ntrans) != PO_SUCCESS) {
free(res_info);
return (PO_FAIL);
}
res_info[donor].ri_transfer -= ntrans;
res_info[receiver].ri_transfer += ntrans;
}
if (sets_finished != nelem)
ret = PO_FAIL;
free(res_info);
return (ret);
}
/*
* Used as a qsort parameter to help order resources in terms of their
* importance, higher importance being first.
*/
int
resource_compare_by_descending_importance(const void *arg1, const void *arg2)
{
pool_elem_t *elem1;
pool_elem_t *elem2;
pool_resource_t **res1 = (pool_resource_t **)arg1;
pool_resource_t **res2 = (pool_resource_t **)arg2;
pool_value_t val = POOL_VALUE_INITIALIZER;
int64_t i1 = 0, i2 = 0;
elem1 = TO_ELEM(*res1);
elem2 = TO_ELEM(*res2);
if (pool_get_property(TO_CONF(elem1), elem1, "_importance", &val) ==
POC_INT)
(void) pool_value_get_int64(&val, &i1);
if (pool_get_property(TO_CONF(elem2), elem2, "_importance", &val) ==
POC_INT)
(void) pool_value_get_int64(&val, &i2);
return (i1 > i2 ? -1 : (i1 < i2 ? 1 : 0));
}
/*
* Sort in increasing order so that resource sets with extra resources are at
* the end and resource sets needing resources are at the beginning.
*/
int
compute_size_to_transfer(const void *arg1, const void *arg2)
{
res_info_t *r1 = (res_info_t *)arg1, *r2 = (res_info_t *)arg2;
r1->ri_transfer = (int64_t)r1->ri_oldsize - (int64_t)r1->ri_newsize;
r2->ri_transfer = (int64_t)r2->ri_oldsize - (int64_t)r2->ri_newsize;
return (r1->ri_transfer > r2->ri_transfer ? 1 :
(r1->ri_transfer < r2->ri_transfer ? -1 : 0));
}
/*
* set_importance_cb() is used to create "_importance" props on each
* resource associated with a pool.
*
* Returns PO_SUCCESS/PO_FAIL
*/
/*ARGSUSED*/
static int
set_importance_cb(pool_conf_t *conf, pool_t *pool, void *unused)
{
pool_value_t val = POOL_VALUE_INITIALIZER;
int64_t importance;
pool_resource_t **res;
uint_t nelem, i;
if (pool_get_property(conf, TO_ELEM(pool), "pool.importance", &val) !=
POC_INT) {
pool_seterror(POE_INVALID_CONF);
return (PO_FAIL);
}
(void) pool_value_get_int64(&val, &importance);
if ((res = pool_query_pool_resources(conf, pool, &nelem, NULL)) ==
NULL) {
return (PO_FAIL);
}
for (i = 0; res[i] != NULL; i++) {
int64_t old_importance = INT64_MIN;
pool_elem_t *elem = TO_ELEM(res[i]);
if (pool_get_property(conf, elem, "_importance", &val) ==
POC_INT)
(void) pool_value_get_int64(&val, &old_importance);
if (old_importance <= importance) {
(void) pool_value_set_int64(&val, importance);
(void) pool_put_property(conf, elem, "_importance",
&val);
}
}
free(res);
return (PO_SUCCESS);
}
/*
* unset_importance_cb() is used to remove "_importance" props from
* each resource associated with a pool.
*
* Returns PO_SUCCESS/PO_FAIL
*/
/*ARGSUSED*/
static int
unset_importance_cb(pool_conf_t *conf, pool_t *pool, void *unused)
{
pool_resource_t **res;
uint_t nelem, i;
if ((res = pool_query_pool_resources(conf, pool, &nelem, NULL)) ==
NULL) {
return (PO_FAIL);
}
for (i = 0; res[i] != NULL; i++) {
if (pool_rm_property(conf, TO_ELEM(res[i]), "_importance") ==
PO_FAIL) {
free(res);
return (PO_FAIL);
}
}
free(res);
return (PO_SUCCESS);
}
/*
* add_importance_props() is used to create "_importance" props on
* each resource associated with a pool.
*
* Returns PO_SUCCESS/PO_FAIL
*/
static int
add_importance_props(pool_conf_t *conf)
{
return (pool_walk_pools(conf, NULL, set_importance_cb));
}
/*
* remove_importance_props() is used to remove "_importance" props on
* each resource associated with a pool.
*
* Returns PO_SUCCESS/PO_FAIL
*/
static int
remove_importance_props(pool_conf_t *conf)
{
return (pool_walk_pools(conf, NULL, unset_importance_cb));
}
/*
* pool_conf_commit_sys() takes a configuration and modifies both the
* supplied configuration and the dynamic configuration. The goal of
* this modification is to generate a dynamic configuration which best
* represents the constraints laid down in the static configuration
* and to update the static configuration with the results of this
* process.
*
* Returns PO_SUCCESS/PO_FAIL
*/
int
pool_conf_commit_sys(pool_conf_t *conf, int validate)
{
pool_conf_t *dyn;
if ((dyn = pool_conf_alloc()) == NULL)
return (PO_FAIL);
if (pool_conf_open(dyn, pool_dynamic_location(), PO_RDWR) !=
PO_SUCCESS) {
pool_conf_free(dyn);
return (PO_FAIL);
}
if (validate == PO_TRUE) {
if (pool_conf_validate(conf, POV_RUNTIME) != PO_SUCCESS) {
(void) pool_conf_close(dyn);
pool_conf_free(dyn);
return (PO_FAIL);
}
}
/*
* Now try to make the two things "the same".
*/
if (diff_and_fix(conf, dyn) != PO_SUCCESS) {
(void) pool_conf_close(dyn);
pool_conf_free(dyn);
pool_seterror(POE_INVALID_CONF);
return (PO_FAIL);
}
if (dyn->pc_prov->pc_commit(dyn) != PO_SUCCESS) {
(void) pool_conf_close(dyn);
pool_conf_free(dyn);
return (PO_FAIL);
}
(void) pool_conf_close(dyn);
pool_conf_free(dyn);
return (PO_SUCCESS);
}
/*
* Copies all properties from one element to another. If the property
* is a readonly property, then don't copy it.
*/
/* ARGSUSED */
static int
clone_element(pool_conf_t *conf, pool_elem_t *pe, const char *name,
pool_value_t *pv, void *user)
{
pool_elem_t *tgt = (pool_elem_t *)user;
const pool_prop_t *prop;
#ifdef DEBUG
dprintf("Cloning %s from %s\n",
pool_conf_location(TO_CONF(TO_ELEM(tgt))),
pool_conf_location(TO_CONF(pe)));
assert(TO_CONF(TO_ELEM(tgt)) != TO_CONF(pe));
dprintf("clone_element: Processing %s\n", name);
pool_value_dprintf(pv);
#endif /* DEBUG */
/*
* Some properties should be ignored
*/
if ((prop = provider_get_prop(pe, name)) != NULL &&
prop_is_readonly(prop) == PO_TRUE)
return (PO_SUCCESS);
/* The temporary property needs special handling */
if (strstr(name, ".temporary") != NULL)
return (pool_set_temporary(TO_CONF(tgt), tgt) ==
PO_FAIL ? PO_FAIL : PO_SUCCESS);
else
return (pool_put_property(TO_CONF(tgt), tgt, name, pv) ==
PO_FAIL ? PO_FAIL : PO_SUCCESS);
}
/*
* Removes all properties from one element. Properties which are
* managed by the configuration are ignored.
*/
/* ARGSUSED3 */
static int
clean_element(pool_conf_t *conf, pool_elem_t *pe, const char *name,
pool_value_t *pv, void *user)
{
const pool_prop_t *prop;
/*
* Some properties should be ignored
*/
if (strstr(name, ".temporary") != NULL ||
((prop = provider_get_prop(pe, name)) != NULL &&
prop_is_optional(prop) == PO_FALSE))
return (PO_SUCCESS);
return (pool_rm_property(conf, (pool_elem_t *)pe, name) == PO_FAIL);
}