meta_db_balance.c revision 2791f8b95893f7d64b6f89703e7af240aa84a33f
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Database location balancing code.
*/
#include <meta.h>
#include <sdssc.h>
#define MD_MINBALREP 2
/*
* Stuff for DB balancing.
*/
enum md_ctlr_ops_t {
DRV_NOP = 0,
DRV_ADD = 1,
DRV_DEL = 2
};
typedef enum md_ctlr_ops_t md_ctlr_ops_t;
/* drive flag fields */
#define DRV_F_ERROR 0x1
#define DRV_F_INDISKSET 0x2
struct md_ctlr_drv_t {
int drv_flags;
int drv_dbcnt;
int drv_new_dbcnt;
struct md_ctlr_drv_t *drv_next;
};
typedef struct md_ctlr_drv_t md_ctlr_drv_t;
struct md_ctlr_ctl_t {
int ctl_dbcnt;
int ctl_drcnt;
struct md_ctlr_ctl_t *ctl_next;
};
typedef struct md_ctlr_ctl_t md_ctlr_ctl_t;
static int
int dbcnt,
)
{
return (-1);
return (-1);
return (-1);
}
return (-1);
}
return (0);
}
static int
)
{
return (-1);
return (-1);
return (-1);
}
return (0);
}
static int
{
continue;
return (1);
}
return (0);
}
static int
int dbcnt,
int indiskset,
int with_bus,
int errored,
)
{
md_ctlr_drv_t **dpp;
char *cmp_name_1, *cmp_name_2;
int not_found;
/*
* The user must pass in a list head.
*/
/*
* A failure to get the slice information can occur
* because the drive has failed, if this is the
* case then there is nothing that can be done
* with this drive, so do not include it in the
* list of drives. Clear the error and return.
*/
mdclrerror(ep);
return (0);
}
return (-1);
return (-1);
errored = 1;
errored = 1;
} else
/*
* Try to locate ctlr.
*/
&cmp_name_2);
not_found = 1;
} else
not_found = 0;
if (not_found)
continue;
/*
* Found ctlr, try to locate the drive.
*/
(void) sdssc_convert_cluster_path(
&cmp_name_2);
if (not_found)
continue;
/*
* Found drive, must be deleting.
*/
if (indiskset)
if (errored) {
mdclrerror(ep);
}
return (0);
}
/*
* The ctlr was found, but not the drive, so add
* the drive
*/
if (indiskset) {
if (errored) {
mdclrerror(ep);
}
} else {
if (errored) {
return (-1);
}
}
return (0);
}
/*
* No ctlr was located, so add the ctlr, then recurse to add the
* drive to the ctlr.
*/
}
static int
md_ctlr_ctl_t *c,
int minimum_replicas,
)
{
md_ctlr_drv_t *d;
int maxdb = 0;
/*
* If this ctrl has no "usable" drives, assert() or just return if
* assert()'s are turned off.
*/
if (c->ctl_drcnt == 0) {
assert(0);
return (0);
}
/*
* Determine the largest DB count on a drive.
*/
/*
* Make sure we start at a reasonable number
*/
if (maxdb == 0)
maxdb = 1;
/*
* Add a replica to a drive on this ctrl.
*/
/*CONSTCOND*/
while (1) {
/*
* If this drive is being deleted, skip it.
*/
continue;
if (d->drv_flags & DRV_F_ERROR)
continue;
/*
* Make sure that the replicas are distributed across
* the drives.
*/
continue;
/*
* See if the drive already has replicas,
* if it does, then delete the exisiting
* replica(s) and re-add n+1 replicas to the drive.
*/
/* ==== Vulnerability - no DB's start ==== */
if (d->drv_dbcnt > 0) {
d->drv_flags |= DRV_F_ERROR;
if (! (d->drv_flags & DRV_F_INDISKSET))
return (-1);
mdclrerror(ep);
continue;
}
}
if (d->drv_dbcnt) {
/*
* We have to to bring the replica
* in the drive to the previous
* status by adding the original no
* of replicas to the drive since
* the addition of (drv_dbcnt+1) no
* of replicas has failed. If we
* leave it at this state, we might
* end up having no replicas at
* all for the diskset.
*/
d->drv_dbcnt, d->drv_dbsize,
&nep) == -1) {
d->drv_dbcnt = 0;
mdclrerror(&nep);
}
}
return (-1);
continue;
d->drv_flags |= DRV_F_ERROR;
if (! (d->drv_flags & DRV_F_INDISKSET))
return (-1);
mdclrerror(ep);
continue;
}
d->drv_dbcnt++;
c->ctl_dbcnt++;
/* ==== Vulnerability - no DB's end ==== */
return (1);
}
maxdb++;
if (maxdb > minimum_replicas)
return (0);
}
/*NOTREACHED*/
}
static int
md_ctlr_ctl_t *c,
)
{
md_ctlr_drv_t *d;
int maxdb = 0;
/*
* If this ctrl has no "usable" drives, assert() or just return if
* assert()'s are turned off.
*/
if (c->ctl_drcnt == 0) {
assert(0);
return (0);
}
/*
* Determine the largest DB count on a drive.
*/
if (maxdb == 0)
return (0);
/*
* Delete a replica from a drive on this ctrl.
*/
/*CONSTCOND*/
while (1) {
/*
* If this drive is being deleted, skip it.
*/
continue;
/*
* Make sure that there are replicas on this drive to
* delete.
*/
if (d->drv_dbcnt == 0)
continue;
if (d->drv_flags & DRV_F_ERROR)
continue;
/*
* We need to keep the DB's distributed across the
* drives.
*/
continue;
/*
* Delete all the replicas on the drive.
*/
/* ==== Vulnerability - no DB's start ==== */
d->drv_flags |= DRV_F_ERROR;
if (! (d->drv_flags & DRV_F_INDISKSET))
return (-1);
mdclrerror(ep);
continue;
}
d->drv_dbcnt--;
c->ctl_dbcnt--;
/*
* If there is still a dbcnt for this drive, then add
* back the needed DB's.
*/
if (d->drv_dbcnt > 0) {
d->drv_dbcnt = 0;
if (mdismddberror(ep,
return (-1);
d->drv_flags |= DRV_F_ERROR;
if (! (d->drv_flags & DRV_F_INDISKSET))
return (-1);
mdclrerror(ep);
continue;
}
}
/* ==== Vulnerability - no DB's end ==== */
return (1);
}
maxdb--;
if (maxdb <= 0)
return (0);
}
/*NOTREACHED*/
}
static int
{
md_ctlr_ctl_t *c;
md_ctlr_drv_t *d;
if (! (d->drv_flags & DRV_F_ERROR) &&
continue;
if (d->drv_dbcnt == 0)
continue;
if (meta_replicaslice(d->drv_dnp,
return (-1);
return (-1);
/*
* Delete the replicas listed.
*/
ep) == -1) {
if (d->drv_flags & DRV_F_INDISKSET) {
mdclrerror(ep);
continue;
}
return (-1);
}
}
}
return (0);
}
static void
{
Free(d);
}
Free(c);
}
}
static int
int with_bus,
)
{
md_drive_desc *d;
static daddr_t min_dbsize = 0;
if (min_dbsize == 0) {
if (! metaislocalset(sp)) {
return (-1);
if (MD_MNSET_DESC(sd))
}
mdclrerror(ep);
} else
min_dbsize = nblks;
}
return (-1);
mdclrerror(ep);
}
/*
* Add drives currently in the set to the ctlr list.
*/
if (this_dbsize == 0)
return (-1);
}
/*
* Add the drives that are being operated on to the ctlr list.
*/
return (-1);
return (0);
}
static int
md_ctlr_ctl_t *c,
int adding,
int *db_cnt,
int minimum_replicas
)
{
md_ctlr_drv_t *d;
int maxdb = 0;
/*
* If this ctrl has no "usable" drives, nothing to do.
*/
if (c->ctl_drcnt == 0)
return (0);
/*
* Determine the largest DB count on a drive.
*/
maxdb = d->drv_new_dbcnt;
/*
* Make sure we start at a reasonable number
*/
if (maxdb == 0) {
if (!adding)
return (0);
maxdb = 1;
}
/*
* Count or Un-Count replicas that would be
* added or deleted respectively.
*/
/*CONSTCOND*/
while (1) {
/*
* If this drive is being deleted, skip it.
*/
continue;
/*
* If the drive is errored and adding, skip it.
*/
continue;
/*
* Make sure that the replicas are distributed across
* the drives.
*/
if (adding) {
if (d->drv_new_dbcnt >= maxdb)
continue;
} else {
if (d->drv_new_dbcnt == 0)
continue;
if (d->drv_new_dbcnt < maxdb)
continue;
}
/*
* Count or Un-Count replicas here.
*/
if (adding) {
if (meta_replicaslice(d->drv_dnp,
mdclrerror(&mde);
continue;
}
if (! partp)
continue;
continue;
(*db_cnt)++;
d->drv_new_dbcnt++;
} else {
(*db_cnt)--;
d->drv_new_dbcnt--;
}
return (0);
}
/*
* This should make sure they get spread
* around. This is to emulate the {add,del}_replica
* routines.
*/
if (adding) {
maxdb++;
if (maxdb > minimum_replicas)
return (-1);
} else {
maxdb--;
if (maxdb <= 0)
return (-1);
}
}
/*NOTREACHED*/
}
static int
int min_reps
)
{
md_ctlr_ctl_t *c;
md_ctlr_drv_t *d;
int db_cnt;
int uctlrs = 0;
int total_cnt = 0;
/*
* Count the number of controllers,
* counting the replicas is slightly different based
* on the controller count.
*/
if (c->ctl_drcnt > 0) {
uctlrs++;
d->drv_new_dbcnt = d->drv_dbcnt;
}
if (uctlrs > 2) {
if (c->ctl_drcnt == 0)
continue;
/*
* Count the replicas that would be added.
*/
if (count_replica_on_ctl(c, TRUE,
return (-1);
/*
* Un-Count the replicas that would be deleted.
*/
if (count_replica_on_ctl(c, FALSE,
return (-1);
}
} else {
if (c->ctl_drcnt == 0)
continue;
/*
* Count the replicas that woud be added.
*/
if (count_replica_on_ctl(c, TRUE,
return (-1);
}
}
return (total_cnt);
}
static int
int *minimum_replicas,
)
{
int n;
int rctlrs = 0;
int uctlrs;
int ructlrs;
int octlrs;
int save_done;
char save_cname[16];
char *cmp_name_1, *cmp_name_2;
int reps;
md_ctlr_ctl_t *c;
/*
* Build a ctlr list with SSA-100 busses NOT as separate controllers.
*/
return (-1);
/*
* Determine what controllers are usable in the sense of being able to
* add a replica to a drive on the controller.
* Also find the minimum number of drives on a controller.
*/
if (c->ctl_drcnt > 0) {
rctlrs++;
if (prevcnt == 0)
issame = 0;
}
}
goto cont;
/*
* If here: Handling 3 or more controllers most
* likely with non-symmetrical number of
* disks. The number of replicas will be
* the minimum number of disks on a controller.
*
* The main point is to insure that a
* controller does not have more than half
* of the replicas.
*/
/*
* Can we find fewer than the maximum replicas by reducing the
* number of replicas per drive.
*/
for (n = drvcnt; n > 0; n--) {
*minimum_replicas = n;
return (0);
}
}
cont:
/*
* Build a ctlr list with SSA-100 busses as separate controllers.
*
* If Here: Try to put 2 replicas per controller/bus
* If that doesn't work put 1 replica per controller/bus
*/
return (-1);
/*
* If the number of "real" controllers is 2, special handling may be
* needed.
*/
if (rctlrs != 2) {
goto other;
}
/*
* Determine what controllers are usable in the sense of being able to
* add a replica to a drive on the controller.
* Also find the minimum number of drives on a controller.
*/
drvcnt = ~0U;
uctlrs = 0;
if (c->ctl_drcnt > 0) {
uctlrs++;
}
}
/*
* If the number of controllers is not changed, continue with original
* strategy.
*/
goto other;
}
/*
* Check the distribution of bus ctlrs across real controllers.
*/
ructlrs = 0;
octlrs = 0;
save_done = 0;
if (c->ctl_drcnt == 0)
continue;
if (! save_done) {
save_done = 1;
}
&cmp_name_1);
octlrs++;
else
ructlrs++;
}
/*
* Take the largest of the counts
*/
/*
* If the distribution of bus controlers is half of the total, then
* this layout strategy will work, doit.
*/
goto other;
}
/*
* If here, there is a distribution of bus controllers that will cause
* the real controller distribution to be unbalanced, so a different
* strategy is used.
*/
/*
* Build the ctlr list with SSA-100 busses NOT as separate controllers.
*/
return (-1);
/*
* Make ctl_drcnt limit the number of replicas
*/
/*
* Try at least MD_MINBALREP's per controller after changing ctl_drcnt
*/
/*
* Can we find fewer than the maximum replicas by reducing the number
* of replicas per drive.
*/
for (n = drvcnt; n > 0; n--) {
*minimum_replicas = n;
return (0);
}
}
/*
* Build a ctlr list with SSA-100 busses NOT as separate controllers.
*
* If Here: Try to put 2 replicas per controller (not on busses)
* If that doesn't work put 1 replica per controller
*/
return (-1);
/*
* Can we find fewer than the maximum replicas by reducing the
* number of replicas per drive.
*/
for (n = MD_MINBALREP; n > 0; n--) {
*minimum_replicas = n;
return (0);
}
}
/*
* Return a ctrl list that does not include the SSA-100 buses as
* separate controllers. This will create fewer separate controllers.
*/
*minimum_replicas = 1;
return (0);
}
static int
int min_reps,
)
{
md_ctlr_ctl_t *c;
int err;
int multiple_reps = 0;
md_ctlr_drv_t *d;
if (c->ctl_drcnt == 0)
continue;
/*
* check for multiple databases on a disk and compensate
*/
if (d->drv_dbcnt)
}
/*
* remove the number of multiple databases count from the
* total db count. This enables us to rebalance if one of
* the disks has a large enough slice for 2 metadb's. If we
* then add a disk with a smaller slice into the set, we want
* that disk to get a replica on it. If we just compare to
* ctl_dbcnt, it won't.
*/
while ((c->ctl_dbcnt - multiple_reps) <
min_reps) {
return (-1);
if (err == 0)
break;
}
return (-1);
if (err == 0)
break;
}
}
return (0);
}
static int
int min_reps,
)
{
md_ctlr_ctl_t *c;
int err;
int multiple_reps = 0;
md_ctlr_drv_t *d;
if (c->ctl_drcnt == 0)
continue;
/*
* check for multiple databases on a disk and compensate
*/
if (d->drv_dbcnt)
}
/*
* remove the number of multiple databases count from the
* total db count. This enables us to rebalance if one of
* the disks has a large enough slice for 2 metadb's. If we
* then add a disk with a smaller slice into the set, we want
* that disk to get a replica on it. If we just compare to
* ctl_dbcnt, it won't.
*/
while ((c->ctl_dbcnt - multiple_reps) <
return (-1);
if (err == 0)
break;
}
return (-1);
if (err == 0)
break;
}
}
return (0);
}
static int
)
{
md_ctlr_ctl_t *c;
md_ctlr_drv_t *d;
return (TRUE);
/*
* retry if all the errored drives are already in the diskset.
*/
== DRV_F_ERROR)
return (FALSE);
}
}
return (TRUE);
}
int
)
{
int min_reps;
int uctlrs = 0;
int retry = 0;
int rval = 0;
return (-1);
/*
* Determine what controllers are usable in the sense of being able to
* add a replica to a drive on the controller.
*/
if (c->ctl_drcnt > 0)
uctlrs++;
/*
* Add replicas to achieve a balance.
*/
if (uctlrs > 2)
else
if (rval) {
mdclrerror(ep);
rval = 0;
}
}
/*
* Delete all the replicas from drives that are so marked.
*/
if (! rval)
if (retry) {
if (uctlrs > 2)
else
mdclrerror(ep);
rval = 0;
}
}
/*
* Free up the ctlr list.
*/
free_ctlr_lst(&cl);
return (rval);
}