/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/file.h>
#include <sys/user.h>
#include <sys/uio.h>
#include <sys/t_lock.h>
#include <sys/buf.h>
#include <sys/dkio.h>
#include <sys/vtoc.h>
#include <sys/kmem.h>
#include <vm/page.h>
#include <sys/cmn_err.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/mkdev.h>
#include <sys/stat.h>
#include <sys/open.h>
#include <sys/disp.h>
#include <sys/lvm/md_mirror.h>
#include <sys/modctl.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/debug.h>
#include <sys/callb.h>
#include <sys/sysevent/eventdefs.h>
#include <sys/sysevent/svm.h>
#include <sys/lvm/mdmn_commd.h>
extern int md_status;
extern kmutex_t md_status_mx;
extern kmutex_t md_mx;
extern unit_t md_nunits;
extern set_t md_nsets;
extern md_set_t md_set[];
extern major_t md_major;
extern md_ops_t mirror_md_ops;
extern kmem_cache_t *mirror_child_cache; /* mirror child memory pool */
extern mdq_anchor_t md_mto_daemon;
extern daemon_request_t mirror_timeout;
extern md_resync_t md_cpr_resync;
extern clock_t md_hz;
extern int md_mtioctl_cnt;
extern kmem_cache_t *mirror_parent_cache;
#ifdef DEBUG
extern int mirror_debug_flag;
#endif
/*
* Tunable resync thread timeout. This is used as the time interval for updating
* the resync progress to the mddb. This allows restartable resyncs to be
* continued across a system reboot.
* Default is to update the resync progress every 5 minutes.
*/
int md_mirror_resync_update_intvl = MD_DEF_MIRROR_RESYNC_INTVL;
/*
* Settable mirror resync buffer size. Specified in 512 byte
* blocks. This is set to MD_DEF_RESYNC_BUF_SIZE by default.
*/
int md_resync_bufsz = MD_DEF_RESYNC_BUF_SIZE;
/*
* Tunables for dirty region processing when
* closing down a mirror.
*
* Dirty region processing during close of a
* mirror is basically monitoring the state
* of the resync region bitmaps and the number
* of outstanding i/o's per submirror to
* determine that there are no more dirty
* regions left over.
*
* The approach taken is a retry logic over
* md_mirror_rr_cleans iterations to monitor
* the progress.
*
* There are two methods of polling the progress
* on dirty bitmap processing: busy-waits and
* non-busy-waits.
*
* Busy-waits are used at the beginning to
* determine the final state as quick as
* possible; md_mirror_rr_polls defines the
* number of busy-waits.
*
* In case the number of busy-waits got exhausted
* with dirty regions left over, the retry logic
* switches over to non-busy-waits, thus giving
* relief to an obviously heavily loaded system.
* The timeout value is defined by the tunable
* md_mirror_rr_sleep_timo in seconds.
*
* The number of non-busy-waits is given by:
* md_mirror_rr_cleans - md_mirror_rr_polls.
*
* The values were found by testing on a
* 'typical' system and may require tuning
* to meet specific customer's requirements.
*/
int md_mirror_rr_cleans = 13;
int md_mirror_rr_polls = 3;
int md_mirror_rr_sleep_timo = 1;
/*
* The value is not #defined because it will be computed
* in the future.
*/
int md_max_xfer_bufsz = 2048;
/*
* mirror_generate_rr_bitmap:
* -------------------
* Generate a compressed bitmap md_mn_msg_rr_clean_t for the given clean
* bitmap associated with mirror 'un'
*
* Input:
* un - mirror unit to get bitmap data from
* *msgp - location to return newly allocated md_mn_msg_rr_clean_t
* *activep- location to return # of active i/os
*
* Returns:
* 1 => dirty bits cleared from un_dirty_bm and DRL flush required
* *msgp contains bitmap of to-be-cleared bits
* 0 => no bits cleared
* *msgp == NULL
*/
static int
mirror_generate_rr_bitmap(mm_unit_t *un, md_mn_msg_rr_clean_t **msgp,
int *activep)
{
unsigned int i, next_bit, data_bytes, start_bit;
int cleared_dirty = 0;
/* Skip any initial 0s. */
retry_dirty_scan:
if ((start_bit = un->un_rr_clean_start_bit) >= un->un_rrd_num)
un->un_rr_clean_start_bit = start_bit = 0;
/*
* Handle case where NO bits are set in PERNODE_DIRTY but the
* un_dirty_bm[] map does have entries set (after a 1st resync)
*/
for (; start_bit < un->un_rrd_num &&
!IS_PERNODE_DIRTY(md_mn_mynode_id, start_bit, un) &&
(un->un_pernode_dirty_sum[start_bit] != (uchar_t)0); start_bit++)
;
if (start_bit >= un->un_rrd_num) {
if (un->un_rr_clean_start_bit == 0) {
return (0);
} else {
un->un_rr_clean_start_bit = 0;
goto retry_dirty_scan;
}
}
/* how much to fit into this message */
data_bytes = MIN(howmany(un->un_rrd_num - start_bit, NBBY),
MDMN_MSG_RR_CLEAN_DATA_MAX_BYTES);
(*msgp) = kmem_zalloc(MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes),
KM_SLEEP);
(*msgp)->rr_nodeid = md_mn_mynode_id;
(*msgp)->rr_mnum = MD_SID(un);
MDMN_MSG_RR_CLEAN_START_SIZE_SET(*msgp, start_bit, data_bytes);
next_bit = MIN(start_bit + data_bytes * NBBY, un->un_rrd_num);
for (i = start_bit; i < next_bit; i++) {
if (un->c.un_status & MD_UN_KEEP_DIRTY && IS_KEEPDIRTY(i, un)) {
continue;
}
if (!IS_REGION_DIRTY(i, un)) {
continue;
}
if (un->un_outstanding_writes[i] != 0) {
(*activep)++;
continue;
}
/*
* Handle the case where a resync has completed and we still
* have the un_dirty_bm[] entries marked as dirty (these are
* the most recent DRL re-read from the replica). They need
* to be cleared from our un_dirty_bm[] but they will not have
* corresponding un_pernode_dirty[] entries set unless (and
* until) further write()s have been issued to the area.
* This handles the case where only the un_dirty_bm[] entry is
* set. Without this we'd not clear this region until a local
* write is issued to the affected area.
*/
if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, un) ||
(un->un_pernode_dirty_sum[i] == (uchar_t)0)) {
if (!IS_GOING_CLEAN(i, un)) {
SET_GOING_CLEAN(i, un);
(*activep)++;
continue;
}
/*
* Now we've got a flagged pernode_dirty, _or_ a clean
* bitmap entry to process. Update the bitmap to flush
* the REGION_DIRTY / GOING_CLEAN bits when we send the
* cross-cluster message.
*/
cleared_dirty++;
setbit(MDMN_MSG_RR_CLEAN_DATA(*msgp), i - start_bit);
} else {
/*
* Not marked as active in the pernode bitmap, so skip
* any update to this. We just increment the 0 count
* and adjust the active count by any outstanding
* un_pernode_dirty_sum[] entries. This means we don't
* leave the mirror permanently dirty.
*/
(*activep) += (int)un->un_pernode_dirty_sum[i];
}
}
if (!cleared_dirty) {
kmem_free(*msgp, MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes));
*msgp = NULL;
}
un->un_rr_clean_start_bit = next_bit;
return (cleared_dirty);
}
/*
* There are three paths into here:
*
* md_daemon -> check_resync_regions -> prr
* mirror_internal_close -> mirror_process_unit_resync -> prr
* mirror_set_capability -> mirror_process_unit_resync -> prr
*
* The first one is a kernel daemon, the other two result from system calls.
* Thus, only the first case needs to deal with kernel CPR activity. This
* is indicated by the cprinfop being non-NULL for kernel daemon calls, and
* NULL for system call paths.
*/
static int
process_resync_regions_non_owner(mm_unit_t *un, callb_cpr_t *cprinfop)
{
int i, start, end;
int cleared_dirty = 0;
/* Number of reasons why we can not proceed shutting down the mirror. */
int active = 0;
set_t setno = MD_UN2SET(un);
md_mn_msg_rr_clean_t *rmsg;
md_mn_kresult_t *kres;
int rval;
minor_t mnum = MD_SID(un);
mdi_unit_t *ui = MDI_UNIT(mnum);
md_mn_nodeid_t owner_node;
/*
* We drop the readerlock here to assist lock ordering with
* update_resync. Once we have the un_rrp_inflight_mx, we
* can re-acquire it.
*/
md_unit_readerexit(ui);
/*
* Resync region processing must be single threaded. We can't use
* un_resync_mx for this purpose since this mutex gets released
* when blocking on un_resync_cv.
*/
mutex_enter(&un->un_rrp_inflight_mx);
(void) md_unit_readerlock(ui);
mutex_enter(&un->un_resync_mx);
rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1], RW_READER);
cleared_dirty = mirror_generate_rr_bitmap(un, &rmsg, &active);
rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
if (cleared_dirty) {
owner_node = un->un_mirror_owner;
mutex_exit(&un->un_resync_mx);
/*
* Transmit the 'to-be-cleared' bitmap to all cluster nodes.
* Receipt of the message will cause the mirror owner to
* update the on-disk DRL.
*/
kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
/* release readerlock before sending message */
md_unit_readerexit(ui);
if (cprinfop) {
mutex_enter(&un->un_prr_cpr_mx);
CALLB_CPR_SAFE_BEGIN(cprinfop);
}
rval = mdmn_ksend_message(setno, MD_MN_MSG_RR_CLEAN,
MD_MSGF_NO_LOG|MD_MSGF_BLK_SIGNAL|MD_MSGF_KSEND_NORETRY|
MD_MSGF_DIRECTED, un->un_mirror_owner,
(char *)rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg), kres);
if (cprinfop) {
CALLB_CPR_SAFE_END(cprinfop, &un->un_prr_cpr_mx);
mutex_exit(&un->un_prr_cpr_mx);
}
/* reacquire readerlock after message */
(void) md_unit_readerlock(ui);
if ((!MDMN_KSEND_MSG_OK(rval, kres)) &&
(kres->kmmr_comm_state != MDMNE_NOT_JOINED)) {
/* if commd is gone, no point in printing a message */
if (md_mn_is_commd_present())
mdmn_ksend_show_error(rval, kres, "RR_CLEAN");
kmem_free(kres, sizeof (md_mn_kresult_t));
kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
mutex_exit(&un->un_rrp_inflight_mx);
return (active);
}
kmem_free(kres, sizeof (md_mn_kresult_t));
/*
* If ownership changed while we were sending, we probably
* sent the message to the wrong node. Leave fixing that for
* the next cycle.
*/
if (un->un_mirror_owner != owner_node) {
mutex_exit(&un->un_rrp_inflight_mx);
return (active);
}
/*
* Now that we've sent the message, clear them from the
* pernode_dirty arrays. These are ONLY cleared on a
* successful send, and failure has no impact.
*/
cleared_dirty = 0;
start = MDMN_MSG_RR_CLEAN_START_BIT(rmsg);
end = start + MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg) * NBBY;
mutex_enter(&un->un_resync_mx);
rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1],
RW_READER);
for (i = start; i < end; i++) {
if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg),
i - start)) {
if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, un)) {
un->un_pernode_dirty_sum[i]--;
CLR_PERNODE_DIRTY(md_mn_mynode_id, i,
un);
}
if (IS_REGION_DIRTY(i, un)) {
cleared_dirty++;
CLR_REGION_DIRTY(i, un);
CLR_GOING_CLEAN(i, un);
}
}
}
rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
}
mutex_exit(&un->un_resync_mx);
mutex_exit(&un->un_rrp_inflight_mx);
return (active);
}
static int
process_resync_regions_owner(mm_unit_t *un)
{
int i, start, end;
int cleared_dirty = 0;
/* Number of reasons why we can not proceed shutting down the mirror. */
int active = 0;
set_t setno = MD_UN2SET(un);
int mnset = MD_MNSET_SETNO(setno);
md_mn_msg_rr_clean_t *rmsg;
minor_t mnum = MD_SID(un);
mdi_unit_t *ui = MDI_UNIT(mnum);
/*
* We drop the readerlock here to assist lock ordering with
* update_resync. Once we have the un_rrp_inflight_mx, we
* can re-acquire it.
*/
md_unit_readerexit(ui);
/*
* Resync region processing must be single threaded. We can't use
* un_resync_mx for this purpose since this mutex gets released
* when blocking on un_resync_cv.
*/
mutex_enter(&un->un_rrp_inflight_mx);
(void) md_unit_readerlock(ui);
mutex_enter(&un->un_resync_mx);
un->un_waiting_to_clear++;
while (un->un_resync_flg & MM_RF_STALL_CLEAN)
cv_wait(&un->un_resync_cv, &un->un_resync_mx);
un->un_waiting_to_clear--;
if (mnset) {
rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1],
RW_READER);
cleared_dirty = mirror_generate_rr_bitmap(un, &rmsg, &active);
if (cleared_dirty) {
/*
* Clear the bits from the pernode_dirty arrays.
* If that results in any being cleared from the
* un_dirty_bm, commit it.
*/
cleared_dirty = 0;
start = MDMN_MSG_RR_CLEAN_START_BIT(rmsg);
end = start + MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg) * NBBY;
for (i = start; i < end; i++) {
if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg),
i - start)) {
if (IS_PERNODE_DIRTY(md_mn_mynode_id, i,
un)) {
un->un_pernode_dirty_sum[i]--;
CLR_PERNODE_DIRTY(
md_mn_mynode_id, i, un);
}
if (un->un_pernode_dirty_sum[i] == 0) {
cleared_dirty++;
CLR_REGION_DIRTY(i, un);
CLR_GOING_CLEAN(i, un);
}
}
}
kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
}
rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
} else {
for (i = 0; i < un->un_rrd_num; i++) {
if (un->c.un_status & MD_UN_KEEP_DIRTY)
if (IS_KEEPDIRTY(i, un))
continue;
if (!IS_REGION_DIRTY(i, un))
continue;
if (un->un_outstanding_writes[i] != 0) {
active++;
continue;
}
if (!IS_GOING_CLEAN(i, un)) {
SET_GOING_CLEAN(i, un);
active++;
continue;
}
CLR_REGION_DIRTY(i, un);
CLR_GOING_CLEAN(i, un);
cleared_dirty++;
}
}
if (cleared_dirty) {
un->un_resync_flg |= MM_RF_GATECLOSED;
mutex_exit(&un->un_resync_mx);
mddb_commitrec_wrapper(un->un_rr_dirty_recid);
mutex_enter(&un->un_resync_mx);
un->un_resync_flg &= ~MM_RF_GATECLOSED;
if (un->un_waiting_to_mark != 0 ||
un->un_waiting_to_clear != 0) {
active++;
cv_broadcast(&un->un_resync_cv);
}
}
mutex_exit(&un->un_resync_mx);
mutex_exit(&un->un_rrp_inflight_mx);
return (active);
}
static int
process_resync_regions(mm_unit_t *un, callb_cpr_t *cprinfop)
{
int mnset = MD_MNSET_SETNO(MD_UN2SET(un));
/*
* For a mirror we can only update the on-disk resync-record if we
* currently own the mirror. If we are called and there is no owner we
* bail out before scanning the outstanding_writes[] array.
* NOTE: we only need to check here (before scanning the array) as we
* are called with the readerlock held. This means that a change
* of ownership away from us will block until this resync check
* has completed.
*/
if (mnset && (MD_MN_NO_MIRROR_OWNER(un) ||
(!MD_MN_MIRROR_OWNER(un) && !md_mn_is_commd_present_lite()))) {
return (0);
} else if (mnset && !MD_MN_MIRROR_OWNER(un)) {
return (process_resync_regions_non_owner(un, cprinfop));
} else {
return (process_resync_regions_owner(un));
}
}
/*
* Function that is callable from other modules to provide
* ability to cleanup dirty region bitmap on demand. Used
* on last close of a unit to avoid massive device resyncs
* when coming back after rolling large amounts of data to
* a mirror (e.g. at umount with logging).
*/
void
mirror_process_unit_resync(mm_unit_t *un)
{
int cleans = 0;
while (process_resync_regions(un, NULL)) {
cleans++;
if (cleans >= md_mirror_rr_cleans) {
cmn_err(CE_NOTE,
"Could not clean resync regions\n");
break;
}
if (cleans > md_mirror_rr_polls) {
/*
* We did not make it with md_mirror_rr_polls
* iterations. Give the system relief and
* switch over to non-busy-wait.
*/
delay(md_mirror_rr_sleep_timo * md_hz);
}
}
}
static void
check_resync_regions(daemon_request_t *timeout)
{
mdi_unit_t *ui;
mm_unit_t *un;
md_link_t *next;
callb_cpr_t cprinfo;
rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
if (md_get_setstatus(next->ln_setno) & MD_SET_STALE)
continue;
un = MD_UNIT(next->ln_id);
/*
* Register this resync thread with the CPR mechanism. This
* allows us to detect when the system is suspended and so
* keep track of the RPC failure condition.
*/
CALLB_CPR_INIT(&cprinfo, &un->un_prr_cpr_mx, callb_md_mrs_cpr,
"check_resync_regions");
ui = MDI_UNIT(next->ln_id);
(void) md_unit_readerlock(ui);
/*
* Do not clean up resync regions if it is an ABR
* mirror, or if a submirror is offline (we will use the resync
* region to resync when back online) or if there is only one
* submirror.
*/
if ((ui->ui_tstate & MD_ABR_CAP) ||
(un->c.un_status & MD_UN_OFFLINE_SM) || (un->un_nsm < 2)) {
md_unit_readerexit(ui);
/* Remove this thread from the CPR callback table. */
mutex_enter(&un->un_prr_cpr_mx);
CALLB_CPR_EXIT(&cprinfo);
continue;
}
(void) process_resync_regions(un, &cprinfo);
md_unit_readerexit(ui);
/* Remove this thread from the CPR callback table. */
mutex_enter(&un->un_prr_cpr_mx);
CALLB_CPR_EXIT(&cprinfo);
}
rw_exit(&mirror_md_ops.md_link_rw.lock);
/* We are done */
mutex_enter(&mirror_timeout.dr_mx);
timeout->dr_pending = 0;
mutex_exit(&mirror_timeout.dr_mx);
}
static void
md_mirror_timeout(void *throwaway)
{
mutex_enter(&mirror_timeout.dr_mx);
if (!mirror_timeout.dr_pending) {
mirror_timeout.dr_pending = 1;
daemon_request(&md_mto_daemon, check_resync_regions,
(daemon_queue_t *)&mirror_timeout, REQ_OLD);
}
if (mirror_md_ops.md_head != NULL)
mirror_timeout.dr_timeout_id = timeout(md_mirror_timeout,
throwaway, (int)MD_MDELAY*hz);
else
mirror_timeout.dr_timeout_id = 0;
mutex_exit(&mirror_timeout.dr_mx);
}
void
resync_start_timeout(set_t setno)
{
if (md_get_setstatus(setno) & MD_SET_STALE)
return;
mutex_enter(&mirror_timeout.dr_mx);
if (mirror_timeout.dr_timeout_id == 0)
mirror_timeout.dr_timeout_id = timeout(md_mirror_timeout,
(void *)NULL, (int)MD_MDELAY*hz);
mutex_exit(&mirror_timeout.dr_mx);
}
static void
offlined_to_attached(mm_unit_t *un)
{
int i;
int changed = 0;
if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
return;
for (i = 0; i < NMIRROR; i++) {
if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE)) {
mirror_set_sm_state(&un->un_sm[i],
&un->un_smic[i], SMS_ATTACHED, 1);
changed++;
}
if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE_RESYNC)) {
mirror_set_sm_state(&un->un_sm[i],
&un->un_smic[i], SMS_ATTACHED_RESYNC, 1);
changed++;
}
}
if (changed != 0) {
un->c.un_status &= ~MD_UN_OFFLINE_SM;
mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM);
}
}
static void
get_unit_resync(mm_unit_t *un)
{
mddb_recstatus_t status;
struct optim_resync *orp;
if (un->un_rr_dirty_recid == 0) {
offlined_to_attached(un);
return;
}
status = mddb_getrecstatus(un->un_rr_dirty_recid);
if ((status == MDDB_NORECORD) || (status == MDDB_NODATA)) {
un->un_rr_dirty_recid = 0;
offlined_to_attached(un);
return;
}
mddb_setrecprivate(un->un_rr_dirty_recid, MD_PRV_GOTIT);
orp = (struct optim_resync *)mddb_getrecaddr(un->un_rr_dirty_recid);
un->un_dirty_bm = orp->or_rr;
}
static int
create_unit_resync(mm_unit_t *un, int snarfing)
{
diskaddr_t tb;
int i;
int blksize; /* rr size in blocks */
int num_rr;
mddb_recid_t recid;
size_t size; /* bitmap size */
optim_resync_t *orp;
mddb_type_t typ1;
set_t setno;
tb = un->c.un_total_blocks;
if (((tb + MD_MIN_RR_SIZE)/ MD_MIN_RR_SIZE) > MD_DEF_NUM_RR) {
blksize = (int)(tb / MD_DEF_NUM_RR);
num_rr = (int)((tb + (blksize)) / (blksize));
} else {
blksize = MD_MIN_RR_SIZE;
num_rr = (int)((tb + MD_MIN_RR_SIZE) / MD_MIN_RR_SIZE);
}
size = howmany(num_rr, NBBY) + sizeof (*orp) - sizeof (orp->or_rr);
setno = MD_UN2SET(un);
typ1 = (mddb_type_t)md_getshared_key(setno,
mirror_md_ops.md_driver.md_drivername);
recid = mddb_createrec(size, typ1, RESYNC_REC,
MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno);
if (recid < 0) {
if (snarfing && !(md_get_setstatus(setno) & MD_SET_STALE)) {
md_set_setstatus(setno, MD_SET_STALE);
cmn_err(CE_WARN, "md: state database is stale");
}
return (-1);
}
un->un_rr_dirty_recid = recid;
orp = (optim_resync_t *)mddb_getrecaddr(recid);
orp->or_magic = OR_MAGIC;
orp->or_blksize = blksize;
orp->or_num = num_rr;
un->un_rrd_blksize = blksize;
un->un_rrd_num = num_rr;
un->un_dirty_bm = orp->or_rr;
if (snarfing)
for (i = 0; i < howmany(num_rr, NBBY); i++)
orp->or_rr[i] = 0xFF;
if (!snarfing) {
mddb_commitrec_wrapper(recid);
mirror_commit(un, NO_SUBMIRRORS, 0);
return (0);
}
mddb_setrecprivate(recid, MD_PRV_PENDCOM);
mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM);
return (0);
}
int
unit_setup_resync(mm_unit_t *un, int snarfing)
{
int err;
int syncable;
int i;
mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
int nonABR = 1; /* only set if ABR marked in ui_tstate */
un->un_dirty_bm = NULL;
un->un_rs_buffer = NULL;
mutex_init(&un->un_rrp_inflight_mx, "rrp mx", MUTEX_DEFAULT, NULL);
mutex_init(&un->un_resync_mx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&un->un_resync_cv, NULL, CV_DEFAULT, NULL);
un->un_resync_flg = 0;
un->un_waiting_to_mark = 0;
un->un_waiting_to_commit = 0;
un->un_waiting_to_clear = 0;
un->un_goingclean_bm = NULL;
un->un_goingdirty_bm = NULL;
un->un_outstanding_writes = NULL;
un->un_resync_bm = NULL;
if (snarfing)
get_unit_resync(un);
if (un->un_rr_dirty_recid == 0) {
/*
* If a MN diskset and snarfing and this node is not the
* master, do not delete any records on snarf of the
* mirror records (create_unit_resync deletes records).
*
* Master node should have already handled this case.
*/
if (MD_MNSET_SETNO(MD_UN2SET(un)) && snarfing &&
md_set[MD_UN2SET(un)].s_am_i_master == 0) {
#ifdef DEBUG
cmn_err(CE_NOTE, "unit_setup_resync: no rr for %s on"
" nodeid %d\n", md_shortname(MD_SID(un)),
md_set[MD_UN2SET(un)].s_nodeid);
#endif
return (-1);
}
if ((err = create_unit_resync(un, snarfing)) != 0)
return (err);
}
un->un_goingclean_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
un->un_rrd_num, NBBY)), KM_SLEEP);
un->un_goingdirty_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
un->un_rrd_num, NBBY)), KM_SLEEP);
un->un_outstanding_writes = (short *)kmem_zalloc(
(uint_t)un->un_rrd_num * sizeof (short), KM_SLEEP);
un->un_resync_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
un->un_rrd_num, NBBY)), KM_SLEEP);
/*
* Allocate pernode bitmap for this node. All other nodes' maps will
* be created 'on-the-fly' in the ioctl message handler
*/
if (MD_MNSET_SETNO(MD_UN2SET(un))) {
un->un_pernode_dirty_sum =
(uchar_t *)kmem_zalloc(un->un_rrd_num, KM_SLEEP);
if (md_mn_mynode_id > 0) {
un->un_pernode_dirty_bm[md_mn_mynode_id-1] = (uchar_t *)
kmem_zalloc((uint_t)(howmany(un->un_rrd_num, NBBY)),
KM_SLEEP);
}
/*
* Allocate taskq to process deferred (due to locking) RR_CLEAN
* requests.
*/
un->un_drl_task = (ddi_taskq_t *)md_create_taskq(MD_UN2SET(un),
MD_SID(un));
}
if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
return (0);
/*
* Only mark mirror which has an associated DRL as requiring a resync.
* For ABR mirrors we need not set the resync record bitmap up.
*/
if (ui && (ui->ui_tstate & MD_ABR_CAP))
nonABR = 0;
for (i = 0, syncable = 0; i < NMIRROR; i++) {
if (nonABR) {
if ((SUBMIRROR_IS_READABLE(un, i) ||
SMS_BY_INDEX_IS(un, i,
(SMS_OFFLINE | SMS_OFFLINE_RESYNC))))
syncable++;
}
}
if (snarfing && un->un_pass_num && (syncable > 1)) {
bcopy((caddr_t)un->un_dirty_bm, (caddr_t)un->un_resync_bm,
howmany(un->un_rrd_num, NBBY));
un->c.un_status |= (MD_UN_OPT_NOT_DONE | MD_UN_WAR);
un->c.un_status &= ~MD_UN_OFFLINE_SM;
for (i = 0; i < NMIRROR; i++) {
if ((SUBMIRROR_IS_READABLE(un, i)) ||
SMS_BY_INDEX_IS(un, i, SMS_OFFLINE_RESYNC))
un->un_sm[i].sm_flags |= MD_SM_RESYNC_TARGET;
if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE)) {
un->un_sm[i].sm_flags |= MD_SM_RESYNC_TARGET;
mirror_set_sm_state(&un->un_sm[i],
&un->un_smic[i], SMS_OFFLINE_RESYNC, 1);
mddb_setrecprivate(un->c.un_record_id,
MD_PRV_PENDCOM);
}
}
}
return (0);
}
/*
* resync_kill_pending:
* -------------------
* Determine if the resync thread has been requested to terminate.
* Block if MD_RI_BLOCK or MD_RI_BLOCK_OWNER is set in un->un_rs_thread_flags.
* MD_RI_BLOCK is only set as a result of a user-initiated ioctl via metasync.
* MD_RI_BLOCK_OWNER is set by the ownership change of a multi-node mirror.
*
* Returns:
* 0 Kill not pending
* 1 Kill requested (set MD_UN_RESYNC_CANCEL in un->c.un_status)
*
* Note: this routine may block
* the writerlock for <ui> will be dropped and reacquired if <mx_type>
* is set to MD_WRITER_HELD.
* the readerlock for <ui> will be dropped and reacquired if <mx_type>
* is set to MD_READER_HELD.
*/
static int
resync_kill_pending(
mm_unit_t *un,
mdi_unit_t *ui,
uint_t mx_type)
{
int retval = 0;
/* Ensure that we don't block with any mutex held */
if (mx_type == MD_WRITER_HELD) {
md_unit_writerexit(ui);
} else if (mx_type == MD_READER_HELD) {
md_unit_readerexit(ui);
}
mutex_enter(&un->un_rs_thread_mx);
while (un->un_rs_thread_flags & (MD_RI_BLOCK|MD_RI_BLOCK_OWNER)) {
cv_wait(&un->un_rs_thread_cv, &un->un_rs_thread_mx);
if (un->un_rs_thread_flags & (MD_RI_KILL|MD_RI_SHUTDOWN))
break;
}
/* Determine if we've been asked to abort or shutdown gracefully */
if (un->un_rs_thread_flags & MD_RI_KILL) {
un->c.un_status |= MD_UN_RESYNC_CANCEL;
retval = 1;
} else if (un->un_rs_thread_flags & MD_RI_SHUTDOWN) {
retval = 1;
}
mutex_exit(&un->un_rs_thread_mx);
/* Reacquire mutex if dropped on entry */
if (mx_type == MD_WRITER_HELD) {
(void) md_unit_writerlock(ui);
} else if (mx_type == MD_READER_HELD) {
(void) md_unit_readerlock(ui);
}
return (retval);
}
/*
* resync_read_buffer:
* ------------------
* Issue the resync source read for the specified start block and size.
* This will cause the mirror strategy routine to issue a write-after-read
* once this request completes successfully.
* If 'flag_err' is set we expect to see a write error flagged in the b_error
* field of the buffer created for this i/o request. If clear we do not expect
* to see the error flagged for write failures.
* Read failures will always set the B_ERROR bit which will stop the resync
* immediately.
*/
static int
resync_read_buffer(mm_unit_t *un, diskaddr_t blk, size_t cnt, int flag_err)
{
md_mcs_t *sp;
buf_t *bp;
int ret = 0;
sp = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
mirror_child_init(sp);
bp = &sp->cs_buf;
bp->b_edev = makedevice(md_major, MD_SID(un));
bp->b_flags = B_READ;
bp->b_lblkno = blk;
bp->b_bcount = dbtob(cnt);
bp->b_un.b_addr = un->un_rs_buffer;
md_unit_readerexit(MDI_UNIT(MD_SID(un)));
(void) md_mirror_strategy(bp, MD_STR_NOTTOP | MD_STR_MAPPED |
MD_STR_WAR | (flag_err ? MD_STR_FLAG_ERR : 0), NULL);
(void) biowait(bp);
(void) md_unit_readerlock(MDI_UNIT(MD_SID(un)));
if (bp->b_flags & B_ERROR) {
ret = 1;
}
kmem_cache_free(mirror_child_cache, sp);
return (ret);
}
/*
* send_mn_resync_done_message
*
* At the end of a resync, send a message to all nodes to indicate that
* the resync is complete. The argument, flags, has the following values
*
* RESYNC_ERR - if an error occurred that terminated the resync
* CLEAR_OPT_NOT_DONE - Just need to clear the OPT_NOT_DONE flag
*
* unit writerlock set on entry
* Only send the message if the thread is not marked as shutting down:
* [un_rs_thread_flags & MD_RI_SHUTDOWN] or being killed:
* [un->c.un_status & MD_UN_RESYNC_CANCEL]
* or if there has been an error that terminated the resync:
* flags & RESYNC_ERR
*
*/
static void
send_mn_resync_done_message(
mm_unit_t *un,
int flags
)
{
md_mn_msg_resync_t *rmsg = un->un_rs_msg;
set_t setno;
mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
md_mn_kresult_t *kres;
int dont_send = 0;
int rval;
int nretries = 0;
rmsg = (md_mn_msg_resync_t *)un->un_rs_msg;
/*
* Only send the message if this resync thread is still active. This
* handles the case where ownership changes to different nodes during
* a resync can cause multiple spurious resync_done messages to occur
* when the resync completes. This happens because only one node is
* the resync owner but other nodes will have their resync_unit thread
* blocked in 'resync_kill_pending'
*/
mutex_enter(&un->un_rs_thread_mx);
dont_send = (un->un_rs_thread_flags & (MD_RI_KILL|MD_RI_SHUTDOWN)) ? 1
: 0;
mutex_exit(&un->un_rs_thread_mx);
dont_send |= (un->c.un_status & MD_UN_RESYNC_CANCEL) ? 1 : 0;
/*
* Always send a message if we've encountered an error that terminated
* the resync.
*/
if (flags & RESYNC_ERR)
dont_send = 0;
if (dont_send) {
#ifdef DEBUG
if (mirror_debug_flag) {
printf("Don't send resync done message, mnum = %x,"
" type = %x, flags = %d\n", MD_SID(un),
un->un_rs_type, flags);
}
#endif /* DEBUG */
return;
}
#ifdef DEBUG
if (mirror_debug_flag) {
printf("send resync done message, mnum = %x, type = %x\n",
MD_SID(un), un->un_rs_type);
}
#endif
rmsg->msg_resync_mnum = MD_SID(un);
rmsg->msg_resync_type = un->un_rs_type;
rmsg->msg_originator = md_mn_mynode_id;
rmsg->msg_resync_flags = 0;
if (flags & RESYNC_ERR)
rmsg->msg_resync_flags |= MD_MN_RS_ERR;
if (flags & CLEAR_OPT_NOT_DONE)
rmsg->msg_resync_flags |= MD_MN_RS_CLEAR_OPT_NOT_DONE;
setno = MD_MIN2SET(MD_SID(un));
md_unit_writerexit(ui);
kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
smrd_msg:
mutex_enter(&un->un_rs_cpr_mx);
CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_PHASE_DONE,
MD_MSGF_NO_LOG, 0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx);
mutex_exit(&un->un_rs_cpr_mx);
/* if the node hasn't yet joined, it's Ok. */
if ((!MDMN_KSEND_MSG_OK(rval, kres)) &&
(kres->kmmr_comm_state != MDMNE_NOT_JOINED)) {
mdmn_ksend_show_error(rval, kres, "RESYNC_PHASE_DONE");
/* If we're shutting down already, pause things here. */
if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
while (!md_mn_is_commd_present()) {
delay(md_hz);
}
/*
* commd is now available again. Retry the message once.
* If this fails we panic as the system is in an
* unexpected state.
*/
if (nretries++ == 0)
goto smrd_msg;
}
cmn_err(CE_PANIC, "ksend_message failure: RESYNC_PHASE_DONE");
}
kmem_free(kres, sizeof (md_mn_kresult_t));
(void) md_unit_writerlock(ui);
}
/*
* send_mn_resync_next_message
*
* Sent a message to all nodes indicating the next region to be resynced.
* The message contains the region to be resynced and the current position in
* the resync as denoted by un_rs_resync_done and un_rs_resync_2_do.
* On entry the unit readerlock is held.
*/
static void
send_mn_resync_next_message(
mm_unit_t *un,
diskaddr_t currentblk,
size_t rsize,
int flags
)
{
md_mn_msg_resync_t *rmsg = un->un_rs_msg;
set_t setno;
md_mn_kresult_t *kres;
mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
int rval;
md_mps_t *ps;
mm_submirror_t *sm;
int smi;
int nretries = 0;
ASSERT(rmsg != NULL);
#ifdef DEBUG
if (mirror_debug_flag) {
printf("send resync next message, mnum = %x, start=%lld, "
"size=%ld, type=%x, done=%lld, 2_do=%lld\n",
MD_SID(un), currentblk, rsize, un->un_rs_type,
un->un_rs_resync_done, un->un_rs_resync_2_do);
}
#endif
rmsg->msg_resync_mnum = MD_SID(un);
rmsg->msg_resync_type = un->un_rs_type;
rmsg->msg_resync_start = currentblk;
rmsg->msg_resync_rsize = rsize;
rmsg->msg_resync_done = un->un_rs_resync_done;
rmsg->msg_resync_2_do = un->un_rs_resync_2_do;
rmsg->msg_originator = md_mn_mynode_id;
if (flags & MD_FIRST_RESYNC_NEXT)
rmsg->msg_resync_flags = MD_MN_RS_FIRST_RESYNC_NEXT;
/*
* Copy current submirror state and flags into message. This provides
* a means of keeping all nodes that are currently active in the cluster
* synchronised with regards to their submirror state settings. If we
* did not pass this information here, the only time every node gets
* submirror state updated is at the end of a resync phase. This can be
* a significant amount of time for large metadevices.
*/
for (smi = 0; smi < NMIRROR; smi++) {
sm = &un->un_sm[smi];
rmsg->msg_sm_state[smi] = sm->sm_state;
rmsg->msg_sm_flags[smi] = sm->sm_flags;
}
setno = MD_MIN2SET(MD_SID(un));
md_unit_readerexit(ui);
kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
smrn_msg:
mutex_enter(&un->un_rs_cpr_mx);
CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_NEXT, MD_MSGF_NO_LOG,
0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx);
mutex_exit(&un->un_rs_cpr_mx);
if (!MDMN_KSEND_MSG_OK(rval, kres)) {
mdmn_ksend_show_error(rval, kres, "RESYNC_NEXT");
/* If we're shutting down already, pause things here. */
if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
while (!md_mn_is_commd_present()) {
delay(md_hz);
}
/*
* commd is now available again. Retry the message once.
* If this fails we panic as the system is in an
* unexpected state.
*/
if (nretries++ == 0)
goto smrn_msg;
}
cmn_err(CE_PANIC, "ksend_message failure: RESYNC_NEXT");
}
kmem_free(kres, sizeof (md_mn_kresult_t));
(void) md_unit_readerlock(ui);
ps = un->un_rs_prev_overlap;
/* Allocate previous overlap reference if needed */
if (ps == NULL) {
ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
ps->ps_un = un;
ps->ps_ui = ui;
ps->ps_firstblk = 0;
ps->ps_lastblk = 0;
ps->ps_flags = 0;
md_unit_readerexit(ui);
(void) md_unit_writerlock(ui);
un->un_rs_prev_overlap = ps;
md_unit_writerexit(ui);
(void) md_unit_readerlock(ui);
}
ps->ps_firstblk = currentblk;
ps->ps_lastblk = currentblk + rsize - 1;
}
static int
resync_read_blk_range(
mm_unit_t *un,
diskaddr_t currentblk,
diskaddr_t stopbefore,
uint_t type,
int flags
)
{
size_t copysize; /* limited by max xfer buf size */
size_t rsize; /* size of resync block (for MN) */
set_t setno;
diskaddr_t newstop;
diskaddr_t rs_startblk;
uint_t rs_type;
int flags1 = flags & MD_FIRST_RESYNC_NEXT;
rs_type = un->un_rs_type;
rs_startblk = currentblk;
if (stopbefore > un->c.un_total_blocks)
stopbefore = un->c.un_total_blocks;
if (currentblk < un->un_resync_startbl)
currentblk = un->un_resync_startbl;
copysize = un->un_rs_copysize;
rsize = MD_DEF_RESYNC_BLK_SZ;
setno = MD_MIN2SET(MD_SID(un));
while (currentblk < stopbefore) {
/*
* Split the block up into units of MD_DEF_RESYNC_BLK_SZ and
* if a MN device and sendflag is set, send a RESYNC_MESSAGE
* to all nodes.
*/
if ((currentblk + MD_DEF_RESYNC_BLK_SZ) > stopbefore)
rsize = stopbefore - currentblk;
if (MD_MNSET_SETNO(setno) && (flags & MD_SEND_MESS_XMIT)) {
un->un_resync_startbl = currentblk;
rs_startblk = currentblk;
send_mn_resync_next_message(un, currentblk, rsize,
flags1);
if (flags1)
flags1 = 0;
/* check to see if we've been asked to terminate */
if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), type))
return ((un->c.un_status & MD_UN_RESYNC_CANCEL)
? 1:0);
/*
* Check to see if another node has completed this
* block, if so either the type or the resync region
* will have changed. If the resync type has changed,
* just exit.
* If the resync region has changed, reset currentblk
* to the start of the current resync region and
* continue.
*/
if (un->un_rs_type != rs_type)
return (0);
if (un->un_rs_prev_overlap->ps_firstblk >
rs_startblk) {
currentblk =
un->un_rs_prev_overlap->ps_firstblk;
continue;
}
}
newstop = currentblk + rsize;
while (currentblk < newstop) {
if ((currentblk + copysize) > stopbefore)
copysize = (size_t)(stopbefore - currentblk);
if (resync_read_buffer(un, currentblk, copysize,
(flags & MD_RESYNC_FLAG_ERR)))
return (1);
/* resync_read_buffer releases/grabs a new lock */
un = (mm_unit_t *)MD_UNIT(MD_SID(un));
currentblk += copysize;
/* check to see if we've been asked to terminate */
if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), type))
return ((un->c.un_status & MD_UN_RESYNC_CANCEL)
? 1:0);
if (MD_MNSET_SETNO(setno)) {
/*
* Check to see if another node has completed
* this block, see above
*/
if (un->un_rs_type != rs_type)
return (0);
if (un->un_rs_prev_overlap->ps_firstblk >
rs_startblk)
currentblk =
un->un_rs_prev_overlap->ps_firstblk;
}
}
}
return (0);
}
static void
optimized_resync(mm_unit_t *un)
{
mdi_unit_t *ui;
minor_t mnum;
int rr, smi;
int resync_regions;
uchar_t *dirtyregions;
diskaddr_t first, stopbefore;
int err;
int cnt;
sm_state_t state;
int broke_out = 0;
set_t setno;
uint_t old_rs_type = un->un_rs_type;
uint_t old_rs_done;
uint_t flags1 = MD_FIRST_RESYNC_NEXT|MD_RESYNC_FLAG_ERR;
size_t start_rr;
mnum = MD_SID(un);
ui = MDI_UNIT(mnum);
setno = MD_UN2SET(un);
if (!(un->c.un_status & MD_UN_OPT_NOT_DONE)) {
/*
* We aren't marked as needing a resync so for multi-node
* sets we flag the completion so that all nodes see the same
* metadevice state. This is a problem when a new node joins
* an existing set as it has to perform a 'metasync -r' and
* we have to step through all of the resync phases. If we
* don't do this the nodes that were already in the set will
* have the metadevices marked as 'Okay' but the joining node
* will have 'Needs Maintenance' which is unclearable.
*/
if (MD_MNSET_SETNO(setno)) {
send_mn_resync_done_message(un, CLEAR_OPT_NOT_DONE);
}
return;
}
/*
* No need for optimized resync if ABR set, clear rs_type and flags
* and exit
*/
if (ui->ui_tstate & MD_ABR_CAP) {
un->un_rs_type = MD_RS_NONE;
un->c.un_status &= ~(MD_UN_OPT_NOT_DONE | MD_UN_WAR);
return;
}
un->un_rs_dropped_lock = 1;
un->c.un_status |= MD_UN_WAR;
resync_regions = un->un_rrd_num;
dirtyregions = un->un_resync_bm;
md_unit_writerexit(ui);
/* For MN sets, resync NOTIFY is done when processing resync messages */
if (!MD_MNSET_SETNO(setno)) {
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
SVM_TAG_METADEVICE, setno, MD_SID(un));
}
un = (mm_unit_t *)md_unit_readerlock(ui);
/* check to see if we've been asked to terminate */
if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
if (un->c.un_status & MD_UN_RESYNC_CANCEL)
broke_out = RESYNC_ERR;
}
/*
* Check that we are still performing an optimized
* resync. If not, another node must have completed it
* so we have no more work to do.
*/
if (un->un_rs_type != old_rs_type) {
md_unit_readerexit(ui);
(void) md_unit_writerlock(ui);
return;
}
/*
* If rs_resync_done is non-zero, we must be completing an optimized
* resync that has already been partially done on another node.
* Therefore clear the bits in resync_bm for the resync regions
* already done. If resync_startbl is zero, calculate 2_do.
*/
if (un->un_rs_resync_done > 0) {
BLK_TO_RR(start_rr, un->un_resync_startbl, un);
for (rr = 0; rr < start_rr && rr < resync_regions; rr++)
CLR_KEEPDIRTY(rr, un);
} else {
un->un_rs_resync_2_do = 0;
for (rr = 0; rr < resync_regions; rr++)
if (isset(dirtyregions, rr))
un->un_rs_resync_2_do++;
}
for (rr = 0; (rr < resync_regions) && (broke_out != RESYNC_ERR); rr++) {
if (isset(dirtyregions, rr)) {
RR_TO_BLK(first, rr, un);
RR_TO_BLK(stopbefore, rr+1, un);
old_rs_type = un->un_rs_type;
old_rs_done = un->un_rs_resync_done;
err = resync_read_blk_range(un, first, stopbefore,
MD_READER_HELD, MD_SEND_MESS_XMIT | flags1);
flags1 = MD_RESYNC_FLAG_ERR;
/* resync_read_blk_range releases/grabs a new lock */
un = (mm_unit_t *)MD_UNIT(mnum);
if (err) {
broke_out = RESYNC_ERR;
break;
}
/*
* Check that we are still performing an optimized
* resync. If not, another node must have completed it
* so we have no more work to do.
*/
if (un->un_rs_type != old_rs_type) {
md_unit_readerexit(ui);
(void) md_unit_writerlock(ui);
return;
}
/*
* If resync_done has increased, we must have
* blocked in resync_read_blk_range while another node
* continued with the resync. Therefore clear resync_bm
* for the blocks that have been resynced on another
* node and update rr to the next RR to be done.
*/
if (old_rs_done < un->un_rs_resync_done) {
int i;
BLK_TO_RR(start_rr, un->un_resync_startbl - 1,
un);
for (i = rr; i < start_rr; i++)
CLR_KEEPDIRTY(i, un);
rr = start_rr;
} else
un->un_rs_resync_done++;
for (smi = 0, cnt = 0; smi < NMIRROR; smi++)
if (SUBMIRROR_IS_WRITEABLE(un, smi) &&
!(SMS_BY_INDEX_IS(un, smi, SMS_ALL_ERRED)))
cnt++;
if (cnt < 2) {
broke_out = RESYNC_ERR;
break;
}
CLR_KEEPDIRTY(rr, un);
/* Check to see if we've completed the resync cleanly */
if (un->un_rs_thread_flags & MD_RI_SHUTDOWN)
break;
/*
* Check that we haven't exceeded un_rs_resync_2_do. If
* we have we've completed the resync.
*/
if (un->un_rs_resync_done > un->un_rs_resync_2_do)
break;
}
}
md_unit_readerexit(ui);
un = (mm_unit_t *)md_unit_writerlock(ui);
/*
* If MN set send message to all nodes to indicate resync
* phase is complete. The processing of the message will update the
* mirror state
*/
if (MD_MNSET_SETNO(setno)) {
send_mn_resync_done_message(un, broke_out);
} else {
if (!broke_out)
un->c.un_status &= ~MD_UN_WAR;
un->c.un_status &= ~MD_UN_KEEP_DIRTY;
setno = MD_UN2SET(un);
for (smi = 0; smi < NMIRROR; smi++) {
un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
if (SMS_BY_INDEX_IS(un, smi, SMS_OFFLINE_RESYNC)) {
state = (broke_out ? SMS_OFFLINE : SMS_RUNNING);
mirror_set_sm_state(&un->un_sm[smi],
&un->un_smic[smi], state, broke_out);
mirror_commit(un, NO_SUBMIRRORS, 0);
}
if (SMS_BY_INDEX_IS(un, smi, SMS_OFFLINE))
un->c.un_status |= MD_UN_OFFLINE_SM;
}
}
/* For MN sets, resync NOTIFY is done when processing resync messages */
if (!MD_MNSET_SETNO(setno)) {
if (broke_out) {
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
} else {
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
}
}
}
/*
* recalc_resync_done
*
* This function deals with a change in value of un_rs_resync_2_do in a
* component resync. This may change if we are restarting a component
* resync on a single node having rebooted with a different value of
* md_resync_bufsz or if we are running in a multi-node with nodes having
* different values of md_resync_bufsz.
* If there is a change in un_rs_resync_2_do, we need to recalculate
* the value of un_rs_resync_done given the new value for resync_2_do.
* We have to calculate a new value for resync_done to be either
* if un_resync_startbl is set, (un_resync_startbl - initblock)/(blksize + skip)
* or if it is not set, we need to calculate it from un_rs_resync_done,
* (un_rs_resync_done/un_rs_resync_2_do) * resync_2_do
* In addition we need to deal with the overflow case by using a factor to
* prevent overflow
*/
static void
recalc_resync_done(mm_unit_t *un, size_t resync_2_do, diskaddr_t initblock,
u_longlong_t blk_size, u_longlong_t skip)
{
diskaddr_t x;
uint_t factor = 1;
/*
* If resync_2_do has not yet been calculated, no need to modify
* resync_done
*/
if (un->un_rs_resync_2_do == 0) {
return;
}
if (un->un_rs_resync_2_do == resync_2_do)
return; /* No change, so nothing to do */
/*
* If un_rs_startbl is set, another node must have already started
* this resync and hence we can calculate resync_done from
* resync_startbl
*/
if (un->un_resync_startbl) {
un->un_rs_resync_done = (un->un_resync_startbl - initblock) /
(blk_size + skip);
return;
}
/*
* un_resync_startbl is not set so we must calculate it from
* un_rs_resync_done.
* If the larger of the two values of resync_2_do is greater than 32
* bits, calculate a factor to divide by to ensure that we don't
* overflow 64 bits when calculating the new value for resync_done
*/
x = (un->un_rs_resync_2_do > resync_2_do) ? un->un_rs_resync_2_do :
resync_2_do;
while (x > INT32_MAX) {
x = x >> 1;
factor = factor << 1;
}
un->un_rs_resync_done = ((un->un_rs_resync_done/factor) *
(resync_2_do/factor)) /
((un->un_rs_resync_2_do + (factor * factor) - 1)/
(factor * factor));
}
static void
check_comp_4_resync(mm_unit_t *un, int smi, int ci)
{
mdi_unit_t *ui;
minor_t mnum;
mm_submirror_t *sm;
mm_submirror_ic_t *smic;
size_t count;
u_longlong_t skip;
u_longlong_t size;
u_longlong_t blk_size;
diskaddr_t initblock;
diskaddr_t block;
diskaddr_t frag = 0;
md_m_shared_t *shared;
int err;
set_t setno;
int broke_out = 0;
int blks;
uint_t old_rs_type = un->un_rs_type;
diskaddr_t old_rs_done;
uint_t flags1 = MD_FIRST_RESYNC_NEXT;
diskaddr_t resync_2_do;
mnum = MD_SID(un);
ui = MDI_UNIT(mnum);
sm = &un->un_sm[smi];
smic = &un->un_smic[smi];
setno = MD_UN2SET(un);
shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
(sm->sm_dev, sm, ci);
if (shared->ms_state != CS_RESYNC) {
SET_RS_TYPE_NONE(un->un_rs_type);
return;
}
if (shared->ms_flags & MDM_S_RS_TRIED) {
SET_RS_TYPE_NONE(un->un_rs_type);
return;
}
(void) (*(smic->sm_get_bcss))
(sm->sm_dev, sm, ci, &initblock, &count, &skip, &size);
if ((count == 1) && (skip == 0)) {
count = (size_t)(size / un->un_rs_copysize);
if ((frag = (size - (count * un->un_rs_copysize))) != 0)
count++;
size = (u_longlong_t)un->un_rs_copysize;
}
blk_size = size; /* Save block size for this resync */
ASSERT(count >= 1);
resync_2_do = count;
/*
* If part way through a resync, un_rs_resync_done/un_rs_resync_2_do
* gives the proportion of the resync that has already been done.
* If un_rs_copysize has changed since this previous partial resync,
* either because this node has been rebooted with a different value
* for md_resync_bufsz or because another node with a different value
* for md_resync_bufsz performed the previous resync, we need to
* recalculate un_rs_resync_done as a proportion of our value of
* resync_2_do.
*/
recalc_resync_done(un, resync_2_do, initblock, blk_size, skip);
/*
* For MN mirrors we need to send a message to all nodes indicating
* the next region to be resynced. For a component resync, the size of
* the contiguous region that is processed by resync_read_blk_range()
* may be small if there is the interleave size.
* Therefore, rather than sending the message within
* resync_read_blk_range(), we will send a message every
* MD_DEF_RESYNC_BLK_SZ blocks. Calculate the frequency in terms of
* the number of blocks. Then, if we are restarting a resync, round
* un_rs_resync_done down to the previous resync region boundary. This
* ensures that we send a RESYNC_NEXT message before resyncing any
* blocks
*/
if (MD_MNSET_SETNO(setno)) {
blks = ((MD_DEF_RESYNC_BLK_SZ + blk_size + skip - 1)/
(blk_size + skip));
un->un_rs_resync_done = (un->un_rs_resync_done/blks) * blks;
}
/*
* un_rs_resync_done is the number of ('size' + 'skip') increments
* already resynced from the base 'block'
* un_rs_resync_2_do is the number of iterations in
* this component resync.
*/
ASSERT(count >= un->un_rs_resync_done);
un->un_rs_resync_2_do = (diskaddr_t)count;
un->c.un_status |= MD_UN_WAR;
sm->sm_flags |= MD_SM_RESYNC_TARGET;
md_unit_writerexit(ui);
/* For MN sets, resync NOTIFY is done when processing resync messages */
if (!MD_MNSET_SETNO(setno)) {
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
SVM_TAG_METADEVICE, setno, MD_SID(un));
}
un = (mm_unit_t *)md_unit_readerlock(ui);
/* check to see if we've been asked to terminate */
if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
if (un->c.un_status & MD_UN_RESYNC_CANCEL)
broke_out = RESYNC_ERR;
}
/*
* Check that we are still performing the same component
* resync. If not, another node must have completed it
* so we have no more work to do.
*/
if (un->un_rs_type != old_rs_type) {
md_unit_readerexit(ui);
(void) md_unit_writerlock(ui);
return;
}
/*
* Adjust resync_done, resync_2_do, start of resync area and count to
* skip already resync'd data. We need to recalculate resync_done as
* we have dropped the unit lock above and may have lost ownership to
* another node, with a different resync buffer size and it may have
* sent us new values of resync_done and resync_2_do based on its
* resync buffer size
*/
recalc_resync_done(un, resync_2_do, initblock, blk_size, skip);
un->un_rs_resync_2_do = resync_2_do;
count -= un->un_rs_resync_done;
block = initblock + ((blk_size + skip) * (int)un->un_rs_resync_done);
un->un_rs_dropped_lock = 1;
while ((count > 0) && (broke_out != RESYNC_ERR)) {
old_rs_done = un->un_rs_resync_done;
/*
* For MN mirrors send a message to the other nodes. This
* message includes the size of the region that must be blocked
* for all writes
*/
if (MD_MNSET_SETNO(setno)) {
if ((un->un_rs_resync_done%blks == 0)) {
un->un_resync_startbl = block;
send_mn_resync_next_message(un, block,
(blk_size+skip)*blks, flags1);
flags1 = 0;
/*
* check to see if we've been asked to
* terminate
*/
if (resync_kill_pending(un,
MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
if (un->c.un_status &
MD_UN_RESYNC_CANCEL) {
broke_out = RESYNC_ERR;
break;
}
}
/*
* Check that we are still performing the same
* component resync. If not, another node must
* have completed it so we have no more work to
* do. Also reset count to remaining resync as
* we may have lost ownership in in
* send_mn_resync_next_message while another
* node continued with the resync and
* incremented resync_done.
*/
if (un->un_rs_type != old_rs_type) {
md_unit_readerexit(ui);
(void) md_unit_writerlock(ui);
return;
}
/*
* recalculate resync_done, resync_2_do
* We need to recalculate resync_done as
* we have dropped the unit lock in
* send_mn_resync_next_message above and may
* have lost ownership to another node, with a
* different resync buffer size and it may have
* sent us new values of resync_done and
* resync_2_do based on its resync buffer size
*/
recalc_resync_done(un, resync_2_do, initblock,
blk_size, skip);
un->un_rs_resync_2_do = resync_2_do;
count = un->un_rs_resync_2_do -
un->un_rs_resync_done;
/*
* Adjust start of resync area to skip already
* resync'd data
*/
block = initblock + ((blk_size + skip) *
(int)un->un_rs_resync_done);
old_rs_done = un->un_rs_resync_done;
}
}
err = resync_read_blk_range(un, block, block + size,
MD_READER_HELD, MD_RESYNC_FLAG_ERR);
/* resync_read_blk_range releases/grabs a new lock */
un = (mm_unit_t *)MD_UNIT(mnum);
if (err) {
broke_out = RESYNC_ERR;
break;
}
/*
* If we are no longer resyncing this component, return as
* another node has progressed the resync.
*/
if (un->un_rs_type != old_rs_type) {
md_unit_readerexit(ui);
(void) md_unit_writerlock(ui);
return;
}
/*
* recalculate resync_done, resync_2_do. We need to recalculate
* resync_done as we have dropped the unit lock in
* resync_read_blk_range above and may have lost ownership to
* another node, with a different resync buffer size and it may
* have sent us new values of resync_done and resync_2_do based
* on its resync buffer size
*/
recalc_resync_done(un, resync_2_do, initblock, blk_size, skip);
un->un_rs_resync_2_do = resync_2_do;
/*
* Reset count to remaining resync as we may have blocked in
* resync_read_blk_range while another node continued
* with the resync and incremented resync_done. Also adjust
* start of resync area to skip already resync'd data.
*/
count = un->un_rs_resync_2_do - un->un_rs_resync_done;
block = initblock +((blk_size + skip) *
(int)un->un_rs_resync_done);
/*
* If we are picking up from another node, we retry the last
* block otherwise step on to the next block
*/
if (old_rs_done == un->un_rs_resync_done) {
block += blk_size + skip;
un->un_rs_resync_done++;
count--;
}
if ((count == 1) && frag)
size = frag;
if (shared->ms_state == CS_ERRED) {
err = 1;
broke_out = RESYNC_ERR;
break;
}
/* Check to see if we've completed the resync cleanly */
if (un->un_rs_thread_flags & MD_RI_SHUTDOWN)
break;
}
md_unit_readerexit(ui);
un = (mm_unit_t *)md_unit_writerlock(ui);
/*
* If MN set send message to all nodes to indicate resync
* phase is complete. The processing of the message will update the
* mirror state
*/
if (MD_MNSET_SETNO(setno)) {
send_mn_resync_done_message(un, broke_out);
} else {
un->c.un_status &= ~MD_UN_WAR;
sm->sm_flags &= ~MD_SM_RESYNC_TARGET;
if (err)
shared->ms_flags |= MDM_S_RS_TRIED;
else
/*
* As we don't transmit the changes,
* no need to drop the lock.
*/
set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
MD_STATE_NO_XMIT, (IOLOCK *)NULL);
}
/* For MN sets, resync NOTIFY is done when processing resync messages */
if (!MD_MNSET_SETNO(setno)) {
if (broke_out) {
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
SVM_TAG_METADEVICE, setno, MD_SID(un));
} else {
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
SVM_TAG_METADEVICE, setno, MD_SID(un));
}
SET_RS_TYPE_NONE(un->un_rs_type);
}
}
static void
submirror_resync(mm_unit_t *un)
{
mdi_unit_t *ui;
minor_t mnum;
mm_submirror_t *sm;
mm_submirror_ic_t *smic;
int smi;
diskaddr_t chunk;
diskaddr_t curblk;
int err;
int cnt;
set_t setno;
int broke_out = 0;
int i;
int flags1 = MD_FIRST_RESYNC_NEXT;
int compcnt;
mnum = MD_SID(un);
ui = MDI_UNIT(mnum);
setno = MD_UN2SET(un);
/*
* If the submirror_index is non-zero, we are continuing a resync
* so restart resync from last submirror marked as being resynced.
*/
if (RS_SMI(un->un_rs_type) != 0) {
smi = RS_SMI(un->un_rs_type);
sm = &un->un_sm[smi];
smic = &un->un_smic[smi];
if (!SMS_IS(sm, SMS_ATTACHED_RESYNC)) {
for (smi = 0; smi < NMIRROR; smi++) {
sm = &un->un_sm[smi];
smic = &un->un_smic[smi];
if (SMS_IS(sm, SMS_ATTACHED_RESYNC))
break;
}
}
} else {
for (smi = 0; smi < NMIRROR; smi++) {
sm = &un->un_sm[smi];
smic = &un->un_smic[smi];
if (SMS_IS(sm, SMS_ATTACHED_RESYNC))
break;
}
}
if (smi == NMIRROR) {
SET_RS_TYPE_NONE(un->un_rs_type);
return;
}
/*
* If we've only got one component we can fail on a resync write
* if an error is encountered. This stops an unnecessary read of the
* whole mirror on a target write error.
*/
compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
if (compcnt == 1)
flags1 |= MD_RESYNC_FLAG_ERR;
un->c.un_status |= MD_UN_WAR;
sm->sm_flags |= MD_SM_RESYNC_TARGET;
SET_RS_SMI(un->un_rs_type, smi);
md_unit_writerexit(ui);
/* For MN sets, resync NOTIFY is done when processing resync messages */
if (!MD_MNSET_SETNO(setno)) {
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
SVM_TAG_METADEVICE, setno, MD_SID(un));
}
un = (mm_unit_t *)md_unit_readerlock(ui);
un->un_rs_dropped_lock = 1;
/* check to see if we've been asked to terminate */
if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
if (un->c.un_status & MD_UN_RESYNC_CANCEL)
broke_out = RESYNC_ERR;
}
/*
* Check that we are still performing the same submirror
* resync. If not, another node must have completed it
* so we have no more work to do.
*/
if (RS_TYPE(un->un_rs_type) != MD_RS_SUBMIRROR) {
md_unit_readerexit(ui);
(void) md_unit_writerlock(ui);
return;
}
/* if > 1TB mirror, increase percent done granularity */
if (un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS)
chunk = un->c.un_total_blocks / 1000;
else
chunk = un->c.un_total_blocks / 100;
if (chunk == 0)
chunk = un->c.un_total_blocks;
/*
* If a MN set, round the chunk size up to a multiple of
* MD_DEF_RESYNC_BLK_SZ
*/
if (MD_MNSET_SETNO(setno)) {
chunk = ((chunk + MD_DEF_RESYNC_BLK_SZ)/MD_DEF_RESYNC_BLK_SZ)
* MD_DEF_RESYNC_BLK_SZ;
if (chunk > un->c.un_total_blocks)
chunk = un->c.un_total_blocks;
}
/*
* Handle restartable resyncs that continue from where the previous
* resync left off. The new resync range is from un_rs_resync_done ..
* un_rs_resync_2_do
*/
curblk = 0;
if (un->un_rs_resync_done == 0) {
un->un_rs_resync_2_do = un->c.un_total_blocks;
} else {
curblk = un->un_rs_resync_done;
}
while ((curblk != un->c.un_total_blocks) && (broke_out != RESYNC_ERR)) {
diskaddr_t rs_done;
rs_done = un->un_rs_resync_done;
err = resync_read_blk_range(un, curblk, curblk + chunk,
MD_READER_HELD, MD_SEND_MESS_XMIT | flags1);
flags1 = (compcnt == 1 ? MD_RESYNC_FLAG_ERR : 0);
/* resync_read_blk_range releases/grabs a new lock */
un = (mm_unit_t *)MD_UNIT(mnum);
if (err) {
broke_out = RESYNC_ERR;
break;
}
/*
* If we are no longer executing a submirror resync, return
* as another node has completed the submirror resync.
*/
if (RS_TYPE(un->un_rs_type) != MD_RS_SUBMIRROR) {
md_unit_readerexit(ui);
(void) md_unit_writerlock(ui);
return;
}
/*
* If resync_done has changed, we must have blocked
* in resync_read_blk_range while another node
* continued with the resync so restart from resync_done.
*/
if (rs_done != un->un_rs_resync_done) {
curblk = un->un_rs_resync_done;
} else {
curblk += chunk;
un->un_rs_resync_done = curblk;
}
if ((curblk + chunk) > un->c.un_total_blocks)
chunk = un->c.un_total_blocks - curblk;
for (i = 0, cnt = 0; i < NMIRROR; i++)
if (SUBMIRROR_IS_WRITEABLE(un, i) &&
!SMS_BY_INDEX_IS(un, i, SMS_ALL_ERRED) &&
(un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET))
cnt++;
if (cnt == 0) {
broke_out = RESYNC_ERR;
break;
}
/* Check to see if we've completed the resync cleanly */
if (un->un_rs_thread_flags & MD_RI_SHUTDOWN)
break;
}
md_unit_readerexit(ui);
un = (mm_unit_t *)md_unit_writerlock(ui);
/*
* If MN set send message to all nodes to indicate resync
* phase is complete. The processing of the message will update the
* mirror state
*/
if (MD_MNSET_SETNO(setno)) {
send_mn_resync_done_message(un, broke_out);
} else {
sm->sm_flags &= ~MD_SM_RESYNC_TARGET;
if (err) {
mirror_set_sm_state(sm, smic, SMS_ATTACHED, 1);
} else {
mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
}
un->c.un_status &= ~MD_UN_WAR;
mirror_commit(un, SMI2BIT(smi), 0);
}
/* For MN sets, resync NOTIFY is done when processing resync messages */
if (!MD_MNSET_SETNO(setno)) {
if (broke_out) {
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
SVM_TAG_METADEVICE, setno, MD_SID(un));
} else {
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
SVM_TAG_METADEVICE, setno, MD_SID(un));
}
}
}
static void
component_resync(mm_unit_t *un)
{
mm_submirror_t *sm;
mm_submirror_ic_t *smic;
int ci;
int i;
int compcnt;
/*
* Handle the case where we are picking up a partially complete
* component resync. In this case un_rs_type contains the submirror
* and component index of where we should restart the resync.
*/
while (un->un_rs_type != MD_RS_COMPONENT) {
i = RS_SMI(un->un_rs_type);
ci = RS_CI(un->un_rs_type);
check_comp_4_resync(un, i, ci);
if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)),
MD_WRITER_HELD))
return;
/*
* If we have no current resync, contine to scan submirror and
* components. If the resync has moved on to another component,
* restart it and if the resync is no longer a component
* resync, just exit
*/
if (RS_TYPE(un->un_rs_type) == MD_RS_NONE)
break;
if (RS_TYPE(un->un_rs_type) != MD_RS_COMPONENT)
return;
}
/* Now continue scanning _all_ submirrors and components */
for (i = 0; i < NMIRROR; i++) {
sm = &un->un_sm[i];
smic = &un->un_smic[i];
if (!SMS_IS(sm, SMS_RUNNING | SMS_LIMPING))
continue;
compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
for (ci = 0; ci < compcnt; ci++) {
SET_RS_SMI(un->un_rs_type, i);
SET_RS_CI(un->un_rs_type, ci);
SET_RS_TYPE(un->un_rs_type, MD_RS_COMPONENT);
check_comp_4_resync(un, i, ci);
/* Bail out if we've been asked to abort/shutdown */
if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)),
MD_WRITER_HELD))
return;
/*
* Now check if another node has continued with the
* resync, if we are no longer in component resync,
* exit, otherwise update to the current component - 1
* so that the next call of check_comp_4 resync() will
* resync the current component.
*/
if ((RS_TYPE(un->un_rs_type) != MD_RS_NONE) &&
(RS_TYPE(un->un_rs_type) != MD_RS_COMPONENT))
return;
else {
if (RS_SMI(un->un_rs_type) != i) {
i = RS_SMI(un->un_rs_type);
ci = RS_CI(un->un_rs_type) - 1;
} else if (RS_CI(un->un_rs_type) != ci)
ci = RS_CI(un->un_rs_type) - 1;
}
}
}
}
static void
reset_comp_flags(mm_unit_t *un)
{
mm_submirror_t *sm;
mm_submirror_ic_t *smic;
md_m_shared_t *shared;
int ci;
int i;
int compcnt;
for (i = 0; i < NMIRROR; i++) {
sm = &un->un_sm[i];
smic = &un->un_smic[i];
if (!SMS_IS(sm, SMS_INUSE))
continue;
compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
for (ci = 0; ci < compcnt; ci++) {
shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
(sm->sm_dev, sm, ci);
shared->ms_flags &= ~MDM_S_RS_TRIED;
}
}
}
/*
* resync_progress_thread:
* ----------------------
* Thread started on first resync of a unit which simply blocks until woken up
* by a cv_signal, and then updates the mddb for the mirror unit record. This
* saves the resync progress information (un_rs_resync_done, un_rs_resync_2_do)
* so that an aborted resync can be continued after an intervening reboot.
*/
static void
resync_progress_thread(minor_t mnum)
{
mm_unit_t *un = MD_UNIT(mnum);
mdi_unit_t *ui = MDI_UNIT(mnum);
set_t setno = MD_MIN2SET(mnum);
while (un->c.un_status & MD_UN_RESYNC_ACTIVE) {
mutex_enter(&un->un_rs_progress_mx);
cv_wait(&un->un_rs_progress_cv, &un->un_rs_progress_mx);
mutex_exit(&un->un_rs_progress_mx);
if (un->un_rs_progress_flags & MD_RI_KILL)
break;
/*
* Commit mirror unit if we're the Master node in a multi-node
* environment
*/
if (MD_MNSET_SETNO(setno) && md_set[setno].s_am_i_master) {
(void) md_unit_readerlock(ui);
mirror_commit(un, NO_SUBMIRRORS, 0);
md_unit_readerexit(ui);
}
}
thread_exit();
}
/*
* resync_progress:
* ---------------
* Timeout handler for updating the progress of the resync thread.
* Simply wake up the resync progress daemon which will then mirror_commit() the
* unit structure to the mddb. This snapshots the current progress of the resync
*/
static void
resync_progress(void *arg)
{
mm_unit_t *un = (mm_unit_t *)arg;
mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
uint_t active;
mutex_enter(&un->un_rs_progress_mx);
cv_signal(&un->un_rs_progress_cv);
mutex_exit(&un->un_rs_progress_mx);
/* schedule the next timeout if the resync is still marked active */
(void) md_unit_readerlock(ui);
active = un->c.un_status & MD_UN_RESYNC_ACTIVE ? 1 : 0;
md_unit_readerexit(ui);
if (active) {
un->un_rs_resync_to_id = timeout(resync_progress, un,
(clock_t)(drv_usectohz(60000000) *
md_mirror_resync_update_intvl));
}
}
/*
* resync_unit:
* -----------
* Resync thread which drives all forms of resync (optimized, component,
* submirror). Must handle thread suspension and kill to allow multi-node
* resync to run without undue ownership changes.
*
* For a MN set, the reync mechanism is as follows:
*
* When a resync is started, either via metattach, metaonline, metareplace,
* metasync or by a hotspare kicking in, a message is sent to all nodes, which
* calls mirror_resync_thread. If there is currently no mirror owner, the
* master node sends a CHOOSE_OWNER message to the handler on the master. This
* chooses a mirror owner and sends a CHANGE_OWNER message requesting the
* selected node to become the owner.
* If this node is not the owner it sets itself to block in resync_kill_pending
* and if there is no owner all nodes will block until the chosen owner is
* selected, in which case it will unblock itself. So, on entry to this
* function only one node will continue past resync_kill_pending().
* Once the resync thread is started, it basically cycles through the optimized,
* component and submirrors resyncs until there is no more work to do.
*
* For an ABR mirror, once a mirror owner is chosen it will complete the resync
* unless the nodes dies in which case a new owner will be chosen and it will
* have to complete the resync from the point at which the previous owner died.
* To do this we broadcast a RESYNC_NEXT message before each region to be
* resynced and this message contains the address and length of the region
* being resynced and the current progress through the resync. The size of
* this region is MD_DEF_RESYNC_BLK_SZ blocks. It is larger than the resync
* block size to limit the amount of inter node traffic. The RESYNC_NEXT
* message also indicates to all other nodes that all writes to this block
* must be blocked until the next RESYNC_NEXT message is received. This ensures
* that no node can write to a block that is being resynced. For all MN
* mirrors we also block the whole resync region on the resync owner node so
* that all writes to the resync region are blocked on all nodes. There is a
* difference here between a MN set and a regular set in that for a MN set
* we protect the mirror from writes to the current resync block by blocking
* a larger region. For a regular set we just block writes to the current
* resync block.
*
* For a non-ABR mirror the same RESYNC_NEXT message is sent with an
* additional purpose. In this case, there is only one mirror owner at a time
* and rather than continually switching ownership between the chosen mirror
* owner and the node that is writing to the mirror, we move the resync to the
* mirror owner. When we swich ownership, we block the old owner and unblock
* the resync thread on the new owner. To enable the new owner to continue the
* resync, all nodes need to have the latest resync status, Then, following each
* resync write, we check to see if the resync state has changed and if it
* has this must be because we have lost ownership to another node(s) for a
* period and then have become owner again later in the resync process. If we
* are still dealing with the same resync, we just adjust addresses and counts
* and then continue. If the resync has moved on to a different type, for
* example from an optimized to a submirror resync, we move on to process the
* resync described by rs_type and continue from the position described by
* resync_done and resync_startbl.
*
* Note that for non-ABR mirrors it is possible for a write to be made on a
* non resync-owner node without a change of ownership. This is the case when
* the mirror has a soft part created on it and a write in ABR mode is made
* to that soft part. Therefore we still need to block writes to the resync
* region on all nodes.
*
* Sending the latest resync state to all nodes also enables them to continue
* a resync in the event that the mirror owner dies. If a mirror owner for
* a non-ABR mirror has died, there will be dirty resync regions. Therefore,
* regardless of whether another type of resync was in progress, we must first
* do an optimized resync to clean up the dirty regions before continuing
* with the interrupted resync.
*
* The resync status is held in the unit structure
* On disk
* un_rs_resync_done The number of contiguous resyc blocks done so far
* un_rs_resync_2_do The total number of contiguous resync blocks
* un_rs_type The resync type (inc submirror and component numbers)
* In core
* un_resync_startbl The address of the current resync block being processed
*
* In the event that the whole cluster fails we need to just use
* un_rs_resync_done to restart the resync and to ensure that this is
* periodically written to disk, we have a thread which writes the record
* to disk every 5 minutes. As the granularity of un_rs_resync_done is
* usually coarse ( for an optimized resync 1001 is the max value) there is
* little point in writing this more frequently.
*/
static void
resync_unit(minor_t mnum)
{
mdi_unit_t *ui;
mm_unit_t *un;
md_error_t mde = mdnullerror;
int mn_resync = 0;
int resync_finish = 0;
set_t setno = MD_MIN2SET(mnum);
uint_t old_rs_type = MD_RS_NONE;
uint_t old_rs_done = 0, old_rs_2_do = 0;
uint_t old_rs_startbl = 0;
int block_resync = 1;
char cpr_name[23]; /* Unique CPR name */
int rs_copysize;
char *rs_buffer;
int nretries = 0;
resync_restart:
#ifdef DEBUG
if (mirror_debug_flag)
printf("Resync started (mnum = %x)\n", mnum);
#endif
/*
* increment the mirror resync count
*/
mutex_enter(&md_cpr_resync.md_resync_mutex);
md_cpr_resync.md_mirror_resync++;
mutex_exit(&md_cpr_resync.md_resync_mutex);
ui = MDI_UNIT(mnum);
un = MD_UNIT(mnum);
rs_copysize = un->un_rs_copysize;
if (rs_copysize == 0) {
/*
* Don't allow buffer size to fall outside the
* range 0 < bufsize <= md_max_xfer_bufsz.
*/
if (md_resync_bufsz <= 0)
md_resync_bufsz = MD_DEF_RESYNC_BUF_SIZE;
rs_copysize = MIN(md_resync_bufsz, md_max_xfer_bufsz);
}
rs_buffer = kmem_zalloc(dbtob(rs_copysize), KM_SLEEP);
un = md_unit_writerlock(ui);
un->un_rs_copysize = rs_copysize;
un->un_rs_buffer = rs_buffer;
if (MD_MNSET_SETNO(setno)) {
/*
* Register this resync thread with the CPR mechanism. This
* allows us to detect when the system is suspended and so
* keep track of the RPC failure condition.
*/
(void) snprintf(cpr_name, sizeof (cpr_name),
"mirror_resync%x", mnum);
CALLB_CPR_INIT(&un->un_rs_cprinfo, &un->un_rs_cpr_mx,
callb_md_mrs_cpr, cpr_name);
if (ui->ui_tstate & MD_RESYNC_NOT_DONE) {
/*
* If this is the first resync following the initial
* snarf (MD_RESYNC_NOT_DONE still set) and we've
* been started outside a reconfig step (e.g. by being
* added to an existing set) we need to query the
* existing submirror state for this mirror.
* The set_status flags will have MD_MN_SET_MIR_STATE_RC
* set if we've been through a step4 reconfig, so only
* query the master if this isn't (yet) set. In this
* case we must continue the resync thread as there is
* not guaranteed to be a currently running resync on
* any of the other nodes. Worst case is that we will
* initiate an ownership change to this node and then
* find that there is no resync to perform. However, we
* will then have correct status across the cluster.
*/
if (!md_set[setno].s_am_i_master) {
if (!(md_get_setstatus(setno) &
MD_SET_MN_MIR_STATE_RC)) {
mirror_get_status(un, NULL);
block_resync = 0;
#ifdef DEBUG
if (mirror_debug_flag) {
mm_submirror_t *sm;
int i;
for (i = 0; i < NMIRROR; i++) {
sm = &un->un_sm[i];
printf(
"sm[%d] state=%4x"
" flags=%4x\n", i,
sm->sm_state,
sm->sm_flags);
}
}
#endif
}
}
ui->ui_tstate &= ~MD_RESYNC_NOT_DONE;
}
/*
* For MN set, if we have an owner, then start the resync on it.
* If there is no owner the master must send a message to
* choose the owner. This message will contain the current
* resync count and it will only be sent to the master, where
* the resync count will be used to choose the next node to
* perform a resync, by cycling through the nodes in the set.
* The message handler will then send a CHANGE_OWNER message to
* all nodes, and on receipt of that message, the chosen owner
* will issue a SET_OWNER ioctl to become the owner. This ioctl
* will be requested to spawn a thread to issue the
* REQUEST_OWNER message to become the owner which avoids the
* need for concurrent ioctl requests.
* After sending the message, we will block waiting for one
* of the nodes to become the owner and start the resync
*/
if (MD_MN_NO_MIRROR_OWNER(un)) {
/*
* There is no owner, block and then the master will
* choose the owner. Only perform this if 'block_resync'
* is set.
*/
if (block_resync) {
mutex_enter(&un->un_rs_thread_mx);
un->un_rs_thread_flags |= MD_RI_BLOCK_OWNER;
mutex_exit(&un->un_rs_thread_mx);
}
if (md_set[setno].s_am_i_master) {
md_unit_writerexit(ui);
(void) mirror_choose_owner(un, NULL);
(void) md_unit_writerlock(ui);
}
} else {
/* There is an owner, block if we are not it */
if (!MD_MN_MIRROR_OWNER(un)) {
mutex_enter(&un->un_rs_thread_mx);
un->un_rs_thread_flags |= MD_RI_BLOCK_OWNER;
mutex_exit(&un->un_rs_thread_mx);
}
}
}
/*
* Start a timeout chain to update the resync progress to the mddb.
* This will run every md_mirror_resync_update_intvl minutes and allows
* a resync to be continued over a reboot.
*/
ASSERT(un->un_rs_resync_to_id == 0);
un->un_rs_resync_to_id = timeout(resync_progress, un,
(clock_t)(drv_usectohz(60000000) * md_mirror_resync_update_intvl));
/*
* Handle resync restart from the last logged position. The contents
* of un_rs_resync_2_do and un_rs_resync_done are dependent on the
* type of resync that was in progress.
*/
if (MD_MNSET_SETNO(setno)) {
switch ((uint_t)RS_TYPE(un->un_rs_type)) {
case MD_RS_NONE:
case MD_RS_OPTIMIZED:
case MD_RS_COMPONENT:
case MD_RS_SUBMIRROR:
case MD_RS_ABR:
break;
default:
un->un_rs_type = MD_RS_NONE;
}
/* Allocate a resync message, if required */
if (un->un_rs_msg == NULL) {
un->un_rs_msg = (md_mn_msg_resync_t *)kmem_zalloc(
sizeof (md_mn_msg_resync_t), KM_SLEEP);
}
mn_resync = 1;
}
/* Check to see if we've been requested to block/kill */
if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
goto bail_out;
}
do {
un->un_rs_dropped_lock = 0;
/*
* Always perform an optimized resync first as this will bring
* the mirror into an available state in the shortest time.
* If we are resuming an interrupted resync, other than an
* optimized resync, we save the type and amount done so that
* we can resume the appropriate resync after the optimized
* resync has completed.
*/
if ((RS_TYPE(un->un_rs_type) != MD_RS_NONE) &&
(RS_TYPE(un->un_rs_type) != MD_RS_OPTIMIZED)) {
old_rs_type = un->un_rs_type;
old_rs_done = un->un_rs_resync_done;
old_rs_2_do = un->un_rs_resync_2_do;
old_rs_startbl = un->un_resync_startbl;
}
SET_RS_TYPE(un->un_rs_type, MD_RS_OPTIMIZED);
/*
* If we are continuing a resync that is not an
* OPTIMIZED one, then we start from the beginning when
* doing this optimized resync
*/
if (RS_TYPE(old_rs_type) != MD_RS_OPTIMIZED) {
un->un_rs_resync_done = 0;
un->un_rs_resync_2_do = 0;
un->un_resync_startbl = 0;
}
optimized_resync(un);
/* Check to see if we've been requested to block/kill */
if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
goto bail_out;
}
un = (mm_unit_t *)MD_UNIT(mnum);
/*
* If another node has moved the resync on, we must
* restart the correct resync
*/
if (mn_resync &&
(RS_TYPE(un->un_rs_type) != MD_RS_NONE)) {
old_rs_type = un->un_rs_type;
old_rs_done = un->un_rs_resync_done;
old_rs_2_do = un->un_rs_resync_2_do;
old_rs_startbl = un->un_resync_startbl;
}
/*
* Restore previous resync progress or move onto a
* component resync.
*/
if (RS_TYPE(old_rs_type) != MD_RS_NONE) {
un->un_rs_type = old_rs_type;
un->un_rs_resync_done = old_rs_done;
un->un_rs_resync_2_do = old_rs_2_do;
un->un_resync_startbl = old_rs_startbl;
} else {
un->un_rs_type = MD_RS_COMPONENT;
un->un_rs_resync_done = 0;
un->un_rs_resync_2_do = 0;
un->un_resync_startbl = 0;
}
if (RS_TYPE(un->un_rs_type) == MD_RS_COMPONENT) {
component_resync(un);
/* Check to see if we've been requested to block/kill */
if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
goto bail_out;
}
un = (mm_unit_t *)MD_UNIT(mnum);
/*
* If we have moved on from a component resync, another
* node must have completed it and started a submirror
* resync, so leave the resync state alone. For non
* multi-node sets we move onto the submirror resync.
*/
if (mn_resync) {
if (RS_TYPE(un->un_rs_type) == MD_RS_NONE) {
un->un_rs_type = MD_RS_SUBMIRROR;
un->un_rs_resync_done =
un->un_rs_resync_2_do = 0;
un->un_resync_startbl = 0;
}
} else {
un->un_rs_type = MD_RS_SUBMIRROR;
un->un_rs_resync_done = 0;
un->un_rs_resync_2_do = 0;
un->un_resync_startbl = 0;
}
}
if (RS_TYPE(un->un_rs_type) == MD_RS_SUBMIRROR) {
submirror_resync(un);
/* Check to see if we've been requested to block/kill */
if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
goto bail_out;
}
un = (mm_unit_t *)MD_UNIT(mnum);
/*
* If we have moved on from a submirror resync, another
* node must have completed it and started a different
* resync, so leave the resync state alone
*/
if (mn_resync) {
if (RS_TYPE(un->un_rs_type) == MD_RS_NONE) {
un->un_rs_resync_done =
un->un_rs_resync_2_do = 0;
un->un_resync_startbl = 0;
}
} else {
/* If non-MN mirror, reinitialize state */
un->un_rs_type = MD_RS_NONE;
un->un_rs_resync_done = 0;
un->un_rs_resync_2_do = 0;
un->un_resync_startbl = 0;
}
}
} while (un->un_rs_dropped_lock);
mutex_enter(&un->un_rs_thread_mx);
un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
mutex_exit(&un->un_rs_thread_mx);
resync_finish = 1;
bail_out:
#ifdef DEBUG
if (mirror_debug_flag)
printf("Resync stopped (mnum = %x), resync_finish = %d\n",
mnum, resync_finish);
#endif
kmem_free(un->un_rs_buffer, dbtob(un->un_rs_copysize));
mutex_enter(&un->un_rs_progress_mx);
un->un_rs_progress_flags |= MD_RI_KILL;
cv_signal(&un->un_rs_progress_cv);
mutex_exit(&un->un_rs_progress_mx);
/*
* For MN Set, send a RESYNC_FINISH if this node completed the resync.
* There is no need to grow unit here, it will be done in the
* handler for the RESYNC_FINISH message together with resetting
* MD_UN_RESYNC_ACTIVE.
*/
if (mn_resync) {
if (resync_finish) {
/*
* Normal resync completion. Issue a RESYNC_FINISH
* message if we're part of a multi-node set.
*/
md_mn_kresult_t *kres;
md_mn_msg_resync_t *rmsg;
int rval;
rmsg = (md_mn_msg_resync_t *)un->un_rs_msg;
md_unit_writerexit(ui);
rmsg->msg_resync_mnum = mnum;
rmsg->msg_resync_type = 0;
rmsg->msg_resync_done = 0;
rmsg->msg_resync_2_do = 0;
rmsg->msg_originator = md_mn_mynode_id;
kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
smrf_msg:
mutex_enter(&un->un_rs_cpr_mx);
CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
rval = mdmn_ksend_message(setno,
MD_MN_MSG_RESYNC_FINISH, MD_MSGF_NO_LOG, 0,
(char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
CALLB_CPR_SAFE_END(&un->un_rs_cprinfo,
&un->un_rs_cpr_mx);
mutex_exit(&un->un_rs_cpr_mx);
if (!MDMN_KSEND_MSG_OK(rval, kres)) {
mdmn_ksend_show_error(rval, kres,
"RESYNC_FINISH");
/* If we're shutting down, pause things here. */
if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
while (!md_mn_is_commd_present()) {
delay(md_hz);
}
/*
* commd is now available again. Retry
* the message once. If this fails we
* panic as the system is in an
* unexpected state.
*/
if (nretries++ == 0)
goto smrf_msg;
}
cmn_err(CE_PANIC,
"ksend_message failure: RESYNC_FINISH");
}
kmem_free(kres, sizeof (md_mn_kresult_t));
(void) md_unit_writerlock(ui);
}
/*
* If the resync has been cancelled, clear flags, reset owner
* for ABR mirror and release the resync region parent
* structure.
*/
if (un->c.un_status & MD_UN_RESYNC_CANCEL) {
md_mps_t *ps;
if (ui->ui_tstate & MD_ABR_CAP) {
/* Resync finished, if ABR set owner to NULL */
mutex_enter(&un->un_owner_mx);
un->un_mirror_owner = 0;
mutex_exit(&un->un_owner_mx);
}
un->c.un_status &= ~(MD_UN_RESYNC_CANCEL |
MD_UN_RESYNC_ACTIVE);
ps = un->un_rs_prev_overlap;
if (ps != NULL) {
/* Remove previous overlap resync region */
if (ps->ps_flags & MD_MPS_ON_OVERLAP)
mirror_overlap_tree_remove(ps);
/*
* Release the overlap range reference
*/
un->un_rs_prev_overlap = NULL;
kmem_cache_free(mirror_parent_cache,
ps);
}
}
/*
* Release resync message buffer. This will be reallocated on
* the next invocation of the resync_unit thread.
*/
if (un->un_rs_msg) {
kmem_free(un->un_rs_msg, sizeof (md_mn_msg_resync_t));
un->un_rs_msg = NULL;
}
} else {
/* For non-MN sets deal with any pending grows */
un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
if (un->c.un_status & MD_UN_GROW_PENDING) {
if ((mirror_grow_unit(un, &mde) != 0) ||
(! mdismderror(&mde, MDE_GROW_DELAYED))) {
un->c.un_status &= ~MD_UN_GROW_PENDING;
}
}
}
reset_comp_flags(un);
un->un_resync_completed = 0;
mirror_commit(un, NO_SUBMIRRORS, 0);
md_unit_writerexit(ui);
/*
* Stop the resync progress thread.
*/
if (un->un_rs_resync_to_id != 0) {
(void) untimeout(un->un_rs_resync_to_id);
un->un_rs_resync_to_id = 0;
}
/*
* Calling mirror_internal_close() makes further reference to un / ui
* dangerous. If we are the only consumer of the mirror it is possible
* for a metaclear to be processed after completion of the m_i_c()
* routine. As we need to handle the case where another resync has been
* scheduled for the mirror, we raise the open count on the device
* which protects against the close / metaclear / lock => panic scenario
*/
(void) md_unit_incopen(MD_SID(un), FREAD|FWRITE, OTYP_LYR);
(void) mirror_internal_close(MD_SID(un), OTYP_LYR, 0, (IOLOCK *)NULL);
/*
* deccrement the mirror resync count
*/
mutex_enter(&md_cpr_resync.md_resync_mutex);
md_cpr_resync.md_mirror_resync--;
mutex_exit(&md_cpr_resync.md_resync_mutex);
/*
* Remove the thread reference as we're about to exit. This allows a
* subsequent mirror_resync_unit() to start a new thread.
* If RESYNC_ACTIVE is set, mirror_resync_unit() must have been
* called to start a new resync, so reopen the mirror and go back to
* the start.
*/
(void) md_unit_writerlock(ui);
mutex_enter(&un->un_rs_thread_mx);
un->un_rs_thread_flags &= ~(MD_RI_KILL|MD_RI_SHUTDOWN);
mutex_exit(&un->un_rs_thread_mx);
if (un->c.un_status & MD_UN_RESYNC_ACTIVE) {
md_unit_writerexit(ui);
if (mirror_internal_open(MD_SID(un), (FREAD|FWRITE),
OTYP_LYR, 0, (IOLOCK *)NULL) == 0) {
/* Release the reference grabbed above */
(void) mirror_internal_close(MD_SID(un), OTYP_LYR, 0,
(IOLOCK *)NULL);
goto resync_restart;
}
(void) md_unit_writerlock(ui);
cmn_err(CE_NOTE,
"Could not open metadevice (%x) for resync\n",
MD_SID(un));
}
un->un_rs_thread = NULL;
md_unit_writerexit(ui);
/*
* Check for hotspares once we've cleared the resync thread reference.
* If there are any errored units a poke_hotspares() will result in
* a call to mirror_resync_unit() which we need to allow to start.
*/
(void) poke_hotspares();
/*
* Remove this thread from the CPR callback table.
*/
if (mn_resync) {
mutex_enter(&un->un_rs_cpr_mx);
CALLB_CPR_EXIT(&un->un_rs_cprinfo);
}
/*
* Remove the extra reference to the unit we generated above. After
* this call it is *unsafe* to reference either ui or un as they may
* no longer be allocated.
*/
(void) mirror_internal_close(MD_SID(un), OTYP_LYR, 0, (IOLOCK *)NULL);
thread_exit();
}
/*
* mirror_resync_unit:
* ------------------
* Start a resync for the given mirror metadevice. Save the resync thread ID in
* un->un_rs_thread for later manipulation.
*
* Returns:
* 0 Success
* !=0 Error
*/
/*ARGSUSED*/
int
mirror_resync_unit(
minor_t mnum,
md_resync_ioctl_t *ri,
md_error_t *ep,
IOLOCK *lockp
)
{
mdi_unit_t *ui;
mm_unit_t *un;
set_t setno = MD_MIN2SET(mnum);
ui = MDI_UNIT(mnum);
if (md_get_setstatus(setno) & MD_SET_STALE)
return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
if (mirror_internal_open(mnum, (FREAD|FWRITE), OTYP_LYR, 0, lockp)) {
return (mdmderror(ep, MDE_MIRROR_OPEN_FAILURE, mnum));
}
if (lockp) {
un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
} else {
un = (mm_unit_t *)md_unit_writerlock(ui);
}
/*
* Check to see if we're attempting to start a resync while one is
* already running.
*/
if (un->c.un_status & MD_UN_RESYNC_ACTIVE ||
un->un_rs_thread != NULL) {
/*
* Ensure RESYNC_ACTIVE set, it may not be if the resync thread
* is in the process of terminating, setting the flag will
* cause the resync thread to return to the beginning
*/
un->c.un_status |= MD_UN_RESYNC_ACTIVE;
if (lockp) {
md_ioctl_writerexit(lockp);
} else {
md_unit_writerexit(ui);
}
(void) mirror_internal_close(mnum, OTYP_LYR, 0, lockp);
return (0);
}
un->c.un_status |= MD_UN_RESYNC_ACTIVE;
un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
if ((ri) && (ri->ri_copysize > 0) &&
(ri->ri_copysize <= md_max_xfer_bufsz))
un->un_rs_copysize = ri->ri_copysize;
else
un->un_rs_copysize = 0;
/* Start the resync progress thread off */
un->un_rs_progress_flags = 0;
(void) thread_create(NULL, 0, resync_progress_thread,
(caddr_t)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
/*
* We have to store the thread ID in the unit structure so do not
* drop writerlock until the thread is active. This means resync_unit
* may spin on its first md_unit_readerlock(), but deadlock won't occur.
*/
mutex_enter(&un->un_rs_thread_mx);
un->un_rs_thread_flags &= ~(MD_RI_KILL|MD_RI_SHUTDOWN);
mutex_exit(&un->un_rs_thread_mx);
un->un_rs_thread = thread_create(NULL, 0, resync_unit,
(caddr_t)(uintptr_t)mnum, 0, &p0, TS_RUN, 60);
if (un->un_rs_thread == (kthread_id_t)NULL) {
un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
if (lockp) {
md_ioctl_writerexit(lockp);
} else {
md_unit_writerexit(ui);
}
(void) mirror_internal_close(mnum, OTYP_LYR, 0, lockp);
return (mdmderror(ep, MDE_MIRROR_THREAD_FAILURE, mnum));
} else {
if (lockp) {
md_ioctl_writerexit(lockp);
} else {
md_unit_writerexit(ui);
}
}
return (0);
}
/*
* mirror_ioctl_resync:
* -------------------
* Called as a result of an MD_IOCSETSYNC ioctl. Either start, block, unblock
* or kill the resync thread associated with the specified unit.
* Can return with locks held since mdioctl will free any locks
* that are marked in lock->l_flags.
*
* Returns:
* 0 Success
* !=0 Error Code
*/
int
mirror_ioctl_resync(
md_resync_ioctl_t *ri,
IOLOCK *lock
)
{
minor_t mnum = ri->ri_mnum;
mm_unit_t *un;
uint_t bits;
mm_submirror_t *sm;
mm_submirror_ic_t *smic;
int smi;
kt_did_t tid;
set_t setno = MD_MIN2SET(mnum);
mdclrerror(&ri->mde);
if ((setno >= md_nsets) ||
(MD_MIN2UNIT(mnum) >= md_nunits)) {
return (mdmderror(&ri->mde, MDE_INVAL_UNIT, mnum));
}
/* RD_LOCK flag grabs the md_ioctl_readerlock */
un = mirror_getun(mnum, &ri->mde, RD_LOCK, lock);
if (un == NULL) {
return (mdmderror(&ri->mde, MDE_UNIT_NOT_SETUP, mnum));
}
if (un->c.un_type != MD_METAMIRROR) {
return (mdmderror(&ri->mde, MDE_NOT_MM, mnum));
}
if (un->un_nsm < 2) {
return (0);
}
/*
* Determine the action to take based on the ri_flags field:
* MD_RI_BLOCK: Block current resync thread
* MD_RI_UNBLOCK: Unblock resync thread
* MD_RI_KILL: Abort resync thread
* MD_RI_RESYNC_FORCE_MNSTART: Directly start resync thread
* without using rpc.mdcommd messages.
* any other: Start resync thread
*/
switch (ri->ri_flags & (MD_RI_BLOCK|MD_RI_UNBLOCK|MD_RI_KILL)) {
case MD_RI_BLOCK:
/* Halt resync thread by setting flag in un_rs_flags */
if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
return (0);
}
mutex_enter(&un->un_rs_thread_mx);
un->un_rs_thread_flags |= MD_RI_BLOCK;
mutex_exit(&un->un_rs_thread_mx);
return (0);
case MD_RI_UNBLOCK:
/*
* Restart resync thread by clearing flag in un_rs_flags and
* cv_signal'ing the blocked thread.
*/
if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
return (0);
}
mutex_enter(&un->un_rs_thread_mx);
un->un_rs_thread_flags &= ~MD_RI_BLOCK;
cv_signal(&un->un_rs_thread_cv);
mutex_exit(&un->un_rs_thread_mx);
return (0);
case MD_RI_KILL:
/* Abort resync thread. */
if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
return (0);
}
mutex_enter(&un->un_rs_thread_mx);
tid = un->un_rs_thread ? (un->un_rs_thread)->t_did : 0;
un->un_rs_thread_flags &= ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
un->un_rs_thread_flags |= MD_RI_KILL;
cv_signal(&un->un_rs_thread_cv);
mutex_exit(&un->un_rs_thread_mx);
if (tid != 0) {
if (!(ri->ri_flags & MD_RI_NO_WAIT)) {
md_ioctl_readerexit(lock);
thread_join(tid);
un->un_rs_thread_flags &= ~MD_RI_KILL;
un->un_rs_thread = NULL;
cmn_err(CE_WARN, "md: %s: Resync cancelled\n",
md_shortname(MD_SID(un)));
}
}
return (0);
}
md_ioctl_readerexit(lock);
bits = 0;
for (smi = 0; smi < NMIRROR; smi++) {
sm = &un->un_sm[smi];
smic = &un->un_smic[smi];
if (!SMS_IS(sm, SMS_ATTACHED))
continue;
mirror_set_sm_state(sm, smic, SMS_ATTACHED_RESYNC, 1);
bits |= SMI2BIT(smi);
}
if (bits != 0)
mirror_commit(un, bits, 0);
/*
* If we are resyncing a mirror in a MN set and the rpc.mdcommd
* can be used, we do not start the resync at this point.
* Instead, the metasync command that issued the ioctl
* will send a RESYNC_STARTING message to start the resync thread. The
* reason we do it this way is to ensure that the metasync ioctl is
* executed on all nodes before the resync thread is started.
*
* If a MN set and the MD_RI_RESYNC_FORCE_MNSTART flag is set, then
* don't use rpc.mdcommd, but just start the resync thread. This
* flag is set on a node when it is being added to a diskset
* so that the resync threads are started on the newly added node.
*/
if ((!(MD_MNSET_SETNO(setno))) ||
(ri->ri_flags & MD_RI_RESYNC_FORCE_MNSTART)) {
return (mirror_resync_unit(mnum, ri, &ri->mde, lock));
} else {
return (0);
}
}
int
mirror_mark_resync_region_non_owner(struct mm_unit *un,
diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
{
int no_change;
size_t start_rr;
size_t current_rr;
size_t end_rr;
md_mn_msg_rr_dirty_t *rr;
md_mn_kresult_t *kres;
set_t setno = MD_UN2SET(un);
int rval;
md_mn_nodeid_t node_idx = source_node - 1;
mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
md_mn_nodeid_t owner_node;
minor_t mnum = MD_SID(un);
if (un->un_nsm < 2)
return (0);
/*
* Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
* not, allocate it and then fill the [start..end] entries.
* Update un_pernode_dirty_sum if we've gone 0->1.
* Update un_dirty_bm if the corresponding entries are clear.
*/
rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
if (un->un_pernode_dirty_bm[node_idx] == NULL) {
un->un_pernode_dirty_bm[node_idx] =
(uchar_t *)kmem_zalloc(
(uint_t)howmany(un->un_rrd_num, NBBY), KM_SLEEP);
}
rw_exit(&un->un_pernode_dirty_mx[node_idx]);
BLK_TO_RR(end_rr, endblk, un);
BLK_TO_RR(start_rr, startblk, un);
no_change = 1;
mutex_enter(&un->un_resync_mx);
rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_READER);
for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
un->un_outstanding_writes[current_rr]++;
if (!IS_PERNODE_DIRTY(source_node, current_rr, un)) {
un->un_pernode_dirty_sum[current_rr]++;
SET_PERNODE_DIRTY(source_node, current_rr, un);
}
CLR_GOING_CLEAN(current_rr, un);
if (!IS_REGION_DIRTY(current_rr, un)) {
no_change = 0;
SET_REGION_DIRTY(current_rr, un);
SET_GOING_DIRTY(current_rr, un);
} else if (IS_GOING_DIRTY(current_rr, un))
no_change = 0;
}
rw_exit(&un->un_pernode_dirty_mx[node_idx]);
mutex_exit(&un->un_resync_mx);
if (no_change) {
return (0);
}
/*
* If we have dirty regions to commit, send a
* message to the owning node so that the
* in-core bitmap gets updated appropriately.
* TODO: make this a kmem_cache pool to improve
* alloc/free performance ???
*/
kres = (md_mn_kresult_t *)kmem_alloc(sizeof (md_mn_kresult_t),
KM_SLEEP);
rr = (md_mn_msg_rr_dirty_t *)kmem_alloc(sizeof (md_mn_msg_rr_dirty_t),
KM_SLEEP);
resend_mmrr:
owner_node = un->un_mirror_owner;
rr->rr_mnum = mnum;
rr->rr_nodeid = md_mn_mynode_id;
rr->rr_range = (ushort_t)start_rr << 16;
rr->rr_range |= (ushort_t)end_rr & 0xFFFF;
/* release readerlock before sending message */
md_unit_readerexit(ui);
rval = mdmn_ksend_message(setno, MD_MN_MSG_RR_DIRTY,
MD_MSGF_NO_LOG|MD_MSGF_BLK_SIGNAL|MD_MSGF_DIRECTED,
un->un_mirror_owner, (char *)rr,
sizeof (md_mn_msg_rr_dirty_t), kres);
/* reaquire readerlock on message completion */
(void) md_unit_readerlock(ui);
/* if the message send failed, note it, and pass an error back up */
if (!MDMN_KSEND_MSG_OK(rval, kres)) {
/* if commd is gone, no point in printing a message */
if (md_mn_is_commd_present())
mdmn_ksend_show_error(rval, kres, "RR_DIRTY");
kmem_free(kres, sizeof (md_mn_kresult_t));
kmem_free(rr, sizeof (md_mn_msg_rr_dirty_t));
return (1);
}
/*
* if the owner changed while we were sending the message, and it's
* not us, the new mirror owner won't yet have done the right thing
* with our data. Let him know. If we became the owner, we'll
* deal with that differently below. Note that receiving a message
* about another node twice won't hurt anything.
*/
if (un->un_mirror_owner != owner_node && !MD_MN_MIRROR_OWNER(un))
goto resend_mmrr;
kmem_free(kres, sizeof (md_mn_kresult_t));
kmem_free(rr, sizeof (md_mn_msg_rr_dirty_t));
mutex_enter(&un->un_resync_mx);
/*
* If we became the owner changed while we were sending the message,
* we have dirty bits in the un_pernode_bm that aren't yet reflected
* in the un_dirty_bm, as it was re-read from disk, and our bits
* are also not reflected in the on-disk DRL. Fix that now.
*/
if (MD_MN_MIRROR_OWNER(un)) {
rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
mirror_copy_rr(howmany(un->un_rrd_num, NBBY),
un->un_pernode_dirty_bm[node_idx], un->un_dirty_bm);
rw_exit(&un->un_pernode_dirty_mx[node_idx]);
un->un_resync_flg |= MM_RF_COMMITING | MM_RF_GATECLOSED;
mutex_exit(&un->un_resync_mx);
mddb_commitrec_wrapper(un->un_rr_dirty_recid);
mutex_enter(&un->un_resync_mx);
un->un_resync_flg &= ~(MM_RF_COMMITING | MM_RF_GATECLOSED);
cv_broadcast(&un->un_resync_cv);
}
for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
CLR_GOING_DIRTY(current_rr, un);
mutex_exit(&un->un_resync_mx);
return (0);
}
int
mirror_mark_resync_region_owner(struct mm_unit *un,
diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
{
int no_change;
size_t start_rr;
size_t current_rr;
size_t end_rr;
int mnset = MD_MNSET_SETNO(MD_UN2SET(un));
md_mn_nodeid_t node_idx = source_node - 1;
if (un->un_nsm < 2)
return (0);
/*
* Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
* not, allocate it and then fill the [start..end] entries.
* Update un_pernode_dirty_sum if we've gone 0->1.
* Update un_dirty_bm if the corresponding entries are clear.
*/
if (mnset) {
rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
if (un->un_pernode_dirty_bm[node_idx] == NULL) {
un->un_pernode_dirty_bm[node_idx] =
(uchar_t *)kmem_zalloc(
(uint_t)howmany(un->un_rrd_num, NBBY), KM_SLEEP);
}
rw_exit(&un->un_pernode_dirty_mx[node_idx]);
}
mutex_enter(&un->un_resync_mx);
if (mnset)
rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_READER);
no_change = 1;
BLK_TO_RR(end_rr, endblk, un);
BLK_TO_RR(start_rr, startblk, un);
for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
if (!mnset || source_node == md_mn_mynode_id)
un->un_outstanding_writes[current_rr]++;
if (mnset) {
if (!IS_PERNODE_DIRTY(source_node, current_rr, un))
un->un_pernode_dirty_sum[current_rr]++;
SET_PERNODE_DIRTY(source_node, current_rr, un);
}
CLR_GOING_CLEAN(current_rr, un);
if (!IS_REGION_DIRTY(current_rr, un))
no_change = 0;
if (IS_GOING_DIRTY(current_rr, un))
no_change = 0;
}
if (mnset)
rw_exit(&un->un_pernode_dirty_mx[node_idx]);
if (no_change) {
mutex_exit(&un->un_resync_mx);
return (0);
}
un->un_waiting_to_mark++;
while (un->un_resync_flg & MM_RF_GATECLOSED) {
if (panicstr)
return (1);
cv_wait(&un->un_resync_cv, &un->un_resync_mx);
}
un->un_waiting_to_mark--;
no_change = 1;
for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
if (!IS_REGION_DIRTY(current_rr, un)) {
SET_REGION_DIRTY(current_rr, un);
SET_GOING_DIRTY(current_rr, un);
no_change = 0;
} else {
if (IS_GOING_DIRTY(current_rr, un))
no_change = 0;
}
}
if (no_change) {
if (un->un_waiting_to_mark == 0 || un->un_waiting_to_clear != 0)
cv_broadcast(&un->un_resync_cv);
mutex_exit(&un->un_resync_mx);
return (0);
}
un->un_resync_flg |= MM_RF_COMMIT_NEEDED;
un->un_waiting_to_commit++;
while (un->un_waiting_to_mark != 0 &&
!(un->un_resync_flg & MM_RF_GATECLOSED)) {
if (panicstr)
return (1);
cv_wait(&un->un_resync_cv, &un->un_resync_mx);
}
if (un->un_resync_flg & MM_RF_COMMIT_NEEDED) {
un->un_resync_flg |= MM_RF_COMMITING | MM_RF_GATECLOSED;
un->un_resync_flg &= ~MM_RF_COMMIT_NEEDED;
mutex_exit(&un->un_resync_mx);
mddb_commitrec_wrapper(un->un_rr_dirty_recid);
mutex_enter(&un->un_resync_mx);
un->un_resync_flg &= ~MM_RF_COMMITING;
cv_broadcast(&un->un_resync_cv);
}
while (un->un_resync_flg & MM_RF_COMMITING) {
if (panicstr)
return (1);
cv_wait(&un->un_resync_cv, &un->un_resync_mx);
}
for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
CLR_GOING_DIRTY(current_rr, un);
if (--un->un_waiting_to_commit == 0) {
un->un_resync_flg &= ~MM_RF_GATECLOSED;
cv_broadcast(&un->un_resync_cv);
}
mutex_exit(&un->un_resync_mx);
return (0);
}
int
mirror_mark_resync_region(struct mm_unit *un,
diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
{
int mnset = MD_MNSET_SETNO(MD_UN2SET(un));
if (mnset && !MD_MN_MIRROR_OWNER(un)) {
return (mirror_mark_resync_region_non_owner(un, startblk,
endblk, source_node));
} else {
return (mirror_mark_resync_region_owner(un, startblk, endblk,
source_node));
}
}
int
mirror_resize_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
{
short *owp;
optim_resync_t *orp;
uint_t rr_mult = 1;
uint_t old_nregions, new_nregions;
int old_bm_size, new_bm_size;
size_t size;
mddb_recid_t recid, old_recid;
uchar_t *old_dirty_bm;
int i, j;
mddb_type_t typ1;
set_t setno = MD_UN2SET(un);
uchar_t *old_pns;
old_nregions = un->un_rrd_num;
new_nregions = (uint_t)((new_tb/un->un_rrd_blksize) + 1);
while (new_nregions > MD_MAX_NUM_RR) {
new_nregions >>= 1;
rr_mult <<= 1;
}
new_bm_size = howmany(new_nregions, NBBY);
old_bm_size = howmany(old_nregions, NBBY);
size = new_bm_size + sizeof (*orp) - sizeof (orp->or_rr);
typ1 = (mddb_type_t)md_getshared_key(setno,
mirror_md_ops.md_driver.md_drivername);
recid = mddb_createrec(size, typ1, RESYNC_REC,
MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno);
if (recid < 0)
return (-1);
orp = (struct optim_resync *)mddb_getrecaddr(recid);
ASSERT(orp != NULL);
orp->or_magic = OR_MAGIC; /* Magic # */
orp->or_blksize = un->un_rrd_blksize; /* Same block size */
orp->or_num = new_nregions; /* New number of regions */
old_dirty_bm = un->un_dirty_bm;
un->un_dirty_bm = orp->or_rr;
kmem_free((caddr_t)un->un_goingdirty_bm, old_bm_size);
un->un_goingdirty_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
kmem_free((caddr_t)un->un_goingclean_bm, old_bm_size);
un->un_goingclean_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
kmem_free((caddr_t)un->un_resync_bm, old_bm_size);
un->un_resync_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
owp = un->un_outstanding_writes;
un->un_outstanding_writes = (short *)kmem_zalloc(
new_nregions * sizeof (short), KM_SLEEP);
old_pns = un->un_pernode_dirty_sum;
if (old_pns)
un->un_pernode_dirty_sum = (uchar_t *)kmem_zalloc(new_nregions,
KM_SLEEP);
/*
* Now translate the old records into the new
* records
*/
for (i = 0; i < old_nregions; i++) {
/*
* only bring forward the
* outstanding write counters and the dirty bits and also
* the pernode_summary counts
*/
if (!isset(old_dirty_bm, i))
continue;
setbit(un->un_dirty_bm, (i / rr_mult));
un->un_outstanding_writes[(i / rr_mult)] += owp[i];
if (old_pns)
un->un_pernode_dirty_sum[(i / rr_mult)] += old_pns[i];
}
kmem_free((caddr_t)owp, old_nregions * sizeof (short));
if (old_pns)
kmem_free((caddr_t)old_pns, old_nregions);
/*
* Copy all non-zero un_pernode_dirty_bm[] arrays to new versions
*/
for (j = 0; j < MD_MNMAXSIDES; j++) {
rw_enter(&un->un_pernode_dirty_mx[j], RW_WRITER);
old_dirty_bm = un->un_pernode_dirty_bm[j];
if (old_dirty_bm) {
un->un_pernode_dirty_bm[j] = (uchar_t *)kmem_zalloc(
new_bm_size, KM_SLEEP);
for (i = 0; i < old_nregions; i++) {
if (!isset(old_dirty_bm, i))
continue;
setbit(un->un_pernode_dirty_bm[j],
(i / rr_mult));
}
kmem_free((caddr_t)old_dirty_bm, old_bm_size);
}
rw_exit(&un->un_pernode_dirty_mx[j]);
}
/* Save the old record id */
old_recid = un->un_rr_dirty_recid;
/* Update the mirror unit struct */
un->un_rr_dirty_recid = recid;
un->un_rrd_num = new_nregions;
un->un_rrd_blksize = un->un_rrd_blksize * rr_mult;
orp->or_blksize = un->un_rrd_blksize;
/*
* NOTE: The reason there are distinct calls to mddb_commitrec_wrapper
* instead of using mddb_commitrecs_wrapper, is that you cannot
* atomically commit optimized records.
*/
mddb_commitrec_wrapper(recid);
mddb_commitrec_wrapper(un->c.un_record_id);
mddb_deleterec_wrapper(old_recid);
return (0);
}
/* lockp can be NULL for !MN diksets */
int
mirror_add_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
{
uchar_t *old;
short *owp;
optim_resync_t *orp;
uint_t old_nregions, new_nregions;
int old_bm_size, new_bm_size;
size_t size;
mddb_recid_t recid, old_recid;
mddb_type_t typ1;
set_t setno = MD_UN2SET(un);
int i;
old_nregions = un->un_rrd_num;
new_nregions = (uint_t)((new_tb/un->un_rrd_blksize) + 1);
new_bm_size = howmany(new_nregions, NBBY);
old_bm_size = howmany(old_nregions, NBBY);
size = new_bm_size + sizeof (*orp) - sizeof (orp->or_rr);
typ1 = (mddb_type_t)md_getshared_key(setno,
mirror_md_ops.md_driver.md_drivername);
recid = mddb_createrec(size, typ1, RESYNC_REC,
MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno);
if (recid < 0)
return (-1);
orp = (struct optim_resync *)mddb_getrecaddr(recid);
ASSERT(orp != NULL);
orp->or_magic = OR_MAGIC; /* Magic # */
orp->or_blksize = un->un_rrd_blksize; /* Same block size */
orp->or_num = new_nregions; /* New number of regions */
/* Copy the old bm over the new bm */
bcopy((caddr_t)un->un_dirty_bm, (caddr_t)orp->or_rr, old_bm_size);
/*
* Create new bigger incore arrays, copy, and free old ones:
* un_goingdirty_bm
* un_goingclean_bm
* un_resync_bm
* un_outstanding_writes
* un_pernode_dirty_sum
* un_pernode_dirty_bm[]
*/
old = un->un_goingdirty_bm;
un->un_goingdirty_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
bcopy((caddr_t)old, (caddr_t)un->un_goingdirty_bm, old_bm_size);
kmem_free((caddr_t)old, old_bm_size);
old = un->un_goingclean_bm;
un->un_goingclean_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
bcopy((caddr_t)old, (caddr_t)un->un_goingclean_bm, old_bm_size);
kmem_free((caddr_t)old, old_bm_size);
old = un->un_resync_bm;
un->un_resync_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
bcopy((caddr_t)old, (caddr_t)un->un_resync_bm, old_bm_size);
kmem_free((caddr_t)old, old_bm_size);
owp = un->un_outstanding_writes;
un->un_outstanding_writes = (short *)kmem_zalloc(
(uint_t)new_nregions * sizeof (short), KM_SLEEP);
bcopy((caddr_t)owp, (caddr_t)un->un_outstanding_writes,
old_nregions * sizeof (short));
kmem_free((caddr_t)owp, (old_nregions * sizeof (short)));
old = un->un_pernode_dirty_sum;
if (old) {
un->un_pernode_dirty_sum = (uchar_t *)kmem_zalloc(
new_nregions, KM_SLEEP);
bcopy((caddr_t)old, (caddr_t)un->un_pernode_dirty_sum,
old_nregions);
kmem_free((caddr_t)old, old_nregions);
}
for (i = 0; i < MD_MNMAXSIDES; i++) {
rw_enter(&un->un_pernode_dirty_mx[i], RW_WRITER);
old = un->un_pernode_dirty_bm[i];
if (old) {
un->un_pernode_dirty_bm[i] = (uchar_t *)kmem_zalloc(
new_bm_size, KM_SLEEP);
bcopy((caddr_t)old, (caddr_t)un->un_pernode_dirty_bm[i],
old_bm_size);
kmem_free((caddr_t)old, old_bm_size);
}
rw_exit(&un->un_pernode_dirty_mx[i]);
}
/* Save the old record id */
old_recid = un->un_rr_dirty_recid;
/* Update the mirror unit struct */
un->un_rr_dirty_recid = recid;
un->un_rrd_num = new_nregions;
un->un_dirty_bm = orp->or_rr;
/*
* NOTE: The reason there are distinct calls to mddb_commitrec_wrapper
* instead of using mddb_commitrecs_wrapper, is that you cannot
* atomically commit optimized records.
*/
mddb_commitrec_wrapper(recid);
mddb_commitrec_wrapper(un->c.un_record_id);
mddb_deleterec_wrapper(old_recid);
return (0);
}
/*
* mirror_copy_rr:
* --------------
* Combine the dirty record bitmap with the in-core resync bitmap. This allows
* us to carry a resync over an ownership change.
*/
void
mirror_copy_rr(int sz, uchar_t *src, uchar_t *dest)
{
int i;
for (i = 0; i < sz; i++)
*dest++ |= *src++;
}
/*
* mirror_set_dirty_rr:
* -------------------
* Set the pernode_dirty_bm[node] entries and un_dirty_bm[] if appropriate.
* For the owning node (DRL/mirror owner) update the on-disk RR if needed.
* Called on every clean->dirty transition for the originating writer node.
* Note: only the non-owning nodes will initiate this message and it is only
* the owning node that has to process it.
*/
int
mirror_set_dirty_rr(md_mn_rr_dirty_params_t *iocp)
{
minor_t mnum = iocp->rr_mnum;
mm_unit_t *un;
int start = (int)iocp->rr_start;
int end = (int)iocp->rr_end;
set_t setno = MD_MIN2SET(mnum);
md_mn_nodeid_t orignode = iocp->rr_nodeid; /* 1-based */
diskaddr_t startblk, endblk;
mdclrerror(&iocp->mde);
if ((setno >= md_nsets) ||
(MD_MIN2UNIT(mnum) >= md_nunits)) {
return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
}
/* Must have _NO_ ioctl lock set if we update the RR on-disk */
un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
if (un == NULL) {
return (mdmderror(&iocp->mde, MDE_UNIT_NOT_SETUP, mnum));
}
if (un->c.un_type != MD_METAMIRROR) {
return (mdmderror(&iocp->mde, MDE_NOT_MM, mnum));
}
if (orignode < 1 || orignode >= MD_MNMAXSIDES) {
return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
}
if (un->un_nsm < 2) {
return (0);
}
/*
* Only process this message if we're the owner of the mirror.
*/
if (!MD_MN_MIRROR_OWNER(un)) {
return (0);
}
RR_TO_BLK(startblk, start, un);
RR_TO_BLK(endblk, end, un);
return (mirror_mark_resync_region_owner(un, startblk, endblk,
orignode));
}
/*
* mirror_clean_rr_bits:
* --------------------
* Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
* Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
* is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
* nodes. Callable from ioctl / interrupt / whatever context.
* un_resync_mx is held on entry.
*/
static void
mirror_clean_rr_bits(
md_mn_rr_clean_params_t *iocp)
{
minor_t mnum = iocp->rr_mnum;
mm_unit_t *un;
uint_t cleared_bits;
md_mn_nodeid_t node = iocp->rr_nodeid - 1;
md_mn_nodeid_t orignode = iocp->rr_nodeid;
int i, start, end;
un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
cleared_bits = 0;
start = MDMN_RR_CLEAN_PARAMS_START_BIT(iocp);
end = start + MDMN_RR_CLEAN_PARAMS_DATA_BYTES(iocp) * NBBY;
rw_enter(&un->un_pernode_dirty_mx[node], RW_READER);
for (i = start; i < end; i++) {
if (isset(MDMN_RR_CLEAN_PARAMS_DATA(iocp), i - start)) {
if (IS_PERNODE_DIRTY(orignode, i, un)) {
un->un_pernode_dirty_sum[i]--;
CLR_PERNODE_DIRTY(orignode, i, un);
}
if (un->un_pernode_dirty_sum[i] == 0) {
cleared_bits++;
CLR_REGION_DIRTY(i, un);
CLR_GOING_CLEAN(i, un);
}
}
}
rw_exit(&un->un_pernode_dirty_mx[node]);
if (cleared_bits) {
/*
* We can only be called iff we are the mirror owner, however
* as this is a (potentially) decoupled routine the ownership
* may have moved from us by the time we get to execute the
* bit clearing. Hence we still need to check for being the
* owner before flushing the DRL to the replica.
*/
if (MD_MN_MIRROR_OWNER(un)) {
mutex_exit(&un->un_resync_mx);
mddb_commitrec_wrapper(un->un_rr_dirty_recid);
mutex_enter(&un->un_resync_mx);
}
}
}
/*
* mirror_drl_task:
* ---------------
* Service routine for clearing the DRL bits on a deferred MD_MN_RR_CLEAN call
* We need to obtain exclusive access to the un_resync_cv and then clear the
* necessary bits.
* On completion, we must also free the passed in argument as it is allocated
* at the end of the ioctl handler and won't be freed on completion.
*/
static void
mirror_drl_task(void *arg)
{
md_mn_rr_clean_params_t *iocp = (md_mn_rr_clean_params_t *)arg;
minor_t mnum = iocp->rr_mnum;
mm_unit_t *un;
un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
mutex_enter(&un->un_rrp_inflight_mx);
mutex_enter(&un->un_resync_mx);
un->un_waiting_to_clear++;
while (un->un_resync_flg & MM_RF_STALL_CLEAN)
cv_wait(&un->un_resync_cv, &un->un_resync_mx);
un->un_waiting_to_clear--;
un->un_resync_flg |= MM_RF_GATECLOSED;
mirror_clean_rr_bits(iocp);
un->un_resync_flg &= ~MM_RF_GATECLOSED;
if (un->un_waiting_to_mark != 0 || un->un_waiting_to_clear != 0) {
cv_broadcast(&un->un_resync_cv);
}
mutex_exit(&un->un_resync_mx);
mutex_exit(&un->un_rrp_inflight_mx);
kmem_free((caddr_t)iocp, MDMN_RR_CLEAN_PARAMS_SIZE(iocp));
}
/*
* mirror_set_clean_rr:
* -------------------
* Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
* Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
* is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
* nodes.
*
* Only the mirror-owner need process this message as it is the only RR updater.
* Non-owner nodes issue this request, but as we have no point-to-point message
* support we will receive the message on all nodes.
*/
int
mirror_set_clean_rr(md_mn_rr_clean_params_t *iocp)
{
minor_t mnum = iocp->rr_mnum;
mm_unit_t *un;
set_t setno = MD_MIN2SET(mnum);
md_mn_nodeid_t node = iocp->rr_nodeid - 1;
int can_clear = 0;
md_mn_rr_clean_params_t *newiocp;
int rval = 0;
mdclrerror(&iocp->mde);
if ((setno >= md_nsets) ||
(MD_MIN2UNIT(mnum) >= md_nunits)) {
return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
}
/* Must have _NO_ ioctl lock set if we update the RR on-disk */
un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
if (un == NULL) {
return (mdmderror(&iocp->mde, MDE_UNIT_NOT_SETUP, mnum));
}
if (un->c.un_type != MD_METAMIRROR) {
return (mdmderror(&iocp->mde, MDE_NOT_MM, mnum));
}
if (un->un_nsm < 2) {
return (0);
}
/*
* Check to see if we're the mirror owner. If not, there's nothing
* for us to to.
*/
if (!MD_MN_MIRROR_OWNER(un)) {
return (0);
}
/*
* Process the to-be-cleaned bitmap. We need to update the pernode_dirty
* bits and pernode_dirty_sum[n], and if, and only if, the sum goes 0
* we can then mark the un_dirty_bm entry as GOINGCLEAN. Alternatively
* we can just defer this cleaning until the next process_resync_regions
* timeout.
*/
rw_enter(&un->un_pernode_dirty_mx[node], RW_WRITER);
if (un->un_pernode_dirty_bm[node] == NULL) {
un->un_pernode_dirty_bm[node] = (uchar_t *)kmem_zalloc(
howmany(un->un_rrd_num, NBBY), KM_SLEEP);
}
rw_exit(&un->un_pernode_dirty_mx[node]);
/*
* See if we can simply clear the un_dirty_bm[] entries. If we're not
* the issuing node _and_ we aren't in the process of marking/clearing
* the RR bitmaps, we can simply update the bits as needed.
* If we're the owning node and _not_ the issuing node, we should also
* sync the RR if we clear any bits in it.
*/
mutex_enter(&un->un_resync_mx);
can_clear = (un->un_resync_flg & MM_RF_STALL_CLEAN) ? 0 : 1;
if (can_clear) {
un->un_resync_flg |= MM_RF_GATECLOSED;
mirror_clean_rr_bits(iocp);
un->un_resync_flg &= ~MM_RF_GATECLOSED;
if (un->un_waiting_to_mark != 0 ||
un->un_waiting_to_clear != 0) {
cv_broadcast(&un->un_resync_cv);
}
}
mutex_exit(&un->un_resync_mx);
/*
* If we couldn't clear the bits, due to DRL update from m_m_r_r / p_r_r
* we must schedule a blocking call to update the DRL on this node.
* As we're invoked from an ioctl we are going to have the original data
* disappear (kmem_free) once we return. So, copy the data into a new
* structure and let the taskq routine release it on completion.
*/
if (!can_clear) {
size_t sz = MDMN_RR_CLEAN_PARAMS_SIZE(iocp);
newiocp = (md_mn_rr_clean_params_t *)kmem_alloc(sz, KM_SLEEP);
bcopy(iocp, newiocp, sz);
if (ddi_taskq_dispatch(un->un_drl_task, mirror_drl_task,
newiocp, DDI_NOSLEEP) != DDI_SUCCESS) {
kmem_free(newiocp, sz);
rval = ENOMEM; /* probably starvation */
}
}
return (rval);
}