mirror_resync.c revision bf85a12b7c81d0745d5a8aff65baeff50006cde9
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/sysmacros.h>
extern int md_status;
extern kmutex_t md_status_mx;
extern md_ops_t mirror_md_ops;
extern mdq_anchor_t md_mto_daemon;
extern daemon_request_t mirror_timeout;
extern md_resync_t md_cpr_resync;
extern int md_mtioctl_cnt;
extern kmem_cache_t *mirror_parent_cache;
#ifdef DEBUG
extern int mirror_debug_flag;
#endif
/*
* Tunable resync thread timeout. This is used as the time interval for updating
* the resync progress to the mddb. This allows restartable resyncs to be
* continued across a system reboot.
* Default is to update the resync progress every 5 minutes.
*/
/*
* Settable mirror resync buffer size. Specified in 512 byte
* blocks. This is set to MD_DEF_RESYNC_BUF_SIZE by default.
*/
/*
* Tunables for dirty region processing when
* closing down a mirror.
*
* Dirty region processing during close of a
* mirror is basically monitoring the state
* of the resync region bitmaps and the number
* of outstanding i/o's per submirror to
* determine that there are no more dirty
* regions left over.
*
* The approach taken is a retry logic over
* md_mirror_rr_cleans iterations to monitor
* the progress.
*
* There are two methods of polling the progress
* on dirty bitmap processing: busy-waits and
* non-busy-waits.
*
* Busy-waits are used at the beginning to
* determine the final state as quick as
* possible; md_mirror_rr_polls defines the
* number of busy-waits.
*
* In case the number of busy-waits got exhausted
* with dirty regions left over, the retry logic
* switches over to non-busy-waits, thus giving
* relief to an obviously heavily loaded system.
* The timeout value is defined by the tunable
* md_mirror_rr_sleep_timo in seconds.
*
* The number of non-busy-waits is given by:
* md_mirror_rr_cleans - md_mirror_rr_polls.
*
* The values were found by testing on a
* 'typical' system and may require tuning
* to meet specific customer's requirements.
*/
int md_mirror_rr_cleans = 13;
int md_mirror_rr_polls = 3;
int md_mirror_rr_sleep_timo = 1;
/*
* The value is not #defined because it will be computed
* in the future.
*/
int md_max_xfer_bufsz = 2048;
/*
* mirror_generate_rr_bitmap:
* -------------------
* Generate a compressed bitmap md_mn_msg_rr_clean_t for the given clean
* bitmap associated with mirror 'un'
*
* Input:
* un - mirror unit to get bitmap data from
* *msgp - location to return newly allocated md_mn_msg_rr_clean_t
*
* Returns:
* 1 => dirty bits cleared from un_dirty_bm and DRL flush required
* *msgp contains bitmap of to-be-cleared bits
* 0 => no bits cleared
* *msgp == NULL
*/
static int
int *activep)
{
int cleared_dirty = 0;
/* Skip any initial 0s. */
/*
* Handle case where NO bits are set in PERNODE_DIRTY but the
* un_dirty_bm[] map does have entries set (after a 1st resync)
*/
;
if (un->un_rr_clean_start_bit == 0) {
return (0);
} else {
un->un_rr_clean_start_bit = 0;
goto retry_dirty_scan;
}
}
/* how much to fit into this message */
KM_SLEEP);
continue;
}
if (!IS_REGION_DIRTY(i, un)) {
continue;
}
if (un->un_outstanding_writes[i] != 0) {
(*activep)++;
continue;
}
/*
* Handle the case where a resync has completed and we still
* have the un_dirty_bm[] entries marked as dirty (these are
* the most recent DRL re-read from the replica). They need
* to be cleared from our un_dirty_bm[] but they will not have
* corresponding un_pernode_dirty[] entries set unless (and
* until) further write()s have been issued to the area.
* This handles the case where only the un_dirty_bm[] entry is
* set. Without this we'd not clear this region until a local
* write is issued to the affected area.
*/
if (!IS_GOING_CLEAN(i, un)) {
SET_GOING_CLEAN(i, un);
(*activep)++;
continue;
}
/*
* Now we've got a flagged pernode_dirty, _or_ a clean
* bitmap entry to process. Update the bitmap to flush
* the REGION_DIRTY / GOING_CLEAN bits when we send the
* cross-cluster message.
*/
} else {
/*
* Not marked as active in the pernode bitmap, so skip
* any update to this. We just increment the 0 count
* and adjust the active count by any outstanding
* un_pernode_dirty_sum[] entries. This means we don't
* leave the mirror permanently dirty.
*/
}
}
if (!cleared_dirty) {
}
return (cleared_dirty);
}
/*
* There are three paths into here:
*
* md_daemon -> check_resync_regions -> prr
* mirror_internal_close -> mirror_process_unit_resync -> prr
* mirror_set_capability -> mirror_process_unit_resync -> prr
*
* The first one is a kernel daemon, the other two result from system calls.
* Thus, only the first case needs to deal with kernel CPR activity. This
* is indicated by the cprinfop being non-NULL for kernel daemon calls, and
* NULL for system call paths.
*/
static int
{
int cleared_dirty = 0;
/* Number of reasons why we can not proceed shutting down the mirror. */
int active = 0;
int rval;
/*
* We drop the readerlock here to assist lock ordering with
* update_resync. Once we have the un_rrp_inflight_mx, we
* can re-acquire it.
*/
/*
* Resync region processing must be single threaded. We can't use
* un_resync_mx for this purpose since this mutex gets released
* when blocking on un_resync_cv.
*/
(void) md_unit_readerlock(ui);
if (cleared_dirty) {
/*
* Transmit the 'to-be-cleared' bitmap to all cluster nodes.
* Receipt of the message will cause the mirror owner to
* update the on-disk DRL.
*/
/* release readerlock before sending message */
if (cprinfop) {
}
if (cprinfop) {
}
/* reacquire readerlock after message */
(void) md_unit_readerlock(ui);
/* if commd is gone, no point in printing a message */
if (md_mn_is_commd_present())
return (active);
}
/*
* If ownership changed while we were sending, we probably
* sent the message to the wrong node. Leave fixing that for
* the next cycle.
*/
return (active);
}
/*
* Now that we've sent the message, clear them from the
* pernode_dirty arrays. These are ONLY cleared on a
* successful send, and failure has no impact.
*/
cleared_dirty = 0;
i - start)) {
un->un_pernode_dirty_sum[i]--;
un);
}
if (IS_REGION_DIRTY(i, un)) {
CLR_REGION_DIRTY(i, un);
CLR_GOING_CLEAN(i, un);
}
}
}
}
return (active);
}
static int
{
int cleared_dirty = 0;
/* Number of reasons why we can not proceed shutting down the mirror. */
int active = 0;
/*
* We drop the readerlock here to assist lock ordering with
* update_resync. Once we have the un_rrp_inflight_mx, we
* can re-acquire it.
*/
/*
* Resync region processing must be single threaded. We can't use
* un_resync_mx for this purpose since this mutex gets released
* when blocking on un_resync_cv.
*/
(void) md_unit_readerlock(ui);
if (mnset) {
if (cleared_dirty) {
/*
* Clear the bits from the pernode_dirty arrays.
* If that results in any being cleared from the
* un_dirty_bm, commit it.
*/
cleared_dirty = 0;
i - start)) {
if (IS_PERNODE_DIRTY(md_mn_mynode_id, i,
un)) {
un->un_pernode_dirty_sum[i]--;
md_mn_mynode_id, i, un);
}
if (un->un_pernode_dirty_sum[i] == 0) {
CLR_REGION_DIRTY(i, un);
CLR_GOING_CLEAN(i, un);
}
}
}
}
} else {
for (i = 0; i < un->un_rrd_num; i++) {
if (IS_KEEPDIRTY(i, un))
continue;
if (!IS_REGION_DIRTY(i, un))
continue;
if (un->un_outstanding_writes[i] != 0) {
active++;
continue;
}
if (!IS_GOING_CLEAN(i, un)) {
SET_GOING_CLEAN(i, un);
active++;
continue;
}
CLR_REGION_DIRTY(i, un);
CLR_GOING_CLEAN(i, un);
}
}
if (cleared_dirty) {
if (un->un_waiting_to_mark != 0 ||
un->un_waiting_to_clear != 0) {
active++;
}
}
return (active);
}
static int
{
/*
* For a mirror we can only update the on-disk resync-record if we
* currently own the mirror. If we are called and there is no owner we
* bail out before scanning the outstanding_writes[] array.
* NOTE: we only need to check here (before scanning the array) as we
* are called with the readerlock held. This means that a change
* of ownership away from us will block until this resync check
* has completed.
*/
return (0);
} else {
return (process_resync_regions_owner(un));
}
}
/*
* Function that is callable from other modules to provide
* ability to cleanup dirty region bitmap on demand. Used
* on last close of a unit to avoid massive device resyncs
* when coming back after rolling large amounts of data to
* a mirror (e.g. at umount with logging).
*/
void
{
int cleans = 0;
cleans++;
if (cleans >= md_mirror_rr_cleans) {
"Could not clean resync regions\n");
break;
}
if (cleans > md_mirror_rr_polls) {
/*
* We did not make it with md_mirror_rr_polls
* iterations. Give the system relief and
* switch over to non-busy-wait.
*/
}
}
}
static void
{
mdi_unit_t *ui;
continue;
/*
* Register this resync thread with the CPR mechanism. This
* allows us to detect when the system is suspended and so
* keep track of the RPC failure condition.
*/
"check_resync_regions");
(void) md_unit_readerlock(ui);
/*
* Do not clean up resync regions if it is an ABR
* mirror, or if a submirror is offline (we will use the resync
* region to resync when back online) or if there is only one
* submirror.
*/
continue;
}
/* Remove this thread from the CPR callback table. */
}
/* We are done */
timeout->dr_pending = 0;
}
static void
md_mirror_timeout(void *throwaway)
{
if (!mirror_timeout.dr_pending) {
}
else
}
void
{
return;
if (mirror_timeout.dr_timeout_id == 0)
}
static void
{
int i;
int changed = 0;
return;
for (i = 0; i < NMIRROR; i++) {
changed++;
}
changed++;
}
}
if (changed != 0) {
}
}
static void
{
struct optim_resync *orp;
if (un->un_rr_dirty_recid == 0) {
return;
}
un->un_rr_dirty_recid = 0;
return;
}
}
static int
{
int i;
int blksize; /* rr size in blocks */
int num_rr;
} else {
}
if (recid < 0) {
}
return (-1);
}
if (snarfing)
if (!snarfing) {
return (0);
}
return (0);
}
int
{
int err;
int syncable;
int i;
un->un_resync_flg = 0;
un->un_waiting_to_mark = 0;
un->un_waiting_to_commit = 0;
un->un_waiting_to_clear = 0;
if (snarfing)
if (un->un_rr_dirty_recid == 0) {
/*
* If a MN diskset and snarfing and this node is not the
* master, do not delete any records on snarf of the
* mirror records (create_unit_resync deletes records).
*
* Master node should have already handled this case.
*/
#ifdef DEBUG
#endif
return (-1);
}
return (err);
}
/*
* Allocate pernode bitmap for this node. All other nodes' maps will
* be created 'on-the-fly' in the ioctl message handler
*/
if (md_mn_mynode_id > 0) {
KM_SLEEP);
}
/*
* Allocate taskq to process deferred (due to locking) RR_CLEAN
* requests.
*/
}
return (0);
/*
* Only mark mirror which has an associated DRL as requiring a resync.
* For ABR mirrors we need not set the resync record bitmap up.
*/
nonABR = 0;
if (nonABR) {
if ((SUBMIRROR_IS_READABLE(un, i) ||
SMS_BY_INDEX_IS(un, i,
(SMS_OFFLINE | SMS_OFFLINE_RESYNC))))
syncable++;
}
}
for (i = 0; i < NMIRROR; i++) {
if ((SUBMIRROR_IS_READABLE(un, i)) ||
}
}
}
return (0);
}
/*
* resync_kill_pending:
* -------------------
* Determine if the resync thread has been requested to terminate.
* Block if MD_RI_BLOCK or MD_RI_BLOCK_OWNER is set in un->un_rs_thread_flags.
* MD_RI_BLOCK is only set as a result of a user-initiated ioctl via metasync.
* MD_RI_BLOCK_OWNER is set by the ownership change of a multi-node mirror.
*
* Returns:
* 0 Kill not pending
* 1 Kill requested (set MD_UN_RESYNC_CANCEL in un->c.un_status)
*
* Note: this routine may block
* the writerlock for <ui> will be dropped and reacquired if <mx_type>
* is set to MD_WRITER_HELD.
* the readerlock for <ui> will be dropped and reacquired if <mx_type>
* is set to MD_READER_HELD.
*/
static int
mdi_unit_t *ui,
{
int retval = 0;
/* Ensure that we don't block with any mutex held */
if (mx_type == MD_WRITER_HELD) {
} else if (mx_type == MD_READER_HELD) {
}
break;
}
/* Determine if we've been asked to abort or shutdown gracefully */
retval = 1;
retval = 1;
}
/* Reacquire mutex if dropped on entry */
if (mx_type == MD_WRITER_HELD) {
(void) md_unit_writerlock(ui);
} else if (mx_type == MD_READER_HELD) {
(void) md_unit_readerlock(ui);
}
return (retval);
}
/*
* resync_read_buffer:
* ------------------
* Issue the resync source read for the specified start block and size.
* This will cause the mirror strategy routine to issue a write-after-read
* once this request completes successfully.
* If 'flag_err' is set we expect to see a write error flagged in the b_error
* field of the buffer created for this i/o request. If clear we do not expect
* to see the error flagged for write failures.
* Read failures will always set the B_ERROR bit which will stop the resync
* immediately.
*/
static int
{
int ret = 0;
ret = 1;
}
return (ret);
}
/*
* send_mn_resync_done_message
*
* At the end of a resync, send a message to all nodes to indicate that
* the resync is complete. The argument, flags, has the following values
*
* RESYNC_ERR - if an error occurred that terminated the resync
* CLEAR_OPT_NOT_DONE - Just need to clear the OPT_NOT_DONE flag
*
* unit writerlock set on entry
* Only send the message if the thread is not marked as shutting down:
* [un_rs_thread_flags & MD_RI_SHUTDOWN] or being killed:
* [un->c.un_status & MD_UN_RESYNC_CANCEL]
* or if there has been an error that terminated the resync:
* flags & RESYNC_ERR
*
*/
static void
int flags
)
{
int dont_send = 0;
int rval;
/*
* Only send the message if this resync thread is still active. This
* handles the case where ownership changes to different nodes during
* a resync can cause multiple spurious resync_done messages to occur
* when the resync completes. This happens because only one node is
* the resync owner but other nodes will have their resync_unit thread
* blocked in 'resync_kill_pending'
*/
: 0;
/*
* Always send a message if we've encountered an error that terminated
* the resync.
*/
if (flags & RESYNC_ERR)
dont_send = 0;
if (dont_send) {
#ifdef DEBUG
if (mirror_debug_flag) {
printf("Don't send resync done message, mnum = %x,"
}
#endif /* DEBUG */
return;
}
#ifdef DEBUG
if (mirror_debug_flag) {
printf("send resync done message, mnum = %x, type = %x\n",
}
#endif
rmsg->msg_resync_flags = 0;
if (flags & RESYNC_ERR)
if (flags & CLEAR_OPT_NOT_DONE)
/* if the node hasn't yet joined, it's Ok. */
/* If we're shutting down already, pause things here. */
while (!md_mn_is_commd_present()) {
}
}
}
(void) md_unit_writerlock(ui);
}
/*
* send_mn_resync_next_message
*
* Sent a message to all nodes indicating the next region to be resynced.
* The message contains the region to be resynced and the current position in
* the resync as denoted by un_rs_resync_done and un_rs_resync_2_do.
* On entry the unit readerlock is held.
*/
static void
int flags
)
{
int rval;
int smi;
#ifdef DEBUG
if (mirror_debug_flag) {
printf("send resync next message, mnum = %x, start=%lld, "
"size=%ld, type=%x, done=%lld, 2_do=%lld\n",
}
#endif
if (flags & MD_FIRST_RESYNC_NEXT)
/*
* Copy current submirror state and flags into message. This provides
* a means of keeping all nodes that are currently active in the cluster
* synchronised with regards to their submirror state settings. If we
* did not pass this information here, the only time every node gets
* submirror state updated is at the end of a resync phase. This can be
* a significant amount of time for large metadevices.
*/
}
/* If we're shutting down already, pause things here. */
while (!md_mn_is_commd_present()) {
}
}
}
(void) md_unit_readerlock(ui);
/* Allocate previous overlap reference if needed */
ps->ps_firstblk = 0;
ps->ps_lastblk = 0;
(void) md_unit_writerlock(ui);
(void) md_unit_readerlock(ui);
}
}
static int
int flags
)
{
while (currentblk < stopbefore) {
/*
* Split the block up into units of MD_DEF_RESYNC_BLK_SZ and
* if a MN device and sendflag is set, send a RESYNC_MESSAGE
* to all nodes.
*/
flags1);
if (flags1)
flags1 = 0;
/* check to see if we've been asked to terminate */
? 1:0);
/*
* Check to see if another node has completed this
* block, if so either the type or the resync region
* will have changed. If the resync type has changed,
* just exit.
* If the resync region has changed, reset currentblk
* to the start of the current resync region and
* continue.
*/
return (0);
rs_startblk) {
continue;
}
}
while (currentblk < newstop) {
(flags & MD_RESYNC_FLAG_ERR)))
return (1);
currentblk += copysize;
/* check to see if we've been asked to terminate */
? 1:0);
if (MD_MNSET_SETNO(setno)) {
/*
* Check to see if another node has completed
* this block, see above
*/
return (0);
}
}
}
return (0);
}
static void
{
mdi_unit_t *ui;
int resync_regions;
int err;
int cnt;
int broke_out = 0;
/*
* We aren't marked as needing a resync so for multi-node
* sets we flag the completion so that all nodes see the same
* metadevice state. This is a problem when a new node joins
* an existing set as it has to perform a 'metasync -r' and
* we have to step through all of the resync phases. If we
* don't do this the nodes that were already in the set will
* have the metadevices marked as 'Okay' but the joining node
* will have 'Needs Maintenance' which is unclearable.
*/
if (MD_MNSET_SETNO(setno)) {
}
return;
}
/*
* No need for optimized resync if ABR set, clear rs_type and flags
* and exit
*/
return;
}
/* For MN sets, resync NOTIFY is done when processing resync messages */
if (!MD_MNSET_SETNO(setno)) {
}
/* check to see if we've been asked to terminate */
}
/*
* Check that we are still performing an optimized
* resync. If not, another node must have completed it
* so we have no more work to do.
*/
(void) md_unit_writerlock(ui);
return;
}
/*
* If rs_resync_done is non-zero, we must be completing an optimized
* resync that has already been partially done on another node.
* Therefore clear the bits in resync_bm for the resync regions
* already done. If resync_startbl is zero, calculate 2_do.
*/
if (un->un_rs_resync_done > 0) {
} else {
un->un_rs_resync_2_do = 0;
un->un_rs_resync_2_do++;
}
if (err) {
break;
}
/*
* Check that we are still performing an optimized
* resync. If not, another node must have completed it
* so we have no more work to do.
*/
(void) md_unit_writerlock(ui);
return;
}
/*
* If resync_done has increased, we must have
* blocked in resync_read_blk_range while another node
* continued with the resync. Therefore clear resync_bm
* for the blocks that have been resynced on another
* node and update rr to the next RR to be done.
*/
int i;
un);
CLR_KEEPDIRTY(i, un);
} else
un->un_rs_resync_done++;
cnt++;
if (cnt < 2) {
break;
}
/* Check to see if we've completed the resync cleanly */
break;
/*
* Check that we haven't exceeded un_rs_resync_2_do. If
* we have we've completed the resync.
*/
break;
}
}
/*
* If MN set send message to all nodes to indicate resync
* phase is complete. The processing of the message will update the
* mirror state
*/
if (MD_MNSET_SETNO(setno)) {
} else {
if (!broke_out)
}
}
}
/* For MN sets, resync NOTIFY is done when processing resync messages */
if (!MD_MNSET_SETNO(setno)) {
if (broke_out) {
} else {
}
}
}
/*
* recalc_resync_done
*
* This function deals with a change in value of un_rs_resync_2_do in a
* component resync. This may change if we are restarting a component
* resync on a single node having rebooted with a different value of
* md_resync_bufsz or if we are running in a multi-node with nodes having
* different values of md_resync_bufsz.
* If there is a change in un_rs_resync_2_do, we need to recalculate
* the value of un_rs_resync_done given the new value for resync_2_do.
* We have to calculate a new value for resync_done to be either
* if un_resync_startbl is set, (un_resync_startbl - initblock)/(blksize + skip)
* or if it is not set, we need to calculate it from un_rs_resync_done,
* (un_rs_resync_done/un_rs_resync_2_do) * resync_2_do
* In addition we need to deal with the overflow case by using a factor to
* prevent overflow
*/
static void
{
diskaddr_t x;
/*
* If resync_2_do has not yet been calculated, no need to modify
* resync_done
*/
if (un->un_rs_resync_2_do == 0) {
return;
}
return; /* No change, so nothing to do */
/*
* If un_rs_startbl is set, another node must have already started
* this resync and hence we can calculate resync_done from
* resync_startbl
*/
if (un->un_resync_startbl) {
return;
}
/*
* un_resync_startbl is not set so we must calculate it from
* un_rs_resync_done.
* If the larger of the two values of resync_2_do is greater than 32
* bits, calculate a factor to divide by to ensure that we don't
* overflow 64 bits when calculating the new value for resync_done
*/
while (x > INT32_MAX) {
x = x >> 1;
}
(resync_2_do/factor)) /
}
static void
{
mdi_unit_t *ui;
diskaddr_t frag = 0;
int err;
int broke_out = 0;
int blks;
return;
}
return;
}
(void) (*(smic->sm_get_bcss))
count++;
}
resync_2_do = count;
/*
* If part way through a resync, un_rs_resync_done/un_rs_resync_2_do
* gives the proportion of the resync that has already been done.
* If un_rs_copysize has changed since this previous partial resync,
* either because this node has been rebooted with a different value
* for md_resync_bufsz or because another node with a different value
* for md_resync_bufsz performed the previous resync, we need to
* recalculate un_rs_resync_done as a proportion of our value of
* resync_2_do.
*/
/*
* For MN mirrors we need to send a message to all nodes indicating
* the next region to be resynced. For a component resync, the size of
* the contiguous region that is processed by resync_read_blk_range()
* may be small if there is the interleave size.
* Therefore, rather than sending the message within
* resync_read_blk_range(), we will send a message every
* MD_DEF_RESYNC_BLK_SZ blocks. Calculate the frequency in terms of
* the number of blocks. Then, if we are restarting a resync, round
* un_rs_resync_done down to the previous resync region boundary. This
* ensures that we send a RESYNC_NEXT message before resyncing any
* blocks
*/
if (MD_MNSET_SETNO(setno)) {
}
/*
* un_rs_resync_done is the number of ('size' + 'skip') increments
* already resynced from the base 'block'
* un_rs_resync_2_do is the number of iterations in
* this component resync.
*/
/* For MN sets, resync NOTIFY is done when processing resync messages */
if (!MD_MNSET_SETNO(setno)) {
}
/* check to see if we've been asked to terminate */
}
/*
* Check that we are still performing the same component
* resync. If not, another node must have completed it
* so we have no more work to do.
*/
(void) md_unit_writerlock(ui);
return;
}
/*
* Adjust resync_done, resync_2_do, start of resync area and count to
* skip already resync'd data. We need to recalculate resync_done as
* we have dropped the unit lock above and may have lost ownership to
* another node, with a different resync buffer size and it may have
* sent us new values of resync_done and resync_2_do based on its
* resync buffer size
*/
/*
* For MN mirrors send a message to the other nodes. This
* message includes the size of the region that must be blocked
* for all writes
*/
if (MD_MNSET_SETNO(setno)) {
flags1 = 0;
/*
* check to see if we've been asked to
* terminate
*/
if (resync_kill_pending(un,
break;
}
}
/*
* Check that we are still performing the same
* component resync. If not, another node must
* have completed it so we have no more work to
* do. Also reset count to remaining resync as
* we may have lost ownership in in
* send_mn_resync_next_message while another
* node continued with the resync and
* incremented resync_done.
*/
(void) md_unit_writerlock(ui);
return;
}
/*
* recalculate resync_done, resync_2_do
* We need to recalculate resync_done as
* we have dropped the unit lock in
* send_mn_resync_next_message above and may
* have lost ownership to another node, with a
* different resync buffer size and it may have
* sent us new values of resync_done and
* resync_2_do based on its resync buffer size
*/
/*
* Adjust start of resync area to skip already
* resync'd data
*/
(int)un->un_rs_resync_done);
}
}
if (err) {
break;
}
/*
* If we are no longer resyncing this component, return as
* another node has progressed the resync.
*/
(void) md_unit_writerlock(ui);
return;
}
/*
* recalculate resync_done, resync_2_do. We need to recalculate
* resync_done as we have dropped the unit lock in
* resync_read_blk_range above and may have lost ownership to
* another node, with a different resync buffer size and it may
* have sent us new values of resync_done and resync_2_do based
* on its resync buffer size
*/
/*
* Reset count to remaining resync as we may have blocked in
* resync_read_blk_range while another node continued
* with the resync and incremented resync_done. Also adjust
* start of resync area to skip already resync'd data.
*/
(int)un->un_rs_resync_done);
/*
* If we are picking up from another node, we retry the last
* block otherwise step on to the next block
*/
un->un_rs_resync_done++;
count--;
}
err = 1;
break;
}
/* Check to see if we've completed the resync cleanly */
break;
}
/*
* If MN set send message to all nodes to indicate resync
* phase is complete. The processing of the message will update the
* mirror state
*/
if (MD_MNSET_SETNO(setno)) {
} else {
if (err)
else
/*
* As we don't transmit the changes,
* no need to drop the lock.
*/
}
/* For MN sets, resync NOTIFY is done when processing resync messages */
if (!MD_MNSET_SETNO(setno)) {
if (broke_out) {
} else {
}
}
}
static void
{
mdi_unit_t *ui;
int smi;
int err;
int cnt;
int broke_out = 0;
int i;
int flags1 = MD_FIRST_RESYNC_NEXT;
int compcnt;
/*
* If the submirror_index is non-zero, we are continuing a resync
* so restart resync from last submirror marked as being resynced.
*/
break;
}
}
} else {
break;
}
}
return;
}
/*
* If we've only got one component we can fail on a resync write
* if an error is encountered. This stops an unnecessary read of the
* whole mirror on a target write error.
*/
if (compcnt == 1)
/* For MN sets, resync NOTIFY is done when processing resync messages */
if (!MD_MNSET_SETNO(setno)) {
}
/* check to see if we've been asked to terminate */
}
/*
* Check that we are still performing the same submirror
* resync. If not, another node must have completed it
* so we have no more work to do.
*/
(void) md_unit_writerlock(ui);
return;
}
/* if > 1TB mirror, increase percent done granularity */
else
if (chunk == 0)
/*
* If a MN set, round the chunk size up to a multiple of
* MD_DEF_RESYNC_BLK_SZ
*/
if (MD_MNSET_SETNO(setno)) {
}
/*
* Handle restartable resyncs that continue from where the previous
* resync left off. The new resync range is from un_rs_resync_done ..
* un_rs_resync_2_do
*/
curblk = 0;
if (un->un_rs_resync_done == 0) {
} else {
}
if (err) {
break;
}
/*
* If we are no longer executing a submirror resync, return
* as another node has completed the submirror resync.
*/
(void) md_unit_writerlock(ui);
return;
}
/*
* If resync_done has changed, we must have blocked
* in resync_read_blk_range while another node
* continued with the resync so restart from resync_done.
*/
} else {
}
if (SUBMIRROR_IS_WRITEABLE(un, i) &&
cnt++;
if (cnt == 0) {
break;
}
/* Check to see if we've completed the resync cleanly */
break;
}
/*
* If MN set send message to all nodes to indicate resync
* phase is complete. The processing of the message will update the
* mirror state
*/
if (MD_MNSET_SETNO(setno)) {
} else {
if (err) {
} else {
}
}
/* For MN sets, resync NOTIFY is done when processing resync messages */
if (!MD_MNSET_SETNO(setno)) {
if (broke_out) {
} else {
}
}
}
static void
{
int ci;
int i;
int compcnt;
/*
* Handle the case where we are picking up a partially complete
* component resync. In this case un_rs_type contains the submirror
* and component index of where we should restart the resync.
*/
return;
/*
* If we have no current resync, contine to scan submirror and
* components. If the resync has moved on to another component,
* restart it and if the resync is no longer a component
* resync, just exit
*/
break;
return;
}
/* Now continue scanning _all_ submirrors and components */
for (i = 0; i < NMIRROR; i++) {
continue;
return;
/*
* Now check if another node has continued with the
* resync, if we are no longer in component resync,
* exit, otherwise update to the current component - 1
* so that the next call of check_comp_4 resync() will
* resync the current component.
*/
return;
else {
}
}
}
}
static void
{
int ci;
int i;
int compcnt;
for (i = 0; i < NMIRROR; i++) {
continue;
}
}
}
/*
* resync_progress_thread:
* ----------------------
* Thread started on first resync of a unit which simply blocks until woken up
* by a cv_signal, and then updates the mddb for the mirror unit record. This
* saves the resync progress information (un_rs_resync_done, un_rs_resync_2_do)
* so that an aborted resync can be continued after an intervening reboot.
*/
static void
{
break;
/*
* Commit mirror unit if we're the Master node in a multi-node
* environment
*/
(void) md_unit_readerlock(ui);
}
}
thread_exit();
}
/*
* resync_progress:
* ---------------
* Timeout handler for updating the progress of the resync thread.
* Simply wake up the resync progress daemon which will then mirror_commit() the
* unit structure to the mddb. This snapshots the current progress of the resync
*/
static void
resync_progress(void *arg)
{
/* schedule the next timeout if the resync is still marked active */
(void) md_unit_readerlock(ui);
if (active) {
}
}
/*
* resync_unit:
* -----------
* Resync thread which drives all forms of resync (optimized, component,
* submirror). Must handle thread suspension and kill to allow multi-node
* resync to run without undue ownership changes.
*
* For a MN set, the reync mechanism is as follows:
*
* When a resync is started, either via metattach, metaonline, metareplace,
* metasync or by a hotspare kicking in, a message is sent to all nodes, which
* calls mirror_resync_thread. If there is currently no mirror owner, the
* master node sends a CHOOSE_OWNER message to the handler on the master. This
* chooses a mirror owner and sends a CHANGE_OWNER message requesting the
* selected node to become the owner.
* If this node is not the owner it sets itself to block in resync_kill_pending
* and if there is no owner all nodes will block until the chosen owner is
* selected, in which case it will unblock itself. So, on entry to this
* function only one node will continue past resync_kill_pending().
* Once the resync thread is started, it basically cycles through the optimized,
* component and submirrors resyncs until there is no more work to do.
*
* For an ABR mirror, once a mirror owner is chosen it will complete the resync
* unless the nodes dies in which case a new owner will be chosen and it will
* have to complete the resync from the point at which the previous owner died.
* To do this we broadcast a RESYNC_NEXT message before each region to be
* resynced and this message contains the address and length of the region
* being resynced and the current progress through the resync. The size of
* this region is MD_DEF_RESYNC_BLK_SZ blocks. It is larger than the resync
* block size to limit the amount of inter node traffic. The RESYNC_NEXT
* message also indicates to all other nodes that all writes to this block
* must be blocked until the next RESYNC_NEXT message is received. This ensures
* that no node can write to a block that is being resynced. For all MN
* mirrors we also block the whole resync region on the resync owner node so
* that all writes to the resync region are blocked on all nodes. There is a
* difference here between a MN set and a regular set in that for a MN set
* we protect the mirror from writes to the current resync block by blocking
* a larger region. For a regular set we just block writes to the current
* resync block.
*
* For a non-ABR mirror the same RESYNC_NEXT message is sent with an
* additional purpose. In this case, there is only one mirror owner at a time
* and rather than continually switching ownership between the chosen mirror
* owner and the node that is writing to the mirror, we move the resync to the
* mirror owner. When we swich ownership, we block the old owner and unblock
* the resync thread on the new owner. To enable the new owner to continue the
* resync, all nodes need to have the latest resync status, Then, following each
* resync write, we check to see if the resync state has changed and if it
* has this must be because we have lost ownership to another node(s) for a
* period and then have become owner again later in the resync process. If we
* are still dealing with the same resync, we just adjust addresses and counts
* and then continue. If the resync has moved on to a different type, for
* example from an optimized to a submirror resync, we move on to process the
* resync described by rs_type and continue from the position described by
* resync_done and resync_startbl.
*
* Note that for non-ABR mirrors it is possible for a write to be made on a
* non resync-owner node without a change of ownership. This is the case when
* the mirror has a soft part created on it and a write in ABR mode is made
* to that soft part. Therefore we still need to block writes to the resync
* region on all nodes.
*
* Sending the latest resync state to all nodes also enables them to continue
* a resync in the event that the mirror owner dies. If a mirror owner for
* a non-ABR mirror has died, there will be dirty resync regions. Therefore,
* regardless of whether another type of resync was in progress, we must first
* do an optimized resync to clean up the dirty regions before continuing
* with the interrupted resync.
*
* The resync status is held in the unit structure
* On disk
* un_rs_resync_done The number of contiguous resyc blocks done so far
* un_rs_resync_2_do The total number of contiguous resync blocks
* un_rs_type The resync type (inc submirror and component numbers)
* In core
* un_resync_startbl The address of the current resync block being processed
*
* In the event that the whole cluster fails we need to just use
* un_rs_resync_done to restart the resync and to ensure that this is
* periodically written to disk, we have a thread which writes the record
* to disk every 5 minutes. As the granularity of un_rs_resync_done is
* usually coarse ( for an optimized resync 1001 is the max value) there is
* little point in writing this more frequently.
*/
static void
{
mdi_unit_t *ui;
int mn_resync = 0;
int resync_finish = 0;
uint_t old_rs_startbl = 0;
int block_resync = 1;
int rs_copysize;
char *rs_buffer;
#ifdef DEBUG
if (mirror_debug_flag)
#endif
/*
* increment the mirror resync count
*/
if (rs_copysize == 0) {
/*
* Don't allow buffer size to fall outside the
* range 0 < bufsize <= md_max_xfer_bufsz.
*/
if (md_resync_bufsz <= 0)
}
if (MD_MNSET_SETNO(setno)) {
/*
* Register this resync thread with the CPR mechanism. This
* allows us to detect when the system is suspended and so
* keep track of the RPC failure condition.
*/
"mirror_resync%x", mnum);
/*
* If this is the first resync following the initial
* snarf (MD_RESYNC_NOT_DONE still set) and we've
* been started outside a reconfig step (e.g. by being
* added to an existing set) we need to query the
* existing submirror state for this mirror.
* The set_status flags will have MD_MN_SET_MIR_STATE_RC
* set if we've been through a step4 reconfig, so only
* query the master if this isn't (yet) set. In this
* case we must continue the resync thread as there is
* not guaranteed to be a currently running resync on
* any of the other nodes. Worst case is that we will
* initiate an ownership change to this node and then
* find that there is no resync to perform. However, we
* will then have correct status across the cluster.
*/
if (!(md_get_setstatus(setno) &
block_resync = 0;
#ifdef DEBUG
if (mirror_debug_flag) {
int i;
for (i = 0; i < NMIRROR; i++) {
"sm[%d] state=%4x"
" flags=%4x\n", i,
}
}
#endif
}
}
}
/*
* For MN set, if we have an owner, then start the resync on it.
* If there is no owner the master must send a message to
* choose the owner. This message will contain the current
* resync count and it will only be sent to the master, where
* the resync count will be used to choose the next node to
* perform a resync, by cycling through the nodes in the set.
* The message handler will then send a CHANGE_OWNER message to
* all nodes, and on receipt of that message, the chosen owner
* will issue a SET_OWNER ioctl to become the owner. This ioctl
* will be requested to spawn a thread to issue the
* REQUEST_OWNER message to become the owner which avoids the
* need for concurrent ioctl requests.
* After sending the message, we will block waiting for one
* of the nodes to become the owner and start the resync
*/
if (MD_MN_NO_MIRROR_OWNER(un)) {
/*
* There is no owner, block and then the master will
* choose the owner. Only perform this if 'block_resync'
* is set.
*/
if (block_resync) {
}
(void) md_unit_writerlock(ui);
}
} else {
/* There is an owner, block if we are not it */
if (!MD_MN_MIRROR_OWNER(un)) {
}
}
}
/*
* Start a timeout chain to update the resync progress to the mddb.
* This will run every md_mirror_resync_update_intvl minutes and allows
* a resync to be continued over a reboot.
*/
/*
* Handle resync restart from the last logged position. The contents
* of un_rs_resync_2_do and un_rs_resync_done are dependent on the
* type of resync that was in progress.
*/
if (MD_MNSET_SETNO(setno)) {
case MD_RS_NONE:
case MD_RS_OPTIMIZED:
case MD_RS_COMPONENT:
case MD_RS_SUBMIRROR:
case MD_RS_ABR:
break;
default:
}
/* Allocate a resync message, if required */
sizeof (md_mn_msg_resync_t), KM_SLEEP);
}
mn_resync = 1;
}
goto bail_out;
}
do {
un->un_rs_dropped_lock = 0;
/*
* Always perform an optimized resync first as this will bring
* the mirror into an available state in the shortest time.
* If we are resuming an interrupted resync, other than an
* optimized resync, we save the type and amount done so that
* we can resume the appropriate resync after the optimized
* resync has completed.
*/
}
/*
* If we are continuing a resync that is not an
* OPTIMIZED one, then we start from the beginning when
* doing this optimized resync
*/
un->un_rs_resync_done = 0;
un->un_rs_resync_2_do = 0;
un->un_resync_startbl = 0;
}
goto bail_out;
}
/*
* If another node has moved the resync on, we must
* restart the correct resync
*/
if (mn_resync &&
}
/*
* Restore previous resync progress or move onto a
* component resync.
*/
} else {
un->un_rs_resync_done = 0;
un->un_rs_resync_2_do = 0;
un->un_resync_startbl = 0;
}
goto bail_out;
}
/*
* If we have moved on from a component resync, another
* node must have completed it and started a submirror
* resync, so leave the resync state alone. For non
* multi-node sets we move onto the submirror resync.
*/
if (mn_resync) {
un->un_rs_resync_2_do = 0;
un->un_resync_startbl = 0;
}
} else {
un->un_rs_resync_done = 0;
un->un_rs_resync_2_do = 0;
un->un_resync_startbl = 0;
}
}
goto bail_out;
}
/*
* If we have moved on from a submirror resync, another
* node must have completed it and started a different
* resync, so leave the resync state alone
*/
if (mn_resync) {
un->un_rs_resync_2_do = 0;
un->un_resync_startbl = 0;
}
} else {
/* If non-MN mirror, reinitialize state */
un->un_rs_resync_done = 0;
un->un_rs_resync_2_do = 0;
un->un_resync_startbl = 0;
}
}
} while (un->un_rs_dropped_lock);
resync_finish = 1;
#ifdef DEBUG
if (mirror_debug_flag)
printf("Resync stopped (mnum = %x), resync_finish = %d\n",
#endif
/*
* For MN Set, send a RESYNC_FINISH if this node completed the resync.
* There is no need to grow unit here, it will be done in the
* handler for the RESYNC_FINISH message together with resetting
* MD_UN_RESYNC_ACTIVE.
*/
if (mn_resync) {
if (resync_finish) {
/*
* Normal resync completion. Issue a RESYNC_FINISH
* message if we're part of a multi-node set.
*/
int rval;
rmsg->msg_resync_type = 0;
rmsg->msg_resync_done = 0;
rmsg->msg_resync_2_do = 0;
&un->un_rs_cpr_mx);
"RESYNC_FINISH");
/* If we're shutting down, pause things here. */
while (!md_mn_is_commd_present()) {
}
}
"ksend_message failure: RESYNC_FINISH");
}
(void) md_unit_writerlock(ui);
}
/*
* If the resync has been cancelled, clear flags, reset owner
* for ABR mirror and release the resync region parent
* structure.
*/
/* Resync finished, if ABR set owner to NULL */
un->un_mirror_owner = 0;
}
/* Remove previous overlap resync region */
/*
* Release the overlap range reference
*/
ps);
}
}
/*
* Release resync message buffer. This will be reallocated on
* the next invocation of the resync_unit thread.
*/
}
} else {
/* For non-MN sets deal with any pending grows */
}
}
}
un->un_resync_completed = 0;
/*
* Stop the resync progress thread.
*/
if (un->un_rs_resync_to_id != 0) {
un->un_rs_resync_to_id = 0;
}
/*
* Calling mirror_internal_close() makes further reference to un / ui
* dangerous. If we are the only consumer of the mirror it is possible
* for a metaclear to be processed after completion of the m_i_c()
* routine. As we need to handle the case where another resync has been
* scheduled for the mirror, we raise the open count on the device
* which protects against the close / metaclear / lock => panic scenario
*/
/*
* deccrement the mirror resync count
*/
/*
* Remove the thread reference as we're about to exit. This allows a
* subsequent mirror_resync_unit() to start a new thread.
* If RESYNC_ACTIVE is set, mirror_resync_unit() must have been
* called to start a new resync, so reopen the mirror and go back to
* the start.
*/
(void) md_unit_writerlock(ui);
/* Release the reference grabbed above */
goto resync_restart;
}
(void) md_unit_writerlock(ui);
"Could not open metadevice (%x) for resync\n",
}
/*
* Check for hotspares once we've cleared the resync thread reference.
* If there are any errored units a poke_hotspares() will result in
* a call to mirror_resync_unit() which we need to allow to start.
*/
(void) poke_hotspares();
/*
* Remove this thread from the CPR callback table.
*/
if (mn_resync) {
}
/*
* Remove the extra reference to the unit we generated above. After
* this call it is *unsafe* to reference either ui or un as they may
* no longer be allocated.
*/
thread_exit();
}
/*
* mirror_resync_unit:
* ------------------
* Start a resync for the given mirror metadevice. Save the resync thread ID in
* un->un_rs_thread for later manipulation.
*
* Returns:
* 0 Success
* !=0 Error
*/
/*ARGSUSED*/
int
md_error_t *ep,
)
{
mdi_unit_t *ui;
}
if (lockp) {
} else {
}
/*
* Check to see if we're attempting to start a resync while one is
* already running.
*/
/*
* Ensure RESYNC_ACTIVE set, it may not be if the resync thread
* is in the process of terminating, setting the flag will
* cause the resync thread to return to the beginning
*/
if (lockp) {
} else {
}
return (0);
}
else
un->un_rs_copysize = 0;
/* Start the resync progress thread off */
un->un_rs_progress_flags = 0;
/*
* We have to store the thread ID in the unit structure so do not
* drop writerlock until the thread is active. This means resync_unit
* may spin on its first md_unit_readerlock(), but deadlock won't occur.
*/
if (lockp) {
} else {
}
} else {
if (lockp) {
} else {
}
}
return (0);
}
/*
* mirror_ioctl_resync:
* -------------------
* Called as a result of an MD_IOCSETSYNC ioctl. Either start, block, unblock
* or kill the resync thread associated with the specified unit.
* Can return with locks held since mdioctl will free any locks
* that are marked in lock->l_flags.
*
* Returns:
* 0 Success
* !=0 Error Code
*/
int
)
{
int smi;
}
/* RD_LOCK flag grabs the md_ioctl_readerlock */
}
}
return (0);
}
/*
* Determine the action to take based on the ri_flags field:
* MD_RI_BLOCK: Block current resync thread
* MD_RI_UNBLOCK: Unblock resync thread
* MD_RI_KILL: Abort resync thread
* MD_RI_RESYNC_FORCE_MNSTART: Directly start resync thread
* without using rpc.mdcommd messages.
* any other: Start resync thread
*/
case MD_RI_BLOCK:
/* Halt resync thread by setting flag in un_rs_flags */
return (0);
}
return (0);
case MD_RI_UNBLOCK:
/*
* Restart resync thread by clearing flag in un_rs_flags and
* cv_signal'ing the blocked thread.
*/
return (0);
}
return (0);
case MD_RI_KILL:
/* Abort resync thread. */
return (0);
}
if (tid != 0) {
}
}
return (0);
}
bits = 0;
continue;
}
if (bits != 0)
/*
* If we are resyncing a mirror in a MN set and the rpc.mdcommd
* can be used, we do not start the resync at this point.
* Instead, the metasync command that issued the ioctl
* will send a RESYNC_STARTING message to start the resync thread. The
* reason we do it this way is to ensure that the metasync ioctl is
* executed on all nodes before the resync thread is started.
*
* If a MN set and the MD_RI_RESYNC_FORCE_MNSTART flag is set, then
* don't use rpc.mdcommd, but just start the resync thread. This
* flag is set on a node when it is being added to a diskset
* so that the resync threads are started on the newly added node.
*/
if ((!(MD_MNSET_SETNO(setno))) ||
} else {
return (0);
}
}
int
{
int no_change;
int rval;
return (0);
/*
* Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
* not, allocate it and then fill the [start..end] entries.
* Update un_pernode_dirty_sum if we've gone 0->1.
* Update un_dirty_bm if the corresponding entries are clear.
*/
(uchar_t *)kmem_zalloc(
}
no_change = 1;
}
no_change = 0;
no_change = 0;
}
if (no_change) {
return (0);
}
/*
* If we have dirty regions to commit, send a
* message to the owning node so that the
* in-core bitmap gets updated appropriately.
* TODO: make this a kmem_cache pool to improve
*/
KM_SLEEP);
KM_SLEEP);
/* release readerlock before sending message */
sizeof (md_mn_msg_rr_dirty_t), kres);
/* reaquire readerlock on message completion */
(void) md_unit_readerlock(ui);
/* if the message send failed, note it, and pass an error back up */
/* if commd is gone, no point in printing a message */
if (md_mn_is_commd_present())
return (1);
}
/*
* if the owner changed while we were sending the message, and it's
* not us, the new mirror owner won't yet have done the right thing
* with our data. Let him know. If we became the owner, we'll
* deal with that differently below. Note that receiving a message
* about another node twice won't hurt anything.
*/
goto resend_mmrr;
/*
* If we became the owner changed while we were sending the message,
* we have dirty bits in the un_pernode_bm that aren't yet reflected
* in the un_dirty_bm, as it was re-read from disk, and our bits
* are also not reflected in the on-disk DRL. Fix that now.
*/
if (MD_MN_MIRROR_OWNER(un)) {
}
return (0);
}
int
{
int no_change;
return (0);
/*
* Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
* not, allocate it and then fill the [start..end] entries.
* Update un_pernode_dirty_sum if we've gone 0->1.
* Update un_dirty_bm if the corresponding entries are clear.
*/
if (mnset) {
(uchar_t *)kmem_zalloc(
}
}
if (mnset)
no_change = 1;
if (mnset) {
}
no_change = 0;
no_change = 0;
}
if (mnset)
if (no_change) {
return (0);
}
un->un_waiting_to_mark++;
if (panicstr)
return (1);
}
un->un_waiting_to_mark--;
no_change = 1;
no_change = 0;
} else {
no_change = 0;
}
}
if (no_change) {
return (0);
}
while (un->un_waiting_to_mark != 0 &&
if (panicstr)
return (1);
}
}
if (panicstr)
return (1);
}
if (--un->un_waiting_to_commit == 0) {
}
return (0);
}
int
{
endblk, source_node));
} else {
source_node));
}
}
int
{
short *owp;
int old_bm_size, new_bm_size;
int i, j;
while (new_nregions > MD_MAX_NUM_RR) {
new_nregions >>= 1;
rr_mult <<= 1;
}
if (recid < 0)
return (-1);
new_nregions * sizeof (short), KM_SLEEP);
if (old_pns)
KM_SLEEP);
/*
* Now translate the old records into the new
* records
*/
for (i = 0; i < old_nregions; i++) {
/*
* only bring forward the
* outstanding write counters and the dirty bits and also
* the pernode_summary counts
*/
if (!isset(old_dirty_bm, i))
continue;
if (old_pns)
}
if (old_pns)
/*
* Copy all non-zero un_pernode_dirty_bm[] arrays to new versions
*/
for (j = 0; j < MD_MNMAXSIDES; j++) {
if (old_dirty_bm) {
for (i = 0; i < old_nregions; i++) {
if (!isset(old_dirty_bm, i))
continue;
(i / rr_mult));
}
}
}
/* Save the old record id */
/* Update the mirror unit struct */
/*
* NOTE: The reason there are distinct calls to mddb_commitrec_wrapper
* instead of using mddb_commitrecs_wrapper, is that you cannot
* atomically commit optimized records.
*/
return (0);
}
/* lockp can be NULL for !MN diksets */
int
{
short *owp;
int old_bm_size, new_bm_size;
int i;
if (recid < 0)
return (-1);
/* Copy the old bm over the new bm */
/*
* Create new bigger incore arrays, copy, and free old ones:
* un_goingdirty_bm
* un_goingclean_bm
* un_resync_bm
* un_outstanding_writes
* un_pernode_dirty_sum
* un_pernode_dirty_bm[]
*/
old_nregions * sizeof (short));
if (old) {
}
for (i = 0; i < MD_MNMAXSIDES; i++) {
if (old) {
}
}
/* Save the old record id */
/* Update the mirror unit struct */
/*
* NOTE: The reason there are distinct calls to mddb_commitrec_wrapper
* instead of using mddb_commitrecs_wrapper, is that you cannot
* atomically commit optimized records.
*/
return (0);
}
/*
* mirror_copy_rr:
* --------------
* Combine the dirty record bitmap with the in-core resync bitmap. This allows
* us to carry a resync over an ownership change.
*/
void
{
int i;
for (i = 0; i < sz; i++)
}
/*
* mirror_set_dirty_rr:
* -------------------
* Set the pernode_dirty_bm[node] entries and un_dirty_bm[] if appropriate.
* Called on every clean->dirty transition for the originating writer node.
* Note: only the non-owning nodes will initiate this message and it is only
* the owning node that has to process it.
*/
int
{
}
/* Must have _NO_ ioctl lock set if we update the RR on-disk */
}
}
}
return (0);
}
/*
* Only process this message if we're the owner of the mirror.
*/
if (!MD_MN_MIRROR_OWNER(un)) {
return (0);
}
orignode));
}
/*
* mirror_clean_rr_bits:
* --------------------
* Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
* Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
* is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
* nodes. Callable from ioctl / interrupt / whatever context.
* un_resync_mx is held on entry.
*/
static void
{
cleared_bits = 0;
un->un_pernode_dirty_sum[i]--;
}
if (un->un_pernode_dirty_sum[i] == 0) {
cleared_bits++;
CLR_REGION_DIRTY(i, un);
CLR_GOING_CLEAN(i, un);
}
}
}
if (cleared_bits) {
/*
* We can only be called iff we are the mirror owner, however
* as this is a (potentially) decoupled routine the ownership
* may have moved from us by the time we get to execute the
* bit clearing. Hence we still need to check for being the
* owner before flushing the DRL to the replica.
*/
if (MD_MN_MIRROR_OWNER(un)) {
}
}
}
/*
* mirror_drl_task:
* ---------------
* Service routine for clearing the DRL bits on a deferred MD_MN_RR_CLEAN call
* We need to obtain exclusive access to the un_resync_cv and then clear the
* necessary bits.
* On completion, we must also free the passed in argument as it is allocated
* at the end of the ioctl handler and won't be freed on completion.
*/
static void
mirror_drl_task(void *arg)
{
}
}
/*
* mirror_set_clean_rr:
* -------------------
* Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
* Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
* is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
* nodes.
*
* Only the mirror-owner need process this message as it is the only RR updater.
* Non-owner nodes issue this request, but as we have no point-to-point message
* support we will receive the message on all nodes.
*/
int
{
int can_clear = 0;
int rval = 0;
}
/* Must have _NO_ ioctl lock set if we update the RR on-disk */
}
}
return (0);
}
/*
* Check to see if we're the mirror owner. If not, there's nothing
* for us to to.
*/
if (!MD_MN_MIRROR_OWNER(un)) {
return (0);
}
/*
* Process the to-be-cleaned bitmap. We need to update the pernode_dirty
* bits and pernode_dirty_sum[n], and if, and only if, the sum goes 0
* we can then mark the un_dirty_bm entry as GOINGCLEAN. Alternatively
* we can just defer this cleaning until the next process_resync_regions
* timeout.
*/
}
/*
* See if we can simply clear the un_dirty_bm[] entries. If we're not
* the RR bitmaps, we can simply update the bits as needed.
* If we're the owning node and _not_ the issuing node, we should also
* sync the RR if we clear any bits in it.
*/
if (can_clear) {
if (un->un_waiting_to_mark != 0 ||
un->un_waiting_to_clear != 0) {
}
}
/*
* If we couldn't clear the bits, due to DRL update from m_m_r_r / p_r_r
* we must schedule a blocking call to update the DRL on this node.
* As we're invoked from an ioctl we are going to have the original data
* disappear (kmem_free) once we return. So, copy the data into a new
* structure and let the taskq routine release it on completion.
*/
if (!can_clear) {
}
}
return (rval);
}