sd_ft.c revision fcf3ce441efd61da9bb2884968af01cb7c1452cc
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/nsc_thread.h>
#include "sd_bcache.h"
#include "sd_ft.h"
#include "sd_trace.h"
#include "sd_io.h"
#include "sd_misc.h"
static volatile int _sd_ft_exit = 0;
static kcondvar_t _sd_ft_cv;
int _sd_node_recovery; /* node recovery in progress */
/*
* _sd_async_recovery:
* 0 = flush and wait
* 1 = clone and async-write
* 2 = quicksort, clone, and async-write
* quicksort allows contiguous blocks to be joined,
* which may greatly improve recovery time for raid devices.
* if kmem_alloc fails, acts as _sd_async_recovery == 1
*/
static int _sd_async_recovery = 2;
/*
* flag to inhibit reset of remote SCSI buses and sending of
* nodedown callback if mirror was deconfigured properly.
* - prevents trashing any I/O that may be happening on the mirror
* node during a normal shutdown and prevents undesired simckd failover.
*/
static int mirror_clean_shutdown = 0;
/*
* Forward declare all statics that are used before defined to enforce
* parameter checking
* Some (if not all) of these could be removed if the code were reordered
*/
static void _sd_health_thread(void);
static void _sd_cache_recover(void);
static int _sd_ft_clone(ss_centry_info_t *, int);
static void _sd_remote_enable(void);
static void sdbc_setmodeandftdata();
static void _sd_cd_discard_mirror(int cd);
static int _sd_failover_file_open(void);
static void _sd_failover_done(void);
static void _sd_wait_for_dirty(void);
static void _sdbc_clear_warm_start(void);
static int sdbc_recover_vol(ss_vol_t *, int);
void _ncall_poke(int);
int _sdbc_ft_hold_io;
extern int sdbc_use_dmchain;
/*
* _sdbc_ft_unload - cache is being unloaded (or failed to load).
*/
void
_sdbc_ft_unload(void)
{
}
/*
* that we need. Return 0 if we succeed. If we fail return -1 (don't
* need to do the unload step as we expect our caller to do that).
*/
int
_sdbc_ft_load(void)
{
/* _sd_ft_data is sure to be zeroes, don't need to bzero it */
return (0);
}
int
_sdbc_ft_configure(void)
{
_sd_ft_exit = 1;
return (nsc_create_process(
(void (*)(void *))_sd_health_thread, 0, TRUE));
}
void
_sdbc_ft_deconfigure(void)
{
_sd_ft_exit = 0;
_sd_node_recovery = 0;
}
/*
* _sd_health_thread -- daemon thread on each node watches if mirror
* node to has crashed, and it needs to flush the mirrors cache entries.
* Note we do *not* detect that the node has come up again, but wait
* for the node to inform us that it is up via _sd_cache_reenable().
*/
static void
_sd_health_thread(void)
{
int warm_started = 0;
/* clear _sd_ft_data in case this is a cache re-enable w/o unload */
#ifdef DEBUG
"is %s. Fast writes %s",
"disabled" : "enabled");
#endif
/* CONSTCOND */
while (1) {
if (_sd_ft_exit == 0) {
return;
}
/* NB evaluation order is important here for nvmem systems */
if (_sd_is_mirror_crashed() ||
(warm_started = _sdbc_warm_start())) {
/*
* Hash invalidate here. We do not want data from
* previous failover incarnation to be cache hits, if
* the 2 failover happens within a short time
*/
/*
* don't change mirror state when warm starting
* nvmem systems. _sd_mirror_down() is called in
* in _sd_remote_enable() on nvmem systems if the
* media is down.
*/
if (!warm_started)
if (!mirror_clean_shutdown)
else
(void) _sd_set_node_hint(NSC_FORCED_WRTHRU);
if (!warm_started) {
/* was FAST */
_sd_node_recovery = 0;
/* was FAST */
/* Assume other side is still up */
"sdbc(_sd_health_thread)"
"Safestore is down. Fast writes %s",
"disabled" : "enabled");
continue;
/* Wait for cache to drain and panic */
"sdbc(_sd_health_thread)"
" dirty blocks flushed");
continue;
}
/* was FAST */
_sd_node_recovery = 1;
/* was FAST */
if (!SAFESTORE_LOCAL(sdbc_safestore))
"sdbc(_sd_health_thread)"
" Cache on node %d is down. "
"Fast writes %s",
"disabled" : "enabled");
"sdbc(_sd_health_thread)"
" Cache recovery in progress");
_sd_node_recovery = 0;
_sdbc_clear_warm_start(); /* nvmem systems */
"sdbc(_sd_health_thread) %s Cache recovery done",
"asynchronous" : "synchronous");
/* restore previous state */
if (warm_started && !_sd_is_mirror_down()) {
(void) _sd_clear_node_hint(NSC_FORCED_WRTHRU);
"sdbc(_sd_health_thread) Fast writes %s",
"disabled" : "enabled");
}
warm_started = 0;
} else if (_sd_is_mirror_node_down()) {
}
}
}
/*
* _sdbc_recovery_io_wait - wait for i/o being done directly
* out of safe storage to complete. If the i/o does not make any
* progress within about 25 seconds we return EIO otherwise return 0.
*
*/
static
int
_sdbc_recovery_io_wait(void)
{
int tries = 0;
int last_numio = 0;
/*
* Wait for numio to reach 0.
* If numio has not changed for 85+ seconds,
* break & pin blocks
*/
while (_sd_ft_data.fi_numio > 0) {
if (++tries > 512) break;
} else {
tries = 0;
}
}
if (_sd_ft_data.fi_numio != 0) {
return (EIO);
}
return (0);
}
#if defined(_SD_FAULT_RES)
/*
* _sd_recovery_wait()
* while _sd_node_recovery is set, accesses to mirrored devices will block
* (_sd_node_recovery-1) is count of blocked threads.
*/
int
_sd_recovery_wait(void)
{
int blk;
if (blk)
if (!_sd_cache_initialized)
return (EINVAL);
return (0);
}
/*
* _sd_recovery_wblk_wait - wait for recovery i/o to a device
* to cease. If the file is closed or the cache is disabled
* first return an error otherwise return 0.
*
* A device is being recovered from our point of view either
* during failover or by putting a disk back online after
* a disk failure.
*
* This code is used to delay access to a device while recovery
* writes are in progress from either a failover or while flushing
* i/o after a failed disk has been repaired.
*/
int
_sd_recovery_wblk_wait(int cd)
{
while (_sd_cache_initialized &&
/* spawn writer if none */
}
return (EINVAL);
return (0);
}
/*
* Recover from a crash of another node:
*
* 1) Open all remote files
* 2) Allocate other node's buffers and new buffer headers
* 3) Flush all dirty buffers to disk
* 4) Deallocate resources
*/
static void
_sd_cache_recover(void)
{
int cblocks_processed;
/* was FAST */
_sd_ft_data.fi_numio = 0;
/* was FAST */
#ifdef _SD_DRIVE_RESP
if (!mirror_clean_shutdown)
#endif
/* allow cache config to proceed */
_sdbc_ft_hold_io = 0;
/* wait for sequential recovery to complete */
if (!_sd_async_recovery && cblocks_processed)
(void) _sdbc_recovery_io_wait();
if (cblocks_processed)
"sdbc %ssynchronous recovery complete "
"%d cache blocks processed",
}
void
_sd_mirror_iodone(void)
{
/* was FAST */
/* was FAST */
}
/*
* _sd_ft_clone -- clone cache block from ft area, retry write or pin.
*/
static int
{
return (-1);
}
/*
* allocate new cache entry and read data
*/
CACHE_BLOCK_SIZE, 0) == SS_ERR) {
"pinned data block failed. cannot recover "
/* _sd_process_failure ?? */
return (-1);
}
/*
* _sd_process_failure() adds to failed list & does pinned callback
* otherwise async flush
*/
(void) _sd_process_failure(ent);
} else {
}
if (async) {
} else {
/*
* this is sync write with asynchronous callback
* (queue to disk and return).
*/
}
}
return (0);
}
/*
* _sd_repin_cd - scan for dirty blocks held by mirror node.
*
* sdbc on this node is being attached to cd. If sdbc on other
* node had failed writes (pinnable or not) we need to take
* responsbility for them now here.
*/
int
_sd_repin_cd(int cd)
{
if (!FILE_OPENED(cd))
return (EINVAL);
return (0);
return (0);
}
static int
{
if (_sd_cache_initialized) {
if (host != _SD_MIRROR_HOST) {
"Configured mirror %x. Got message from %x",
return (-EINVAL);
}
if (_sd_node_recovery) (void) _sd_recovery_wait();
if (_sd_cache_initialized && _sd_is_mirror_down()) {
int i;
/* make sure any pinned data we have is now refreshed */
for (i = 0; i < sdbc_max_devs; i++)
if (FILE_OPENED(i))
(void) _sdbc_remote_store_pinned(i);
"sdbc(_sd_cache_mirror_enable) Cache on "
"mirror node %d is up. Fast writes enabled",
host);
(void) _sd_clear_node_hint(NSC_FORCED_WRTHRU);
}
}
return (_sd_cache_initialized);
}
/*
* two stage mirror disable:
* stage 0: set FORCED_WRTHRU hint (cache shutdown started)
* stage 1: mirror shutdown completed
*/
static int
{
if (_sd_cache_initialized) {
if (host != _SD_MIRROR_HOST)
return (0);
if (stage == 0) {
(void) _sd_set_node_hint(NSC_FORCED_WRTHRU);
return (0);
}
} else {
}
return (0);
}
/*
* set the fault tolerant data to indicate the state
* of the safestore host. set mode to writethru if appropriate
*/
static void
{
/*
* if single node local safestore or ram safestore
* then mark host state as carashed/_SD_HOST_NONE and set writethru
*/
if (SAFESTORE_LOCAL(sdbc_safestore)) {
if (!SAFESTORE_SAFE(sdbc_safestore)) {
_sd_mirror_down(); /* mirror node down */
(void) _sd_set_node_hint(NSC_FORCED_WRTHRU);
} else {
if (_sdbc_warm_start())
(void) _sd_set_node_hint(NSC_FORCED_WRTHRU);
}
} else
}
static void
_sd_remote_enable(void)
{
long r;
_sd_mirror_down(); /* mirror node down */
(void) _sd_set_node_hint(NSC_FORCED_WRTHRU);
return;
}
if (r == 1) { /* _sd_cache_initialized */
if (!_sd_is_mirror_crashed() &&
return;
}
if (r == ENOLINK)
_sd_mirror_down(); /* mirror node down */
else
_sd_mirror_cache_down(); /* mirror up, but no cache */
(void) _sd_set_node_hint(NSC_FORCED_WRTHRU);
}
void
_sd_remote_disable(int stage)
{
}
void
{
}
void
{
}
#else /* (_SD_FAULT_RES) */
void r_sd_ifs_cache_enable() {; }
void r_sd_ifs_cache_disable() {; }
#endif /* (_SD_FAULT_RES) */
/*
* invalidate cache hash table entries for given device
* or (-1) all devices belonging to mirrored node
*/
void
_sd_hash_invalidate_cd(int CD)
{
int i;
int cd;
for (i = 0; i < (_sd_htable->ht_size); i++) {
while (hptr) {
/*
* Skip if device doesn't match or pinned.
* (-1) skip attached cd's
*/
if ((CD != -1 &&
continue;
}
fl1:
_sd_htable))) {
if (SET_CENTRY_INUSE(ent)) {
goto fl1; /* try again */
}
/* cc_inuse is set, delete on block match */
(void)
_sd_hash_delete((struct _sd_hash_hd *)
ent, _sd_htable);
if (sdbc_use_dmchain) {
/* attempt to que head */
if (ent->cc_alloc_size_dm) {
(ent);
}
} else
} else
}
}
}
}
/*
* _sd_cd_online(cd,discard)
* clear local error state.
* if (discard && _attached != _SD_SELF_HOST) then release buffers.
* (add to dirty pending queue).
* returns:
* 0 success
* EINVAL invalid device or not failed
* EBUSY attached by this node, or by active mirror
*/
static int
{
/*
* in the case where a failed device has been closed and
* then re-opened, sh_failed will be zero because it is
* cleared in _sd_open_cd(). hence the test for
* _pinned != _SD_SELF_HOST which allows the restore to
* proceed in this scenario.
*/
return (EINVAL);
return (EINVAL);
return (0);
if (_sd_nodes_configured > 1) {
/* can't discard while attached on multinode systems */
return (EBUSY);
if (!discard && /* attached by active mirror! */
return (EBUSY);
}
return (0);
}
/* prevent any new i/o from arriving for this cd */
if (!discard)
num = 0;
num++;
if (discard) {
/* was FAST */
/* was FAST */
if (wctl) {
}
continue;
}
/* Clear PEND_DIRTY, iocount & iostatus */
if (SET_CENTRY_INUSE(cc_ent) == 0) {
cc_ent->cc_iocount = 0;
} else {
/* was FAST */
cc_ent->cc_iocount = 0;
/* was FAST */
}
}
}
if (discard) {
return (0);
}
/* make sure data gets flushed in case there is no new I/O */
(void) _sd_wait_for_flush(cd);
cdi->cd_recovering = 0;
return (0);
}
#if defined(_SD_FAULT_RES)
/*
* This node has disk attached, discard pins held by mirror
*/
static void
_sd_cd_discard_mirror(int cd)
{
return;
}
void
{
if (_sd_cache_initialized) {
}
}
/*
* _sd_failover_file_open -
* on failover, open devices which are not attached by this node.
*/
static int
_sd_failover_file_open(void)
{
int cblocks_processed = 0;
extern ss_voldata_t *_sdbc_gl_file_info;
/*
* If the cd is open and reserved we certainly don't
* need to do it again. However the recovery code
* must be racing some other cache usage which could
* be bad. We really need to be able to lock out
* all cache activity for this cd that is not tied
* to the recovery process. This doesn't seem to be
* feasible in sdbc since a competing thread could
* already be finished doing an alloc_buf. If this
* hole is to be closed sd-ctl must be more in
* control of the failover process.
*/
continue;
/*
* this constuct says that, on non-nvmem systems,
* if we are attempting to open a "local" device and
* nothing is pinned, then continue. i.e. open only
* remote devices or devices that have pinned data.
* for recovery on nvmem systems we open all devices.
*/
if ((!_sdbc_warm_start()) &&
continue;
if (!cd_gl->sv_volname ||
!cd_gl->sv_volname[0])
continue;
"Unable to open disk partition %s",
cd_gl->sv_volname);
continue;
}
if (rc == 0) {
}
}
return (cblocks_processed);
}
static int
{
int cblocks_processed = 0;
int err;
/* setup the key to get a volume directory stream of centrys */
"cannot recover volume %s",
cd_gl->sv_volname);
return (0);
}
/* cycle through the cdir getting resource tokens and reading centrys */
/*CONSTANTCONDITION*/
while (1) {
== SS_ERR) {
"cache entry read failure %s %p",
continue;
}
break; /* done */
/*
* this get into double caching consistency
* need to resolve this jgk
*/
/* should not happen */
continue;
}
/*
* note
* ss should return a stream of dirty blocks ordered
* by block number. if it turns out that ss will not support
* this then sorting for async recovery will have to be
* done here jgk
*/
continue;
/*
* clone mirror cache entry and do
* async I/O or sync I/O or pin if sh_failed
*/
}
if (cblocks_processed)
"sdbc(sdbc_recover_vol) %d cache blocks processed for volume %s",
return (cblocks_processed);
}
/*
* _sd_failover_done -
* mark failover open'd devices as requiring nsc_release()
* when all queued I/O's have drained.
*/
static void
_sd_failover_done(void)
{
int cd;
}
}
#endif /* (_SD_FAULT_RES) */
/*
* _sd_uncommit - discard local buffer modifications
* clear the valid bits.
*/
int
int flag)
{
int cd;
int bits;
return (EINVAL);
}
if (fba_len == 0) {
return (NSC_DONE);
}
end_cblk_len = 0;
}
else
/*
* Check if remote write-cache spool is dirty,
* if not, we can just discard local valid bits.
*/
cc_len -= st_cblk_len;
}
#if defined(_SD_DEBUG)
if (cc_len != end_cblk_len)
#endif
if (cc_len) {
}
return (NSC_DONE);
}
static void
_sd_wait_for_dirty(void)
{
int cd;
while (_SD_CD_WBLK_USED(cd))
}
}
/*
* _sd_wait_for_flush - wait for all i/o for this cd to cease.
* This function assumes that no further i/o are being issued
* against this device. This assumption is enforced by sd-ctl
* when called from _sd_flush_cd. Recovery also uses this
* wait and it enforces this assumption (somewhat imperfectly)
* by using cd_recovering.
* We must see progress in getting i/o complete within 25 seconds
* or we will return an error. If we complete normally (all i/o done)
* we return 0.
*/
int
_sd_wait_for_flush(int cd)
{
if (!(_SD_CD_WBLK_USED(cd)))
return (0);
/*
* Wait for WBLK_USED to reach 0.
* If unchanged for 32+ seconds returns EAGAIN
*/
break;
if (++tries > 128) {
"%s still has %d blocks pending %d"
" in progress (@ %lx)",
inprogress, nsc_lbolt());
return (EAGAIN);
}
} else {
tries = 0;
}
}
return (EIO);
else
return (0);
}
static
int _sd_ft_warm_start;
int
_sdbc_warm_start(void)
{
return (_sd_ft_warm_start);
}
void
_sdbc_clear_warm_start(void)
{
_sd_ft_warm_start = 0;
}
void
_sdbc_set_warm_start(void)
{
_sd_ft_warm_start = 1;
}
/*ARGSUSED*/
void
_ncall_poke(int host)
{
}