nfs4_recovery.c revision 108322fb1c3ed341aba9c80c9774df0ed9e35768
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* NFS Version 4 state recovery code.
*/
#include <nfs/nfs4_clnt.h>
/*
* Information that describes what needs to be done for recovery. It is
* passed to a client recovery thread as well as passed to various recovery
* routines. rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and
* vnode(s) affected by recovery. rc_vp1 and rc_vp2 are references (use
* VN_HOLD) or NULL. rc_lost_rqst contains information about the lost
* various objects (vnode, etc.). The recovery thread also uses flags set
* in the mntinfo4_t or vnode_t to tell it what to do. rc_error is used
* to save the error that originally triggered the recovery event -- will
* later be used to set mi_error if recovery doesn't work. rc_bseqid_rqst
* contains information about the request that got NFS4ERR_BAD_SEQID, and
* it holds reference count for the various objects (vnode, open owner,
* open stream, lock owner).
*/
typedef struct {
int rc_error;
} recov_info_t;
/*
* How long to wait before trying again if there is an error doing
* recovery, in seconds.
*/
static int recov_err_delay = 1;
/*
* How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY
* errors. Expressed in seconds. Default is defined as
* NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init()
*/
time_t nfs4err_delay_time = 0;
/*
* Tuneable to limit how many time "exempt" ops go OTW
* after a recovery error. Exempt op hints are OH_CLOSE,
* OH_LOCKU, OH_DELEGRETURN. These previously always went
* OTW even after rnode was "dead" due to recovery errors.
*
* The tuneable below limits the number of times a start_fop
* invocation will retry the exempt hints. After the limit
* is reached, nfs4_start_fop will return an error just like
* it would for non-exempt op hints.
*/
int nfs4_max_recov_error_retry = 3;
/*
* Number of seconds the recovery thread should pause before retry when the
* filesystem has been forcibly unmounted.
*/
int nfs4_unmount_delay = 1;
#ifdef DEBUG
/*
* How long to wait (in seconds) between recovery operations on a given
* file. Normally zero, but could be set longer for testing purposes.
*/
static int nfs4_recovdelay = 0;
/*
* Switch that controls whether to go into the debugger when recovery
* fails.
*/
static int nfs4_fail_recov_stop = 0;
/*
* Tuneables to debug client namespace interaction with server
* mount points:
*
* nfs4_srvmnt_fail_cnt:
* number of times EACCES returned because client
* attempted to cross server mountpoint
*
* nfs4_srvmnt_debug:
* trigger console printf whenever client attempts
* to cross server mountpoint
*/
int nfs4_srvmnt_fail_cnt = 0;
int nfs4_srvmnt_debug = 0;
#endif
/* forward references, in alphabetic order */
nfs4_error_t *);
static void errs_to_action(recov_info_t *,
static void flush_reinstate(nfs4_lost_rqst_t *);
static void free_milist(mntinfo4_t **, int);
nfs4_recov_state_t *, int, char *);
static void nfs4_recov_thread(recov_info_t *);
static void recov_bad_seqid(recov_info_t *);
nfs4_server_t *);
nfs4_server_t *);
vnode_t *);
/*
* Return non-zero if the given errno, status, and rpc status codes
* in the nfs4_error_t indicate that client recovery is needed.
* "stateful" indicates whether the call that got the error establishes or
* removes state on the server (open, close, lock, unlock, delegreturn).
*/
int
{
int recov = 0;
mntinfo4_t *mi;
/*
* Try failover if the error values justify it and if
* it's a failover mount. Don't try if the mount is in
* progress, failures are handled explicitly by nfs4rootvp.
*/
if (nfs4_try_failover(ep)) {
if (recov)
return (recov);
}
/*
* The server may have gotten the request, so for stateful
* ops we need to resynchronize and possibly back out the
* op.
*/
return (stateful);
}
return (0);
/* stat values are listed alphabetically */
/*
* There are two lists here: the errors for which we have code, and
* the errors for which we plan to have code before FCS. For the
* second list, print a warning message but don't attempt recovery.
*/
case NFS4ERR_BADHANDLE:
case NFS4ERR_BAD_SEQID:
case NFS4ERR_BAD_STATEID:
case NFS4ERR_DELAY:
case NFS4ERR_EXPIRED:
case NFS4ERR_FHEXPIRED:
case NFS4ERR_GRACE:
case NFS4ERR_OLD_STATEID:
case NFS4ERR_RESOURCE:
case NFS4ERR_STALE_CLIENTID:
case NFS4ERR_STALE_STATEID:
case NFS4ERR_WRONGSEC:
case NFS4ERR_STALE:
recov = 1;
break;
#ifdef DEBUG
case NFS4ERR_LEASE_MOVED:
case NFS4ERR_MOVED:
CE_WARN, "!Can't yet recover from NFS status %d",
break;
#endif
}
return (recov);
}
/*
* Some operations such as DELEGRETURN want to avoid invoking
* recovery actions that will only mark the file dead. If
* better handlers are invoked for any of these errors, this
* routine should be modified.
*/
int
{
if (status == NFS4ERR_BAD_SEQID ||
status == NFS4ERR_EXPIRED ||
status == NFS4ERR_BAD_STATEID ||
return (1);
return (0);
}
/*
* Transfer the state recovery information in recovp to mi's resend queue,
* and mark mi as having a lost state request.
*/
static void
{
"nfs4_enqueue_lost_rqst %p, op %d",
if (lrp->lr_putfirst)
else
}
/*
* Transfer the bad seqid recovery information in recovp to mi's
* bad seqid queue, and mark mi as having a bad seqid request.
*/
void
{
}
/*
* Initiate recovery.
*
* The nfs4_error_t contains the return codes that triggered a recovery
* attempt. mi, vp1, and vp2 refer to the filesystem and files that were
* being operated on. vp1 and vp2 may be NULL.
*
* Multiple calls are okay. If recovery is already underway, the call
* updates the information about what state needs recovery but does not
* start a new thread. The caller should hold mi->mi_recovlock as a reader
* for proper synchronization with any recovery thread.
*
* This will return TRUE if recovery was aborted, and FALSE otherwise.
*/
{
/*
* If there is lost state, we need to kick off recovery even if the
* filesystem has been unmounted or the zone is shutting down.
*/
if (gone) {
/* failed due to forced unmount, no new lost state */
}
/* some other failure, no existing lost state */
}
if (abort) {
"nfs4_start_recovery: fs unmounted"));
return (TRUE);
}
}
mi->mi_in_recovery++;
return (FALSE);
}
/*
* Internal version of nfs4_start_recovery. The difference is that the
* caller specifies the recovery action, rather than the errors leading to
* recovery.
*/
static void
{
mi->mi_in_recovery++;
}
static void
{
"start_recovery: mi %p, what %s", (void*)mi,
/*
* Bump the reference on the vfs so that we can pass it to the
* recovery thread.
*/
case NR_FAILOVER:
goto out_no_thread;
break;
case NR_CLIENTID:
/*
* If the filesystem has been unmounted, punt.
*/
goto out_no_thread;
/*
* If nobody else is working on the clientid, mark the
* clientid as being no longer set. Then mark the specific
* filesystem being worked on.
*/
if (!nfs4_server_in_recovery(sp)) {
}
if (recovp->rc_srv_reboot)
break;
case NR_OPENFILES:
if (recovp->rc_srv_reboot)
break;
case NR_WRONGSEC:
break;
case NR_EXPIRED:
goto out_no_thread; /* no further recovery possible */
case NR_BAD_STATEID:
goto out_no_thread; /* no further recovery possible */
case NR_FHEXPIRED:
case NR_BADHANDLE:
/*
* Recover the filehandle now, rather than using a
* separate thread. We can do this because filehandle
* recovery is independent of any other state, and because
* we know that we are not competing with the recovery
* thread at this time. recov_filehandle will deal with
* threads that are competing to recover this filehandle.
*/
goto out_no_thread; /* no further recovery needed */
case NR_STALE:
/*
* NFS4ERR_STALE handling
* recov_stale() could set MI4R_NEED_NEW_SERVER to
* indicate that we can and should failover.
*/
goto out_no_thread;
}
goto again;
case NR_BAD_SEQID:
if (recovp->rc_bseqid_rqst) {
break;
}
goto out_no_thread; /* no further recovery possible */
case NR_OLDSTATEID:
goto out_no_thread; /* no further recovery possible */
case NR_GRACE:
goto out_no_thread; /* no further action required for GRACE */
case NR_DELAY:
if (vp1)
goto out_no_thread; /* no further action required for DELAY */
case NR_LOST_STATE_RQST:
case NR_LOST_LOCK:
break;
default:
TAG_NONE, 0, 0);
goto out_no_thread;
}
/*
* If either file recently went through the same recovery, wait
* awhile. This is in case there is some sort of bug; we might not
* be able to recover properly, but at least we won't bombard the
* server with calls, and we won't tie up the client.
*/
/*
* If there's already a recovery thread, don't start another one.
*/
goto out_no_thread;
}
"start_recovery: starting new thread for mi %p", (void*)mi));
}
}
return;
/* not reached by thread creating call */
mi->mi_in_recovery--;
/*
* Free up resources that were allocated for us.
*/
}
static int
{
int err = 0;
/*
* If tuneable does not allow client to cross srv mountpoints and
* object is a stub, then check check op hint and return EACCES for
* any hint other than access, rddir, getattr, lookup.
*/
#ifdef DEBUG
"nfs4_check_srvstub: op=%d err=%d rp=%p vp=%p\n"
"va_nod=%llx r_mntd_fid=%llx\n"
"sv_fsid=(%llx:%llx) r_srv_fsid=(%llx:%llx)",
#endif
}
return (err);
}
static int
{
int error = 0;
int exempt;
return (0);
/*
* If there was a recovery error, then allow op hints "exempt" from
* recov errors to retry (currently 3 times). Either r_error or
* EIO is returned for non-exempt op hints.
*
* Error heirarchy:
* a) check for R4ERECOVERR
* b) check for R4SRVSTUB (only if R4RECOVERR is not set).
*/
/*
* Check to make sure that we haven't already inc'd
* rs_num_retry_despite_err for current nfs4_start_fop
* instance. We don't want to double inc (if we were
* called with vp2, then the vp1 call could have
* already incremented.
*/
"nfs4_start_fop: %s %p DEAD, cnt=%d", str,
} else {
/*
* An ESTALE error on a non-regular file is not
* "sticky". Return the ESTALE error once, but
* clear the condition to allow future operations
* to go OTW. This will allow the client to
* recover if the server has merely unshared then
* re-shared the file system. For regular files,
* the unshare has destroyed the open state at the
* server and we aren't willing to do a reopen (yet).
*/
}
"nfs4_start_fop: %s %p DEAD, cnt=%d error=%d",
}
} else {
"nfs4_start_fop: %s %p SRVSTUB, error=%d", str,
}
return (error);
}
/*
* Initial setup code that every operation should call if it might invoke
* client recovery. Can block waiting for recovery to finish on a
* filesystem. Either vnode ptr can be NULL.
*
* Returns 0 if there are no outstanding errors. Can return an
* errno value under various circumstances (e.g., failed recovery, or
* interrupted while waiting for recovery to finish).
*
* There must be a corresponding call to nfs4_end_op() to free up any locks
* or resources allocated by this call (assuming this call succeeded),
* using the same rsp that's passed in here.
*
* The open and lock seqid synchronization must be stopped before calling this
* function, as it could lead to deadlock when trying to reopen a file or
* reclaim a lock. The synchronization is obtained with calls to:
* nfs4_start_open_seqid_sync()
* nfs4_start_lock_seqid_sync()
*
* *startrecovp is set TRUE if the caller should not bother with the
* over-the-wire call, and just initiate recovery for the given request.
* This is typically used for state-releasing ops if the filesystem has
* been forcibly unmounted. startrecovp may be NULL for
* non-state-releasing ops.
*/
int
{
#ifdef DEBUG
void *fop_caller;
#endif
#ifdef DEBUG
}
#endif
/*
* Process the items that may delay() based on server response
*/
if (error)
goto out;
if (error)
goto out;
}
/* Wait for a delegation recall to complete. */
if (error)
goto out;
/*
* Wait for any current recovery actions to finish. Note that a
* recovery thread can still start up after wait_for_recovery()
* finishes. We don't block out recovery operations until we
* acquire s_recovlock and mi_recovlock.
*/
if (error)
goto out;
/*
* Check to see if the rnode is already marked with a
* recovery error. If so, return it immediately. But
* always pass CLOSE, LOCKU, and DELEGRETURN so we can
* clean up state on the server.
*/
goto out;
}
goto out;
}
/*
* The lock order calls for us to acquire s_recovlock before
* mi_recovlock, but we have to hold mi_recovlock to look up sp (to
* mi_recovlock, look up sp, drop mi_recovlock, acquire
* s_recovlock and mi_recovlock, then verify that sp is still the
* right object. XXX Can we find a simpler way to deal with this?
*/
goto out;
}
sp->s_otw_call_count++;
}
goto out;
}
}
goto out;
}
/*
* If the mntinfo4_t hasn't changed nfs4_sever_ts then
* there's no point in double checking to make sure it
* has switched.
*/
/* try again */
}
sp->s_otw_call_count--;
}
goto get_sp;
} else {
}
}
}
}
/*
* If the fileystem uses volatile filehandles, obtain a lock so
* that we synchronize with renames. Exception: mount operations
* can change mi_fh_expire_type, which could be a problem, since
* the end_op code needs to be consistent with the start_op code
* about mi_rename_lock. Since mounts don't compete with renames,
* it's simpler to just not acquire the rename lock for mounts.
*/
goto out;
}
}
if (OH_IS_STATE_RELE(op)) {
/*
* For forced unmount, letting the request proceed will
* almost always delay response to the user, so hand it off
* to the recovery thread. For exiting lwp's, we don't
* have a good way to tell if the request will hang. We
* generally want processes to handle their own requests so
* that they can be done in parallel, but if there is
* already a recovery thread, hand the request off to it.
* This will improve user response at no cost to overall
* system throughput. For zone shutdown, we'd prefer
* the recovery thread to handle this as well.
*/
*startrecovp = TRUE;
*startrecovp = TRUE;
else
*startrecovp = FALSE;
} else
if (startrecovp != NULL)
*startrecovp = FALSE;
return (error);
out:
sp->s_otw_call_count--;
}
#ifdef DEBUG
#endif
return (error);
}
/*
* It is up to the caller to determine if rsp->rs_sp being NULL
* is detrimental or not.
*/
int
{
rsp->rs_num_retry_despite_err = 0;
}
/*
* Release any resources acquired by nfs4_start_op().
* 'sp' should be the nfs4_server pointer returned by nfs4_start_op().
*
* The operation hint is used to avoid a deadlock by bypassing delegation
* return logic for writes, which are done while returning a delegation.
*/
void
{
#ifdef lint
/*
* The op hint isn't used any more, but might be in
* the future.
*/
#endif
#ifdef DEBUG
#endif
if (!needs_recov) {
/* may need to clear the delay interval */
rp->r_delay_interval = 0;
}
}
}
/*
* If the corresponding nfs4_start_op() found a sp,
* then there must still be a sp.
*/
sp->s_otw_call_count--;
} else {
}
}
void
{
}
/*
* If the filesystem is going through client recovery, block until
* finished.
* Exceptions:
* - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed
* if the filesystem has been forcibly unmounted or the lwp is exiting.
*
* Return value:
* - 0 if no errors
* - EINTR if the call was interrupted
* - EIO if the filesystem has been forcibly unmounted (non-state-releasing
* op)
* - the errno value from the recovery thread, if recovery failed
*/
static int
{
int error = 0;
while (mi->mi_recovflags != 0) {
break;
break;
if (OH_IS_STATE_RELE(op_hint) &&
break;
lwp->lwp_nostop++;
/* XXX - use different cv? */
lwp->lwp_nostop--;
break;
}
lwp->lwp_nostop--;
}
"wait_for_recovery: fail since RECOV FAIL"));
!OH_IS_STATE_RELE(op_hint)) {
"wait_for_recovery: forced unmount"));
}
return (error);
}
/*
* If the client received NFS4ERR_GRACE for this particular mount,
* the client blocks here until it is time to try again.
*
* Return value:
* - 0 if wait was successful
* - EINTR if the call was interrupted
*/
int
{
int error = 0;
/* do a unprotected check to reduce mi_lock contention */
if (mi->mi_grace_wait != 0) {
if (mi->mi_grace_wait != 0) {
curtime = gethrestime_sec();
curtime = gethrestime_sec();
mi->mi_grace_wait = 0;
} else {
mi->mi_grace_wait = 0;
}
}
}
return (error);
}
/*
* If the client received NFS4ERR_DELAY for an operation on a vnode,
* the client blocks here until it is time to try again.
*
* Return value:
* - 0 if wait was successful
* - EINTR if the call was interrupted
*/
int
{
int error = 0;
/* do a unprotected check to reduce r_statelock contention */
if (rp->r_delay_wait != 0) {
if (rp->r_delay_wait != 0) {
}
curtime = gethrestime_sec();
curtime = gethrestime_sec();
rp->r_delay_wait = 0;
} else {
rp->r_delay_wait = 0;
}
}
}
return (error);
}
/*
* The recovery thread.
*/
static void
{
0, 0);
/*
* We don't really need protection here against failover or
* migration, since the current thread is the one that would make
* any changes, but hold mi_recovlock anyway for completeness (and
* to satisfy any ASSERTs).
*/
/*
* Do any necessary recovery, based on the information in recovp
* and any recovery flags.
*/
do {
"nfs4_recov_thread: file system has been "
"unmounted"));
"nfs4_recov_thread: zone shutting down"));
/*
* If the server has lost its state for us and
* the filesystem is unmounted, then the filesystem
* can be tossed, even if there are lost lock or
* lost state calls in the recovery queue.
*/
if (mi->mi_recovflags &
"nfs4_recov_thread: bailing out"));
recov_fail = TRUE;
}
/*
* We don't know if the server has any state for
* us, and the filesystem has been unmounted. If
* there are "lost state" recovery items, keep
* trying to process them until there are no more
* mounted filesystems for the server. Otherwise,
* bail out. The reason we don't mark the
* filesystem as failing recovery is in case we
* have to do "lost state" recovery later (e.g., a
* user process exits).
*/
done = 1;
break;
}
else {
}
if (!activesrv) {
"no active fs for server %p",
(void *)sp));
recov_fail = TRUE;
/*
* Mark the server instance as
* dead, so that nobody will attach
* a new filesystem.
*/
}
}
} else {
}
/*
* Check if we need to select a new server for a
* failover. Choosing a new server will force at
* least a check of the clientid.
*/
if (!recov_fail &&
} else
/*
* Check if we need to recover the clientid. This
* must be done before file and lock recovery, and it
* potentially affects the recovery threads for other
* filesystems, so it gets special treatment.
*/
} else {
/*
* Unset this flag in case another recovery
* thread successfully recovered the clientid
* for us already.
*/
}
}
/*
* Check if we need to get the security information.
*/
RW_WRITER, 0);
/*
* If error, nothing more can be done, stop
* the recovery.
*/
if (error) {
}
} else
/*
* Check if there's a bad seqid to recover.
*/
RW_WRITER, 0);
} else
/*
* Next check for recovery that affects the entire
* filesystem.
*/
} else
}
/*
* Send any queued state recovery requests.
*/
RW_WRITER, 0);
/* done */
}
} else {
}
/*
* See if there is anything more to do. If not, announce
* that we are done and exit.
*
* Need mi_recovlock to keep 'sp' valid. Must grab
* mi_recovlock before mi_lock to preserve lock ordering.
*/
/*
* We need to remove the lost requests before we
* unmark the mi as no longer doing recovery to
* avoid a race with a new thread putting new lost
* requests on the same mi (and the going away
* thread would remove the new lost requests).
*
* Move the lost requests to a local list since
* nfs4_remove_lost_rqst() drops mi_lock, and
* dropping the mi_lock would make our check to
* see if recovery is done no longer valid.
*/
sizeof (nfs4_lost_rqst_t),
done = 1;
/*
* Now officially free the "moved"
* lost requests.
*/
}
} else
/*
* If the filesystem has been forcibly unmounted, there is
* probably no point in retrying immediately. Furthermore,
* there might be user processes waiting for a chance to
* queue up "lost state" requests, so that they can exit.
* So pause here for a moment. Same logic for zone shutdown.
*/
}
} while (!done);
/*
* Return all recalled delegations
*/
mi->mi_in_recovery--;
/*
* Free up resources that were allocated for us.
*/
zthread_exit();
}
/*
* Log the end of recovery and notify any waiting threads.
*/
static void
{
}
/*
* State-specific recovery routines, by state.
*/
/*
* Failover.
*
* Replaces *spp with a reference to the new server, which must
* eventually be freed.
*/
static void
{
int error;
int oncethru = 0;
int index;
char *snames;
#ifdef lint
/*
* Lint can't follow the logic, so thinks that snames and len
* can be used before being set. They can't, but lint can't
* figure it out. To address the lint warning, initialize
* snames and len for lint.
*/
len = 0;
#endif
/*
* Ping the null NFS procedure of every server in
* the list until one responds. We always start
* at the head of the list and always skip the one
* that is current, since it's caused us a problem.
*/
*recov_fail = TRUE;
if (oncethru)
return;
}
continue;
}
continue;
if (error)
continue;
if (status == RPC_SUCCESS) {
break;
}
}
if (!oncethru) {
oncethru = 1;
}
}
}
if (oncethru) {
0, NULL);
}
#if DEBUG
#endif
/*
* Update server-dependent fields in the root vnode.
*/
(void) nfs4_free_data_reclaim(rp);
"recov_newserver: done with %s",
rnode4info(rp)));
} else
if (recovp->rc_srv_reboot)
mi->mi_failover++;
} else
}
/*
* Clientid.
*/
static void
{
int error = 0;
int still_stale;
int need_new_s;
/*
* Acquire the recovery lock and then verify that the clientid
* still needs to be recovered. (Note that s_recovlock is supposed
* to be acquired before s_lock.) Since the thread holds the
* recovery lock, no other thread will recover the clientid.
*/
if (still_stale) {
if (error != 0) {
/*
* nfs4setclientid may have set MI4R_NEED_NEW_SERVER,
* if so, just return and let recov_thread drive
* failover.
*/
if (need_new_s) {
return;
}
/* don't destroy the nfs4_server, let umount do it */
}
}
if (error == 0) {
/*
* If still_stale isn't true, then another thread already
* recovered the clientid. And that thread that set the
* clientid will have initiated reopening files on all the
* filesystems for the server, so we should not initiate
* reopening for this filesystem here.
*/
if (still_stale) {
if (recovp->rc_srv_reboot)
}
}
if (error != 0) {
} else {
mntinfo4_t **milist;
int nummi, i;
/*
* Initiate recovery of open files for other filesystems.
* We create an array of filesystems, rather than just
* walking the filesystem list, to avoid deadlock issues
* with s_lock and mi_recovlock.
*/
for (i = 0; i < nummi; i++) {
RW_READER, 0);
}
}
}
}
/*
* Return an array of filesystems associated with the given server. The
* caller should call free_milist() to free the references and memory.
*/
static mntinfo4_t **
{
int nummi, i;
mntinfo4_t **milist;
nummi = 0;
nummi++;
}
return (milist);
}
/*
* Free the filesystem list created by make_milist().
*/
static void
{
int i;
for (i = 0; i < nummi; i++) {
}
}
/*
* Filehandle
*/
/*
* Lookup the filehandle for the given vnode and update the rnode if it has
* changed.
*
* Errors:
* - if the filehandle could not be updated because of an error that
* requires further recovery, initiate that recovery and return.
* - if the filehandle could not be updated because of a signal, pretend we
* succeeded and let someone else deal with it.
* - if the filehandle could not be updated and the filesystem has been
* forcibly unmounted, pretend we succeeded, and let the caller deal with
* the forced unmount (to retry or not to retry, that is the question).
* - if the filehandle could not be updated because of some other error,
* mark the rnode bad and return.
*/
static void
{
return;
}
/*
* If someone else is updating the filehandle, wait for them to
* finish and then let our caller retry.
*/
}
return;
}
if (action == NR_BADHANDLE) {
/* shouldn't happen */
}
/*
* If we get BADHANDLE or FHEXPIRED in their handler, something is
* broken. Don't try to recover, just mark the file dead.
*/
if (needrecov) {
/*
* Don't set r_error to ESTALE. Higher-level code (e.g.,
* cstatat_getvp()) retries on ESTALE, which would cause
* an infinite loop.
*/
}
}
/*
* Stale Filehandle
*/
/*
* A stale filehandle can happen when an individual file has
* been removed, or when an entire filesystem has been taken
* offline. To distinguish these cases, we do this:
* - if a GETATTR with the current filehandle is okay, we do
* nothing (this can happen with two-filehandle ops)
* - if the GETATTR fails, but a GETATTR of the root filehandle
* succeeds, mark the rnode with R4STALE, which will stop use
* - if the GETATTR fails, and a GETATTR of the root filehandle
* also fails, we consider the problem filesystem-wide, so:
* - if we can failover, we should
* - if we can't failover, we should mark both the original
* vnode and the root bad
*/
static void
{
char *fail_msg = "failed to recover from NFS4ERR_STALE";
"recov_stale: already marked dead, rp %s",
rnode4info(rp)));
return;
}
"recov_stale: already marked stale, rp %s",
rnode4info(rp)));
return;
}
/* Try a GETATTR on this vnode */
/*
* Handle non-STALE recoverable errors
*/
"recov_stale: error=%d, stat=%d seen on rp %s",
goto out;
}
/* Are things OK for this vnode? */
"recov_stale: file appears fine, rp %s",
rnode4info(rp)));
goto out;
}
/* Did we get an unrelated non-recoverable error? */
"recov_stale: unrelated fatal error, rp %s",
rnode4info(rp)));
goto out;
}
/*
* If we don't appear to be dealing with the root node, find it.
*/
nfs4_error_zinit(&e);
if (e.error) {
"recov_stale: can't find root node for rp %s",
rnode4info(rp)));
goto out;
}
}
/* Try a GETATTR on the root vnode */
nfs4_error_zinit(&e);
/* Try recovery? */
if (needrecov) {
(void) nfs4_start_recovery(&e,
OP_GETATTR, NULL);
"recov_stale: error=%d, stat=%d seen "
rnode4info(rp)));
}
}
/*
* Check to see if a failover attempt is warranted
* NB: nfs4_try_failover doesn't check for STALE
* because recov_stale gets a shot first. Now that
* recov_stale has failed, go ahead and try failover.
*
* If the getattr on the root filehandle was successful,
* then mark recovery as failed for 'vp' and exit.
*/
/*
* pass the original error to fail_recov, not
* the one from trying the root vnode.
*/
"recov_stale: root node OK, marking "
goto out;
}
}
/*
* Here, we know that both the original file and the
* root filehandle (which may be the same) are stale.
* We want to fail over if we can, and if we can't, we
* want to mark everything in sight bad.
*/
if (FAILOVER_MOUNT4(mi)) {
"recov_stale: failing over due to rp %s",
rnode4info(rp)));
} else {
/*
* Can't fail over, so mark things dead.
*
* If rootvp is set, we know we have a distinct
* non-root vnode which can be marked dead in
* the usual way.
*
* Then we want to mark the root vnode dead.
* Note that if rootvp wasn't set, our vp is
* actually the root vnode.
*/
"recov_stale: can't fail over, marking dead rp %s",
rnode4info(rp)));
} else {
}
/*
* Mark root dead, but quietly - since
* the root rnode is frequently recreated,
* we can encounter this at every access.
* Also mark recovery as failed on this VFS.
*/
"recov_stale: marking dead root rp %s",
rnode4info(rootrp)));
}
out:
if (rootvp)
}
/*
* Locks.
*/
/*
* Reclaim all the active (acquired) locks for the given file.
* If a process lost a lock, the process is sent a SIGLOST. This is not
* considered an error.
*
* Return values:
* Errors and status are returned via the nfs4_error_t parameter
* If an error indicates that recovery is needed, the caller is responsible
* for dealing with it.
*/
static void
{
return;
/*
* If we get an error that requires recovery actions, just bail out
* and let the top-level recovery code handle it.
*
* If we get some other error, kill the process that owned the lock
* and mark its remaining locks (if any) as belonging to NOPID, so
* that we don't make any more reclaim requests for that process.
*/
int did_reclaim = 1;
continue;
/*
* If we need to restart recovery, stop processing the
* list. Some errors would be recoverable under other
* circumstances, but if they happen here we just give up
* on the lock.
*/
break;
break;
}
/*
* In case the server isn't offering us a grace period, or
* if we missed it, we might have opened & locked from scratch,
* We need to ensure that the object hadn't been otherwise
* changed during this time, by comparing the changeinfo.
* We get passed the changeinfo from before the reopen by our
* caller, in pre_change.
* The changeinfo from after the reopen is in rp->r_change,
* courtesy of the GETATTR in the reopen.
* If they're different, then the file has changed, and we
* have to SIGLOST the app.
*/
}
0, 0);
else
0, 0);
/* Reinitialize the nfs4_error and continue */
}
}
}
/*
* Reclaim the given lock.
* If the lock can't be reclaimed, the process is sent SIGLOST, but this is
* not considered an error.
*
* Errors are returned via the nfs4_error_t parameter.
*/
static void
int *did_reclaimp)
{
return;
}
do {
/*
* This shouldn't affect other reclaims, so don't
* return an error.
*/
break;
}
}
/*
* Open files.
*/
/*
* Verifies if the nfsstat4 is a valid error for marking this vnode dead.
* Returns 1 if the error is valid; 0 otherwise.
*/
static int
{
/*
* We should not be marking non-regular files as dead,
* except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME).
*/
stat != NFS4ERR_BADNAME)
return (0);
return (1);
}
/*
* Failed attempting to recover a filehandle. If 'stat' is valid for 'vp',
* then mark the object dead. Since we've had to do a lookup for
* filehandle recovery, we will mark the object dead if we got NOENT.
*/
static void
{
return;
}
/*
* Recovery from a "shouldn't happen" error. In the long term, we'd like
* to mark only the data structure(s) that provided the bad value as being
* bad. But for now we'll just mark the entire file.
*/
static void
{
return;
}
/*
* Free up the information saved for a lost state request.
*/
static void
{
int have_sync_lock;
(CE_NOTE, "nfs4_free_lost_rqst:"));
case OP_OPEN:
if (filep->utf8string_val) {
}
break;
case OP_DELEGRETURN:
break;
case OP_CLOSE:
have_sync_lock = 1;
if (osp->os_pending_close) {
/* clean up the open file state. */
osp->os_pending_close = 0;
}
if (have_sync_lock)
break;
}
}
}
}
}
}
}
}
}
/*
* Remove any lost state requests and free them.
*/
static void
{
}
}
/*
* Reopen all the files for the given filesystem and reclaim any locks.
*/
static void
{
int remap;
char *fail_msg = "No such file or directory on replica";
/*
* This check is to allow a 10ms pause before we reopen files
* it should allow the server time to have received the CB_NULL
* reply and update its internal structures such that (if
* applicable) we are granted a delegation on reopened files.
*/
}
if (NFS4_VOLATILE_FH(mi)) {
nfs4_remap_root(mi, &e, 0);
}
}
else
claim = CLAIM_NULL;
/*
* Get a snapshot of open files in the filesystem. Note
* that new opens will stall until the server's grace
* period is done.
*/
/*
* Since we are re-establishing state on the
* server, its ok to blow away the saved lost
* requests since we don't need to reissue it.
*/
if (remap) {
NFS4_REMAP_CKATTRS, &e);
}
/*
* The current server does not have the file
* that is to be remapped. This is most
* likely due to an improperly maintained
* replica. The files that are missing from
* the server will be marked dead and logged
* in order to make sys admins aware of the
* problem.
*/
/*
* We've already handled the error so clear it.
*/
nfs4_error_zinit(&e);
continue;
int j;
break;
}
if (nfs4_needs_recovery(&e, TRUE,
(void) nfs4_start_recovery(&e, mi,
break;
}
}
#ifdef DEBUG
if (nfs4_recovdelay > 0)
#endif
(void) nfs4_start_recovery(&e, mi,
NULL);
break;
}
/*
* Check to see if we need to remap files passed in
* via the recovery arguments; this will have been
* done for open files. A failure here is not fatal.
*/
if (remap) {
&ignore);
&ignore);
}
}
}
if (reopenlist != NULL)
}
/*
* Resend the queued state recovery requests in "rqsts".
*/
static void
{
#ifdef NOTYET
#endif
"nfs4_resend_lost_rqsts: resend request: for vp %p got "
/*
* If we get a recovery error that we can actually
* recover from (such as ETIMEDOUT, FHEXPIRED), we
* return and let the recovery thread redrive the call.
* Don't requeue unless the zone is still healthy.
*/
(nfs4_try_failover(&n4e) ||
/*
* For these three errors, we want to delay a bit
* instead of pounding the server into submission.
* We have to do this manually; the normal
* processing for these errors only works for
* non-recovery requests.
*/
} else {
(void) nfs4_start_recovery(&n4e,
}
return;
}
}
}
/*
* Resend the given op, and issue any necessary undo call.
* errors are returned via the nfs4_error_t parameter.
*/
static void
{
case OP_OPEN:
break;
case OP_OPEN_DOWNGRADE:
break;
case OP_CLOSE:
acc_bits = 0;
if (osp->os_share_acc_read)
if (osp->os_share_acc_write)
CLOSE_RESEND, 0, 0, 0);
break;
case OP_LOCK:
case OP_LOCKU:
goto done;
case OP_DELEGRETURN:
goto done;
default:
#ifdef DEBUG
#endif
return;
}
/*
* No need to retry nor send an "undo" CLOSE in the
* event the server rebooted.
*/
goto done;
/*
* If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing
* to undo. Undoing locking operations was handled by
* resend_lock().
*/
goto done;
/*
* If we get any other error for OPEN, then don't attempt
* to undo the resend of the open (since it was never
* successful!).
*/
goto done;
/*
* Now let's undo our OPEN.
*/
"nfs4close_one: for vp %p got error %d stat %d",
done:
}
/*
* Close a file that was opened via a resent OPEN.
* Most errors are passed back to the caller (via the return value and
* *statp), except for FHEXPIRED, which is retried.
*
* It might be conceptually cleaner to push the CLOSE request onto the
* front of the resend queue, rather than sending it here. That would
* match the way we undo lost lock requests. On the other
* hand, we've already got something that works, and there's no reason to
* change it at this time.
*/
static void
{
for (;;) {
CLOSE_AFTER_RESEND, 0, 0, 0);
break; /* success; done */
break;
/* else retry FHEXPIRED */
}
}
/*
* Resend the given lost lock request. Return an errno value. If zero,
* *statp is set to the NFS status code for the call.
*
* Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or
* a recovery error that we don't actually recover from yet (eg: BAD_SEQID).
* Let the recovery thread redrive the call if we get a recovery error that
* we can actually recover from.
*/
static void
{
"nfs4frlock for vp %p returned error %d, stat %d",
goto done;
goto done;
/*
* If we failed with a non-recovery error, send SIGLOST and
* mark the file dead.
*/
send_siglost = TRUE;
else {
/*
* Done with recovering LOST LOCK in the event the
* server rebooted or we've lost the lease.
*/
goto done;
}
/*
* BAD_STATEID on an unlock indicates that the server has
* forgotten about the lock anyway, so act like the call
* was successful.
*/
goto done;
/*
* If we got a recovery error that we don't actually
* recover from, send SIGLOST. If the filesystem was
* forcibly unmounted, we skip the SIGLOST because (a) it's
* unnecessary noise, and (b) there could be a new process
* with the same pid as the one that had generated the lost
* state request.
*/
send_siglost = TRUE;
goto done;
}
/*
* If the filesystem was forcibly unmounted, we
* still need to synchronize with the server and
* release state. Try again later.
*/
goto done;
/*
* If we get a recovery error that we can actually
* recover from (such as ETIMEDOUT, FHEXPIRED),
* return and let the recovery thread redrive the call.
*
* For the three errors below, we want to delay a bit
* instead of pounding the server into submission.
*/
goto done;
}
done:
if (send_siglost) {
/*
* Must be root or the actual thread being issued the
* SIGLOST for this to work, so just become root.
*/
/*
* Flush any additional reinstantiation requests for
* this operation. Sending multiple SIGLOSTs to the user
* process is unlikely to help and may cause trouble.
*/
}
}
/*
* Remove any lock reinstantiation requests that correspond to the given
* lost request. We only remove items that follow lrp in the queue,
* assuming that lrp will be removed by the generic lost state code.
*/
static void
{
mntinfo4_t *mi;
/*
* If there are any more reinstantation requests to get rid of,
* they should all be clustered at the front of the lost state
* queue.
*/
break;
break;
"remove reinstantiation %p", (void *)lrp));
}
}
/*
* End of state-specific recovery routines.
*/
/*
* Allocate a lost request struct, initialize it from lost_rqstp (including
* bumping the reference counts for the referenced vnode, etc.), and hang
* it off of recovp.
*/
static void
{
*action = NR_LOST_LOCK;
/*
* Consume caller's utf8string
*/
srcfp->utf8string_len = 0;
} else {
#ifdef DEBUG
lost_rqstp->lr_op);
#endif
return;
}
else {
}
}
/*
* Map the given return values (errno and nfs4 status code) to a recovery
* action and fill in the following fields of recovp: rc_action,
* rc_srv_reboot, rc_stateid, rc_lost_rqst.
*/
void
{
int try_f;
/*
* We start recovery for EINTR only in the lost lock
*/
if (lost_rqstp) {
}
if (try_f)
} else if (error != 0) {
} else {
switch (stat) {
#ifdef notyet
case NFS4ERR_LEASE_MOVED:
break;
case NFS4ERR_MOVED:
break;
#endif
case NFS4ERR_BADHANDLE:
break;
case NFS4ERR_BAD_SEQID:
if (bsep)
break;
case NFS4ERR_OLD_STATEID:
break;
case NFS4ERR_WRONGSEC:
break;
case NFS4ERR_FHEXPIRED:
break;
case NFS4ERR_BAD_STATEID:
if (sidp)
} else
break;
case NFS4ERR_EXPIRED:
/*
* The client's lease has expired, either due
* to a network partition or perhaps a client
* error. In either case, try an NR_CLIENTID
* style recovery. reboot remains false, since
* there is no evidence the server has rebooted.
* This will cause CLAIM_NULL opens and lock
* requests without the reclaim bit.
*/
nfs4_server_t *, sp,
mntinfo4_t *, mi,
break;
case NFS4ERR_STALE_CLIENTID:
case NFS4ERR_STALE_STATEID:
break;
case NFS4ERR_RESOURCE:
/*
* If this had been a FAILOVER mount, then
* we'd have tried failover. Since it's not,
* just delay a while and retry.
*/
break;
case NFS4ERR_GRACE:
break;
case NFS4ERR_DELAY:
break;
case NFS4ERR_STALE:
break;
default:
0, 0);
break;
}
}
/* make sure action got set */
NULL);
}
/*
* Return the (held) credential for the process with the given pid.
* May return NULL (e.g., process not found).
*/
static cred_t *
{
proc_t *p;
return (NULL);
}
mutex_enter(&p->p_crlock);
mutex_exit(&p->p_crlock);
return (cr);
}
/*
* Send SIGLOST to the given process and queue the event.
*
* The 'dump' boolean tells us whether this action should dump the
* in-kernel queue of recovery messages or not.
*/
void
{
proc_t *p;
if (p)
}
/*
* Scan the lock list for entries that match the given pid. Change the
* pid in those that do to NOPID.
*/
static void
{
}
}
/*
* Mark a file as having failed recovery, after making a last-ditch effort
* to return any delegation.
*
* Sets r_error to EIO or ESTALE for the given vnode.
*/
void
{
#ifdef DEBUG
if (nfs4_fail_recov_stop)
debug_enter("nfs4_fail_recov");
#endif
return;
}
/*
* Set R4RECOVERRP to indicate that a recovery error is in
* progress. This will shut down reads and writes at the top
* half. Don't set R4RECOVERR until after we've returned the
* delegation, otherwise it will fail.
*/
}
/*
* recov_throttle: if the file had the same recovery action within the
* throttle interval, wait for the throttle interval to finish before
* proceeding.
*
* Side effects: updates the rnode with the current recovery information.
*/
static void
{
curtime = gethrestime_sec();
"recov_throttle: now: (%d, %ld), last: (%d, %ld)",
curtime = gethrestime_sec();
}
}
/*
* React to NFS4ERR_GRACE by setting the time we'll permit
* the next call to this filesystem.
*/
void
{
/* Mark the time for the future */
}
/*
* React to MFS4ERR_DELAY by setting the time we'll permit
* the next call to this vnode.
*/
void
{
/*
* Calculate amount we should delay, initial
* delay will be short and then we will back off.
*/
if (rp->r_delay_interval == 0)
else
/* calculate next interval value */
}
/*
* The caller is responsible for freeing the returned string.
*/
static char *
{
char *srvnames;
char *namep;
/*
* Calculate the length of the string required to hold all
* of the server names plus either a comma or a null
* character following each individual one.
*/
length = 0;
continue;
}
}
continue;
}
*namep++ = ',';
}
*--namep = '\0';
return (srvnames);
}
static void
{
}
static void
{
}
/*
* We don't actually fully recover from NFS4ERR_BAD_SEQID. We
* simply mark the open owner and open stream (if provided) as "bad".
* Then future uses of these data structures will be limited to basically
* just cleaning up the internal client state (no going OTW).
*
* succeed so progress can be made.
*/
void
{
int error;
/*
* Handle all the bad seqid entries on mi's list.
*/
"recov_bad_seqid: mark oop %p lop %p as bad for "
"vp %p tag %s pid %d: last good seqid %d for tag %s",
if (bad_oop) {
/* essentially reset the open owner */
}
if (bad_lop) {
0, NFS4ERR_BAD_SEQID);
}
}
}