/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/pathname.h>
#include <nfs/nfs_clnt.h>
/*
* The hash queues for the access to active and cached rnodes
* for each hash bucket is used to control access and to synchronize
* lookups, additions, and deletions from the hash queue.
*
* The rnode freelist is organized as a doubly linked list with
* a head pointer. Additions and deletions are synchronized via
* a single mutex.
*
* In order to add an rnode to the free list, it must be hashed into
* a hash queue and the exclusive lock to the hash queue be held.
* If an rnode is not hashed into a hash queue, then it is destroyed
* because it represents no valuable information that can be reused
* about the file. The exclusive lock to the hash queue must be
* held in order to prevent a lookup in the hash queue from finding
* the rnode and using it and assuming that the rnode is not on the
* freelist. The lookup in the hash queue will have the hash queue
* locked, either exclusive or shared.
*
* The vnode reference count for each rnode is not allowed to drop
* below 1. This prevents external entities, such as the VM
* subsystem, from acquiring references to vnodes already on the
* freelist and then trying to place them back on the freelist
* when their reference is released. This means that the when an
* rnode is looked up in the hash queues, then either the rnode
* is removed from the freelist and that reference is transferred to
* the new reference or the vnode reference count must be incremented
* accordingly. The mutex for the freelist must be held in order to
* accurately test to see if the rnode is on the freelist or not.
* The hash queue lock might be held shared and it is possible that
* two different threads may race to remove the rnode from the
* freelist. This race can be resolved by holding the mutex for the
* freelist. Please note that the mutex for the freelist does not
* need to held if the rnode is not on the freelist. It can not be
* placed on the freelist due to the requirement that the thread
* putting the rnode on the freelist must hold the exclusive lock
* to the hash queue and the thread doing the lookup in the hash
* queue is holding either a shared or exclusive lock to the hash
* queue.
*
* The lock ordering is:
*
* hash bucket lock -> vnode lock
* hash bucket lock -> freelist lock
*/
static long rnew = 0;
long nrnode = 0;
static int rtablesize;
static int rtablemask;
/*
* Mutex to protect the following variables:
* nfs_major
* nfs_minor
*/
int nfs_major;
int nfs_minor;
/* Do we allow preepoch (negative) time values otw? */
/*
* Access cache
*/
static int acachesize;
static int acachemask;
/*
* Client side utilities
*/
/*
* client side statistics
*/
{ "calls", KSTAT_DATA_UINT64 },
{ "badcalls", KSTAT_DATA_UINT64 },
{ "clgets", KSTAT_DATA_UINT64 },
{ "cltoomany", KSTAT_DATA_UINT64 },
#ifdef DEBUG
{ "clalloc", KSTAT_DATA_UINT64 },
{ "noresponse", KSTAT_DATA_UINT64 },
{ "failover", KSTAT_DATA_UINT64 },
{ "remap", KSTAT_DATA_UINT64 },
#endif
};
/*
* The following are statistics that describe behavior of the system as a whole
* and doesn't correspond to any one particular zone.
*/
#ifdef DEBUG
static struct clstat_debug {
} clstat_debug = {
{ "nrnode", KSTAT_DATA_UINT64 },
{ "access", KSTAT_DATA_UINT64 },
{ "dirent", KSTAT_DATA_UINT64 },
{ "dirents", KSTAT_DATA_UINT64 },
{ "reclaim", KSTAT_DATA_UINT64 },
{ "clreclaim", KSTAT_DATA_UINT64 },
{ "f_reclaim", KSTAT_DATA_UINT64 },
{ "a_reclaim", KSTAT_DATA_UINT64 },
{ "r_reclaim", KSTAT_DATA_UINT64 },
{ "r_path", KSTAT_DATA_UINT64 },
};
#endif /* DEBUG */
/*
* We keep a global list of per-zone client data, so we can clean up all zones
* if we get low on memory.
*/
/*
* Some servers do not properly update the attributes of the
* directory when changes are made. To allow interoperability
* with these broken servers, the nfs_disable_rddir_cache
*/
int nfs_disable_rddir_cache = 0;
struct chtab **);
static void clreclaim(void *);
static int nfs_feedback(int, int, mntinfo_t *);
failinfo_t *);
static int rtablehash(nfs_fhandle *);
struct vnodeops *,
cred_t *),
int (*)(const void *, const void *), int *, cred_t *,
char *, char *);
static void rp_addhash(rnode_t *);
static void rp_rmhash_locked(rnode_t *);
static void destroy_rnode(rnode_t *);
static void rddir_cache_free(rddir_cache *);
static int nfs_free_data_reclaim(rnode_t *);
static int nfs_active_data_reclaim(rnode_t *);
static int nfs_free_reclaim(void);
static int nfs_active_reclaim(void);
static int nfs_rnode_reclaim(void);
static void nfs_reclaim(void *);
static int failover_safe(failinfo_t *);
static int failover_wait(mntinfo_t *);
static int failover_remap(failinfo_t *);
static int failover_lookup(char *, vnode_t *,
vnode_t **);
static void nfs_free_r_path(rnode_t *);
static void nfs_set_vroot(vnode_t *);
/*
*/
extern void sec_clnt_freeh(AUTH *);
extern void sec_clnt_freeinfo(struct sec_data *);
/*
* used in mount policy
*/
/*
* EIO or EINTR are not recoverable errors.
*/
#ifdef DEBUG
#else
#endif
/*
* Common handle get program for NFS, NFS ACL, and NFS AUTH client.
*/
static int
{
int error;
return (EINVAL);
/*
* Find an unused handle or create one
*/
top:
/*
* Find the correct entry in the cache to check for free
* client handles. The search is based on the RPC program
* number, program version number, dev_t for the transport
* device, and the protocol family.
*/
break;
}
/*
* If we didn't find a cache entry for this quadruple, then
* create one. If we don't have one already preallocated,
* then drop the cache lock, create one, and then start over.
* If we did have a preallocated entry, then just add it to
* the front of the list.
*/
newch->ch_timesused = 0;
KM_SLEEP);
goto top;
}
/*
* We found a cache entry, but if it isn't on the front of the
* list, then move it to the front of the list to try to take
* advantage of locality of operations.
*/
}
/*
* If there was a free client handle cached, then remove it
* from the list, init it, and use it.
*/
}
}
ch->ch_timesused++;
return (0);
}
/*
* There weren't any free client handles which fit, so allocate
* a new one and use that.
*/
#ifdef DEBUG
#endif
}
if (error != 0) {
#ifdef DEBUG
#endif
/*
* Warning is unnecessary if error is EINTR.
*/
"clget: couldn't create handle: %m\n");
}
return (error);
}
#ifdef DEBUG
#endif
}
ch->ch_timesused++;
return (0);
}
int
{
}
static int
{
int error;
/*
* Set read buffer size to rsize
* and add room for RPC headers.
*/
if (ci.cl_readsize != 0)
/*
* If soft mount and server is down just try once.
* meaning: do not retransmit.
*/
ci.cl_retrans = 0;
else
/*
* clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
* security flavor, the client tries to establish a security context
* by contacting the server. If the connection is timed out or reset,
* e.g. server reboot, we will try again.
*/
do {
if (error == 0)
break;
/*
* For forced unmount or zone shutdown, bail out, no retry.
*/
break;
}
/* do not retry for softmount */
break;
/* let the caller deal with the failover case */
if (FAILOVER_MOUNT(mi))
break;
return (error);
}
static int
{
int error;
/*
* Set read buffer size to rsize
* and add room for RPC headers.
*/
if (ci.cl_readsize != 0)
/*
* If soft mount and server is down just try once.
* meaning: do not retransmit.
*/
ci.cl_retrans = 0;
else
/*
* clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
* security flavor, the client tries to establish a security context
* by contacting the server. If the connection is timed out or reset,
* e.g. server reboot, we will try again.
*/
do {
if (error == 0)
break;
/*
* For forced unmount or zone shutdown, bail out, no retry.
*/
break;
}
/* do not retry for softmount */
break;
/* let the caller deal with the failover case */
if (FAILOVER_MOUNT(mi))
break;
return (error);
}
static void
{
}
/*
* Timestamp this cache entry so that we know when it was last
* used.
*/
/*
* Add the free client handle to the front of the list.
* This way, the list will be sorted in youngest to oldest
* order.
*/
}
void
{
}
static void
{
#ifdef DEBUG
int n = 0;
#endif
/*
* Need to reclaim some memory, so step through the cache
* looking through the lists for entries which can be freed.
*/
/*
* Here we step through each non-NULL quadruple and start to
* construct the reclaim list pointed to by cp. Note that
* cp will contain all eligible chtab entries. When this traversal
* completes, chtab entries from the last quadruple will be at the
* front of cp and entries from previously inspected quadruples have
* been appended to the rear of cp.
*/
continue;
/*
* Search each list for entries older then
* cl_holdtime seconds. The lists are maintained
* in youngest to oldest order so that when the
* first entry is found which is old enough, then
* all of the rest of the entries on the list will
* be old enough as well.
*/
}
}
}
}
/*
* If cp is empty, then there is nothing to reclaim here.
*/
return;
/*
* Step through the list of entries to free, destroying each client
* handle and kmem_free'ing the memory for each entry.
*/
#ifdef DEBUG
n++;
#endif
}
#ifdef DEBUG
/*
* Update clalloc so that nfsstat shows the current number
* of allocated client handles.
*/
#endif
}
/* ARGSUSED */
static void
{
#ifdef DEBUG
#endif
/*
* The system is low on memory; go through and try to reclaim some from
* every zone on the system.
*/
}
/*
* Minimum time-out values indexed by call type
* These units are in "eights" of a second to avoid multiplies
*/
static unsigned int minimum_timeo[] = {
6, 7, 10
};
/*
* Back off for retransmission timeout, MAXTIMO is in hz of a sec
*/
/*
* Function called when rfscall notices that we have been
* re-transmitting, or when we get a response without retransmissions.
* Return 1 if the transfer size was adjusted down - 0 if no change.
*/
static int
{
int kind;
int r = 0;
if (flag == FEEDBACK_REXMIT1) {
goto done;
r = 1;
}
r = 1;
}
} else if (flag == FEEDBACK_OK) {
if (kind == 0 ||
goto done;
if (kind == 1) {
goto done;
} else if (kind == 2) {
goto done;
}
}
done:
return (r);
}
#ifdef DEBUG
static int rfs2call_hits = 0;
static int rfs2call_misses = 0;
#endif
int
{
int rpcerror;
if (!rpcerror) {
/*
* See crnetadjust() for comments.
*/
if (*statusp == NFSERR_ACCES &&
#ifdef DEBUG
#endif
#ifdef DEBUG
if (*statusp == NFSERR_ACCES)
#endif
}
} else if (rpc_status == RPC_PROCUNAVAIL) {
rpcerror = 0;
}
return (rpcerror);
}
#ifdef DEBUG
static int rfs3call_hits = 0;
static int rfs3call_misses = 0;
#endif
int
{
int rpcerror;
int user_informed;
user_informed = 0;
do {
if (!rpcerror) {
if (*statusp == NFS3ERR_JUKEBOX) {
break;
}
if (!user_informed) {
user_informed = 1;
"file temporarily unavailable on the server, retrying...\n");
}
}
/*
* See crnetadjust() for comments.
*/
else if (*statusp == NFS3ERR_ACCES &&
#ifdef DEBUG
#endif
#ifdef DEBUG
if (*statusp == NFS3ERR_ACCES)
#endif
}
}
return (rpcerror);
}
mi->mi_readers++; \
}
mi->mi_readers--; \
if (mi->mi_readers == 0) \
}
static int
{
char *msg;
#ifdef DEBUG
char *bufp;
#endif
/*
* In case of forced unmount or zone shutdown, return EIO.
*/
}
/*
* Remember the transfer sizes in case
* nfs_feedback changes them underneath us.
*/
/*
* NFS client failover support
*
* If this rnode is not in sync with the current server (VALID_FH),
* we'd like to do a remap to get in sync. We can be interrupted
* in failover_remap(), and if so we'll bail. Otherwise, we'll
* use the best info we have to try the RPC. Part of that is
* unconditionally updating the filehandle copy kept for V3.
*
* Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
* rw_enter(); we're trying to keep the current server from being
* changed on us until we're done with the remapping and have a
* matching client handle. We don't want to sending a filehandle
* to the wrong host.
*/
if (FAILOVER_MOUNT(mi)) {
if (failover_wait(mi)) {
return (EINTR);
}
}
if (fi) {
int remaperr;
if (remaperr != 0) {
#ifdef DEBUG
"rfscall couldn't failover: %m");
#endif
/*
* If failover_remap returns ETIMEDOUT
* and the filesystem is hard mounted
* we have to retry the call with a new
* server.
*/
goto failoverretry;
}
return (remaperr);
}
}
}
}
/* For TSOL, use a new cred which has net_mac_aware flag */
if (!cred_cloned && is_system_labeled()) {
cred_cloned = TRUE;
}
/*
* clget() calls clnt_tli_kinit() which clears the xid, so we
* are guaranteed to reprocess the retry as a new request.
*/
if (FAILOVER_MOUNT(mi)) {
failover_safe(fi)) {
goto failoverretry;
}
}
} else {
}
/*
* If hard mounted fs, retry call forever unless hard error occurs.
*/
do {
status = RPC_FAILED;
break;
}
/*
* Mask out all signals except SIGHUP, SIGINT, SIGQUIT
* and SIGTERM. (Preserving the existing masks).
* Mask out SIGINT if mount option nointr is specified.
*/
/*
* If there is a current signal, then don't bother
* even trying to send out the request because we
* won't be able to block waiting for the response.
* Simply assume RPC_INTR and get on with it.
*/
else {
}
/*
* restore original signal mask
*/
switch (status) {
case RPC_SUCCESS:
break;
case RPC_INTR:
/*
* There is no way to recover from this error,
* even if mount option nointr is specified.
* SIGKILL, for example, cannot be blocked.
*/
break;
case RPC_UDERROR:
/*
* If the NFS server is local (vold) and
* it goes away then we get RPC_UDERROR.
* This is a retryable error, so we would
* loop, so check to see if the specific
* error was ECONNRESET, indicating that
* target did not exist at all. If so,
* return with RPC_PROGUNAVAIL and
* ECONNRESET to indicate why.
*/
break;
}
/*FALLTHROUGH*/
default: /* probably RPC_TIMEDOUT */
if (IS_UNRECOVERABLE_RPC(status))
break;
/*
* increment server not responding count
*/
mi->mi_noresponse++;
#ifdef DEBUG
#endif
break;
}
/*
* The call is in progress (over COTS).
* Try the CLNT_CALL again, but don't
* print a noisy error message.
*/
if (status == RPC_INPROGRESS) {
break;
}
if (flags & RFSCALL_SOFT)
break;
/*
* On zone shutdown, just move on.
*/
break;
}
/*
* NFS client failover support
*
* If the current server just failed us, we'll
* start the process of finding a new server.
* After that, we can just retry.
*/
goto failoverretry;
}
if ((status == RPC_CANTSEND) &&
msg = SRV_QFULL_MSG;
else
#ifdef DEBUG
svp->sv_hostname);
#else
#endif
} else
if (*douprintf && nfs_has_ctty()) {
*douprintf = 0;
#ifdef DEBUG
svp->sv_hostname);
#else
#endif
}
/*
* If doing dynamic adjustment of transfer
* size and if it's a read or write call
* and if the transfer size changed while
* retransmitting or if the feedback routine
* changed the transfer size,
* then exit rfscall so that the transfer
* size can be adjusted at the vnops level.
*/
/*
* On read or write calls, return
* back to the vnode ops level if
* the transfer size changed.
*/
if (cred_cloned)
return (ENFS_TRYAGAIN);
}
}
} while (tryagain);
if (status != RPC_SUCCESS) {
/*
* Let soft mounts use the timed out message.
*/
if (status == RPC_INPROGRESS)
#ifdef DEBUG
if (nfs_has_ctty()) {
uprintf("NFS%d %s failed for %s\n",
bufp);
}
}
#else
"NFS %s failed for server %s: error %d (%s)\n",
if (nfs_has_ctty()) {
"NFS %s failed for server %s: error %d (%s)\n",
}
}
#endif
/*
* when CLNT_CALL() fails with RPC_AUTHERROR,
* re_errno is set appropriately depending on
* the authentication error
*/
if (status == RPC_VERSMISMATCH ||
}
} else {
/*
* Test the value of mi_down and mi_printed without
* holding the mi_lock mutex. If they are both zero,
* then it is okay to skip the down and printed
* processing. This saves on a mutex_enter and
* mutex_exit pair for a normal, successful RPC.
* This was just complete overhead.
*/
#ifdef DEBUG
#else
svp->sv_hostname);
#endif
} else
}
if (*douprintf == 0) {
#ifdef DEBUG
uprintf("NFS%d server %s ok\n",
#else
#endif
*douprintf = 1;
}
}
if (cred_cloned)
if (rpc_status != NULL)
}
#ifdef DEBUG
static int acl2call_hits = 0;
static int acl2call_misses = 0;
#endif
int
{
int rpcerror;
if (!rpcerror) {
/*
* See comments with crnetadjust().
*/
if (*statusp == NFSERR_ACCES &&
#ifdef DEBUG
#endif
#ifdef DEBUG
if (*statusp == NFSERR_ACCES)
#endif
}
}
return (rpcerror);
}
#ifdef DEBUG
static int acl3call_hits = 0;
static int acl3call_misses = 0;
#endif
int
{
int rpcerror;
int user_informed;
user_informed = 0;
do {
if (!rpcerror) {
if (*statusp == NFS3ERR_JUKEBOX) {
if (!user_informed) {
user_informed = 1;
"file temporarily unavailable on the server, retrying...\n");
}
}
/*
* See crnetadjust() for comments.
*/
else if (*statusp == NFS3ERR_ACCES &&
#ifdef DEBUG
#endif
#ifdef DEBUG
if (*statusp == NFS3ERR_ACCES)
#endif
}
}
return (rpcerror);
}
static int
{
#if 0 /* notyet */
#endif
#ifdef DEBUG
char *bufp;
#endif
#if 0 /* notyet */
#endif
}
#if 0 /* notyet */
/*
* Remember the transfer sizes in case
* nfs_feedback changes them underneath us.
*/
#endif
/*
* NFS client failover support
*
* If this rnode is not in sync with the current server (VALID_FH),
* we'd like to do a remap to get in sync. We can be interrupted
* in failover_remap(), and if so we'll bail. Otherwise, we'll
* use the best info we have to try the RPC. Part of that is
* unconditionally updating the filehandle copy kept for V3.
*
* Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
* rw_enter(); we're trying to keep the current server from being
* changed on us until we're done with the remapping and have a
* matching client handle. We don't want to sending a filehandle
* to the wrong host.
*/
if (FAILOVER_MOUNT(mi)) {
if (failover_wait(mi)) {
return (EINTR);
}
}
if (fi) {
int remaperr;
if (remaperr != 0) {
#ifdef DEBUG
"aclcall couldn't failover: %m");
#endif
/*
* If failover_remap returns ETIMEDOUT
* and the filesystem is hard mounted
* we have to retry the call with a new
* server.
*/
goto failoverretry;
}
return (remaperr);
}
}
}
}
/* For TSOL, use a new cred which has net_mac_aware flag */
if (!cred_cloned && is_system_labeled()) {
cred_cloned = TRUE;
}
/*
* acl_clget() calls clnt_tli_kinit() which clears the xid, so we
* are guaranteed to reprocess the retry as a new request.
*/
if (FAILOVER_MOUNT(mi)) {
failover_safe(fi)) {
goto failoverretry;
}
}
if (cred_cloned)
}
} else {
}
/*
* If hard mounted fs, retry call forever unless hard error occurs.
*/
do {
status = RPC_FAILED;
break;
}
/*
* Mask out all signals except SIGHUP, SIGINT, SIGQUIT
* and SIGTERM. (Preserving the existing masks).
* Mask out SIGINT if mount option nointr is specified.
*/
/*
* If there is a current signal, then don't bother
* even trying to send out the request because we
* won't be able to block waiting for the response.
* Simply assume RPC_INTR and get on with it.
*/
else {
}
/*
* restore original signal mask
*/
switch (status) {
case RPC_SUCCESS:
#if 0 /* notyet */
#endif
break;
/*
* Unfortunately, there are servers in the world which
* are not coded correctly. They are not prepared to
* handle RPC requests to the NFS port which are not
* NFS requests. Thus, they may try to process the
* NFS_ACL request as if it were an NFS request. This
* does not work. Generally, an error will be generated
* on the client because it will not be able to decode
* the response from the server. However, it seems
* possible that the server may not be able to decode
* the arguments. Thus, the criteria for deciding
* whether the server supports NFS_ACL or not is whether
* the following RPC errors are returned from CLNT_CALL.
*/
case RPC_CANTDECODERES:
case RPC_PROGUNAVAIL:
case RPC_CANTDECODEARGS:
case RPC_PROGVERSMISMATCH:
break;
/*
* If the server supports NFS_ACL but not the new ops
* for extended attributes, make sure we don't retry.
*/
case RPC_PROCUNAVAIL:
break;
case RPC_INTR:
/*
* There is no way to recover from this error,
* even if mount option nointr is specified.
* SIGKILL, for example, cannot be blocked.
*/
break;
case RPC_UDERROR:
/*
* If the NFS server is local (vold) and
* it goes away then we get RPC_UDERROR.
* This is a retryable error, so we would
* loop, so check to see if the specific
* error was ECONNRESET, indicating that
* target did not exist at all. If so,
* return with RPC_PROGUNAVAIL and
* ECONNRESET to indicate why.
*/
break;
}
/*FALLTHROUGH*/
default: /* probably RPC_TIMEDOUT */
if (IS_UNRECOVERABLE_RPC(status))
break;
/*
* increment server not responding count
*/
mi->mi_noresponse++;
#ifdef DEBUG
#endif
break;
}
/*
* The call is in progress (over COTS).
* Try the CLNT_CALL again, but don't
* print a noisy error message.
*/
if (status == RPC_INPROGRESS) {
break;
}
if (flags & RFSCALL_SOFT)
break;
/*
* On zone shutdown, just move on.
*/
break;
}
/*
* NFS client failover support
*
* If the current server just failed us, we'll
* start the process of finding a new server.
* After that, we can just retry.
*/
goto failoverretry;
}
#ifdef DEBUG
"NFS_ACL%d server %s not responding still trying\n",
#else
"NFS server %s not responding still trying\n",
svp->sv_hostname);
#endif
} else
if (*douprintf && nfs_has_ctty()) {
*douprintf = 0;
#ifdef DEBUG
"NFS_ACL%d server %s not responding still trying\n",
#else
"NFS server %s not responding still trying\n",
svp->sv_hostname);
#endif
}
#if 0 /* notyet */
/*
* If doing dynamic adjustment of transfer
* size and if it's a read or write call
* and if the transfer size changed while
* retransmitting or if the feedback routine
* changed the transfer size,
* then exit rfscall so that the transfer
* size can be adjusted at the vnops level.
*/
/*
* On read or write calls, return
* back to the vnode ops level if
* the transfer size changed.
*/
if (cred_cloned)
return (ENFS_TRYAGAIN);
}
#endif
}
} while (tryagain);
if (status != RPC_SUCCESS) {
/*
* Let soft mounts use the timed out message.
*/
if (status == RPC_INPROGRESS)
if (status == RPC_CANTDECODERES ||
status == RPC_PROGUNAVAIL ||
status == RPC_PROCUNAVAIL ||
status == RPC_CANTDECODEARGS ||
#ifdef DEBUG
if (nfs_has_ctty()) {
uprintf("NFS_ACL%d %s failed for %s\n",
bufp);
}
}
#else
"NFS %s failed for server %s: error %d (%s)\n",
if (nfs_has_ctty()) {
"NFS %s failed for server %s: error %d (%s)\n",
}
#endif
/*
* when CLNT_CALL() fails with RPC_AUTHERROR,
* re_errno is set appropriately depending on
* the authentication error
*/
if (status == RPC_VERSMISMATCH ||
}
} else {
/*
* Test the value of mi_down and mi_printed without
* holding the mi_lock mutex. If they are both zero,
* then it is okay to skip the down and printed
* processing. This saves on a mutex_enter and
* mutex_exit pair for a normal, successful RPC.
* This was just complete overhead.
*/
#ifdef DEBUG
#else
svp->sv_hostname);
#endif
} else
}
if (*douprintf == 0) {
#ifdef DEBUG
uprintf("NFS_ACL%d server %s ok\n",
#else
#endif
*douprintf = 1;
}
}
if (cred_cloned)
#if 0 /* notyet */
#endif
}
int
{
else
else
else
else
else {
/* check time validity */
return (EOVERFLOW);
}
}
else {
/* check time validity */
return (EOVERFLOW);
}
}
return (0);
}
int
{
else {
}
else {
}
else {
}
else {
}
else {
/* check time validity */
return (EOVERFLOW);
}
}
else {
/* check time validity */
return (EOVERFLOW);
}
}
return (0);
}
void
{
}
void
{
}
int
{
int error;
if (error)
return (error);
/*
* To determine the expected group-id of the created file:
* 1) If the filesystem was not mounted with the Old-BSD-compatible
* GRPID option, and the directory's set-gid bit is clear,
* then use the process's gid.
* 2) Otherwise, set the group-id to the gid of the parent directory.
*/
else
return (0);
}
int
{
int error;
if (error)
return (error);
/*
* Modify the expected mode (om) so that the set-gid bit matches
* that of the parent directory (dvp).
*/
else
return (0);
}
void
{
}
} else {
}
}
}
/*
* Free the resources associated with an rnode.
*/
static void
{
char *contents;
int size;
int error;
/*
* Before freeing anything, wait until all asynchronous
* activity is done on this rnode. This will allow all
* asynchronous read ahead and write behind i/o's to
* finish.
*/
/*
* Flush and invalidate all pages associated with the vnode.
*/
if (vn_has_cached_data(vp)) {
}
}
}
/*
* Free any held credentials and caches which may be associated
* with this rnode.
*/
/*
* Free the held credential.
*/
/*
* Free the access cache entries.
*/
(void) nfs_access_purge_rp(rp);
/*
* Free the readdir cache entries.
*/
if (HAVE_RDDIR_CACHE(rp))
/*
* Free the symbolic link cache.
*/
}
/*
* Free any cached ACL.
*/
/*
* Free any cached pathconf information.
*/
}
/*
* Return a vnode for the given NFS Version 2 file handle.
* If no rnode exists for this fhandle, create one and put it
* into the hash queues. If the rnode for this fhandle
* already exists, return it.
*
* Note: make_rnode() may upgrade the hash bucket lock to exclusive.
*/
vnode_t *
{
int newnode;
int index;
if (!newnode) {
} else {
else
/*
* A translation here seems to be necessary
* because this function can be called
* with `attr' that has come from the wire,
* and been operated on by vattr_to_nattr().
* See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
* ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
* ->makenfsnode().
*/
else
}
} else {
if (newnode) {
}
}
return (vp);
}
/*
* Return a vnode for the given NFS Version 3 file handle.
* If no rnode exists for this fhandle, create one and put it
* into the hash queues. If the rnode for this fhandle
* already exists, return it.
*
* Note: make_rnode() may upgrade the hash bucket lock to exclusive.
*/
vnode_t *
{
int newnode;
int index;
if (newnode) {
}
return (vp);
}
if (!newnode) {
} else {
}
return (vp);
}
vnode_t *
{
int newnode;
int index;
if (newnode) {
}
return (vp);
}
if (!newnode) {
} else {
else
}
return (vp);
}
/*
* Read this comment before making changes to rtablehash()!
* This is a hash function in which seemingly obvious and harmless
* changes can cause escalations costing million dollars!
* Know what you are doing.
*
* rtablehash() implements Jenkins' one-at-a-time hash algorithm. The
* algorithm is currently detailed here:
*
*
* Of course, the above link may not be valid by the time you are reading
* this, but suffice it to say that the one-at-a-time algorithm works well in
* almost all cases. If you are changing the algorithm be sure to verify that
* the hash algorithm still provides even distribution in all cases and with
* any server returning filehandles in whatever order (sequential or random).
*/
static int
{
char *key;
}
return (hash & rtablemask);
}
static vnode_t *
int (*compar)(const void *, const void *),
{
*newnode = 0;
return (vp);
}
rp = rpfreelist;
goto start;
}
}
goto start;
}
vn_invalid(vp);
/*
* destroy old locks before bzero'ing and
* recreating the locks below.
*/
/*
* Make sure that if rnode is recycled then
* VFS count is decremented properly before
* reuse.
*/
} else {
#ifdef DEBUG
#endif
}
if (FAILOVER_MOUNT(mi)) {
/*
* If replicated servers, stash pathnames
*/
char *s, *p;
#ifdef DEBUG
#endif
for (p = dnm; *p; p++)
*s++ = *p;
*s++ = '/';
for (p = nm; *p; p++)
*s++ = *p;
*s = '\0';
} else {
/* special case for root */
#ifdef DEBUG
#endif
}
}
/*
* There is a race condition if someone else
* alloc's the rnode while no locks are held, so we
* check again and recover if found.
*/
*newnode = 0;
return (vp);
}
rp_addhash(rp);
*newnode = 1;
return (vp);
}
/*
* Callback function to check if the page should be marked as
* modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
*/
int
{
return (1);
}
return (0);
}
static void
{
}
}
}
static void
{
char *path;
if (path) {
#ifdef DEBUG
#endif
}
}
/*
* Put an rnode on the free list.
*
* Rnodes which were allocated above and beyond the normal limit
* are immediately freed.
*/
void
{
/*
* If we have too many rnodes allocated and there are no
* references to this rnode, or if the rnode is no longer
* accessible by it does not reside in the hash queues,
* or if an i/o error occurred while writing to the file,
* then just free it instead of putting it on the rnode
* freelist.
*/
return;
}
}
/*
* Recheck the vnode reference count. We need to
* make sure that another reference has not been
* acquired while we were not holding v_lock. The
* rnode is not in the rnode hash queues, so the
* only way for a reference to have been acquired
* is for a VOP_PUTPAGE because the rnode was marked
* with RDIRTY or for a modified page. This
* reference may have been acquired before our call
* to rinactive. The i/o may have been completed,
* thus allowing rinactive to complete, but the
* reference to the vnode may not have been released
* yet. In any case, the rnode can not be destroyed
* until the other references to this vnode have been
* released. The other references will take care of
* either destroying the rnode or placing it on the
* rnode freelist. If there are no other references,
* then the rnode may be safely destroyed.
*/
return;
}
return;
}
/*
* Lock the hash queue and then recheck the reference count
* to ensure that no other threads have acquired a reference
* to indicate that the rnode should not be placed on the
* freelist. If another reference has been acquired, then
* just release this one and let the other thread complete
* the processing of adding this rnode to the freelist.
*/
return;
}
/*
* If there is no cached data or metadata for this file, then
* put the rnode on the front of the freelist so that it will
* be reused before other rnodes which may have cached data or
* metadata associated with them.
*/
if (rpfreelist == NULL) {
rpfreelist = rp;
} else {
if (!vn_has_cached_data(vp) &&
!HAVE_RDDIR_CACHE(rp) &&
rpfreelist = rp;
}
}
/*
* Remove an rnode from the free list.
*
* The caller must be holding rpfreelist_lock and the rnode
* must be on the freelist.
*/
static void
{
if (rp == rpfreelist) {
if (rp == rpfreelist)
rpfreelist = NULL;
}
}
/*
* Put a rnode in the hash table.
*
* The caller must be holding the exclusive hash queue lock.
*/
static void
{
}
/*
* Remove a rnode from the hash table.
*
* The caller must be holding the hash queue lock.
*/
static void
{
}
/*
* Remove a rnode from the hash table.
*
* The caller must not be holding the hash queue lock.
*/
void
{
}
/*
* Lookup a rnode by fhandle.
*
* The caller must be holding the hash queue lock, either shared or exclusive.
*/
static rnode_t *
{
/*
* remove rnode from free list, if necessary.
*/
/*
* If the rnode is on the freelist,
* then remove it and use that reference
* as the new reference. Otherwise,
* need to increment the reference count.
*/
} else {
}
} else
return (rp);
}
}
return (NULL);
}
/*
* Return 1 if there is a active vnode belonging to this vfs in the
* rtable cache.
*
* Several of these checks are done without holding the usual
* locks. This is safe because destroy_rtable(), rp_addfree(),
* etc. will redo the necessary checks before actually destroying
* any rnodes.
*/
int
{
int index;
(vn_has_cached_data(vp) &&
return (1);
}
}
}
}
return (0);
}
/*
* Destroy inactive vnodes from the hash queues which belong to this
* vfs. It is essential that we destroy all inactive vnodes during a
* forced unmount as well as during a normal unmount.
*/
void
{
int index;
/* save the hash pointer before destroying */
} else
}
}
}
/*
* This call to rp_addfree will end up destroying the
* rnode, but in a safe way with the appropriate set
* of checks done.
*/
}
}
/*
* This routine destroys all the resources associated with the rnode
* and then the rnode itself.
*/
static void
{
#ifdef DEBUG
#endif
vn_invalid(vp);
}
/*
* Flush all vnodes in this (or every) vfs.
* Used by nfs_sync and by nfs_unmount.
*/
void
{
int index;
/*
* Check to see whether there is anything to do.
*/
if (num == 0)
return;
/*
* Allocate a slot for all currently active rnodes on the
* supposition that they all may need flushing.
*/
cnt = 0;
/*
* Walk the hash queues looking for rnodes with page
* lists associated with them. Make a list of these
* files.
*/
/*
* Don't bother sync'ing a vp if it
* is part of virtual swap device or
* if VFS is read-only
*/
continue;
/*
* If flushing all mounted file systems or
* the vnode belongs to this vfs, has pages
* and is marked as either dirty or mmap'd,
* hold and add this vnode to the list of
* vnodes to flush.
*/
vn_has_cached_data(vp) &&
goto toomany;
}
}
}
}
/*
* Flush and release all of the files on the list.
*/
while (cnt-- > 0) {
}
/*
* Free the space allocated to hold the list.
*/
}
/*
* This probably needs to be larger than or equal to
* log2(sizeof (struct rnode)) due to the way that rnodes are
* allocated.
*/
static int
{
}
#ifdef DEBUG
static long nfs_access_cache_hits = 0;
static long nfs_access_cache_misses = 0;
#endif
{
return (NFS_ACCESS_UNKNOWN);
#ifdef DEBUG
#endif
else
} else {
#ifdef DEBUG
#endif
}
return (all);
}
}
}
#ifdef DEBUG
#endif
return (NFS_ACCESS_UNKNOWN);
}
void
{
/*
* Allocate now assuming that mostly an allocation will be
* required. This allows the allocation to happen without
* holding the hash bucket locked.
*/
}
}
return;
}
}
}
#ifdef DEBUG
#endif
}
}
int
{
/*
* If there aren't any cached entries, then there is nothing
* to free.
*/
return (0);
/*
* Loop through each entry in the list pointed to in the
* rnode. Remove each of these entries from the hash
* queue that it is on and remove it from the list in
* the rnode.
*/
#ifdef DEBUG
#endif
}
return (1);
}
int
newnum(void)
{
if (newnum == 0)
return (id);
}
char *
newname(void)
{
char *news;
char *s;
const char *p;
s = news;
p = prefix;
while (*p != '\0')
*s++ = *p++;
while (id != 0) {
id >>= 4;
}
*s = '\0';
return (news);
}
/*
* Snapshot callback for nfs:0:nfs_client as registered with the kstat
* framework.
*/
static int
{
if (rw == KSTAT_WRITE) {
#ifdef DEBUG
/*
* Currently only the global zone can write to kstats, but we
* add the check just for paranoia.
*/
if (INGLOBALZONE(curproc))
sizeof (clstat_debug));
#endif
} else {
#ifdef DEBUG
/*
* If we're displaying the "global" debug kstat values, we
* display them as-is to all zones since in fact they apply to
* the system as a whole.
*/
sizeof (clstat_debug));
#endif
}
return (0);
}
static void *
{
#ifdef DEBUG
#endif
}
return (nfscl);
}
/*ARGSUSED*/
static void
{
return;
clreclaim_zone(nfscl, 0);
}
}
/*
* Called by endpnt_destructor to make sure the client handles are
* cleaned up before the RPC endpoints. This becomes a no-op if
* clfini_zone (above) is called first. This function is needed
* (rather than relying on clfini_zone to clean up) because the ZSD
* callbacks have no ordering mechanism, so we have no way to ensure
* that clfini_zone is called before endpnt_destructor.
*/
void
{
clreclaim_zone(nfscl, 0);
break;
}
}
}
int
nfs_subrinit(void)
{
int i;
/*
* Allocate and initialize the rnode hash queues
*/
if (nrnode <= 0)
"!setting nrnode to max value of %ld", nrnode_max);
nrnode = nrnode_max;
}
for (i = 0; i < rtablesize; i++) {
}
/*
* Allocate and initialize the access cache
*/
/*
* Initial guess is one access cache entry per rnode unless
* nacache is set to a non-zero value and then it is used to
* indicate a guess at the number of access cache entries.
*/
if (nacache > 0)
else
for (i = 0; i < acachesize; i++) {
}
/*
* Allocate and initialize the client handle cache
*/
/*
* Initialize the list of per-zone client handles (and associated data).
* This needs to be done before we call zone_key_create().
*/
/*
* Initialize the zone_key for per-zone client handle lists.
*/
/*
*/
/*
* Assign unique major number for all nfs mounts
*/
"nfs: init: can't get unique device number");
nfs_major = 0;
}
nfs_minor = 0;
if (nfs3_jukebox_delay == 0)
return (0);
}
void
nfs_subrfini(void)
{
int i;
/*
* Deallocate the rnode hash queues
*/
for (i = 0; i < rtablesize; i++)
/*
* Deallocated the access cache
*/
for (i = 0; i < acachesize; i++)
/*
* Deallocate the client handle cache
*/
/*
*/
(void) zone_key_delete(nfsclnt_zone_key);
}
enum nfsstat
{
switch (error) {
case EOPNOTSUPP:
return (NFSERR_OPNOTSUPP);
case ENAMETOOLONG:
return (NFSERR_NAMETOOLONG);
case ENOTEMPTY:
return (NFSERR_NOTEMPTY);
case EDQUOT:
return (NFSERR_DQUOT);
case ESTALE:
return (NFSERR_STALE);
case EREMOTE:
return (NFSERR_REMOTE);
case ENOSYS:
return (NFSERR_OPNOTSUPP);
case EOVERFLOW:
return (NFSERR_INVAL);
default:
}
/* NOTREACHED */
}
int
{
switch (status) {
case NFSERR_OPNOTSUPP:
return (EOPNOTSUPP);
case NFSERR_NAMETOOLONG:
return (ENAMETOOLONG);
case NFSERR_NOTEMPTY:
return (ENOTEMPTY);
case NFSERR_DQUOT:
return (EDQUOT);
case NFSERR_STALE:
return (ESTALE);
case NFSERR_REMOTE:
return (EREMOTE);
case NFSERR_WFLUSH:
return (EIO);
default:
return ((int)status);
}
/* NOTREACHED */
}
enum nfsstat3
{
#ifdef DEBUG
switch (error) {
case 0:
return (NFS3_OK);
case EPERM:
return (NFS3ERR_PERM);
case ENOENT:
return (NFS3ERR_NOENT);
case EIO:
return (NFS3ERR_IO);
case ENXIO:
return (NFS3ERR_NXIO);
case EACCES:
return (NFS3ERR_ACCES);
case EEXIST:
return (NFS3ERR_EXIST);
case EXDEV:
return (NFS3ERR_XDEV);
case ENODEV:
return (NFS3ERR_NODEV);
case ENOTDIR:
return (NFS3ERR_NOTDIR);
case EISDIR:
return (NFS3ERR_ISDIR);
case EINVAL:
return (NFS3ERR_INVAL);
case EFBIG:
return (NFS3ERR_FBIG);
case ENOSPC:
return (NFS3ERR_NOSPC);
case EROFS:
return (NFS3ERR_ROFS);
case EMLINK:
return (NFS3ERR_MLINK);
case ENAMETOOLONG:
return (NFS3ERR_NAMETOOLONG);
case ENOTEMPTY:
return (NFS3ERR_NOTEMPTY);
case EDQUOT:
return (NFS3ERR_DQUOT);
case ESTALE:
return (NFS3ERR_STALE);
case EREMOTE:
return (NFS3ERR_REMOTE);
case ENOSYS:
case EOPNOTSUPP:
return (NFS3ERR_NOTSUPP);
case EOVERFLOW:
return (NFS3ERR_INVAL);
default:
}
#else
switch (error) {
case ENAMETOOLONG:
return (NFS3ERR_NAMETOOLONG);
case ENOTEMPTY:
return (NFS3ERR_NOTEMPTY);
case EDQUOT:
return (NFS3ERR_DQUOT);
case ESTALE:
return (NFS3ERR_STALE);
case ENOSYS:
case EOPNOTSUPP:
return (NFS3ERR_NOTSUPP);
case EREMOTE:
return (NFS3ERR_REMOTE);
case EOVERFLOW:
return (NFS3ERR_INVAL);
default:
}
#endif
}
int
{
#ifdef DEBUG
switch (status) {
case NFS3_OK:
return (0);
case NFS3ERR_PERM:
return (EPERM);
case NFS3ERR_NOENT:
return (ENOENT);
case NFS3ERR_IO:
return (EIO);
case NFS3ERR_NXIO:
return (ENXIO);
case NFS3ERR_ACCES:
return (EACCES);
case NFS3ERR_EXIST:
return (EEXIST);
case NFS3ERR_XDEV:
return (EXDEV);
case NFS3ERR_NODEV:
return (ENODEV);
case NFS3ERR_NOTDIR:
return (ENOTDIR);
case NFS3ERR_ISDIR:
return (EISDIR);
case NFS3ERR_INVAL:
return (EINVAL);
case NFS3ERR_FBIG:
return (EFBIG);
case NFS3ERR_NOSPC:
return (ENOSPC);
case NFS3ERR_ROFS:
return (EROFS);
case NFS3ERR_MLINK:
return (EMLINK);
case NFS3ERR_NAMETOOLONG:
return (ENAMETOOLONG);
case NFS3ERR_NOTEMPTY:
return (ENOTEMPTY);
case NFS3ERR_DQUOT:
return (EDQUOT);
case NFS3ERR_STALE:
return (ESTALE);
case NFS3ERR_REMOTE:
return (EREMOTE);
case NFS3ERR_BADHANDLE:
return (ESTALE);
case NFS3ERR_NOT_SYNC:
return (EINVAL);
case NFS3ERR_BAD_COOKIE:
return (ENOENT);
case NFS3ERR_NOTSUPP:
return (EOPNOTSUPP);
case NFS3ERR_TOOSMALL:
return (EINVAL);
case NFS3ERR_SERVERFAULT:
return (EIO);
case NFS3ERR_BADTYPE:
return (EINVAL);
case NFS3ERR_JUKEBOX:
return (ENXIO);
default:
return ((int)status);
}
#else
switch (status) {
case NFS3ERR_NAMETOOLONG:
return (ENAMETOOLONG);
case NFS3ERR_NOTEMPTY:
return (ENOTEMPTY);
case NFS3ERR_DQUOT:
return (EDQUOT);
case NFS3ERR_STALE:
case NFS3ERR_BADHANDLE:
return (ESTALE);
case NFS3ERR_NOTSUPP:
return (EOPNOTSUPP);
case NFS3ERR_REMOTE:
return (EREMOTE);
case NFS3ERR_NOT_SYNC:
case NFS3ERR_TOOSMALL:
case NFS3ERR_BADTYPE:
return (EINVAL);
case NFS3ERR_BAD_COOKIE:
return (ENOENT);
case NFS3ERR_SERVERFAULT:
return (EIO);
case NFS3ERR_JUKEBOX:
return (ENXIO);
default:
return ((int)status);
}
#endif
}
{
#ifdef DEBUG
#endif
}
return (rc);
}
static void
{
#ifdef DEBUG
#endif
#ifdef DEBUG
#else
#endif
}
}
void
{
}
void
{
} else
}
#ifdef DEBUG
char *
{
char *rc;
return (rc);
}
void
{
}
#endif
static int
{
char *contents;
int size;
int freed;
/*
* Free any held credentials and caches which
* may be associated with this rnode.
*/
/*
* Free the access cache entries.
*/
if (!HAVE_RDDIR_CACHE(rp) &&
return (freed);
/*
* Free the readdir cache entries
*/
if (HAVE_RDDIR_CACHE(rp))
/*
* Free the symbolic link cache.
*/
}
/*
* Free any cached ACL.
*/
/*
* Free any cached pathconf information.
*/
return (1);
}
static int
{
char *contents;
int size;
int freed;
/*
* Free any held credentials and caches which
* may be associated with this rnode.
*/
return (0);
/*
* Free the access cache entries.
*/
if (!HAVE_RDDIR_CACHE(rp) &&
return (freed);
/*
* Free the readdir cache entries
*/
if (HAVE_RDDIR_CACHE(rp))
/*
* Free the symbolic link cache.
*/
}
/*
* Free any cached ACL.
*/
/*
* Free any cached pathconf information.
*/
return (1);
}
static int
nfs_free_reclaim(void)
{
int freed;
#ifdef DEBUG
#endif
freed = 0;
rp = rpfreelist;
do {
if (nfs_free_data_reclaim(rp))
freed = 1;
}
return (freed);
}
static int
nfs_active_reclaim(void)
{
int freed;
int index;
#ifdef DEBUG
#endif
freed = 0;
if (nfs_active_data_reclaim(rp))
freed = 1;
}
}
return (freed);
}
static int
nfs_rnode_reclaim(void)
{
int freed;
#ifdef DEBUG
#endif
freed = 0;
continue;
}
}
/*
* This call to rp_addfree will end up destroying the
* rnode, but in a safe way with the appropriate set
* of checks done.
*/
}
return (freed);
}
/*ARGSUSED*/
static void
{
#ifdef DEBUG
#endif
if (nfs_free_reclaim())
return;
if (nfs_active_reclaim())
return;
(void) nfs_rnode_reclaim();
}
/*
* NFS client failover support
*
* Routines to copy filehandles
*/
void
{
}
void
{
}
/*
* NFS client failover support
*
* failover_safe() will test various conditions to ensure that
* failover is permitted for this vnode. It will be denied
* if:
* 1) the operation in progress does not support failover (NULL fi)
* 2) there are no available replicas (NULL mi_servers->sv_next)
* 3) any locks are outstanding on this file
*/
static int
{
/*
* Does this op permit failover?
*/
return (0);
/*
* Are there any alternates to failover to?
*/
return (0);
/*
* Disable check; we've forced local locking
*
* if (flk_has_remote_locks(fi->vp))
* return (0);
*/
/*
* If we have no partial path, we can't do anything
*/
return (0);
return (1);
}
/*
* NFS client failover support
*
* failover_newserver() will start a search for a new server,
* preferably by starting an async thread to do the work. If
* someone is already doing this (recognizable by MI_BINDINPROG
* being set), it will simply return and the calling thread
* will queue on the mi_failover_cv condition variable.
*/
static void
{
/*
* Check if someone else is doing this already
*/
return;
}
/*
* Need to hold the vfs struct so that it can't be released
* while the failover thread is selecting a new server.
*/
/*
* Start a thread to do the real searching.
*/
}
/*
* NFS client failover support
*
* failover_thread() will find a new server to replace the one
* currently in use, wake up other threads waiting on this mount
* point, and die. It will start at the head of the server list
* and poll servers until it finds one with an NFS server which is
* registered and responds to a NULL procedure ping.
*
* XXX failover_thread is unsafe within the scope of the
* present model defined for cpr to suspend the system.
* Specifically, over-the-wire calls made by the thread
* are unsafe. The thread needs to be reevaluated in case of
* future updates to the cpr suspend model.
*/
static void
{
int error;
int oncethru = 0;
int index;
char *srvnames;
#ifdef DEBUG
/*
* This is currently only needed to access counters which exist on
* DEBUG kernels, hence we don't want to pay the penalty of the lookup
* on non-DEBUG kernels.
*/
#endif
/*
* Its safe to piggyback on the mi_lock since failover_newserver()
* code guarantees that there will be only one failover thread
* per mountinfo at any instance.
*/
"failover_thread");
while (mi->mi_readers) {
}
/*
* Ping the null NFS procedure of every server in
* the list until one responds. We always start
* at the head of the list and always skip the one
* that is current, since it's caused us a problem.
*/
continue;
/*
* If the file system was forcibly umounted
* while trying to do a failover, then just
* give up on the failover. It won't matter
* what the server is.
*/
goto done;
}
if (error)
continue;
if (status == RPC_SUCCESS) {
#ifdef DEBUG
"NFS%d: failing over: selecting original server %s",
#else
"NFS: failing over: selecting original server %s",
svp->sv_hostname);
#endif
} else {
#ifdef DEBUG
"NFS%d: failing over from %s to %s",
svp->sv_hostname);
#else
"NFS: failing over from %s to %s",
svp->sv_hostname);
#endif
}
break;
}
}
if (!oncethru) {
#ifdef DEBUG
"NFS%d servers %s not responding "
#else
"still trying\n", srvnames);
#endif
oncethru = 1;
}
}
}
if (oncethru) {
#ifdef DEBUG
#else
#endif
}
(void) nfs_free_data_reclaim(rp);
rp_addhash(rp);
} else
}
done:
if (oncethru)
mi->mi_failover++;
#ifdef DEBUG
#endif
}
zthread_exit();
/* NOTREACHED */
}
/*
* NFS client failover support
*
* failover_wait() will put the thread to sleep until MI_BINDINPROG
* is cleared, meaning that failover is complete. Called with
* mi_lock mutex held.
*/
static int
{
/*
* If someone else is hunting for a living server,
* sleep until it's done. After our sleep, we may
* be bound to the right server and get off cheaply.
*/
/*
* Mask out all signals except SIGHUP, SIGINT, SIGQUIT
* and SIGTERM. (Preserving the existing masks).
* Mask out SIGINT if mount option nointr is specified.
*/
/*
* restore original signal mask
*/
return (EINTR);
}
/*
* restore original signal mask
*/
}
return (0);
}
/*
* NFS client failover support
*
* failover_remap() will do a partial pathname lookup and find the
* desired vnode on the current server. The interim vnode will be
* discarded after we pilfer the new filehandle.
*
* Side effects:
* - This routine will also update the filehandle in the args structure
* pointed to by the fi->fhp pointer if it is non-NULL.
*/
static int
{
int error;
#ifdef DEBUG
#endif
/*
* Sanity check
*/
return (EINVAL);
/*
* Given the root fh, use the path stored in
* the rnode to find the fh for the new server.
*/
if (error)
return (error);
if (error)
return (error);
/*
* If we found the same rnode, we're done now
*/
/*
* Failed and the new server may physically be same
* OR may share a same disk subsystem. In this case
* file handle for a particular file path is not going
* to change, given the same filehandle lookup will
* always locate the same rnode as the existing one.
* All we might need to do is to update the r_server
* with the current servinfo.
*/
}
return (0);
}
/*
* Try to make it so that no one else will find this
* vnode because it is just a temporary to hold the
* new file handle until that file handle can be
*/
/*
* Some other thread could have raced in here and could
* have done the remap for this particular rnode before
* this thread here. Check for rp->r_server and
* mi->mi_curr_serv and return if they are same.
*/
return (0);
}
/*
* As a heuristic check on the validity of the new
* file, check that the size and type match against
* that we remember from the old version.
*/
"NFS replicas %s and %s: file %s not same.",
return (EINVAL);
}
/*
* snarf the filehandle from the new rnode
* then release it, again while updating the
* hash queues for the rnode.
*/
/*
* Copy the attributes from the new rnode to the old
* rnode. This will help to reduce unnecessary page
* cache flushes.
*/
(void) nfs_free_data_reclaim(rp);
rp_addhash(rp);
}
/*
* Update successful failover remap count
*/
#ifdef DEBUG
#endif
/*
* If we have a copied filehandle to update, do it now.
*/
return (0);
}
/*
* NFS client failover support
*
* We want a simple pathname lookup routine to parse the pieces
* of path in rp->r_path. We know that the path was a created
* as rnodes were made, so we know we have only to deal with
* paths that look like:
* Any evidence of anything like .., symlinks, and ENOTDIR
* are hard errors, because they mean something in this filesystem
* is different from the one we came from, or has changed under
* us in some way. If this is true, we want the failure.
*
* Extended attributes: if the filesystem is mounted with extended
* attributes enabled (-o xattr), the attribute directory will be
* represented in the r_path as the magic name XATTR_RPATH. So if
* we see that name in the pathname, is must be because this node
* is an extended attribute. Therefore, look it up that way.
*/
static int
{
char *s, *p, *tmppath;
/* Make local copy of path */
s = tmppath;
do {
p = strchr(s, '/');
if (p != NULL)
*p = '\0';
} else {
CRED(), RFSCALL_SOFT);
}
if (p != NULL)
*p++ = '/';
if (error) {
return (error);
}
s = p;
} while (p != NULL);
return (0);
}
/*
* NFS client failover support
*
* sv_free() frees the malloc'd portion of a "servinfo_t".
*/
void
{
if (svp->sv_secdata)
}
}
}
}
/*
* Only can return non-zero if intr != 0.
*/
int
{
mutex_enter(&l->lock);
/*
* If this is a nested enter, then allow it. There
* must be as many exits as enters through.
*/
/* lock is held for writing by current thread */
l->count--;
/*
* While there is a writer active or writers waiting,
* then wait for them to finish up and move on. Then,
* increment the count to indicate that a reader is
* active.
*/
if (intr) {
lwp->lwp_nostop++;
lwp->lwp_nostop--;
mutex_exit(&l->lock);
return (EINTR);
}
lwp->lwp_nostop--;
} else
}
#ifdef DEBUG
"rwlock @ %p\n", l->count, (void *)&l);
#endif
l->count++;
} else {
/*
* While there are readers active or a writer
* active, then wait for all of the readers
* to finish or for the writer to finish.
* Then, set the owner field to curthread and
* decrement count to indicate that a writer
* is active.
*/
l->waiters++;
if (intr) {
lwp->lwp_nostop++;
lwp->lwp_nostop--;
l->waiters--;
cv_broadcast(&l->cv);
mutex_exit(&l->lock);
return (EINTR);
}
lwp->lwp_nostop--;
} else
l->waiters--;
}
l->count--;
}
mutex_exit(&l->lock);
return (0);
}
/*
* If the lock is available, obtain it and return non-zero. If there is
* already a conflicting lock, return 0 immediately.
*/
int
{
mutex_enter(&l->lock);
/*
* If this is a nested enter, then allow it. There
* must be as many exits as enters through.
*/
/* lock is held for writing by current thread */
l->count--;
/*
* If there is a writer active or writers waiting, deny the
* lock. Otherwise, bump the count of readers.
*/
mutex_exit(&l->lock);
return (0);
}
l->count++;
} else {
/*
* If there are readers active or a writer active, deny the
* lock. Otherwise, set the owner field to curthread and
* decrement count to indicate that a writer is active.
*/
mutex_exit(&l->lock);
return (0);
}
l->count--;
}
mutex_exit(&l->lock);
return (1);
}
void
{
mutex_enter(&l->lock);
/*
* If this is releasing a writer lock, then increment count to
* indicate that there is one less writer active. If this was
* the last of possibly nested writer locks, then clear the owner
* field as well to indicate that there is no writer active
* and wakeup any possible waiting writers or readers.
*
* If releasing a reader lock, then just decrement count to
* indicate that there is one less reader active. If this was
* the last active reader and there are writer(s) waiting,
* then wake up the first.
*/
l->count++;
if (l->count == 0) {
cv_broadcast(&l->cv);
}
} else {
l->count--;
cv_broadcast(&l->cv);
}
mutex_exit(&l->lock);
}
int
{
return (l->count > 0);
return (l->count < 0);
}
/* ARGSUSED */
void
{
l->count = 0;
l->waiters = 0;
}
void
{
mutex_destroy(&l->lock);
cv_destroy(&l->cv);
}
int
nfs3_rddir_compar(const void *x, const void *y)
{
rddir_cache *a = (rddir_cache *)x;
rddir_cache *b = (rddir_cache *)y;
if (a->nfs3_cookie == b->nfs3_cookie) {
return (0);
return (-1);
return (1);
}
if (a->nfs3_cookie < b->nfs3_cookie)
return (-1);
return (1);
}
int
nfs_rddir_compar(const void *x, const void *y)
{
rddir_cache *a = (rddir_cache *)x;
rddir_cache *b = (rddir_cache *)y;
if (a->nfs_cookie == b->nfs_cookie) {
return (0);
return (-1);
return (1);
}
if (a->nfs_cookie < b->nfs_cookie)
return (-1);
return (1);
}
static char *
{
servinfo_t *s;
char *srvnames;
char *namep;
/*
* Calculate the length of the string required to hold all
* of the server names plus either a comma or a null
* character following each individual one.
*/
length = 0;
length += s->sv_hostnamelen;
*namep++ = ',';
}
*--namep = '\0';
return (srvnames);
}
/*
* These two functions are temporary and designed for the upgrade-workaround
* only. They cannot be used for general zone-crossing NFS client support, and
* will be removed shortly.
*
* When the workaround is enabled, all NFS traffic is forced into the global
* zone. These functions are called when the code needs to refer to the state
* of the underlying network connection. They're not called when the function
* needs to refer to the state of the process that invoked the system call.
* (E.g., when checking whether the zone is shutting down during the mount()
* call.)
*/
struct zone *
nfs_zone(void)
{
}
nfs_zoneid(void)
{
}
/*
* nfs_mount_label_policy:
* Determine whether the mount is allowed according to MAC check,
* by comparing (where appropriate) label of the remote server
* against the label of the zone being mounted into.
*
* Returns:
* 0 : access allowed
* -1 : read-only access allowed (i.e., read-down)
* >0 : error code, such as EACCES
*/
int
{
int addr_type;
void *ipaddr;
int retv;
/*
* Get the zone's label. Each zone on a labeled system has a label.
*/
} else {
retv = 0;
goto out;
}
/*
* Next, get the assigned label of the remote server.
*/
goto out; /* error getting host entry */
goto rel_tpc; /* invalid domain */
goto rel_tpc; /* invalid hosttype */
goto rel_tpc; /* error getting server lbl */
} else { /* UNLABELED */
}
/*
* Now compare labels to complete the MAC check. If the labels
* are equal or if the requestor is in the global zone and has
* NET_MAC_AWARE, then allow read-write access. (Except for
* mounts into the global zone itself; restrict these to
* read-only.)
*
* If the requestor is in some other zone, but his label
* dominates the server, then allow read-down.
*
* Otherwise, access is denied.
*/
if ((mntzone == global_zone) ||
else
retv = 0; /* access OK */
} else {
}
out:
if (mntzone)
return (retv);
}
nfs_has_ctty(void)
{
return (rv);
}
/*
* See if xattr directory to see if it has any generic user attributes
*/
int
{
char *dbuf;
int eof = 0;
int error;
*valp = 0;
uio.uio_loffset = 0;
return (error);
}
VIEW_READONLY) == 0) {
continue;
}
*valp = 1;
break;
}
return (0);
}