nfs_subr.c revision e8dc3b7db5649f094399356b5f050a719a27ab43
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
* Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
* All rights reserved.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/param.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/cred.h>
#include <sys/proc.h>
#include <sys/user.h>
#include <sys/time.h>
#include <sys/buf.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/socket.h>
#include <sys/uio.h>
#include <sys/tiuser.h>
#include <sys/swap.h>
#include <sys/errno.h>
#include <sys/debug.h>
#include <sys/kmem.h>
#include <sys/kstat.h>
#include <sys/cmn_err.h>
#include <sys/vtrace.h>
#include <sys/session.h>
#include <sys/dnlc.h>
#include <sys/bitmap.h>
#include <sys/acl.h>
#include <sys/ddi.h>
#include <sys/pathname.h>
#include <sys/flock.h>
#include <sys/dirent.h>
#include <sys/flock.h>
#include <sys/callb.h>
#include <sys/atomic.h>
#include <sys/list.h>
#include <rpc/types.h>
#include <rpc/xdr.h>
#include <rpc/auth.h>
#include <rpc/clnt.h>
#include <nfs/nfs.h>
#include <nfs/nfs4.h>
#include <nfs/nfs_clnt.h>
#include <nfs/rnode.h>
#include <nfs/nfs_acl.h>
/*
* The hash queues for the access to active and cached rnodes
* are organized as doubly linked lists. A reader/writer lock
* for each hash bucket is used to control access and to synchronize
* lookups, additions, and deletions from the hash queue.
*
* The rnode freelist is organized as a doubly linked list with
* a head pointer. Additions and deletions are synchronized via
* a single mutex.
*
* In order to add an rnode to the free list, it must be hashed into
* a hash queue and the exclusive lock to the hash queue be held.
* If an rnode is not hashed into a hash queue, then it is destroyed
* because it represents no valuable information that can be reused
* about the file. The exclusive lock to the hash queue must be
* held in order to prevent a lookup in the hash queue from finding
* the rnode and using it and assuming that the rnode is not on the
* freelist. The lookup in the hash queue will have the hash queue
* locked, either exclusive or shared.
*
* The vnode reference count for each rnode is not allowed to drop
* below 1. This prevents external entities, such as the VM
* subsystem, from acquiring references to vnodes already on the
* freelist and then trying to place them back on the freelist
* when their reference is released. This means that the when an
* rnode is looked up in the hash queues, then either the rnode
* is removed from the freelist and that reference is tranfered to
* the new reference or the vnode reference count must be incremented
* accordingly. The mutex for the freelist must be held in order to
* accurately test to see if the rnode is on the freelist or not.
* The hash queue lock might be held shared and it is possible that
* two different threads may race to remove the rnode from the
* freelist. This race can be resolved by holding the mutex for the
* freelist. Please note that the mutex for the freelist does not
* need to held if the rnode is not on the freelist. It can not be
* placed on the freelist due to the requirement that the thread
* putting the rnode on the freelist must hold the exclusive lock
* to the hash queue and the thread doing the lookup in the hash
* queue is holding either a shared or exclusive lock to the hash
* queue.
*
* The lock ordering is:
*
* hash bucket lock -> vnode lock
* hash bucket lock -> freelist lock
*/
static rhashq_t *rtable;
static kmutex_t rpfreelist_lock;
static rnode_t *rpfreelist = NULL;
static long rnew = 0;
long nrnode = 0;
static int rtablesize;
static int rtablemask;
static int hashlen = 4;
static struct kmem_cache *rnode_cache;
/*
* Mutex to protect the following variables:
* nfs_major
* nfs_minor
*/
kmutex_t nfs_minor_lock;
int nfs_major;
int nfs_minor;
/* Do we allow preepoch (negative) time values otw? */
bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */
/*
* Access cache
*/
static acache_hash_t *acache;
static long nacache; /* used strictly to size the number of hash queues */
static int acachesize;
static int acachemask;
static struct kmem_cache *acache_cache;
/*
* Client side utilities
*/
/*
* client side statistics
*/
static const struct clstat clstat_tmpl = {
{ "calls", KSTAT_DATA_UINT64 },
{ "badcalls", KSTAT_DATA_UINT64 },
{ "clgets", KSTAT_DATA_UINT64 },
{ "cltoomany", KSTAT_DATA_UINT64 },
#ifdef DEBUG
{ "clalloc", KSTAT_DATA_UINT64 },
{ "noresponse", KSTAT_DATA_UINT64 },
{ "failover", KSTAT_DATA_UINT64 },
{ "remap", KSTAT_DATA_UINT64 },
#endif
};
/*
* The following are statistics that describe behavior of the system as a whole
* and doesn't correspond to any one particular zone.
*/
#ifdef DEBUG
static struct clstat_debug {
kstat_named_t nrnode; /* number of allocated rnodes */
kstat_named_t access; /* size of access cache */
kstat_named_t dirent; /* size of readdir cache */
kstat_named_t dirents; /* size of readdir buf cache */
kstat_named_t reclaim; /* number of reclaims */
kstat_named_t clreclaim; /* number of cl reclaims */
kstat_named_t f_reclaim; /* number of free reclaims */
kstat_named_t a_reclaim; /* number of active reclaims */
kstat_named_t r_reclaim; /* number of rnode reclaims */
kstat_named_t rpath; /* bytes used to store rpaths */
} clstat_debug = {
{ "nrnode", KSTAT_DATA_UINT64 },
{ "access", KSTAT_DATA_UINT64 },
{ "dirent", KSTAT_DATA_UINT64 },
{ "dirents", KSTAT_DATA_UINT64 },
{ "reclaim", KSTAT_DATA_UINT64 },
{ "clreclaim", KSTAT_DATA_UINT64 },
{ "f_reclaim", KSTAT_DATA_UINT64 },
{ "a_reclaim", KSTAT_DATA_UINT64 },
{ "r_reclaim", KSTAT_DATA_UINT64 },
{ "r_path", KSTAT_DATA_UINT64 },
};
#endif /* DEBUG */
/*
* We keep a global list of per-zone client data, so we can clean up all zones
* if we get low on memory.
*/
static list_t nfs_clnt_list;
static kmutex_t nfs_clnt_list_lock;
static zone_key_t nfsclnt_zone_key;
static struct kmem_cache *chtab_cache;
/*
* Some servers do not properly update the attributes of the
* directory when changes are made. To allow interoperability
* with these broken servers, the nfs_disable_rddir_cache
* parameter must be set in /etc/system
*/
int nfs_disable_rddir_cache = 0;
int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
struct chtab **);
void clfree(CLIENT *, struct chtab *);
static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
struct chtab **, struct nfs_clnt *);
static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
struct chtab **, struct nfs_clnt *);
static void clreclaim(void *);
static int nfs_feedback(int, int, mntinfo_t *);
static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
caddr_t, cred_t *, int *, enum clnt_stat *, int,
failinfo_t *);
static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
caddr_t, cred_t *, int *, int, failinfo_t *);
static void rinactive(rnode_t *, cred_t *);
static int rtablehash(nfs_fhandle *);
static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
struct vnodeops *,
int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
cred_t *),
int (*)(const void *, const void *), int *, cred_t *,
char *, char *);
static void rp_rmfree(rnode_t *);
static void rp_addhash(rnode_t *);
static void rp_rmhash_locked(rnode_t *);
static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
static void destroy_rnode(rnode_t *);
static void rddir_cache_free(rddir_cache *);
static int nfs_free_data_reclaim(rnode_t *);
static int nfs_active_data_reclaim(rnode_t *);
static int nfs_free_reclaim(void);
static int nfs_active_reclaim(void);
static int nfs_rnode_reclaim(void);
static void nfs_reclaim(void *);
static int failover_safe(failinfo_t *);
static void failover_newserver(mntinfo_t *mi);
static void failover_thread(mntinfo_t *mi);
static int failover_wait(mntinfo_t *);
static int failover_remap(failinfo_t *);
static int failover_lookup(char *, vnode_t *,
int (*)(vnode_t *, char *, vnode_t **,
struct pathname *, int, vnode_t *, cred_t *, int),
int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
vnode_t **);
static void nfs_free_r_path(rnode_t *);
static void nfs_set_vroot(vnode_t *);
static char *nfs_getsrvnames(mntinfo_t *, size_t *);
/*
* from rpcsec module (common/rpcsec)
*/
extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
extern void sec_clnt_freeh(AUTH *);
extern void sec_clnt_freeinfo(struct sec_data *);
/*
* EIO or EINTR are not recoverable errors.
*/
#define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO))
/*
* Common handle get program for NFS, NFS ACL, and NFS AUTH client.
*/
static int
clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
struct chtab **chp, struct nfs_clnt *nfscl)
{
struct chhead *ch, *newch;
struct chhead **plistp;
struct chtab *cp;
int error;
k_sigset_t smask;
if (newcl == NULL || chp == NULL || ci == NULL)
return (EINVAL);
*newcl = NULL;
*chp = NULL;
/*
* Find an unused handle or create one
*/
newch = NULL;
nfscl->nfscl_stat.clgets.value.ui64++;
top:
/*
* Find the correct entry in the cache to check for free
* client handles. The search is based on the RPC program
* number, program version number, dev_t for the transport
* device, and the protocol family.
*/
mutex_enter(&nfscl->nfscl_chtable_lock);
plistp = &nfscl->nfscl_chtable;
for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
if (ch->ch_prog == ci->cl_prog &&
ch->ch_vers == ci->cl_vers &&
ch->ch_dev == svp->sv_knconf->knc_rdev &&
(strcmp(ch->ch_protofmly,
svp->sv_knconf->knc_protofmly) == 0))
break;
plistp = &ch->ch_next;
}
/*
* If we didn't find a cache entry for this quadruple, then
* create one. If we don't have one already preallocated,
* then drop the cache lock, create one, and then start over.
* If we did have a preallocated entry, then just add it to
* the front of the list.
*/
if (ch == NULL) {
if (newch == NULL) {
mutex_exit(&nfscl->nfscl_chtable_lock);
newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
newch->ch_timesused = 0;
newch->ch_prog = ci->cl_prog;
newch->ch_vers = ci->cl_vers;
newch->ch_dev = svp->sv_knconf->knc_rdev;
newch->ch_protofmly = kmem_alloc(
strlen(svp->sv_knconf->knc_protofmly) + 1,
KM_SLEEP);
(void) strcpy(newch->ch_protofmly,
svp->sv_knconf->knc_protofmly);
newch->ch_list = NULL;
goto top;
}
ch = newch;
newch = NULL;
ch->ch_next = nfscl->nfscl_chtable;
nfscl->nfscl_chtable = ch;
/*
* We found a cache entry, but if it isn't on the front of the
* list, then move it to the front of the list to try to take
* advantage of locality of operations.
*/
} else if (ch != nfscl->nfscl_chtable) {
*plistp = ch->ch_next;
ch->ch_next = nfscl->nfscl_chtable;
nfscl->nfscl_chtable = ch;
}
/*
* If there was a free client handle cached, then remove it
* from the list, init it, and use it.
*/
if (ch->ch_list != NULL) {
cp = ch->ch_list;
ch->ch_list = cp->ch_list;
mutex_exit(&nfscl->nfscl_chtable_lock);
if (newch != NULL) {
kmem_free(newch->ch_protofmly,
strlen(newch->ch_protofmly) + 1);
kmem_free(newch, sizeof (*newch));
}
(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
&svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
&cp->ch_client->cl_auth);
if (error || cp->ch_client->cl_auth == NULL) {
CLNT_DESTROY(cp->ch_client);
kmem_cache_free(chtab_cache, cp);
return ((error != 0) ? error : EINTR);
}
ch->ch_timesused++;
*newcl = cp->ch_client;
*chp = cp;
return (0);
}
/*
* There weren't any free client handles which fit, so allocate
* a new one and use that.
*/
#ifdef DEBUG
atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
#endif
mutex_exit(&nfscl->nfscl_chtable_lock);
nfscl->nfscl_stat.cltoomany.value.ui64++;
if (newch != NULL) {
kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
kmem_free(newch, sizeof (*newch));
}
cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
cp->ch_head = ch;
sigintr(&smask, (int)ci->cl_flags & MI_INT);
error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
sigunintr(&smask);
if (error != 0) {
kmem_cache_free(chtab_cache, cp);
#ifdef DEBUG
atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
#endif
/*
* Warning is unnecessary if error is EINTR.
*/
if (error != EINTR) {
nfs_cmn_err(error, CE_WARN,
"clget: couldn't create handle: %m\n");
}
return (error);
}
(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
auth_destroy(cp->ch_client->cl_auth);
error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
&cp->ch_client->cl_auth);
if (error || cp->ch_client->cl_auth == NULL) {
CLNT_DESTROY(cp->ch_client);
kmem_cache_free(chtab_cache, cp);
#ifdef DEBUG
atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
#endif
return ((error != 0) ? error : EINTR);
}
ch->ch_timesused++;
*newcl = cp->ch_client;
ASSERT(cp->ch_client->cl_nosignal == FALSE);
*chp = cp;
return (0);
}
int
clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
struct chtab **chp)
{
struct nfs_clnt *nfscl;
nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
ASSERT(nfscl != NULL);
return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
}
static int
acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
struct chtab **chp, struct nfs_clnt *nfscl)
{
clinfo_t ci;
int error;
/*
* Set read buffer size to rsize
* and add room for RPC headers.
*/
ci.cl_readsize = mi->mi_tsize;
if (ci.cl_readsize != 0)
ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
/*
* If soft mount and server is down just try once.
* meaning: do not retransmit.
*/
if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
ci.cl_retrans = 0;
else
ci.cl_retrans = mi->mi_retrans;
ci.cl_prog = NFS_ACL_PROGRAM;
ci.cl_vers = mi->mi_vers;
ci.cl_flags = mi->mi_flags;
/*
* clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
* security flavor, the client tries to establish a security context
* by contacting the server. If the connection is timed out or reset,
* e.g. server reboot, we will try again.
*/
do {
error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
if (error == 0)
break;
/*
* For forced unmount or zone shutdown, bail out, no retry.
*/
if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
error = EIO;
break;
}
/* do not retry for softmount */
if (!(mi->mi_flags & MI_HARD))
break;
/* let the caller deal with the failover case */
if (FAILOVER_MOUNT(mi))
break;
} while (error == ETIMEDOUT || error == ECONNRESET);
return (error);
}
static int
nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
struct chtab **chp, struct nfs_clnt *nfscl)
{
clinfo_t ci;
int error;
/*
* Set read buffer size to rsize
* and add room for RPC headers.
*/
ci.cl_readsize = mi->mi_tsize;
if (ci.cl_readsize != 0)
ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
/*
* If soft mount and server is down just try once.
* meaning: do not retransmit.
*/
if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
ci.cl_retrans = 0;
else
ci.cl_retrans = mi->mi_retrans;
ci.cl_prog = mi->mi_prog;
ci.cl_vers = mi->mi_vers;
ci.cl_flags = mi->mi_flags;
/*
* clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
* security flavor, the client tries to establish a security context
* by contacting the server. If the connection is timed out or reset,
* e.g. server reboot, we will try again.
*/
do {
error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
if (error == 0)
break;
/*
* For forced unmount or zone shutdown, bail out, no retry.
*/
if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
error = EIO;
break;
}
/* do not retry for softmount */
if (!(mi->mi_flags & MI_HARD))
break;
/* let the caller deal with the failover case */
if (FAILOVER_MOUNT(mi))
break;
} while (error == ETIMEDOUT || error == ECONNRESET);
return (error);
}
static void
clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
{
if (cl->cl_auth != NULL) {
sec_clnt_freeh(cl->cl_auth);
cl->cl_auth = NULL;
}
/*
* Timestamp this cache entry so that we know when it was last
* used.
*/
cp->ch_freed = gethrestime_sec();
/*
* Add the free client handle to the front of the list.
* This way, the list will be sorted in youngest to oldest
* order.
*/
mutex_enter(&nfscl->nfscl_chtable_lock);
cp->ch_list = cp->ch_head->ch_list;
cp->ch_head->ch_list = cp;
mutex_exit(&nfscl->nfscl_chtable_lock);
}
void
clfree(CLIENT *cl, struct chtab *cp)
{
struct nfs_clnt *nfscl;
nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
ASSERT(nfscl != NULL);
clfree_impl(cl, cp, nfscl);
}
#define CL_HOLDTIME 60 /* time to hold client handles */
static void
clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
{
struct chhead *ch;
struct chtab *cp; /* list of objects that can be reclaimed */
struct chtab *cpe;
struct chtab *cpl;
struct chtab **cpp;
#ifdef DEBUG
int n = 0;
#endif
/*
* Need to reclaim some memory, so step through the cache
* looking through the lists for entries which can be freed.
*/
cp = NULL;
mutex_enter(&nfscl->nfscl_chtable_lock);
/*
* Here we step through each non-NULL quadruple and start to
* construct the reclaim list pointed to by cp. Note that
* cp will contain all eligible chtab entries. When this traversal
* completes, chtab entries from the last quadruple will be at the
* front of cp and entries from previously inspected quadruples have
* been appended to the rear of cp.
*/
for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
if (ch->ch_list == NULL)
continue;
/*
* Search each list for entries older then
* cl_holdtime seconds. The lists are maintained
* in youngest to oldest order so that when the
* first entry is found which is old enough, then
* all of the rest of the entries on the list will
* be old enough as well.
*/
cpl = ch->ch_list;
cpp = &ch->ch_list;
while (cpl != NULL &&
cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
cpp = &cpl->ch_list;
cpl = cpl->ch_list;
}
if (cpl != NULL) {
*cpp = NULL;
if (cp != NULL) {
cpe = cpl;
while (cpe->ch_list != NULL)
cpe = cpe->ch_list;
cpe->ch_list = cp;
}
cp = cpl;
}
}
mutex_exit(&nfscl->nfscl_chtable_lock);
/*
* If cp is empty, then there is nothing to reclaim here.
*/
if (cp == NULL)
return;
/*
* Step through the list of entries to free, destroying each client
* handle and kmem_free'ing the memory for each entry.
*/
while (cp != NULL) {
#ifdef DEBUG
n++;
#endif
CLNT_DESTROY(cp->ch_client);
cpl = cp->ch_list;
kmem_cache_free(chtab_cache, cp);
cp = cpl;
}
#ifdef DEBUG
/*
* Update clalloc so that nfsstat shows the current number
* of allocated client handles.
*/
atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
#endif
}
/* ARGSUSED */
static void
clreclaim(void *all)
{
struct nfs_clnt *nfscl;
#ifdef DEBUG
clstat_debug.clreclaim.value.ui64++;
#endif
/*
* The system is low on memory; go through and try to reclaim some from
* every zone on the system.
*/
mutex_enter(&nfs_clnt_list_lock);
nfscl = list_head(&nfs_clnt_list);
for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
clreclaim_zone(nfscl, CL_HOLDTIME);
mutex_exit(&nfs_clnt_list_lock);
}
/*
* Minimum time-out values indexed by call type
* These units are in "eights" of a second to avoid multiplies
*/
static unsigned int minimum_timeo[] = {
6, 7, 10
};
/*
* Back off for retransmission timeout, MAXTIMO is in hz of a sec
*/
#define MAXTIMO (20*hz)
#define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
#define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
#define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */
#define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */
#define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
/*
* Function called when rfscall notices that we have been
* re-transmitting, or when we get a response without retransmissions.
* Return 1 if the transfer size was adjusted down - 0 if no change.
*/
static int
nfs_feedback(int flag, int which, mntinfo_t *mi)
{
int kind;
int r = 0;
mutex_enter(&mi->mi_lock);
if (flag == FEEDBACK_REXMIT1) {
if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
goto done;
if (mi->mi_curread > MIN_NFS_TSIZE) {
mi->mi_curread /= 2;
if (mi->mi_curread < MIN_NFS_TSIZE)
mi->mi_curread = MIN_NFS_TSIZE;
r = 1;
}
if (mi->mi_curwrite > MIN_NFS_TSIZE) {
mi->mi_curwrite /= 2;
if (mi->mi_curwrite < MIN_NFS_TSIZE)
mi->mi_curwrite = MIN_NFS_TSIZE;
r = 1;
}
} else if (flag == FEEDBACK_OK) {
kind = mi->mi_timer_type[which];
if (kind == 0 ||
mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
goto done;
if (kind == 1) {
if (mi->mi_curread >= mi->mi_tsize)
goto done;
mi->mi_curread += MIN_NFS_TSIZE;
if (mi->mi_curread > mi->mi_tsize/2)
mi->mi_curread = mi->mi_tsize;
} else if (kind == 2) {
if (mi->mi_curwrite >= mi->mi_stsize)
goto done;
mi->mi_curwrite += MIN_NFS_TSIZE;
if (mi->mi_curwrite > mi->mi_stsize/2)
mi->mi_curwrite = mi->mi_stsize;
}
}
done:
mutex_exit(&mi->mi_lock);
return (r);
}
#ifdef DEBUG
static int rfs2call_hits = 0;
static int rfs2call_misses = 0;
#endif
int
rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
enum nfsstat *statusp, int flags, failinfo_t *fi)
{
int rpcerror;
enum clnt_stat rpc_status;
ASSERT(statusp != NULL);
rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
cr, douprintf, &rpc_status, flags, fi);
if (!rpcerror) {
/*
* See crnetadjust() for comments.
*/
if (*statusp == NFSERR_ACCES &&
(cr = crnetadjust(cr)) != NULL) {
#ifdef DEBUG
rfs2call_hits++;
#endif
rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
resp, cr, douprintf, NULL, flags, fi);
crfree(cr);
#ifdef DEBUG
if (*statusp == NFSERR_ACCES)
rfs2call_misses++;
#endif
}
} else if (rpc_status == RPC_PROCUNAVAIL) {
*statusp = NFSERR_OPNOTSUPP;
rpcerror = 0;
}
return (rpcerror);
}
#define NFS3_JUKEBOX_DELAY 10 * hz
static clock_t nfs3_jukebox_delay = 0;
#ifdef DEBUG
static int rfs3call_hits = 0;
static int rfs3call_misses = 0;
#endif
int
rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
nfsstat3 *statusp, int flags, failinfo_t *fi)
{
int rpcerror;
int user_informed;
user_informed = 0;
do {
rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
cr, douprintf, NULL, flags, fi);
if (!rpcerror) {
cred_t *crr;
if (*statusp == NFS3ERR_JUKEBOX) {
if (ttoproc(curthread) == &p0) {
rpcerror = EAGAIN;
break;
}
if (!user_informed) {
user_informed = 1;
uprintf(
"file temporarily unavailable on the server, retrying...\n");
}
delay(nfs3_jukebox_delay);
}
/*
* See crnetadjust() for comments.
*/
else if (*statusp == NFS3ERR_ACCES &&
(crr = crnetadjust(cr)) != NULL) {
#ifdef DEBUG
rfs3call_hits++;
#endif
rpcerror = rfscall(mi, which, xdrargs, argsp,
xdrres, resp, crr, douprintf,
NULL, flags, fi);
crfree(crr);
#ifdef DEBUG
if (*statusp == NFS3ERR_ACCES)
rfs3call_misses++;
#endif
}
}
} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
return (rpcerror);
}
#define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
#define INC_READERS(mi) { \
mi->mi_readers++; \
}
#define DEC_READERS(mi) { \
mi->mi_readers--; \
if (mi->mi_readers == 0) \
cv_broadcast(&mi->mi_failover_cv); \
}
static int
rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
{
CLIENT *client;
struct chtab *ch;
enum clnt_stat status;
struct rpc_err rpcerr;
struct timeval wait;
int timeo; /* in units of hz */
int my_rsize, my_wsize;
bool_t tryagain;
k_sigset_t smask;
servinfo_t *svp;
struct nfs_clnt *nfscl;
zoneid_t zoneid = getzoneid();
#ifdef DEBUG
char *bufp;
#endif
TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
"rfscall_start:which %d mi %p", which, mi);
nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
ASSERT(nfscl != NULL);
nfscl->nfscl_stat.calls.value.ui64++;
mi->mi_reqs[which].value.ui64++;
rpcerr.re_status = RPC_SUCCESS;
/*
* In case of forced unmount or zone shutdown, return EIO.
*/
if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
rpcerr.re_status = RPC_FAILED;
rpcerr.re_errno = EIO;
return (rpcerr.re_errno);
}
/*
* Remember the transfer sizes in case
* nfs_feedback changes them underneath us.
*/
my_rsize = mi->mi_curread;
my_wsize = mi->mi_curwrite;
/*
* NFS client failover support
*
* If this rnode is not in sync with the current server (VALID_FH),
* we'd like to do a remap to get in sync. We can be interrupted
* in failover_remap(), and if so we'll bail. Otherwise, we'll
* use the best info we have to try the RPC. Part of that is
* unconditionally updating the filehandle copy kept for V3.
*
* Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
* rw_enter(); we're trying to keep the current server from being
* changed on us until we're done with the remapping and have a
* matching client handle. We don't want to sending a filehandle
* to the wrong host.
*/
failoverretry:
if (FAILOVER_MOUNT(mi)) {
mutex_enter(&mi->mi_lock);
if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
if (failover_wait(mi)) {
mutex_exit(&mi->mi_lock);
return (EINTR);
}
}
INC_READERS(mi);
mutex_exit(&mi->mi_lock);
if (fi) {
if (!VALID_FH(fi) &&
!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
int remaperr;
svp = mi->mi_curr_serv;
remaperr = failover_remap(fi);
if (remaperr != 0) {
#ifdef DEBUG
if (remaperr != EINTR)
nfs_cmn_err(remaperr, CE_WARN,
"rfscall couldn't failover: %m");
#endif
mutex_enter(&mi->mi_lock);
DEC_READERS(mi);
mutex_exit(&mi->mi_lock);
/*
* If failover_remap returns ETIMEDOUT
* and the filesystem is hard mounted
* we have to retry the call with a new
* server.
*/
if ((mi->mi_flags & MI_HARD) &&
IS_RECOVERABLE_ERROR(remaperr)) {
if (svp == mi->mi_curr_serv)
failover_newserver(mi);
rpcerr.re_status = RPC_SUCCESS;
goto failoverretry;
}
rpcerr.re_errno = remaperr;
return (remaperr);
}
}
if (fi->fhp && fi->copyproc)
(*fi->copyproc)(fi->fhp, fi->vp);
}
}
/*
* clget() calls clnt_tli_kinit() which clears the xid, so we
* are guaranteed to reprocess the retry as a new request.
*/
svp = mi->mi_curr_serv;
rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
if (FAILOVER_MOUNT(mi)) {
mutex_enter(&mi->mi_lock);
DEC_READERS(mi);
mutex_exit(&mi->mi_lock);
if ((rpcerr.re_errno == ETIMEDOUT ||
rpcerr.re_errno == ECONNRESET) &&
failover_safe(fi)) {
if (svp == mi->mi_curr_serv)
failover_newserver(mi);
goto failoverretry;
}
}
if (rpcerr.re_errno != 0)
return (rpcerr.re_errno);
if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
timeo = (mi->mi_timeo * hz) / 10;
} else {
mutex_enter(&mi->mi_lock);
timeo = CLNT_SETTIMERS(client,
&(mi->mi_timers[mi->mi_timer_type[which]]),
&(mi->mi_timers[NFS_CALLTYPES]),
(minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
(void (*)())NULL, (caddr_t)mi, 0);
mutex_exit(&mi->mi_lock);
}
/*
* If hard mounted fs, retry call forever unless hard error occurs.
*/
do {
tryagain = FALSE;
if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
status = RPC_FAILED;
rpcerr.re_status = RPC_FAILED;
rpcerr.re_errno = EIO;
break;
}
TICK_TO_TIMEVAL(timeo, &wait);
/*
* Mask out all signals except SIGHUP, SIGINT, SIGQUIT
* and SIGTERM. (Preserving the existing masks).
* Mask out SIGINT if mount option nointr is specified.
*/
sigintr(&smask, (int)mi->mi_flags & MI_INT);
if (!(mi->mi_flags & MI_INT))
client->cl_nosignal = TRUE;
/*
* If there is a current signal, then don't bother
* even trying to send out the request because we
* won't be able to block waiting for the response.
* Simply assume RPC_INTR and get on with it.
*/
if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
status = RPC_INTR;
else {
status = CLNT_CALL(client, which, xdrargs, argsp,
xdrres, resp, wait);
}
if (!(mi->mi_flags & MI_INT))
client->cl_nosignal = FALSE;
/*
* restore original signal mask
*/
sigunintr(&smask);
switch (status) {
case RPC_SUCCESS:
if ((mi->mi_flags & MI_DYNAMIC) &&
mi->mi_timer_type[which] != 0 &&
(mi->mi_curread != my_rsize ||
mi->mi_curwrite != my_wsize))
(void) nfs_feedback(FEEDBACK_OK, which, mi);
break;
case RPC_INTR:
/*
* There is no way to recover from this error,
* even if mount option nointr is specified.
* SIGKILL, for example, cannot be blocked.
*/
rpcerr.re_status = RPC_INTR;
rpcerr.re_errno = EINTR;
break;
case RPC_UDERROR:
/*
* If the NFS server is local (vold) and
* it goes away then we get RPC_UDERROR.
* This is a retryable error, so we would
* loop, so check to see if the specific
* error was ECONNRESET, indicating that
* target did not exist at all. If so,
* return with RPC_PROGUNAVAIL and
* ECONNRESET to indicate why.
*/
CLNT_GETERR(client, &rpcerr);
if (rpcerr.re_errno == ECONNRESET) {
rpcerr.re_status = RPC_PROGUNAVAIL;
rpcerr.re_errno = ECONNRESET;
break;
}
/*FALLTHROUGH*/
default: /* probably RPC_TIMEDOUT */
if (IS_UNRECOVERABLE_RPC(status))
break;
/*
* increment server not responding count
*/
mutex_enter(&mi->mi_lock);
mi->mi_noresponse++;
mutex_exit(&mi->mi_lock);
#ifdef DEBUG
nfscl->nfscl_stat.noresponse.value.ui64++;
#endif
if (!(mi->mi_flags & MI_HARD)) {
if (!(mi->mi_flags & MI_SEMISOFT) ||
(mi->mi_ss_call_type[which] == 0))
break;
}
/*
* The call is in progress (over COTS).
* Try the CLNT_CALL again, but don't
* print a noisy error message.
*/
if (status == RPC_INPROGRESS) {
tryagain = TRUE;
break;
}
if (flags & RFSCALL_SOFT)
break;
/*
* On zone shutdown, just move on.
*/
if (zone_status_get(curproc->p_zone) >=
ZONE_IS_SHUTTING_DOWN) {
rpcerr.re_status = RPC_FAILED;
rpcerr.re_errno = EIO;
break;
}
/*
* NFS client failover support
*
* If the current server just failed us, we'll
* start the process of finding a new server.
* After that, we can just retry.
*/
if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
if (svp == mi->mi_curr_serv)
failover_newserver(mi);
clfree_impl(client, ch, nfscl);
goto failoverretry;
}
tryagain = TRUE;
timeo = backoff(timeo);
mutex_enter(&mi->mi_lock);
if (!(mi->mi_flags & MI_PRINTED)) {
mi->mi_flags |= MI_PRINTED;
mutex_exit(&mi->mi_lock);
#ifdef DEBUG
zprintf(zoneid,
"NFS%d server %s not responding still trying\n",
mi->mi_vers, svp->sv_hostname);
#else
zprintf(zoneid,
"NFS server %s not responding still trying\n",
svp->sv_hostname);
#endif
} else
mutex_exit(&mi->mi_lock);
if (*douprintf && curproc->p_sessp->s_vp != NULL) {
*douprintf = 0;
if (!(mi->mi_flags & MI_NOPRINT))
#ifdef DEBUG
uprintf(
"NFS%d server %s not responding still trying\n",
mi->mi_vers, svp->sv_hostname);
#else
uprintf(
"NFS server %s not responding still trying\n",
svp->sv_hostname);
#endif
}
/*
* If doing dynamic adjustment of transfer
* size and if it's a read or write call
* and if the transfer size changed while
* retransmitting or if the feedback routine
* changed the transfer size,
* then exit rfscall so that the transfer
* size can be adjusted at the vnops level.
*/
if ((mi->mi_flags & MI_DYNAMIC) &&
mi->mi_timer_type[which] != 0 &&
(mi->mi_curread != my_rsize ||
mi->mi_curwrite != my_wsize ||
nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
/*
* On read or write calls, return
* back to the vnode ops level if
* the transfer size changed.
*/
clfree_impl(client, ch, nfscl);
return (ENFS_TRYAGAIN);
}
}
} while (tryagain);
if (status != RPC_SUCCESS) {
/*
* Let soft mounts use the timed out message.
*/
if (status == RPC_INPROGRESS)
status = RPC_TIMEDOUT;
nfscl->nfscl_stat.badcalls.value.ui64++;
if (status != RPC_INTR) {
mutex_enter(&mi->mi_lock);
mi->mi_flags |= MI_DOWN;
mutex_exit(&mi->mi_lock);
CLNT_GETERR(client, &rpcerr);
#ifdef DEBUG
bufp = clnt_sperror(client, svp->sv_hostname);
zprintf(zoneid, "NFS%d %s failed for %s\n",
mi->mi_vers, mi->mi_rfsnames[which], bufp);
if (curproc->p_sessp->s_vp != NULL) {
if (!(mi->mi_flags & MI_NOPRINT)) {
uprintf("NFS%d %s failed for %s\n",
mi->mi_vers, mi->mi_rfsnames[which],
bufp);
}
}
kmem_free(bufp, MAXPATHLEN);
#else
zprintf(zoneid,
"NFS %s failed for server %s: error %d (%s)\n",
mi->mi_rfsnames[which], svp->sv_hostname,
status, clnt_sperrno(status));
if (curproc->p_sessp->s_vp != NULL) {
if (!(mi->mi_flags & MI_NOPRINT)) {
uprintf(
"NFS %s failed for server %s: error %d (%s)\n",
mi->mi_rfsnames[which],
svp->sv_hostname, status,
clnt_sperrno(status));
}
}
#endif
/*
* when CLNT_CALL() fails with RPC_AUTHERROR,
* re_errno is set appropriately depending on
* the authentication error
*/
if (status == RPC_VERSMISMATCH ||
status == RPC_PROGVERSMISMATCH)
rpcerr.re_errno = EIO;
}
} else {
/*
* Test the value of mi_down and mi_printed without
* holding the mi_lock mutex. If they are both zero,
* then it is okay to skip the down and printed
* processing. This saves on a mutex_enter and
* mutex_exit pair for a normal, successful RPC.
* This was just complete overhead.
*/
if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
mutex_enter(&mi->mi_lock);
mi->mi_flags &= ~MI_DOWN;
if (mi->mi_flags & MI_PRINTED) {
mi->mi_flags &= ~MI_PRINTED;
mutex_exit(&mi->mi_lock);
#ifdef DEBUG
if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
zprintf(zoneid, "NFS%d server %s ok\n",
mi->mi_vers, svp->sv_hostname);
#else
if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
zprintf(zoneid, "NFS server %s ok\n",
svp->sv_hostname);
#endif
} else
mutex_exit(&mi->mi_lock);
}
if (*douprintf == 0) {
if (!(mi->mi_flags & MI_NOPRINT))
#ifdef DEBUG
if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
uprintf("NFS%d server %s ok\n",
mi->mi_vers, svp->sv_hostname);
#else
if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
uprintf("NFS server %s ok\n", svp->sv_hostname);
#endif
*douprintf = 1;
}
}
clfree_impl(client, ch, nfscl);
ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
if (rpc_status != NULL)
*rpc_status = rpcerr.re_status;
TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
rpcerr.re_errno);
return (rpcerr.re_errno);
}
#ifdef DEBUG
static int acl2call_hits = 0;
static int acl2call_misses = 0;
#endif
int
acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
enum nfsstat *statusp, int flags, failinfo_t *fi)
{
int rpcerror;
rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
cr, douprintf, flags, fi);
if (!rpcerror) {
/*
* See comments with crnetadjust().
*/
if (*statusp == NFSERR_ACCES &&
(cr = crnetadjust(cr)) != NULL) {
#ifdef DEBUG
acl2call_hits++;
#endif
rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
resp, cr, douprintf, flags, fi);
crfree(cr);
#ifdef DEBUG
if (*statusp == NFSERR_ACCES)
acl2call_misses++;
#endif
}
}
return (rpcerror);
}
#ifdef DEBUG
static int acl3call_hits = 0;
static int acl3call_misses = 0;
#endif
int
acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
nfsstat3 *statusp, int flags, failinfo_t *fi)
{
int rpcerror;
int user_informed;
user_informed = 0;
do {
rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
cr, douprintf, flags, fi);
if (!rpcerror) {
cred_t *crr;
if (*statusp == NFS3ERR_JUKEBOX) {
if (!user_informed) {
user_informed = 1;
uprintf(
"file temporarily unavailable on the server, retrying...\n");
}
delay(nfs3_jukebox_delay);
}
/*
* See crnetadjust() for comments.
*/
else if (*statusp == NFS3ERR_ACCES &&
(crr = crnetadjust(cr)) != NULL) {
#ifdef DEBUG
acl3call_hits++;
#endif
rpcerror = aclcall(mi, which, xdrargs, argsp,
xdrres, resp, crr, douprintf, flags, fi);
crfree(crr);
#ifdef DEBUG
if (*statusp == NFS3ERR_ACCES)
acl3call_misses++;
#endif
}
}
} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
return (rpcerror);
}
static int
aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
int flags, failinfo_t *fi)
{
CLIENT *client;
struct chtab *ch;
enum clnt_stat status;
struct rpc_err rpcerr;
struct timeval wait;
int timeo; /* in units of hz */
#if 0 /* notyet */
int my_rsize, my_wsize;
#endif
bool_t tryagain;
k_sigset_t smask;
servinfo_t *svp;
struct nfs_clnt *nfscl;
zoneid_t zoneid = getzoneid();
#ifdef DEBUG
char *bufp;
#endif
#if 0 /* notyet */
TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
"rfscall_start:which %d mi %p", which, mi);
#endif
nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
ASSERT(nfscl != NULL);
nfscl->nfscl_stat.calls.value.ui64++;
mi->mi_aclreqs[which].value.ui64++;
rpcerr.re_status = RPC_SUCCESS;
if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
rpcerr.re_status = RPC_FAILED;
rpcerr.re_errno = EIO;
return (rpcerr.re_errno);
}
#if 0 /* notyet */
/*
* Remember the transfer sizes in case
* nfs_feedback changes them underneath us.
*/
my_rsize = mi->mi_curread;
my_wsize = mi->mi_curwrite;
#endif
/*
* NFS client failover support
*
* If this rnode is not in sync with the current server (VALID_FH),
* we'd like to do a remap to get in sync. We can be interrupted
* in failover_remap(), and if so we'll bail. Otherwise, we'll
* use the best info we have to try the RPC. Part of that is
* unconditionally updating the filehandle copy kept for V3.
*
* Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
* rw_enter(); we're trying to keep the current server from being
* changed on us until we're done with the remapping and have a
* matching client handle. We don't want to sending a filehandle
* to the wrong host.
*/
failoverretry:
if (FAILOVER_MOUNT(mi)) {
mutex_enter(&mi->mi_lock);
if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
if (failover_wait(mi)) {
mutex_exit(&mi->mi_lock);
return (EINTR);
}
}
INC_READERS(mi);
mutex_exit(&mi->mi_lock);
if (fi) {
if (!VALID_FH(fi) &&
!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
int remaperr;
svp = mi->mi_curr_serv;
remaperr = failover_remap(fi);
if (remaperr != 0) {
#ifdef DEBUG
if (remaperr != EINTR)
nfs_cmn_err(remaperr, CE_WARN,
"aclcall couldn't failover: %m");
#endif
mutex_enter(&mi->mi_lock);
DEC_READERS(mi);
mutex_exit(&mi->mi_lock);
/*
* If failover_remap returns ETIMEDOUT
* and the filesystem is hard mounted
* we have to retry the call with a new
* server.
*/
if ((mi->mi_flags & MI_HARD) &&
IS_RECOVERABLE_ERROR(remaperr)) {
if (svp == mi->mi_curr_serv)
failover_newserver(mi);
rpcerr.re_status = RPC_SUCCESS;
goto failoverretry;
}
return (remaperr);
}
}
if (fi->fhp && fi->copyproc)
(*fi->copyproc)(fi->fhp, fi->vp);
}
}
/*
* acl_clget() calls clnt_tli_kinit() which clears the xid, so we
* are guaranteed to reprocess the retry as a new request.
*/
svp = mi->mi_curr_serv;
rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
if (FAILOVER_MOUNT(mi)) {
mutex_enter(&mi->mi_lock);
DEC_READERS(mi);
mutex_exit(&mi->mi_lock);
if ((rpcerr.re_errno == ETIMEDOUT ||
rpcerr.re_errno == ECONNRESET) &&
failover_safe(fi)) {
if (svp == mi->mi_curr_serv)
failover_newserver(mi);
goto failoverretry;
}
}
if (rpcerr.re_errno != 0)
return (rpcerr.re_errno);
if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
timeo = (mi->mi_timeo * hz) / 10;
} else {
mutex_enter(&mi->mi_lock);
timeo = CLNT_SETTIMERS(client,
&(mi->mi_timers[mi->mi_acl_timer_type[which]]),
&(mi->mi_timers[NFS_CALLTYPES]),
(minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
(void (*)()) 0, (caddr_t)mi, 0);
mutex_exit(&mi->mi_lock);
}
/*
* If hard mounted fs, retry call forever unless hard error occurs.
*/
do {
tryagain = FALSE;
if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
status = RPC_FAILED;
rpcerr.re_status = RPC_FAILED;
rpcerr.re_errno = EIO;
break;
}
TICK_TO_TIMEVAL(timeo, &wait);
/*
* Mask out all signals except SIGHUP, SIGINT, SIGQUIT
* and SIGTERM. (Preserving the existing masks).
* Mask out SIGINT if mount option nointr is specified.
*/
sigintr(&smask, (int)mi->mi_flags & MI_INT);
if (!(mi->mi_flags & MI_INT))
client->cl_nosignal = TRUE;
/*
* If there is a current signal, then don't bother
* even trying to send out the request because we
* won't be able to block waiting for the response.
* Simply assume RPC_INTR and get on with it.
*/
if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
status = RPC_INTR;
else {
status = CLNT_CALL(client, which, xdrargs, argsp,
xdrres, resp, wait);
}
if (!(mi->mi_flags & MI_INT))
client->cl_nosignal = FALSE;
/*
* restore original signal mask
*/
sigunintr(&smask);
switch (status) {
case RPC_SUCCESS:
#if 0 /* notyet */
if ((mi->mi_flags & MI_DYNAMIC) &&
mi->mi_timer_type[which] != 0 &&
(mi->mi_curread != my_rsize ||
mi->mi_curwrite != my_wsize))
(void) nfs_feedback(FEEDBACK_OK, which, mi);
#endif
break;
/*
* Unfortunately, there are servers in the world which
* are not coded correctly. They are not prepared to
* handle RPC requests to the NFS port which are not
* NFS requests. Thus, they may try to process the
* NFS_ACL request as if it were an NFS request. This
* does not work. Generally, an error will be generated
* on the client because it will not be able to decode
* the response from the server. However, it seems
* possible that the server may not be able to decode
* the arguments. Thus, the criteria for deciding
* whether the server supports NFS_ACL or not is whether
* the following RPC errors are returned from CLNT_CALL.
*/
case RPC_CANTDECODERES:
case RPC_PROGUNAVAIL:
case RPC_CANTDECODEARGS:
case RPC_PROGVERSMISMATCH:
mutex_enter(&mi->mi_lock);
mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
mutex_exit(&mi->mi_lock);
break;
/*
* If the server supports NFS_ACL but not the new ops
* for extended attributes, make sure we don't retry.
*/
case RPC_PROCUNAVAIL:
mutex_enter(&mi->mi_lock);
mi->mi_flags &= ~MI_EXTATTR;
mutex_exit(&mi->mi_lock);
break;
case RPC_INTR:
/*
* There is no way to recover from this error,
* even if mount option nointr is specified.
* SIGKILL, for example, cannot be blocked.
*/
rpcerr.re_status = RPC_INTR;
rpcerr.re_errno = EINTR;
break;
case RPC_UDERROR:
/*
* If the NFS server is local (vold) and
* it goes away then we get RPC_UDERROR.
* This is a retryable error, so we would
* loop, so check to see if the specific
* error was ECONNRESET, indicating that
* target did not exist at all. If so,
* return with RPC_PROGUNAVAIL and
* ECONNRESET to indicate why.
*/
CLNT_GETERR(client, &rpcerr);
if (rpcerr.re_errno == ECONNRESET) {
rpcerr.re_status = RPC_PROGUNAVAIL;
rpcerr.re_errno = ECONNRESET;
break;
}
/*FALLTHROUGH*/
default: /* probably RPC_TIMEDOUT */
if (IS_UNRECOVERABLE_RPC(status))
break;
/*
* increment server not responding count
*/
mutex_enter(&mi->mi_lock);
mi->mi_noresponse++;
mutex_exit(&mi->mi_lock);
#ifdef DEBUG
nfscl->nfscl_stat.noresponse.value.ui64++;
#endif
if (!(mi->mi_flags & MI_HARD)) {
if (!(mi->mi_flags & MI_SEMISOFT) ||
(mi->mi_acl_ss_call_type[which] == 0))
break;
}
/*
* The call is in progress (over COTS).
* Try the CLNT_CALL again, but don't
* print a noisy error message.
*/
if (status == RPC_INPROGRESS) {
tryagain = TRUE;
break;
}
if (flags & RFSCALL_SOFT)
break;
/*
* On zone shutdown, just move on.
*/
if (zone_status_get(curproc->p_zone) >=
ZONE_IS_SHUTTING_DOWN) {
rpcerr.re_status = RPC_FAILED;
rpcerr.re_errno = EIO;
break;
}
/*
* NFS client failover support
*
* If the current server just failed us, we'll
* start the process of finding a new server.
* After that, we can just retry.
*/
if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
if (svp == mi->mi_curr_serv)
failover_newserver(mi);
clfree_impl(client, ch, nfscl);
goto failoverretry;
}
tryagain = TRUE;
timeo = backoff(timeo);
mutex_enter(&mi->mi_lock);
if (!(mi->mi_flags & MI_PRINTED)) {
mi->mi_flags |= MI_PRINTED;
mutex_exit(&mi->mi_lock);
#ifdef DEBUG
zprintf(zoneid,
"NFS_ACL%d server %s not responding still trying\n",
mi->mi_vers, svp->sv_hostname);
#else
zprintf(zoneid,
"NFS server %s not responding still trying\n",
svp->sv_hostname);
#endif
} else
mutex_exit(&mi->mi_lock);
if (*douprintf && curproc->p_sessp->s_vp != NULL) {
*douprintf = 0;
if (!(mi->mi_flags & MI_NOPRINT))
#ifdef DEBUG
uprintf(
"NFS_ACL%d server %s not responding still trying\n",
mi->mi_vers, svp->sv_hostname);
#else
uprintf(
"NFS server %s not responding still trying\n",
svp->sv_hostname);
#endif
}
#if 0 /* notyet */
/*
* If doing dynamic adjustment of transfer
* size and if it's a read or write call
* and if the transfer size changed while
* retransmitting or if the feedback routine
* changed the transfer size,
* then exit rfscall so that the transfer
* size can be adjusted at the vnops level.
*/
if ((mi->mi_flags & MI_DYNAMIC) &&
mi->mi_acl_timer_type[which] != 0 &&
(mi->mi_curread != my_rsize ||
mi->mi_curwrite != my_wsize ||
nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
/*
* On read or write calls, return
* back to the vnode ops level if
* the transfer size changed.
*/
clfree_impl(client, ch, nfscl);
return (ENFS_TRYAGAIN);
}
#endif
}
} while (tryagain);
if (status != RPC_SUCCESS) {
/*
* Let soft mounts use the timed out message.
*/
if (status == RPC_INPROGRESS)
status = RPC_TIMEDOUT;
nfscl->nfscl_stat.badcalls.value.ui64++;
if (status == RPC_CANTDECODERES ||
status == RPC_PROGUNAVAIL ||
status == RPC_PROCUNAVAIL ||
status == RPC_CANTDECODEARGS ||
status == RPC_PROGVERSMISMATCH)
CLNT_GETERR(client, &rpcerr);
else if (status != RPC_INTR) {
mutex_enter(&mi->mi_lock);
mi->mi_flags |= MI_DOWN;
mutex_exit(&mi->mi_lock);
CLNT_GETERR(client, &rpcerr);
#ifdef DEBUG
bufp = clnt_sperror(client, svp->sv_hostname);
zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
mi->mi_vers, mi->mi_aclnames[which], bufp);
if (curproc->p_sessp->s_vp != NULL) {
if (!(mi->mi_flags & MI_NOPRINT)) {
uprintf("NFS_ACL%d %s failed for %s\n",
mi->mi_vers, mi->mi_aclnames[which],
bufp);
}
}
kmem_free(bufp, MAXPATHLEN);
#else
zprintf(zoneid,
"NFS %s failed for server %s: error %d (%s)\n",
mi->mi_aclnames[which], svp->sv_hostname,
status, clnt_sperrno(status));
if (curproc->p_sessp->s_vp != NULL) {
if (!(mi->mi_flags & MI_NOPRINT))
uprintf(
"NFS %s failed for server %s: error %d (%s)\n",
mi->mi_aclnames[which],
svp->sv_hostname, status,
clnt_sperrno(status));
}
#endif
/*
* when CLNT_CALL() fails with RPC_AUTHERROR,
* re_errno is set appropriately depending on
* the authentication error
*/
if (status == RPC_VERSMISMATCH ||
status == RPC_PROGVERSMISMATCH)
rpcerr.re_errno = EIO;
}
} else {
/*
* Test the value of mi_down and mi_printed without
* holding the mi_lock mutex. If they are both zero,
* then it is okay to skip the down and printed
* processing. This saves on a mutex_enter and
* mutex_exit pair for a normal, successful RPC.
* This was just complete overhead.
*/
if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
mutex_enter(&mi->mi_lock);
mi->mi_flags &= ~MI_DOWN;
if (mi->mi_flags & MI_PRINTED) {
mi->mi_flags &= ~MI_PRINTED;
mutex_exit(&mi->mi_lock);
#ifdef DEBUG
zprintf(zoneid, "NFS_ACL%d server %s ok\n",
mi->mi_vers, svp->sv_hostname);
#else
zprintf(zoneid, "NFS server %s ok\n",
svp->sv_hostname);
#endif
} else
mutex_exit(&mi->mi_lock);
}
if (*douprintf == 0) {
if (!(mi->mi_flags & MI_NOPRINT))
#ifdef DEBUG
uprintf("NFS_ACL%d server %s ok\n",
mi->mi_vers, svp->sv_hostname);
#else
uprintf("NFS server %s ok\n", svp->sv_hostname);
#endif
*douprintf = 1;
}
}
clfree_impl(client, ch, nfscl);
ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
#if 0 /* notyet */
TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
rpcerr.re_errno);
#endif
return (rpcerr.re_errno);
}
int
vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
{
uint_t mask = vap->va_mask;
if (!(mask & AT_MODE))
sa->sa_mode = (uint32_t)-1;
else
sa->sa_mode = vap->va_mode;
if (!(mask & AT_UID))
sa->sa_uid = (uint32_t)-1;
else
sa->sa_uid = (uint32_t)vap->va_uid;
if (!(mask & AT_GID))
sa->sa_gid = (uint32_t)-1;
else
sa->sa_gid = (uint32_t)vap->va_gid;
if (!(mask & AT_SIZE))
sa->sa_size = (uint32_t)-1;
else
sa->sa_size = (uint32_t)vap->va_size;
if (!(mask & AT_ATIME))
sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
else {
/* check time validity */
if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
return (EOVERFLOW);
}
sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
}
if (!(mask & AT_MTIME))
sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
else {
/* check time validity */
if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
return (EOVERFLOW);
}
sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
}
return (0);
}
int
vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
{
uint_t mask = vap->va_mask;
if (!(mask & AT_MODE))
sa->mode.set_it = FALSE;
else {
sa->mode.set_it = TRUE;
sa->mode.mode = (mode3)vap->va_mode;
}
if (!(mask & AT_UID))
sa->uid.set_it = FALSE;
else {
sa->uid.set_it = TRUE;
sa->uid.uid = (uid3)vap->va_uid;
}
if (!(mask & AT_GID))
sa->gid.set_it = FALSE;
else {
sa->gid.set_it = TRUE;
sa->gid.gid = (gid3)vap->va_gid;
}
if (!(mask & AT_SIZE))
sa->size.set_it = FALSE;
else {
sa->size.set_it = TRUE;
sa->size.size = (size3)vap->va_size;
}
if (!(mask & AT_ATIME))
sa->atime.set_it = DONT_CHANGE;
else {
/* check time validity */
if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
return (EOVERFLOW);
}
sa->atime.set_it = SET_TO_CLIENT_TIME;
sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
}
if (!(mask & AT_MTIME))
sa->mtime.set_it = DONT_CHANGE;
else {
/* check time validity */
if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
return (EOVERFLOW);
}
sa->mtime.set_it = SET_TO_CLIENT_TIME;
sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
}
return (0);
}
void
setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
{
da->da_fhandle = VTOFH(dvp);
da->da_name = nm;
da->da_flags = 0;
}
void
setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
{
da->dirp = VTOFH3(dvp);
da->name = nm;
}
int
setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
{
int error;
rnode_t *rp;
struct vattr va;
va.va_mask = AT_MODE | AT_GID;
error = VOP_GETATTR(dvp, &va, 0, cr);
if (error)
return (error);
/*
* To determine the expected group-id of the created file:
* 1) If the filesystem was not mounted with the Old-BSD-compatible
* GRPID option, and the directory's set-gid bit is clear,
* then use the process's gid.
* 2) Otherwise, set the group-id to the gid of the parent directory.
*/
rp = VTOR(dvp);
mutex_enter(&rp->r_statelock);
if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
*gidp = crgetgid(cr);
else
*gidp = va.va_gid;
mutex_exit(&rp->r_statelock);
return (0);
}
int
setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
{
int error;
struct vattr va;
va.va_mask = AT_MODE;
error = VOP_GETATTR(dvp, &va, 0, cr);
if (error)
return (error);
/*
* Modify the expected mode (om) so that the set-gid bit matches
* that of the parent directory (dvp).
*/
if (va.va_mode & VSGID)
*omp |= VSGID;
else
*omp &= ~VSGID;
return (0);
}
void
nfs_setswaplike(vnode_t *vp, vattr_t *vap)
{
if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
if (!(vp->v_flag & VSWAPLIKE)) {
mutex_enter(&vp->v_lock);
vp->v_flag |= VSWAPLIKE;
mutex_exit(&vp->v_lock);
}
} else {
if (vp->v_flag & VSWAPLIKE) {
mutex_enter(&vp->v_lock);
vp->v_flag &= ~VSWAPLIKE;
mutex_exit(&vp->v_lock);
}
}
}
/*
* Free the resources associated with an rnode.
*/
static void
rinactive(rnode_t *rp, cred_t *cr)
{
vnode_t *vp;
cred_t *cred;
char *contents;
int size;
vsecattr_t *vsp;
int error;
nfs3_pathconf_info *info;
/*
* Before freeing anything, wait until all asynchronous
* activity is done on this rnode. This will allow all
* asynchronous read ahead and write behind i/o's to
* finish.
*/
mutex_enter(&rp->r_statelock);
while (rp->r_count > 0)
cv_wait(&rp->r_cv, &rp->r_statelock);
mutex_exit(&rp->r_statelock);
/*
* Flush and invalidate all pages associated with the vnode.
*/
vp = RTOV(rp);
if (vn_has_cached_data(vp)) {
ASSERT(vp->v_type != VCHR);
if ((rp->r_flags & RDIRTY) && !rp->r_error) {
error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr);
if (error && (error == ENOSPC || error == EDQUOT)) {
mutex_enter(&rp->r_statelock);
if (!rp->r_error)
rp->r_error = error;
mutex_exit(&rp->r_statelock);
}
}
nfs_invalidate_pages(vp, (u_offset_t)0, cr);
}
/*
* Free any held credentials and caches which may be associated
* with this rnode.
*/
mutex_enter(&rp->r_statelock);
cred = rp->r_cred;
rp->r_cred = NULL;
contents = rp->r_symlink.contents;
size = rp->r_symlink.size;
rp->r_symlink.contents = NULL;
vsp = rp->r_secattr;
rp->r_secattr = NULL;
info = rp->r_pathconf;
rp->r_pathconf = NULL;
mutex_exit(&rp->r_statelock);
/*
* Free the held credential.
*/
if (cred != NULL)
crfree(cred);
/*
* Free the access cache entries.
*/
(void) nfs_access_purge_rp(rp);
/*
* Free the readdir cache entries.
*/
if (HAVE_RDDIR_CACHE(rp))
nfs_purge_rddir_cache(vp);
/*
* Free the symbolic link cache.
*/
if (contents != NULL) {
kmem_free((void *)contents, size);
}
/*
* Free any cached ACL.
*/
if (vsp != NULL)
nfs_acl_free(vsp);
/*
* Free any cached pathconf information.
*/
if (info != NULL)
kmem_free(info, sizeof (*info));
}
/*
* Return a vnode for the given NFS Version 2 file handle.
* If no rnode exists for this fhandle, create one and put it
* into the hash queues. If the rnode for this fhandle
* already exists, return it.
*
* Note: make_rnode() may upgrade the hash bucket lock to exclusive.
*/
vnode_t *
makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
hrtime_t t, cred_t *cr, char *dnm, char *nm)
{
int newnode;
int index;
vnode_t *vp;
nfs_fhandle nfh;
vattr_t va;
nfh.fh_len = NFS_FHSIZE;
bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
index = rtablehash(&nfh);
rw_enter(&rtable[index].r_lock, RW_READER);
vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
if (attr != NULL) {
if (!newnode) {
rw_exit(&rtable[index].r_lock);
(void) nfs_cache_fattr(vp, attr, &va, t, cr);
} else {
if (attr->na_type < NFNON || attr->na_type > NFSOC)
vp->v_type = VBAD;
else
vp->v_type = n2v_type(attr);
/*
* A translation here seems to be necessary
* because this function can be called
* with `attr' that has come from the wire,
* and been operated on by vattr_to_nattr().
* See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
* ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
* ->makenfsnode().
*/
if ((attr->na_rdev & 0xffff0000) == 0)
vp->v_rdev = nfsv2_expdev(attr->na_rdev);
else
vp->v_rdev = expldev(n2v_rdev(attr));
nfs_attrcache(vp, attr, t);
rw_exit(&rtable[index].r_lock);
}
} else {
if (newnode) {
PURGE_ATTRCACHE(vp);
}
rw_exit(&rtable[index].r_lock);
}
return (vp);
}
/*
* Return a vnode for the given NFS Version 3 file handle.
* If no rnode exists for this fhandle, create one and put it
* into the hash queues. If the rnode for this fhandle
* already exists, return it.
*
* Note: make_rnode() may upgrade the hash bucket lock to exclusive.
*/
vnode_t *
makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
cred_t *cr, char *dnm, char *nm)
{
int newnode;
int index;
vnode_t *vp;
index = rtablehash((nfs_fhandle *)fh);
rw_enter(&rtable[index].r_lock, RW_READER);
vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
dnm, nm);
if (vap == NULL) {
if (newnode) {
PURGE_ATTRCACHE(vp);
}
rw_exit(&rtable[index].r_lock);
return (vp);
}
if (!newnode) {
rw_exit(&rtable[index].r_lock);
nfs_attr_cache(vp, vap, t, cr);
} else {
rnode_t *rp = VTOR(vp);
vp->v_type = vap->va_type;
vp->v_rdev = vap->va_rdev;
mutex_enter(&rp->r_statelock);
if (rp->r_mtime <= t)
nfs_attrcache_va(vp, vap);
mutex_exit(&rp->r_statelock);
rw_exit(&rtable[index].r_lock);
}
return (vp);
}
vnode_t *
makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
cred_t *cr, char *dnm, char *nm)
{
int newnode;
int index;
vnode_t *vp;
vattr_t va;
index = rtablehash((nfs_fhandle *)fh);
rw_enter(&rtable[index].r_lock, RW_READER);
vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
dnm, nm);
if (attr == NULL) {
if (newnode) {
PURGE_ATTRCACHE(vp);
}
rw_exit(&rtable[index].r_lock);
return (vp);
}
if (!newnode) {
rw_exit(&rtable[index].r_lock);
(void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
} else {
if (attr->type < NF3REG || attr->type > NF3FIFO)
vp->v_type = VBAD;
else
vp->v_type = nf3_to_vt[attr->type];
vp->v_rdev = makedevice(attr->rdev.specdata1,
attr->rdev.specdata2);
nfs3_attrcache(vp, attr, t);
rw_exit(&rtable[index].r_lock);
}
return (vp);
}
/*
* Read this comment before making changes to rtablehash()!
* This is a hash function in which seemingly obvious and harmless
* changes can cause escalations costing million dollars!
* Know what you are doing.
*
* rtablehash() implements Jenkins' one-at-a-time hash algorithm. The
* algorithm is currently detailed here:
*
* http://burtleburtle.net/bob/hash/doobs.html
*
* Of course, the above link may not be valid by the time you are reading
* this, but suffice it to say that the one-at-a-time algorithm works well in
* almost all cases. If you are changing the algorithm be sure to verify that
* the hash algorithm still provides even distribution in all cases and with
* any server returning filehandles in whatever order (sequential or random).
*/
static int
rtablehash(nfs_fhandle *fh)
{
ulong_t hash, len, i;
char *key;
key = fh->fh_buf;
len = (ulong_t)fh->fh_len;
for (hash = 0, i = 0; i < len; i++) {
hash += key[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return (hash & rtablemask);
}
static vnode_t *
make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
struct vnodeops *vops,
int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
int (*compar)(const void *, const void *),
int *newnode, cred_t *cr, char *dnm, char *nm)
{
rnode_t *rp;
rnode_t *trp;
vnode_t *vp;
mntinfo_t *mi;
ASSERT(RW_READ_HELD(&rhtp->r_lock));
mi = VFTOMI(vfsp);
start:
if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
vp = RTOV(rp);
nfs_set_vroot(vp);
*newnode = 0;
return (vp);
}
rw_exit(&rhtp->r_lock);
mutex_enter(&rpfreelist_lock);
if (rpfreelist != NULL && rnew >= nrnode) {
rp = rpfreelist;
rp_rmfree(rp);
mutex_exit(&rpfreelist_lock);
vp = RTOV(rp);
if (rp->r_flags & RHASHED) {
rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
mutex_enter(&vp->v_lock);
if (vp->v_count > 1) {
vp->v_count--;
mutex_exit(&vp->v_lock);
rw_exit(&rp->r_hashq->r_lock);
rw_enter(&rhtp->r_lock, RW_READER);
goto start;
}
mutex_exit(&vp->v_lock);
rp_rmhash_locked(rp);
rw_exit(&rp->r_hashq->r_lock);
}
rinactive(rp, cr);
mutex_enter(&vp->v_lock);
if (vp->v_count > 1) {
vp->v_count--;
mutex_exit(&vp->v_lock);
rw_enter(&rhtp->r_lock, RW_READER);
goto start;
}
mutex_exit(&vp->v_lock);
vn_invalid(vp);
/*
* destroy old locks before bzero'ing and
* recreating the locks below.
*/
nfs_rw_destroy(&rp->r_rwlock);
nfs_rw_destroy(&rp->r_lkserlock);
mutex_destroy(&rp->r_statelock);
cv_destroy(&rp->r_cv);
cv_destroy(&rp->r_commit.c_cv);
nfs_free_r_path(rp);
avl_destroy(&rp->r_dir);
/*
* Make sure that if rnode is recycled then
* VFS count is decremented properly before
* reuse.
*/
VFS_RELE(vp->v_vfsp);
vn_reinit(vp);
} else {
vnode_t *new_vp;
mutex_exit(&rpfreelist_lock);
rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
new_vp = vn_alloc(KM_SLEEP);
atomic_add_long((ulong_t *)&rnew, 1);
#ifdef DEBUG
clstat_debug.nrnode.value.ui64++;
#endif
vp = new_vp;
}
bzero(rp, sizeof (*rp));
rp->r_vnode = vp;
nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
rp->r_fh.fh_len = fh->fh_len;
bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
rp->r_server = mi->mi_curr_serv;
if (FAILOVER_MOUNT(mi)) {
/*
* If replicated servers, stash pathnames
*/
if (dnm != NULL && nm != NULL) {
char *s, *p;
uint_t len;
len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
rp->r_path = kmem_alloc(len, KM_SLEEP);
#ifdef DEBUG
clstat_debug.rpath.value.ui64 += len;
#endif
s = rp->r_path;
for (p = dnm; *p; p++)
*s++ = *p;
*s++ = '/';
for (p = nm; *p; p++)
*s++ = *p;
*s = '\0';
} else {
/* special case for root */
rp->r_path = kmem_alloc(2, KM_SLEEP);
#ifdef DEBUG
clstat_debug.rpath.value.ui64 += 2;
#endif
*rp->r_path = '.';
*(rp->r_path + 1) = '\0';
}
}
VFS_HOLD(vfsp);
rp->r_putapage = putapage;
rp->r_hashq = rhtp;
rp->r_flags = RREADDIRPLUS;
avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
offsetof(rddir_cache, tree));
vn_setops(vp, vops);
vp->v_data = (caddr_t)rp;
vp->v_vfsp = vfsp;
vp->v_type = VNON;
nfs_set_vroot(vp);
/*
* There is a race condition if someone else
* alloc's the rnode while no locks are held, so we
* check again and recover if found.
*/
rw_enter(&rhtp->r_lock, RW_WRITER);
if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
vp = RTOV(trp);
nfs_set_vroot(vp);
*newnode = 0;
rw_exit(&rhtp->r_lock);
rp_addfree(rp, cr);
rw_enter(&rhtp->r_lock, RW_READER);
return (vp);
}
rp_addhash(rp);
*newnode = 1;
return (vp);
}
static void
nfs_set_vroot(vnode_t *vp)
{
rnode_t *rp;
nfs_fhandle *rootfh;
rp = VTOR(vp);
rootfh = &rp->r_server->sv_fhandle;
if (rootfh->fh_len == rp->r_fh.fh_len &&
bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
if (!(vp->v_flag & VROOT)) {
mutex_enter(&vp->v_lock);
vp->v_flag |= VROOT;
mutex_exit(&vp->v_lock);
}
}
}
static void
nfs_free_r_path(rnode_t *rp)
{
char *path;
size_t len;
path = rp->r_path;
if (path) {
rp->r_path = NULL;
len = strlen(path) + 1;
kmem_free(path, len);
#ifdef DEBUG
clstat_debug.rpath.value.ui64 -= len;
#endif
}
}
/*
* Put an rnode on the free list.
*
* Rnodes which were allocated above and beyond the normal limit
* are immediately freed.
*/
void
rp_addfree(rnode_t *rp, cred_t *cr)
{
vnode_t *vp;
struct vfs *vfsp;
vp = RTOV(rp);
ASSERT(vp->v_count >= 1);
ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
/*
* If we have too many rnodes allocated and there are no
* references to this rnode, or if the rnode is no longer
* accessible by it does not reside in the hash queues,
* or if an i/o error occurred while writing to the file,
* then just free it instead of putting it on the rnode
* freelist.
*/
vfsp = vp->v_vfsp;
if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
(vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
if (rp->r_flags & RHASHED) {
rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
mutex_enter(&vp->v_lock);
if (vp->v_count > 1) {
vp->v_count--;
mutex_exit(&vp->v_lock);
rw_exit(&rp->r_hashq->r_lock);
return;
}
mutex_exit(&vp->v_lock);
rp_rmhash_locked(rp);
rw_exit(&rp->r_hashq->r_lock);
}
rinactive(rp, cr);
/*
* Recheck the vnode reference count. We need to
* make sure that another reference has not been
* acquired while we were not holding v_lock. The
* rnode is not in the rnode hash queues, so the
* only way for a reference to have been acquired
* is for a VOP_PUTPAGE because the rnode was marked
* with RDIRTY or for a modified page. This
* reference may have been acquired before our call
* to rinactive. The i/o may have been completed,
* thus allowing rinactive to complete, but the
* reference to the vnode may not have been released
* yet. In any case, the rnode can not be destroyed
* until the other references to this vnode have been
* released. The other references will take care of
* either destroying the rnode or placing it on the
* rnode freelist. If there are no other references,
* then the rnode may be safely destroyed.
*/
mutex_enter(&vp->v_lock);
if (vp->v_count > 1) {
vp->v_count--;
mutex_exit(&vp->v_lock);
return;
}
mutex_exit(&vp->v_lock);
destroy_rnode(rp);
return;
}
/*
* Lock the hash queue and then recheck the reference count
* to ensure that no other threads have acquired a reference
* to indicate that the rnode should not be placed on the
* freelist. If another reference has been acquired, then
* just release this one and let the other thread complete
* the processing of adding this rnode to the freelist.
*/
rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
mutex_enter(&vp->v_lock);
if (vp->v_count > 1) {
vp->v_count--;
mutex_exit(&vp->v_lock);
rw_exit(&rp->r_hashq->r_lock);
return;
}
mutex_exit(&vp->v_lock);
/*
* If there is no cached data or metadata for this file, then
* put the rnode on the front of the freelist so that it will
* be reused before other rnodes which may have cached data or
* metadata associated with them.
*/
mutex_enter(&rpfreelist_lock);
if (rpfreelist == NULL) {
rp->r_freef = rp;
rp->r_freeb = rp;
rpfreelist = rp;
} else {
rp->r_freef = rpfreelist;
rp->r_freeb = rpfreelist->r_freeb;
rpfreelist->r_freeb->r_freef = rp;
rpfreelist->r_freeb = rp;
if (!vn_has_cached_data(vp) &&
!HAVE_RDDIR_CACHE(rp) &&
rp->r_symlink.contents == NULL &&
rp->r_secattr == NULL &&
rp->r_pathconf == NULL)
rpfreelist = rp;
}
mutex_exit(&rpfreelist_lock);
rw_exit(&rp->r_hashq->r_lock);
}
/*
* Remove an rnode from the free list.
*
* The caller must be holding rpfreelist_lock and the rnode
* must be on the freelist.
*/
static void
rp_rmfree(rnode_t *rp)
{
ASSERT(MUTEX_HELD(&rpfreelist_lock));
ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
if (rp == rpfreelist) {
rpfreelist = rp->r_freef;
if (rp == rpfreelist)
rpfreelist = NULL;
}
rp->r_freeb->r_freef = rp->r_freef;
rp->r_freef->r_freeb = rp->r_freeb;
rp->r_freef = rp->r_freeb = NULL;
}
/*
* Put a rnode in the hash table.
*
* The caller must be holding the exclusive hash queue lock.
*/
static void
rp_addhash(rnode_t *rp)
{
ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
ASSERT(!(rp->r_flags & RHASHED));
rp->r_hashf = rp->r_hashq->r_hashf;
rp->r_hashq->r_hashf = rp;
rp->r_hashb = (rnode_t *)rp->r_hashq;
rp->r_hashf->r_hashb = rp;
mutex_enter(&rp->r_statelock);
rp->r_flags |= RHASHED;
mutex_exit(&rp->r_statelock);
}
/*
* Remove a rnode from the hash table.
*
* The caller must be holding the hash queue lock.
*/
static void
rp_rmhash_locked(rnode_t *rp)
{
ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
ASSERT(rp->r_flags & RHASHED);
rp->r_hashb->r_hashf = rp->r_hashf;
rp->r_hashf->r_hashb = rp->r_hashb;
mutex_enter(&rp->r_statelock);
rp->r_flags &= ~RHASHED;
mutex_exit(&rp->r_statelock);
}
/*
* Remove a rnode from the hash table.
*
* The caller must not be holding the hash queue lock.
*/
void
rp_rmhash(rnode_t *rp)
{
rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
rp_rmhash_locked(rp);
rw_exit(&rp->r_hashq->r_lock);
}
/*
* Lookup a rnode by fhandle.
*
* The caller must be holding the hash queue lock, either shared or exclusive.
*/
static rnode_t *
rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
{
rnode_t *rp;
vnode_t *vp;
ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
vp = RTOV(rp);
if (vp->v_vfsp == vfsp &&
rp->r_fh.fh_len == fh->fh_len &&
bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
/*
* remove rnode from free list, if necessary.
*/
if (rp->r_freef != NULL) {
mutex_enter(&rpfreelist_lock);
/*
* If the rnode is on the freelist,
* then remove it and use that reference
* as the new reference. Otherwise,
* need to increment the reference count.
*/
if (rp->r_freef != NULL) {
rp_rmfree(rp);
mutex_exit(&rpfreelist_lock);
} else {
mutex_exit(&rpfreelist_lock);
VN_HOLD(vp);
}
} else
VN_HOLD(vp);
return (rp);
}
}
return (NULL);
}
/*
* Return 1 if there is a active vnode belonging to this vfs in the
* rtable cache.
*
* Several of these checks are done without holding the usual
* locks. This is safe because destroy_rtable(), rp_addfree(),
* etc. will redo the necessary checks before actually destroying
* any rnodes.
*/
int
check_rtable(struct vfs *vfsp)
{
int index;
rnode_t *rp;
vnode_t *vp;
for (index = 0; index < rtablesize; index++) {
rw_enter(&rtable[index].r_lock, RW_READER);
for (rp = rtable[index].r_hashf;
rp != (rnode_t *)(&rtable[index]);
rp = rp->r_hashf) {
vp = RTOV(rp);
if (vp->v_vfsp == vfsp) {
if (rp->r_freef == NULL ||
(vn_has_cached_data(vp) &&
(rp->r_flags & RDIRTY)) ||
rp->r_count > 0) {
rw_exit(&rtable[index].r_lock);
return (1);
}
}
}
rw_exit(&rtable[index].r_lock);
}
return (0);
}
/*
* Destroy inactive vnodes from the hash queues which belong to this
* vfs. It is essential that we destroy all inactive vnodes during a
* forced unmount as well as during a normal unmount.
*/
void
destroy_rtable(struct vfs *vfsp, cred_t *cr)
{
int index;
rnode_t *rp;
rnode_t *rlist;
rnode_t *r_hashf;
vnode_t *vp;
rlist = NULL;
for (index = 0; index < rtablesize; index++) {
rw_enter(&rtable[index].r_lock, RW_WRITER);
for (rp = rtable[index].r_hashf;
rp != (rnode_t *)(&rtable[index]);
rp = r_hashf) {
/* save the hash pointer before destroying */
r_hashf = rp->r_hashf;
vp = RTOV(rp);
if (vp->v_vfsp == vfsp) {
mutex_enter(&rpfreelist_lock);
if (rp->r_freef != NULL) {
rp_rmfree(rp);
mutex_exit(&rpfreelist_lock);
rp_rmhash_locked(rp);
rp->r_hashf = rlist;
rlist = rp;
} else
mutex_exit(&rpfreelist_lock);
}
}
rw_exit(&rtable[index].r_lock);
}
for (rp = rlist; rp != NULL; rp = rlist) {
rlist = rp->r_hashf;
/*
* This call to rp_addfree will end up destroying the
* rnode, but in a safe way with the appropriate set
* of checks done.
*/
rp_addfree(rp, cr);
}
}
/*
* This routine destroys all the resources associated with the rnode
* and then the rnode itself.
*/
static void
destroy_rnode(rnode_t *rp)
{
vnode_t *vp;
vfs_t *vfsp;
vp = RTOV(rp);
vfsp = vp->v_vfsp;
ASSERT(vp->v_count == 1);
ASSERT(rp->r_count == 0);
ASSERT(rp->r_lmpl == NULL);
ASSERT(rp->r_mapcnt == 0);
ASSERT(!(rp->r_flags & RHASHED));
ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
atomic_add_long((ulong_t *)&rnew, -1);
#ifdef DEBUG
clstat_debug.nrnode.value.ui64--;
#endif
nfs_rw_destroy(&rp->r_rwlock);
nfs_rw_destroy(&rp->r_lkserlock);
mutex_destroy(&rp->r_statelock);
cv_destroy(&rp->r_cv);
cv_destroy(&rp->r_commit.c_cv);
if (rp->r_flags & RDELMAPLIST)
list_destroy(&rp->r_indelmap);
nfs_free_r_path(rp);
avl_destroy(&rp->r_dir);
vn_invalid(vp);
vn_free(vp);
kmem_cache_free(rnode_cache, rp);
VFS_RELE(vfsp);
}
/*
* Flush all vnodes in this (or every) vfs.
* Used by nfs_sync and by nfs_unmount.
*/
void
rflush(struct vfs *vfsp, cred_t *cr)
{
int index;
rnode_t *rp;
vnode_t *vp, **vplist;
long num, cnt;
/*
* Check to see whether there is anything to do.
*/
num = rnew;
if (num == 0)
return;
/*
* Allocate a slot for all currently active rnodes on the
* supposition that they all may need flushing.
*/
vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
cnt = 0;
/*
* Walk the hash queues looking for rnodes with page
* lists associated with them. Make a list of these
* files.
*/
for (index = 0; index < rtablesize; index++) {
rw_enter(&rtable[index].r_lock, RW_READER);
for (rp = rtable[index].r_hashf;
rp != (rnode_t *)(&rtable[index]);
rp = rp->r_hashf) {
vp = RTOV(rp);
/*
* Don't bother sync'ing a vp if it
* is part of virtual swap device or
* if VFS is read-only
*/
if (IS_SWAPVP(vp) || vn_is_readonly(vp))
continue;
/*
* If flushing all mounted file systems or
* the vnode belongs to this vfs, has pages
* and is marked as either dirty or mmap'd,
* hold and add this vnode to the list of
* vnodes to flush.
*/
if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
vn_has_cached_data(vp) &&
((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
VN_HOLD(vp);
vplist[cnt++] = vp;
if (cnt == num) {
rw_exit(&rtable[index].r_lock);
goto toomany;
}
}
}
rw_exit(&rtable[index].r_lock);
}
toomany:
/*
* Flush and release all of the files on the list.
*/
while (cnt-- > 0) {
vp = vplist[cnt];
(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr);
VN_RELE(vp);
}
/*
* Free the space allocated to hold the list.
*/
kmem_free(vplist, num * sizeof (*vplist));
}
/*
* This probably needs to be larger than or equal to
* log2(sizeof (struct rnode)) due to the way that rnodes are
* allocated.
*/
#define ACACHE_SHIFT_BITS 9
static int
acachehash(rnode_t *rp, cred_t *cr)
{
return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
acachemask);
}
#ifdef DEBUG
static long nfs_access_cache_hits = 0;
static long nfs_access_cache_misses = 0;
#endif
nfs_access_type_t
nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
{
vnode_t *vp;
acache_t *ap;
acache_hash_t *hp;
nfs_access_type_t all;
vp = RTOV(rp);
if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
return (NFS_ACCESS_UNKNOWN);
if (rp->r_acache != NULL) {
hp = &acache[acachehash(rp, cr)];
rw_enter(&hp->lock, RW_READER);
ap = hp->next;
while (ap != (acache_t *)hp) {
if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
if ((ap->known & acc) == acc) {
#ifdef DEBUG
nfs_access_cache_hits++;
#endif
if ((ap->allowed & acc) == acc)
all = NFS_ACCESS_ALLOWED;
else
all = NFS_ACCESS_DENIED;
} else {
#ifdef DEBUG
nfs_access_cache_misses++;
#endif
all = NFS_ACCESS_UNKNOWN;
}
rw_exit(&hp->lock);
return (all);
}
ap = ap->next;
}
rw_exit(&hp->lock);
}
#ifdef DEBUG
nfs_access_cache_misses++;
#endif
return (NFS_ACCESS_UNKNOWN);
}
void
nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
{
acache_t *ap;
acache_t *nap;
acache_hash_t *hp;
hp = &acache[acachehash(rp, cr)];
/*
* Allocate now assuming that mostly an allocation will be
* required. This allows the allocation to happen without
* holding the hash bucket locked.
*/
nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
if (nap != NULL) {
nap->known = acc;
nap->allowed = resacc;
nap->rnode = rp;
crhold(cr);
nap->cred = cr;
nap->hashq = hp;
}
rw_enter(&hp->lock, RW_WRITER);
if (rp->r_acache != NULL) {
ap = hp->next;
while (ap != (acache_t *)hp) {
if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
ap->known |= acc;
ap->allowed &= ~acc;
ap->allowed |= resacc;
rw_exit(&hp->lock);
if (nap != NULL) {
crfree(nap->cred);
kmem_cache_free(acache_cache, nap);
}
return;
}
ap = ap->next;
}
}
if (nap != NULL) {
#ifdef DEBUG
clstat_debug.access.value.ui64++;
#endif
nap->next = hp->next;
hp->next = nap;
nap->next->prev = nap;
nap->prev = (acache_t *)hp;
mutex_enter(&rp->r_statelock);
nap->list = rp->r_acache;
rp->r_acache = nap;
mutex_exit(&rp->r_statelock);
}
rw_exit(&hp->lock);
}
int
nfs_access_purge_rp(rnode_t *rp)
{
acache_t *ap;
acache_t *tmpap;
acache_t *rplist;
/*
* If there aren't any cached entries, then there is nothing
* to free.
*/
if (rp->r_acache == NULL)
return (0);
mutex_enter(&rp->r_statelock);
rplist = rp->r_acache;
rp->r_acache = NULL;
mutex_exit(&rp->r_statelock);
/*
* Loop through each entry in the list pointed to in the
* rnode. Remove each of these entries from the hash
* queue that it is on and remove it from the list in
* the rnode.
*/
for (ap = rplist; ap != NULL; ap = tmpap) {
rw_enter(&ap->hashq->lock, RW_WRITER);
ap->prev->next = ap->next;
ap->next->prev = ap->prev;
rw_exit(&ap->hashq->lock);
tmpap = ap->list;
crfree(ap->cred);
kmem_cache_free(acache_cache, ap);
#ifdef DEBUG
clstat_debug.access.value.ui64--;
#endif
}
return (1);
}
static const char prefix[] = ".nfs";
static kmutex_t newnum_lock;
int
newnum(void)
{
static uint_t newnum = 0;
uint_t id;
mutex_enter(&newnum_lock);
if (newnum == 0)
newnum = gethrestime_sec() & 0xffff;
id = newnum++;
mutex_exit(&newnum_lock);
return (id);
}
char *
newname(void)
{
char *news;
char *s;
const char *p;
uint_t id;
id = newnum();
news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
s = news;
p = prefix;
while (*p != '\0')
*s++ = *p++;
while (id != 0) {
*s++ = "0123456789ABCDEF"[id & 0x0f];
id >>= 4;
}
*s = '\0';
return (news);
}
int
nfs_atoi(char *cp)
{
int n;
n = 0;
while (*cp != '\0') {
n = n * 10 + (*cp - '0');
cp++;
}
return (n);
}
/*
* Snapshot callback for nfs:0:nfs_client as registered with the kstat
* framework.
*/
static int
cl_snapshot(kstat_t *ksp, void *buf, int rw)
{
ksp->ks_snaptime = gethrtime();
if (rw == KSTAT_WRITE) {
bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
#ifdef DEBUG
/*
* Currently only the global zone can write to kstats, but we
* add the check just for paranoia.
*/
if (INGLOBALZONE(curproc))
bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
sizeof (clstat_debug));
#endif
} else {
bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
#ifdef DEBUG
/*
* If we're displaying the "global" debug kstat values, we
* display them as-is to all zones since in fact they apply to
* the system as a whole.
*/
bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
sizeof (clstat_debug));
#endif
}
return (0);
}
static void *
clinit_zone(zoneid_t zoneid)
{
kstat_t *nfs_client_kstat;
struct nfs_clnt *nfscl;
uint_t ndata;
nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
nfscl->nfscl_chtable = NULL;
nfscl->nfscl_zoneid = zoneid;
bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
#ifdef DEBUG
ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
#endif
if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
"misc", KSTAT_TYPE_NAMED, ndata,
KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
nfs_client_kstat->ks_snapshot = cl_snapshot;
kstat_install(nfs_client_kstat);
}
mutex_enter(&nfs_clnt_list_lock);
list_insert_head(&nfs_clnt_list, nfscl);
mutex_exit(&nfs_clnt_list_lock);
return (nfscl);
}
/*ARGSUSED*/
static void
clfini_zone(zoneid_t zoneid, void *arg)
{
struct nfs_clnt *nfscl = arg;
chhead_t *chp, *next;
if (nfscl == NULL)
return;
mutex_enter(&nfs_clnt_list_lock);
list_remove(&nfs_clnt_list, nfscl);
mutex_exit(&nfs_clnt_list_lock);
clreclaim_zone(nfscl, 0);
for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
ASSERT(chp->ch_list == NULL);
kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
next = chp->ch_next;
kmem_free(chp, sizeof (*chp));
}
kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
mutex_destroy(&nfscl->nfscl_chtable_lock);
kmem_free(nfscl, sizeof (*nfscl));
}
/*
* Called by endpnt_destructor to make sure the client handles are
* cleaned up before the RPC endpoints. This becomes a no-op if
* clfini_zone (above) is called first. This function is needed
* (rather than relying on clfini_zone to clean up) because the ZSD
* callbacks have no ordering mechanism, so we have no way to ensure
* that clfini_zone is called before endpnt_destructor.
*/
void
clcleanup_zone(zoneid_t zoneid)
{
struct nfs_clnt *nfscl;
mutex_enter(&nfs_clnt_list_lock);
nfscl = list_head(&nfs_clnt_list);
for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
if (nfscl->nfscl_zoneid == zoneid) {
clreclaim_zone(nfscl, 0);
break;
}
}
mutex_exit(&nfs_clnt_list_lock);
}
int
nfs_subrinit(void)
{
int i;
ulong_t nrnode_max;
/*
* Allocate and initialize the rnode hash queues
*/
if (nrnode <= 0)
nrnode = ncsize;
nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
zcmn_err(GLOBAL_ZONEID, CE_NOTE,
"setting nrnode to max value of %ld", nrnode_max);
nrnode = nrnode_max;
}
rtablesize = 1 << highbit(nrnode / hashlen);
rtablemask = rtablesize - 1;
rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
for (i = 0; i < rtablesize; i++) {
rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
}
rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
/*
* Allocate and initialize the access cache
*/
/*
* Initial guess is one access cache entry per rnode unless
* nacache is set to a non-zero value and then it is used to
* indicate a guess at the number of access cache entries.
*/
if (nacache > 0)
acachesize = 1 << highbit(nacache / hashlen);
else
acachesize = rtablesize;
acachemask = acachesize - 1;
acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
for (i = 0; i < acachesize; i++) {
acache[i].next = (acache_t *)&acache[i];
acache[i].prev = (acache_t *)&acache[i];
rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
}
acache_cache = kmem_cache_create("nfs_access_cache",
sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
/*
* Allocate and initialize the client handle cache
*/
chtab_cache = kmem_cache_create("client_handle_cache",
sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL,
NULL, 0);
/*
* Initialize the list of per-zone client handles (and associated data).
* This needs to be done before we call zone_key_create().
*/
list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
offsetof(struct nfs_clnt, nfscl_node));
/*
* Initialize the zone_key for per-zone client handle lists.
*/
zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
/*
* Initialize the various mutexes and reader/writer locks
*/
mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
/*
* Assign unique major number for all nfs mounts
*/
if ((nfs_major = getudev()) == -1) {
zcmn_err(GLOBAL_ZONEID, CE_WARN,
"nfs: init: can't get unique device number");
nfs_major = 0;
}
nfs_minor = 0;
if (nfs3_jukebox_delay == 0)
nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
return (0);
}
void
nfs_subrfini(void)
{
int i;
/*
* Deallocate the rnode hash queues
*/
kmem_cache_destroy(rnode_cache);
for (i = 0; i < rtablesize; i++)
rw_destroy(&rtable[i].r_lock);
kmem_free(rtable, rtablesize * sizeof (*rtable));
/*
* Deallocated the access cache
*/
kmem_cache_destroy(acache_cache);
for (i = 0; i < acachesize; i++)
rw_destroy(&acache[i].lock);
kmem_free(acache, acachesize * sizeof (*acache));
/*
* Deallocate the client handle cache
*/
kmem_cache_destroy(chtab_cache);
/*
* Destroy the various mutexes and reader/writer locks
*/
mutex_destroy(&rpfreelist_lock);
mutex_destroy(&newnum_lock);
mutex_destroy(&nfs_minor_lock);
(void) zone_key_delete(nfsclnt_zone_key);
}
enum nfsstat
puterrno(int error)
{
switch (error) {
case EOPNOTSUPP:
return (NFSERR_OPNOTSUPP);
case ENAMETOOLONG:
return (NFSERR_NAMETOOLONG);
case ENOTEMPTY:
return (NFSERR_NOTEMPTY);
case EDQUOT:
return (NFSERR_DQUOT);
case ESTALE:
return (NFSERR_STALE);
case EREMOTE:
return (NFSERR_REMOTE);
case ENOSYS:
return (NFSERR_OPNOTSUPP);
case EOVERFLOW:
return (NFSERR_INVAL);
default:
return ((enum nfsstat)error);
}
/* NOTREACHED */
}
int
geterrno(enum nfsstat status)
{
switch (status) {
case NFSERR_OPNOTSUPP:
return (EOPNOTSUPP);
case NFSERR_NAMETOOLONG:
return (ENAMETOOLONG);
case NFSERR_NOTEMPTY:
return (ENOTEMPTY);
case NFSERR_DQUOT:
return (EDQUOT);
case NFSERR_STALE:
return (ESTALE);
case NFSERR_REMOTE:
return (EREMOTE);
case NFSERR_WFLUSH:
return (EIO);
default:
return ((int)status);
}
/* NOTREACHED */
}
enum nfsstat3
puterrno3(int error)
{
#ifdef DEBUG
switch (error) {
case 0:
return (NFS3_OK);
case EPERM:
return (NFS3ERR_PERM);
case ENOENT:
return (NFS3ERR_NOENT);
case EIO:
return (NFS3ERR_IO);
case ENXIO:
return (NFS3ERR_NXIO);
case EACCES:
return (NFS3ERR_ACCES);
case EEXIST:
return (NFS3ERR_EXIST);
case EXDEV:
return (NFS3ERR_XDEV);
case ENODEV:
return (NFS3ERR_NODEV);
case ENOTDIR:
return (NFS3ERR_NOTDIR);
case EISDIR:
return (NFS3ERR_ISDIR);
case EINVAL:
return (NFS3ERR_INVAL);
case EFBIG:
return (NFS3ERR_FBIG);
case ENOSPC:
return (NFS3ERR_NOSPC);
case EROFS:
return (NFS3ERR_ROFS);
case EMLINK:
return (NFS3ERR_MLINK);
case ENAMETOOLONG:
return (NFS3ERR_NAMETOOLONG);
case ENOTEMPTY:
return (NFS3ERR_NOTEMPTY);
case EDQUOT:
return (NFS3ERR_DQUOT);
case ESTALE:
return (NFS3ERR_STALE);
case EREMOTE:
return (NFS3ERR_REMOTE);
case EOPNOTSUPP:
return (NFS3ERR_NOTSUPP);
case EOVERFLOW:
return (NFS3ERR_INVAL);
default:
zcmn_err(getzoneid(), CE_WARN,
"puterrno3: got error %d", error);
return ((enum nfsstat3)error);
}
#else
switch (error) {
case ENAMETOOLONG:
return (NFS3ERR_NAMETOOLONG);
case ENOTEMPTY:
return (NFS3ERR_NOTEMPTY);
case EDQUOT:
return (NFS3ERR_DQUOT);
case ESTALE:
return (NFS3ERR_STALE);
case EOPNOTSUPP:
return (NFS3ERR_NOTSUPP);
case EREMOTE:
return (NFS3ERR_REMOTE);
case EOVERFLOW:
return (NFS3ERR_INVAL);
default:
return ((enum nfsstat3)error);
}
#endif
}
int
geterrno3(enum nfsstat3 status)
{
#ifdef DEBUG
switch (status) {
case NFS3_OK:
return (0);
case NFS3ERR_PERM:
return (EPERM);
case NFS3ERR_NOENT:
return (ENOENT);
case NFS3ERR_IO:
return (EIO);
case NFS3ERR_NXIO:
return (ENXIO);
case NFS3ERR_ACCES:
return (EACCES);
case NFS3ERR_EXIST:
return (EEXIST);
case NFS3ERR_XDEV:
return (EXDEV);
case NFS3ERR_NODEV:
return (ENODEV);
case NFS3ERR_NOTDIR:
return (ENOTDIR);
case NFS3ERR_ISDIR:
return (EISDIR);
case NFS3ERR_INVAL:
return (EINVAL);
case NFS3ERR_FBIG:
return (EFBIG);
case NFS3ERR_NOSPC:
return (ENOSPC);
case NFS3ERR_ROFS:
return (EROFS);
case NFS3ERR_MLINK:
return (EMLINK);
case NFS3ERR_NAMETOOLONG:
return (ENAMETOOLONG);
case NFS3ERR_NOTEMPTY:
return (ENOTEMPTY);
case NFS3ERR_DQUOT:
return (EDQUOT);
case NFS3ERR_STALE:
return (ESTALE);
case NFS3ERR_REMOTE:
return (EREMOTE);
case NFS3ERR_BADHANDLE:
return (ESTALE);
case NFS3ERR_NOT_SYNC:
return (EINVAL);
case NFS3ERR_BAD_COOKIE:
return (ENOENT);
case NFS3ERR_NOTSUPP:
return (EOPNOTSUPP);
case NFS3ERR_TOOSMALL:
return (EINVAL);
case NFS3ERR_SERVERFAULT:
return (EIO);
case NFS3ERR_BADTYPE:
return (EINVAL);
case NFS3ERR_JUKEBOX:
return (ENXIO);
default:
zcmn_err(getzoneid(), CE_WARN,
"geterrno3: got status %d", status);
return ((int)status);
}
#else
switch (status) {
case NFS3ERR_NAMETOOLONG:
return (ENAMETOOLONG);
case NFS3ERR_NOTEMPTY:
return (ENOTEMPTY);
case NFS3ERR_DQUOT:
return (EDQUOT);
case NFS3ERR_STALE:
case NFS3ERR_BADHANDLE:
return (ESTALE);
case NFS3ERR_NOTSUPP:
return (EOPNOTSUPP);
case NFS3ERR_REMOTE:
return (EREMOTE);
case NFS3ERR_NOT_SYNC:
case NFS3ERR_TOOSMALL:
case NFS3ERR_BADTYPE:
return (EINVAL);
case NFS3ERR_BAD_COOKIE:
return (ENOENT);
case NFS3ERR_SERVERFAULT:
return (EIO);
case NFS3ERR_JUKEBOX:
return (ENXIO);
default:
return ((int)status);
}
#endif
}
rddir_cache *
rddir_cache_alloc(int flags)
{
rddir_cache *rc;
rc = kmem_alloc(sizeof (*rc), flags);
if (rc != NULL) {
rc->entries = NULL;
rc->flags = RDDIR;
cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
rc->count = 1;
#ifdef DEBUG
atomic_add_64(&clstat_debug.dirent.value.ui64, 1);
#endif
}
return (rc);
}
static void
rddir_cache_free(rddir_cache *rc)
{
#ifdef DEBUG
atomic_add_64(&clstat_debug.dirent.value.ui64, -1);
#endif
if (rc->entries != NULL) {
#ifdef DEBUG
rddir_cache_buf_free(rc->entries, rc->buflen);
#else
kmem_free(rc->entries, rc->buflen);
#endif
}
cv_destroy(&rc->cv);
mutex_destroy(&rc->lock);
kmem_free(rc, sizeof (*rc));
}
void
rddir_cache_hold(rddir_cache *rc)
{
mutex_enter(&rc->lock);
rc->count++;
mutex_exit(&rc->lock);
}
void
rddir_cache_rele(rddir_cache *rc)
{
mutex_enter(&rc->lock);
ASSERT(rc->count > 0);
if (--rc->count == 0) {
mutex_exit(&rc->lock);
rddir_cache_free(rc);
} else
mutex_exit(&rc->lock);
}
#ifdef DEBUG
char *
rddir_cache_buf_alloc(size_t size, int flags)
{
char *rc;
rc = kmem_alloc(size, flags);
if (rc != NULL)
atomic_add_64(&clstat_debug.dirents.value.ui64, size);
return (rc);
}
void
rddir_cache_buf_free(void *addr, size_t size)
{
atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
kmem_free(addr, size);
}
#endif
static int
nfs_free_data_reclaim(rnode_t *rp)
{
char *contents;
int size;
vsecattr_t *vsp;
nfs3_pathconf_info *info;
int freed;
cred_t *cred;
/*
* Free any held credentials and caches which
* may be associated with this rnode.
*/
mutex_enter(&rp->r_statelock);
cred = rp->r_cred;
rp->r_cred = NULL;
contents = rp->r_symlink.contents;
size = rp->r_symlink.size;
rp->r_symlink.contents = NULL;
vsp = rp->r_secattr;
rp->r_secattr = NULL;
info = rp->r_pathconf;
rp->r_pathconf = NULL;
mutex_exit(&rp->r_statelock);
if (cred != NULL)
crfree(cred);
/*
* Free the access cache entries.
*/
freed = nfs_access_purge_rp(rp);
if (!HAVE_RDDIR_CACHE(rp) &&
contents == NULL &&
vsp == NULL &&
info == NULL)
return (freed);
/*
* Free the readdir cache entries
*/
if (HAVE_RDDIR_CACHE(rp))
nfs_purge_rddir_cache(RTOV(rp));
/*
* Free the symbolic link cache.
*/
if (contents != NULL) {
kmem_free((void *)contents, size);
}
/*
* Free any cached ACL.
*/
if (vsp != NULL)
nfs_acl_free(vsp);
/*
* Free any cached pathconf information.
*/
if (info != NULL)
kmem_free(info, sizeof (*info));
return (1);
}
static int
nfs_active_data_reclaim(rnode_t *rp)
{
char *contents;
int size;
vsecattr_t *vsp;
nfs3_pathconf_info *info;
int freed;
/*
* Free any held credentials and caches which
* may be associated with this rnode.
*/
if (!mutex_tryenter(&rp->r_statelock))
return (0);
contents = rp->r_symlink.contents;
size = rp->r_symlink.size;
rp->r_symlink.contents = NULL;
vsp = rp->r_secattr;
rp->r_secattr = NULL;
info = rp->r_pathconf;
rp->r_pathconf = NULL;
mutex_exit(&rp->r_statelock);
/*
* Free the access cache entries.
*/
freed = nfs_access_purge_rp(rp);
if (!HAVE_RDDIR_CACHE(rp) &&
contents == NULL &&
vsp == NULL &&
info == NULL)
return (freed);
/*
* Free the readdir cache entries
*/
if (HAVE_RDDIR_CACHE(rp))
nfs_purge_rddir_cache(RTOV(rp));
/*
* Free the symbolic link cache.
*/
if (contents != NULL) {
kmem_free((void *)contents, size);
}
/*
* Free any cached ACL.
*/
if (vsp != NULL)
nfs_acl_free(vsp);
/*
* Free any cached pathconf information.
*/
if (info != NULL)
kmem_free(info, sizeof (*info));
return (1);
}
static int
nfs_free_reclaim(void)
{
int freed;
rnode_t *rp;
#ifdef DEBUG
clstat_debug.f_reclaim.value.ui64++;
#endif
freed = 0;
mutex_enter(&rpfreelist_lock);
rp = rpfreelist;
if (rp != NULL) {
do {
if (nfs_free_data_reclaim(rp))
freed = 1;
} while ((rp = rp->r_freef) != rpfreelist);
}
mutex_exit(&rpfreelist_lock);
return (freed);
}
static int
nfs_active_reclaim(void)
{
int freed;
int index;
rnode_t *rp;
#ifdef DEBUG
clstat_debug.a_reclaim.value.ui64++;
#endif
freed = 0;
for (index = 0; index < rtablesize; index++) {
rw_enter(&rtable[index].r_lock, RW_READER);
for (rp = rtable[index].r_hashf;
rp != (rnode_t *)(&rtable[index]);
rp = rp->r_hashf) {
if (nfs_active_data_reclaim(rp))
freed = 1;
}
rw_exit(&rtable[index].r_lock);
}
return (freed);
}
static int
nfs_rnode_reclaim(void)
{
int freed;
rnode_t *rp;
vnode_t *vp;
#ifdef DEBUG
clstat_debug.r_reclaim.value.ui64++;
#endif
freed = 0;
mutex_enter(&rpfreelist_lock);
while ((rp = rpfreelist) != NULL) {
rp_rmfree(rp);
mutex_exit(&rpfreelist_lock);
if (rp->r_flags & RHASHED) {
vp = RTOV(rp);
rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
mutex_enter(&vp->v_lock);
if (vp->v_count > 1) {
vp->v_count--;
mutex_exit(&vp->v_lock);
rw_exit(&rp->r_hashq->r_lock);
mutex_enter(&rpfreelist_lock);
continue;
}
mutex_exit(&vp->v_lock);
rp_rmhash_locked(rp);
rw_exit(&rp->r_hashq->r_lock);
}
/*
* This call to rp_addfree will end up destroying the
* rnode, but in a safe way with the appropriate set
* of checks done.
*/
rp_addfree(rp, CRED());
mutex_enter(&rpfreelist_lock);
}
mutex_exit(&rpfreelist_lock);
return (freed);
}
/*ARGSUSED*/
static void
nfs_reclaim(void *cdrarg)
{
#ifdef DEBUG
clstat_debug.reclaim.value.ui64++;
#endif
if (nfs_free_reclaim())
return;
if (nfs_active_reclaim())
return;
(void) nfs_rnode_reclaim();
}
/*
* NFS client failover support
*
* Routines to copy filehandles
*/
void
nfscopyfh(caddr_t fhp, vnode_t *vp)
{
fhandle_t *dest = (fhandle_t *)fhp;
if (dest != NULL)
*dest = *VTOFH(vp);
}
void
nfs3copyfh(caddr_t fhp, vnode_t *vp)
{
nfs_fh3 *dest = (nfs_fh3 *)fhp;
if (dest != NULL)
*dest = *VTOFH3(vp);
}
/*
* NFS client failover support
*
* failover_safe() will test various conditions to ensure that
* failover is permitted for this vnode. It will be denied
* if:
* 1) the operation in progress does not support failover (NULL fi)
* 2) there are no available replicas (NULL mi_servers->sv_next)
* 3) any locks are outstanding on this file
*/
static int
failover_safe(failinfo_t *fi)
{
/*
* Does this op permit failover?
*/
if (fi == NULL || fi->vp == NULL)
return (0);
/*
* Are there any alternates to failover to?
*/
if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
return (0);
/*
* Disable check; we've forced local locking
*
* if (flk_has_remote_locks(fi->vp))
* return (0);
*/
/*
* If we have no partial path, we can't do anything
*/
if (VTOR(fi->vp)->r_path == NULL)
return (0);
return (1);
}
#include <sys/thread.h>
/*
* NFS client failover support
*
* failover_newserver() will start a search for a new server,
* preferably by starting an async thread to do the work. If
* someone is already doing this (recognizable by MI_BINDINPROG
* being set), it will simply return and the calling thread
* will queue on the mi_failover_cv condition variable.
*/
static void
failover_newserver(mntinfo_t *mi)
{
/*
* Check if someone else is doing this already
*/
mutex_enter(&mi->mi_lock);
if (mi->mi_flags & MI_BINDINPROG) {
mutex_exit(&mi->mi_lock);
return;
}
mi->mi_flags |= MI_BINDINPROG;
/*
* Need to hold the vfs struct so that it can't be released
* while the failover thread is selecting a new server.
*/
VFS_HOLD(mi->mi_vfsp);
/*
* Start a thread to do the real searching.
*/
(void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
mutex_exit(&mi->mi_lock);
}
/*
* NFS client failover support
*
* failover_thread() will find a new server to replace the one
* currently in use, wake up other threads waiting on this mount
* point, and die. It will start at the head of the server list
* and poll servers until it finds one with an NFS server which is
* registered and responds to a NULL procedure ping.
*
* XXX failover_thread is unsafe within the scope of the
* present model defined for cpr to suspend the system.
* Specifically, over-the-wire calls made by the thread
* are unsafe. The thread needs to be reevaluated in case of
* future updates to the cpr suspend model.
*/
static void
failover_thread(mntinfo_t *mi)
{
servinfo_t *svp = NULL;
CLIENT *cl;
enum clnt_stat status;
struct timeval tv;
int error;
int oncethru = 0;
callb_cpr_t cprinfo;
rnode_t *rp;
int index;
char *srvnames;
size_t srvnames_len;
struct nfs_clnt *nfscl = NULL;
zoneid_t zoneid = getzoneid();
#ifdef DEBUG
/*
* This is currently only needed to access counters which exist on
* DEBUG kernels, hence we don't want to pay the penalty of the lookup
* on non-DEBUG kernels.
*/
nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
ASSERT(nfscl != NULL);
#endif
/*
* Its safe to piggyback on the mi_lock since failover_newserver()
* code guarantees that there will be only one failover thread
* per mountinfo at any instance.
*/
CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
"failover_thread");
mutex_enter(&mi->mi_lock);
while (mi->mi_readers) {
CALLB_CPR_SAFE_BEGIN(&cprinfo);
cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
}
mutex_exit(&mi->mi_lock);
tv.tv_sec = 2;
tv.tv_usec = 0;
/*
* Ping the null NFS procedure of every server in
* the list until one responds. We always start
* at the head of the list and always skip the one
* that is current, since it's caused us a problem.
*/
while (svp == NULL) {
for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
if (!oncethru && svp == mi->mi_curr_serv)
continue;
/*
* If the file system was forcibly umounted
* while trying to do a failover, then just
* give up on the failover. It won't matter
* what the server is.
*/
if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
svp = NULL;
goto done;
}
error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
if (error)
continue;
if (!(mi->mi_flags & MI_INT))
cl->cl_nosignal = TRUE;
status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
xdr_void, NULL, tv);
if (!(mi->mi_flags & MI_INT))
cl->cl_nosignal = FALSE;
AUTH_DESTROY(cl->cl_auth);
CLNT_DESTROY(cl);
if (status == RPC_SUCCESS) {
if (svp == mi->mi_curr_serv) {
#ifdef DEBUG
zcmn_err(zoneid, CE_NOTE,
"NFS%d: failing over: selecting original server %s",
mi->mi_vers, svp->sv_hostname);
#else
zcmn_err(zoneid, CE_NOTE,
"NFS: failing over: selecting original server %s",
svp->sv_hostname);
#endif
} else {
#ifdef DEBUG
zcmn_err(zoneid, CE_NOTE,
"NFS%d: failing over from %s to %s",
mi->mi_vers,
mi->mi_curr_serv->sv_hostname,
svp->sv_hostname);
#else
zcmn_err(zoneid, CE_NOTE,
"NFS: failing over from %s to %s",
mi->mi_curr_serv->sv_hostname,
svp->sv_hostname);
#endif
}
break;
}
}
if (svp == NULL) {
if (!oncethru) {
srvnames = nfs_getsrvnames(mi, &srvnames_len);
#ifdef DEBUG
zprintf(zoneid,
"NFS%d servers %s not responding "
"still trying\n", mi->mi_vers, srvnames);
#else
zprintf(zoneid, "NFS servers %s not responding "
"still trying\n", srvnames);
#endif
oncethru = 1;
}
mutex_enter(&mi->mi_lock);
CALLB_CPR_SAFE_BEGIN(&cprinfo);
mutex_exit(&mi->mi_lock);
delay(hz);
mutex_enter(&mi->mi_lock);
CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
mutex_exit(&mi->mi_lock);
}
}
if (oncethru) {
#ifdef DEBUG
zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
#else
zprintf(zoneid, "NFS servers %s ok\n", srvnames);
#endif
}
if (svp != mi->mi_curr_serv) {
(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
rw_enter(&rtable[index].r_lock, RW_WRITER);
rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
mi->mi_vfsp);
if (rp != NULL) {
if (rp->r_flags & RHASHED)
rp_rmhash_locked(rp);
rw_exit(&rtable[index].r_lock);
rp->r_server = svp;
rp->r_fh = svp->sv_fhandle;
(void) nfs_free_data_reclaim(rp);
index = rtablehash(&rp->r_fh);
rp->r_hashq = &rtable[index];
rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
vn_exists(RTOV(rp));
rp_addhash(rp);
rw_exit(&rp->r_hashq->r_lock);
VN_RELE(RTOV(rp));
} else
rw_exit(&rtable[index].r_lock);
}
done:
if (oncethru)
kmem_free(srvnames, srvnames_len);
mutex_enter(&mi->mi_lock);
mi->mi_flags &= ~MI_BINDINPROG;
if (svp != NULL) {
mi->mi_curr_serv = svp;
mi->mi_failover++;
#ifdef DEBUG
nfscl->nfscl_stat.failover.value.ui64++;
#endif
}
cv_broadcast(&mi->mi_failover_cv);
CALLB_CPR_EXIT(&cprinfo);
VFS_RELE(mi->mi_vfsp);
zthread_exit();
/* NOTREACHED */
}
/*
* NFS client failover support
*
* failover_wait() will put the thread to sleep until MI_BINDINPROG
* is cleared, meaning that failover is complete. Called with
* mi_lock mutex held.
*/
static int
failover_wait(mntinfo_t *mi)
{
k_sigset_t smask;
/*
* If someone else is hunting for a living server,
* sleep until it's done. After our sleep, we may
* be bound to the right server and get off cheaply.
*/
while (mi->mi_flags & MI_BINDINPROG) {
/*
* Mask out all signals except SIGHUP, SIGINT, SIGQUIT
* and SIGTERM. (Preserving the existing masks).
* Mask out SIGINT if mount option nointr is specified.
*/
sigintr(&smask, (int)mi->mi_flags & MI_INT);
if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
/*
* restore original signal mask
*/
sigunintr(&smask);
return (EINTR);
}
/*
* restore original signal mask
*/
sigunintr(&smask);
}
return (0);
}
/*
* NFS client failover support
*
* failover_remap() will do a partial pathname lookup and find the
* desired vnode on the current server. The interim vnode will be
* discarded after we pilfer the new filehandle.
*
* Side effects:
* - This routine will also update the filehandle in the args structure
* pointed to by the fi->fhp pointer if it is non-NULL.
*/
static int
failover_remap(failinfo_t *fi)
{
vnode_t *vp, *nvp, *rootvp;
rnode_t *rp, *nrp;
mntinfo_t *mi;
int error;
int index;
#ifdef DEBUG
struct nfs_clnt *nfscl;
nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
ASSERT(nfscl != NULL);
#endif
/*
* Sanity check
*/
if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
return (EINVAL);
vp = fi->vp;
rp = VTOR(vp);
mi = VTOMI(vp);
if (!(vp->v_flag & VROOT)) {
/*
* Given the root fh, use the path stored in
* the rnode to find the fh for the new server.
*/
error = VFS_ROOT(mi->mi_vfsp, &rootvp);
if (error)
return (error);
error = failover_lookup(rp->r_path, rootvp,
fi->lookupproc, fi->xattrdirproc, &nvp);
VN_RELE(rootvp);
if (error)
return (error);
/*
* If we found the same rnode, we're done now
*/
if (nvp == vp) {
/*
* Failed and the new server may physically be same
* OR may share a same disk subsystem. In this case
* file handle for a particular file path is not going
* to change, given the same filehandle lookup will
* always locate the same rnode as the existing one.
* All we might need to do is to update the r_server
* with the current servinfo.
*/
if (!VALID_FH(fi)) {
rp->r_server = mi->mi_curr_serv;
}
VN_RELE(nvp);
return (0);
}
/*
* Try to make it so that no one else will find this
* vnode because it is just a temporary to hold the
* new file handle until that file handle can be
* copied to the original vnode/rnode.
*/
nrp = VTOR(nvp);
mutex_enter(&mi->mi_remap_lock);
/*
* Some other thread could have raced in here and could
* have done the remap for this particular rnode before
* this thread here. Check for rp->r_server and
* mi->mi_curr_serv and return if they are same.
*/
if (VALID_FH(fi)) {
mutex_exit(&mi->mi_remap_lock);
VN_RELE(nvp);
return (0);
}
if (nrp->r_flags & RHASHED)
rp_rmhash(nrp);
/*
* As a heuristic check on the validity of the new
* file, check that the size and type match against
* that we remember from the old version.
*/
if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
mutex_exit(&mi->mi_remap_lock);
zcmn_err(mi->mi_zone->zone_id, CE_WARN,
"NFS replicas %s and %s: file %s not same.",
rp->r_server->sv_hostname,
nrp->r_server->sv_hostname, rp->r_path);
VN_RELE(nvp);
return (EINVAL);
}
/*
* snarf the filehandle from the new rnode
* then release it, again while updating the
* hash queues for the rnode.
*/
if (rp->r_flags & RHASHED)
rp_rmhash(rp);
rp->r_server = mi->mi_curr_serv;
rp->r_fh = nrp->r_fh;
rp->r_hashq = nrp->r_hashq;
/*
* Copy the attributes from the new rnode to the old
* rnode. This will help to reduce unnecessary page
* cache flushes.
*/
rp->r_attr = nrp->r_attr;
rp->r_attrtime = nrp->r_attrtime;
rp->r_mtime = nrp->r_mtime;
(void) nfs_free_data_reclaim(rp);
nfs_setswaplike(vp, &rp->r_attr);
rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
rp_addhash(rp);
rw_exit(&rp->r_hashq->r_lock);
mutex_exit(&mi->mi_remap_lock);
VN_RELE(nvp);
}
/*
* Update successful failover remap count
*/
mutex_enter(&mi->mi_lock);
mi->mi_remap++;
mutex_exit(&mi->mi_lock);
#ifdef DEBUG
nfscl->nfscl_stat.remap.value.ui64++;
#endif
/*
* If we have a copied filehandle to update, do it now.
*/
if (fi->fhp != NULL && fi->copyproc != NULL)
(*fi->copyproc)(fi->fhp, vp);
return (0);
}
/*
* NFS client failover support
*
* We want a simple pathname lookup routine to parse the pieces
* of path in rp->r_path. We know that the path was a created
* as rnodes were made, so we know we have only to deal with
* paths that look like:
* dir1/dir2/dir3/file
* Any evidence of anything like .., symlinks, and ENOTDIR
* are hard errors, because they mean something in this filesystem
* is different from the one we came from, or has changed under
* us in some way. If this is true, we want the failure.
*
* Extended attributes: if the filesystem is mounted with extended
* attributes enabled (-o xattr), the attribute directory will be
* represented in the r_path as the magic name XATTR_RPATH. So if
* we see that name in the pathname, is must be because this node
* is an extended attribute. Therefore, look it up that way.
*/
static int
failover_lookup(char *path, vnode_t *root,
int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
vnode_t *, cred_t *, int),
int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
vnode_t **new)
{
vnode_t *dvp, *nvp;
int error = EINVAL;
char *s, *p, *tmppath;
size_t len;
mntinfo_t *mi;
bool_t xattr;
/* Make local copy of path */
len = strlen(path) + 1;
tmppath = kmem_alloc(len, KM_SLEEP);
(void) strcpy(tmppath, path);
s = tmppath;
dvp = root;
VN_HOLD(dvp);
mi = VTOMI(root);
xattr = mi->mi_flags & MI_EXTATTR;
do {
p = strchr(s, '/');
if (p != NULL)
*p = '\0';
if (xattr && strcmp(s, XATTR_RPATH) == 0) {
error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
RFSCALL_SOFT);
} else {
error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
CRED(), RFSCALL_SOFT);
}
if (p != NULL)
*p++ = '/';
if (error) {
VN_RELE(dvp);
kmem_free(tmppath, len);
return (error);
}
s = p;
VN_RELE(dvp);
dvp = nvp;
} while (p != NULL);
if (nvp != NULL && new != NULL)
*new = nvp;
kmem_free(tmppath, len);
return (0);
}
/*
* NFS client failover support
*
* sv_free() frees the malloc'd portion of a "servinfo_t".
*/
void
sv_free(servinfo_t *svp)
{
servinfo_t *next;
struct knetconfig *knconf;
while (svp != NULL) {
next = svp->sv_next;
if (svp->sv_secdata)
sec_clnt_freeinfo(svp->sv_secdata);
if (svp->sv_hostname && svp->sv_hostnamelen > 0)
kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
knconf = svp->sv_knconf;
if (knconf != NULL) {
if (knconf->knc_protofmly != NULL)
kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
if (knconf->knc_proto != NULL)
kmem_free(knconf->knc_proto, KNC_STRSIZE);
kmem_free(knconf, sizeof (*knconf));
}
knconf = svp->sv_origknconf;
if (knconf != NULL) {
if (knconf->knc_protofmly != NULL)
kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
if (knconf->knc_proto != NULL)
kmem_free(knconf->knc_proto, KNC_STRSIZE);
kmem_free(knconf, sizeof (*knconf));
}
if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
mutex_destroy(&svp->sv_lock);
kmem_free(svp, sizeof (*svp));
svp = next;
}
}
/*
* Only can return non-zero if intr != 0.
*/
int
nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
{
mutex_enter(&l->lock);
/*
* If this is a nested enter, then allow it. There
* must be as many exits as enters through.
*/
if (l->owner == curthread) {
/* lock is held for writing by current thread */
ASSERT(rw == RW_READER || rw == RW_WRITER);
l->count--;
} else if (rw == RW_READER) {
/*
* While there is a writer active or writers waiting,
* then wait for them to finish up and move on. Then,
* increment the count to indicate that a reader is
* active.
*/
while (l->count < 0 || l->waiters > 0) {
if (intr) {
klwp_t *lwp = ttolwp(curthread);
if (lwp != NULL)
lwp->lwp_nostop++;
if (!cv_wait_sig(&l->cv, &l->lock)) {
if (lwp != NULL)
lwp->lwp_nostop--;
mutex_exit(&l->lock);
return (EINTR);
}
if (lwp != NULL)
lwp->lwp_nostop--;
} else
cv_wait(&l->cv, &l->lock);
}
ASSERT(l->count < INT_MAX);
#ifdef DEBUG
if ((l->count % 10000) == 9999)
cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
"rwlock @ %p\n", l->count, (void *)&l);
#endif
l->count++;
} else {
ASSERT(rw == RW_WRITER);
/*
* While there are readers active or a writer
* active, then wait for all of the readers
* to finish or for the writer to finish.
* Then, set the owner field to curthread and
* decrement count to indicate that a writer
* is active.
*/
while (l->count > 0 || l->owner != NULL) {
l->waiters++;
if (intr) {
klwp_t *lwp = ttolwp(curthread);
if (lwp != NULL)
lwp->lwp_nostop++;
if (!cv_wait_sig(&l->cv, &l->lock)) {
if (lwp != NULL)
lwp->lwp_nostop--;
l->waiters--;
cv_broadcast(&l->cv);
mutex_exit(&l->lock);
return (EINTR);
}
if (lwp != NULL)
lwp->lwp_nostop--;
} else
cv_wait(&l->cv, &l->lock);
l->waiters--;
}
l->owner = curthread;
l->count--;
}
mutex_exit(&l->lock);
return (0);
}
/*
* If the lock is available, obtain it and return non-zero. If there is
* already a conflicting lock, return 0 immediately.
*/
int
nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
{
mutex_enter(&l->lock);
/*
* If this is a nested enter, then allow it. There
* must be as many exits as enters through.
*/
if (l->owner == curthread) {
/* lock is held for writing by current thread */
ASSERT(rw == RW_READER || rw == RW_WRITER);
l->count--;
} else if (rw == RW_READER) {
/*
* If there is a writer active or writers waiting, deny the
* lock. Otherwise, bump the count of readers.
*/
if (l->count < 0 || l->waiters > 0) {
mutex_exit(&l->lock);
return (0);
}
l->count++;
} else {
ASSERT(rw == RW_WRITER);
/*
* If there are readers active or a writer active, deny the
* lock. Otherwise, set the owner field to curthread and
* decrement count to indicate that a writer is active.
*/
if (l->count > 0 || l->owner != NULL) {
mutex_exit(&l->lock);
return (0);
}
l->owner = curthread;
l->count--;
}
mutex_exit(&l->lock);
return (1);
}
void
nfs_rw_exit(nfs_rwlock_t *l)
{
mutex_enter(&l->lock);
/*
* If this is releasing a writer lock, then increment count to
* indicate that there is one less writer active. If this was
* the last of possibly nested writer locks, then clear the owner
* field as well to indicate that there is no writer active
* and wakeup any possible waiting writers or readers.
*
* If releasing a reader lock, then just decrement count to
* indicate that there is one less reader active. If this was
* the last active reader and there are writer(s) waiting,
* then wake up the first.
*/
if (l->owner != NULL) {
ASSERT(l->owner == curthread);
l->count++;
if (l->count == 0) {
l->owner = NULL;
cv_broadcast(&l->cv);
}
} else {
ASSERT(l->count > 0);
l->count--;
if (l->count == 0 && l->waiters > 0)
cv_broadcast(&l->cv);
}
mutex_exit(&l->lock);
}
int
nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
{
if (rw == RW_READER)
return (l->count > 0);
ASSERT(rw == RW_WRITER);
return (l->count < 0);
}
/* ARGSUSED */
void
nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
{
l->count = 0;
l->waiters = 0;
l->owner = NULL;
mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
}
void
nfs_rw_destroy(nfs_rwlock_t *l)
{
mutex_destroy(&l->lock);
cv_destroy(&l->cv);
}
int
nfs3_rddir_compar(const void *x, const void *y)
{
rddir_cache *a = (rddir_cache *)x;
rddir_cache *b = (rddir_cache *)y;
if (a->nfs3_cookie == b->nfs3_cookie) {
if (a->buflen == b->buflen)
return (0);
if (a->buflen < b->buflen)
return (-1);
return (1);
}
if (a->nfs3_cookie < b->nfs3_cookie)
return (-1);
return (1);
}
int
nfs_rddir_compar(const void *x, const void *y)
{
rddir_cache *a = (rddir_cache *)x;
rddir_cache *b = (rddir_cache *)y;
if (a->nfs_cookie == b->nfs_cookie) {
if (a->buflen == b->buflen)
return (0);
if (a->buflen < b->buflen)
return (-1);
return (1);
}
if (a->nfs_cookie < b->nfs_cookie)
return (-1);
return (1);
}
static char *
nfs_getsrvnames(mntinfo_t *mi, size_t *len)
{
servinfo_t *s;
char *srvnames;
char *namep;
size_t length;
/*
* Calculate the length of the string required to hold all
* of the server names plus either a comma or a null
* character following each individual one.
*/
length = 0;
for (s = mi->mi_servers; s != NULL; s = s->sv_next)
length += s->sv_hostnamelen;
srvnames = kmem_alloc(length, KM_SLEEP);
namep = srvnames;
for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
(void) strcpy(namep, s->sv_hostname);
namep += s->sv_hostnamelen - 1;
*namep++ = ',';
}
*--namep = '\0';
*len = length;
return (srvnames);
}
/*
* These two functions are temporary and designed for the upgrade-workaround
* only. They cannot be used for general zone-crossing NFS client support, and
* will be removed shortly.
*
* When the workaround is enabled, all NFS traffic is forced into the global
* zone. These functions are called when the code needs to refer to the state
* of the underlying network connection. They're not called when the function
* needs to refer to the state of the process that invoked the system call.
* (E.g., when checking whether the zone is shutting down during the mount()
* call.)
*/
struct zone *
nfs_zone(void)
{
return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
}
zoneid_t
nfs_zoneid(void)
{
return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
}