nfs4_rnode.c revision 2937862b1ec2370b30761b9dd687c49b4d43e60f
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
* All Rights Reserved
*/
#include <sys/pathname.h>
#include <rpc/rpcsec_gss.h>
#include <nfs/nfs_clnt.h>
#include <nfs/nfs4_clnt.h>
/*
* The hash queues for the access to active and cached rnodes
* for each hash bucket is used to control access and to synchronize
* lookups, additions, and deletions from the hash queue.
*
* The rnode freelist is organized as a doubly linked list with
* a head pointer. Additions and deletions are synchronized via
* a single mutex.
*
* In order to add an rnode to the free list, it must be hashed into
* a hash queue and the exclusive lock to the hash queue be held.
* If an rnode is not hashed into a hash queue, then it is destroyed
* because it represents no valuable information that can be reused
* about the file. The exclusive lock to the hash queue must be
* held in order to prevent a lookup in the hash queue from finding
* the rnode and using it and assuming that the rnode is not on the
* freelist. The lookup in the hash queue will have the hash queue
* locked, either exclusive or shared.
*
* The vnode reference count for each rnode is not allowed to drop
* below 1. This prevents external entities, such as the VM
* subsystem, from acquiring references to vnodes already on the
* freelist and then trying to place them back on the freelist
* when their reference is released. This means that the when an
* rnode is looked up in the hash queues, then either the rnode
* is removed from the freelist and that reference is transferred to
* the new reference or the vnode reference count must be incremented
* accordingly. The mutex for the freelist must be held in order to
* accurately test to see if the rnode is on the freelist or not.
* The hash queue lock might be held shared and it is possible that
* two different threads may race to remove the rnode from the
* freelist. This race can be resolved by holding the mutex for the
* freelist. Please note that the mutex for the freelist does not
* need to be held if the rnode is not on the freelist. It can not be
* placed on the freelist due to the requirement that the thread
* putting the rnode on the freelist must hold the exclusive lock
* to the hash queue and the thread doing the lookup in the hash
* queue is holding either a shared or exclusive lock to the hash
* queue.
*
* The lock ordering is:
*
* hash bucket lock -> vnode lock
* hash bucket lock -> freelist lock -> r_statelock
*/
static kmutex_t rp4freelist_lock;
static long rnode4_new = 0;
int rtable4size;
static int rtable4mask;
static struct kmem_cache *rnode4_cache;
static int rnode4_hashlen = 4;
struct vnodeops *,
cred_t *),
int *, cred_t *);
static void rp4_rmfree(rnode4_t *);
int nfs4_free_data_reclaim(rnode4_t *);
static int nfs4_active_data_reclaim(rnode4_t *);
static int nfs4_free_reclaim(void);
static int nfs4_active_reclaim(void);
static int nfs4_rnode_reclaim(void);
static void nfs4_reclaim(void *);
static void uninit_rnode4(rnode4_t *);
static void destroy_rnode4(rnode4_t *);
#ifdef DEBUG
static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */
static int nfs4_rnode_debug = 0;
/* if nonzero, kmem_cache_free() rnodes rather than place on freelist */
static int nfs4_rnode_nofreelist = 0;
/* give messages on colliding shared filehandles */
#endif
/*
* If the vnode has pages, run the list and check for any that are
* still dangling. We call this routine before putting an rnode on
* the free list.
*/
static int
{
do {
return (1);
}
}
return (0);
}
/*
* Flush any pages left on this rnode.
*/
static void
{
int error;
/*
* Before freeing anything, wait until all asynchronous
* activity is done on this rnode. This will allow all
* asynchronous read ahead and write behind i/o's to
* finish.
*/
/*
* Flush and invalidate all pages associated with the vnode.
*/
if (nfs4_has_pages(vp)) {
}
}
}
}
/*
* Free the resources associated with an rnode.
*/
static void
{
char *contents;
int size;
/*
* Free any held caches which may be
* associated with this rnode.
*/
/*
* Free the access cache entries.
*/
(void) nfs4_access_purge_rp(rp);
/*
* Free the readdir cache entries.
*/
/*
* Free the symbolic link cache.
*/
}
/*
* Free any cached ACL.
*/
/*
* Release the cached xattr_dir
*/
}
/*
* We have seen a case that the fh passed in is for "." which
* should be a VROOT node, however, the fh is different from the
* root fh stored in the mntinfo4_t. The invalid fh might be
* from a misbehaved server and will panic the client system at
* a later time. To avoid the panic, we drop the bad fh, use
* the root fh from mntinfo4_t, and print an error message
* for attention.
*/
int *wasbad)
{
char *s;
*wasbad = 0;
#ifdef DEBUG
"Server %s returns a different "
"root filehandle for the path %s:",
/* print the bad fh */
/* print mi_rootfh */
#endif
/* use mi_rootfh instead; fh will be rele by the caller */
*wasbad = 1;
}
kmem_free(s, MAXNAMELEN);
return (fh);
}
void
{
int is_stub;
/*
* Don't add to attrcache if time overflow, but
* no need to check because either attr is null or the time
* values in it were processed by nfs4_time_ntov(), which checks
* for time overflows.
*/
if (attr) {
if (!newnode) {
#ifdef DEBUG
"makenfs4node: type (%d) doesn't "
"match type of found node at %p (%d)",
}
#endif
} else {
/*
* Turn this object into a "stub" object if we
* crossed an underlying server fs boundary.
* To make this check, during mount we save the
* fsid of the server object being mounted.
* Here we compare this object's server fsid
* with the fsid we saved at mount. If they
* are different, we crossed server fs boundary.
*
* The stub type is set (or not) at rnode
* creation time and it never changes for life
* of the rnode.
*
* The stub type is also set during RO failover,
* nfs4_remap_file().
*
* This stub will be for a mirror-mount.
*
* We don't bother with taking r_state_lock to
* set the stub type because this is a new rnode
* and we're holding the hash bucket r_lock RW_WRITER.
* No other thread could have obtained access
* to this rnode.
*/
is_stub = 0;
if (garp->n4g_fsid_valid) {
RW_READER, 0);
is_stub = 1;
}
if (is_stub)
else
/* Can not cache partial attr */
else
}
} else {
if (newnode) {
}
}
}
/*
* Find or create an rnode based primarily on filehandle. To be
* used when dvp (vnode for parent directory) is not available;
* otherwise, makenfs4node() should be used.
*
* The nfs4_fname_t argument *npp is consumed and nulled out.
*/
vnode_t *
{
int newnode = 0;
int index;
if (newnode) {
} else {
/*
* It is possible that due to a server
* side rename fnames have changed.
* update the fname here.
*/
} else {
}
}
return (vp);
}
/*
* Find or create a vnode for the given filehandle, filesystem, parent, and
* name. The reference to nm is consumed, so the caller must first do an
* fn_hold() if it wants to continue using nm after this call.
*/
vnode_t *
{
int newnode;
int index;
int had_badfh = 0;
/*
* Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
*/
}
/* if getting a bad file handle, do not cache the attributes. */
if (had_badfh) {
return (vp);
}
return (vp);
}
/*
* Hash on address of filehandle object.
* XXX totally untuned.
*/
int
{
}
/*
* Find or create the vnode for the given filehandle and filesystem.
* *newnode is set to zero if the vnode already existed; non-zero if it had
* to be created.
*
* Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
*/
static vnode_t *
{
mntinfo4_t *mi;
*newnode = 0;
return (vp);
}
rp = rp4freelist;
rp4_rmfree(rp);
goto start;
}
}
goto start;
}
vn_invalid(vp);
/*
* destroy old locks before bzero'ing and
* recreating the locks below.
*/
/*
* Make sure that if rnode is recycled then
* VFS count is decremented properly before
* reuse.
*/
} else {
#ifdef DEBUG
#endif
}
rp->created_v4 = 0;
/*
* There is a race condition if someone else
* alloc's the rnode while no locks are held, so we
* check again and recover if found.
*/
*newnode = 0;
return (vp);
}
*newnode = 1;
return (vp);
}
static void
{
}
/*
* Destroy the rddir cache first since we need to grab the r_statelock.
*/
}
/*
* Put an rnode on the free list.
*
* Rnodes which were allocated above and beyond the normal limit
* are immediately freed.
*/
void
{
/*
* If we have too many rnodes allocated and there are no
* references to this rnode, or if the rnode is no longer
* accessible by it does not reside in the hash queues,
* or if an i/o error occurred while writing to the file,
* then just free it instead of putting it on the rnode
* freelist.
*/
#ifdef DEBUG
(nfs4_rnode_nofreelist != 0) ||
#endif
return;
}
}
/*
* Make sure we don't have a delegation on this rnode
* before destroying it.
*/
(void) nfs4delegreturn(rp,
}
/*
* Recheck the vnode reference count. We need to
* make sure that another reference has not been
* acquired while we were not holding v_lock. The
* rnode is not in the rnode hash queues; one
* way for a reference to have been acquired
* is for a VOP_PUTPAGE because the rnode was marked
* with R4DIRTY or for a modified page. This
* reference may have been acquired before our call
* to r4inactive. The i/o may have been completed,
* thus allowing r4inactive to complete, but the
* reference to the vnode may not have been released
* yet. In any case, the rnode can not be destroyed
* until the other references to this vnode have been
* released. The other references will take care of
* either destroying the rnode or placing it on the
* rnode freelist. If there are no other references,
* then the rnode may be safely destroyed.
*/
return;
}
return;
}
/*
* Lock the hash queue and then recheck the reference count
* to ensure that no other threads have acquired a reference
* to indicate that the rnode should not be placed on the
* freelist. If another reference has been acquired, then
* just release this one and let the other thread complete
* the processing of adding this rnode to the freelist.
*/
return;
}
/*
* Make sure we don't put an rnode with a delegation
* on the free list.
*/
(void) nfs4delegreturn(rp,
goto again;
}
/*
* Now that we have the hash queue lock, and we know there
* are not anymore references on the vnode, check to make
* sure there aren't any open streams still on the rnode.
* If so, drop the hash queue lock, remove the open streams,
* and recheck the v_count.
*/
else
goto again;
}
/*
* Before we put it on the freelist, make sure there are no pages.
* If there are, flush and commit of all of the dirty and
* uncommitted pages, assuming the file system isn't read only.
*/
goto again;
}
/*
* Before we put it on the freelist, make sure there is no
* active xattr directory cached, the freelist will not
* have its entries r4inactive'd if there is still an active
* rnode, thus nothing in the freelist can hold another
* rnode active.
*/
/*
* If there is no cached data or metadata for this file, then
* put the rnode on the front of the freelist so that it will
* be reused before other rnodes which may have cached data or
* metadata associated with them.
*/
if (rp4freelist == NULL) {
rp4freelist = rp;
} else {
rp4freelist = rp;
}
if (xattr)
}
/*
* Remove an rnode from the free list.
*
* The caller must be holding rp4freelist_lock and the rnode
* must be on the freelist.
*/
static void
{
if (rp == rp4freelist) {
if (rp == rp4freelist)
rp4freelist = NULL;
}
}
/*
* Put a rnode in the hash table.
*
* The caller must be holding the exclusive hash queue lock
*/
void
{
#ifdef DEBUG
#endif
}
/*
* Remove a rnode from the hash table.
*
* The caller must be holding the hash queue lock.
*/
void
{
}
/*
* Remove a rnode from the hash table.
*
* The caller must not be holding the hash queue lock.
*/
void
{
}
/*
* Lookup a rnode by fhandle. Ignores rnodes that had failed recovery.
* Returns NULL if no match. If an rnode is returned, the reference count
* on the master vnode is incremented.
*
* The caller must be holding the hash queue lock, either shared or exclusive.
*/
rnode4_t *
{
continue;
}
#ifdef DEBUG
#endif
/*
* If the rnode is on the freelist,
* then remove it and use that reference
* as the new reference. Otherwise,
* need to increment the reference count.
*/
rp4_rmfree(rp);
} else {
}
} else
/*
* if root vnode, set v_flag to indicate that
*/
}
}
return (rp);
}
}
return (NULL);
}
/*
* Lookup an rnode by fhandle. Just a wrapper for r4find()
* that assumes the caller hasn't already got the lock
* on the hash bucket.
*/
rnode4_t *
{
int index;
return (rp);
}
/*
* Return >0 if there is a active vnode belonging to this vfs in the
* rtable4 cache.
*
* Several of these checks are done without holding the usual
* locks. This is safe because destroy_rtable(), rp_addfree(),
* etc. will redo the necessary checks before actually destroying
* any rnodes.
*/
int
{
int busy = NFSV4_RTABLE4_OK;
int index;
} else if (nfs4_has_pages(vp) &&
}
if (busy != NFSV4_RTABLE4_OK) {
#ifdef DEBUG
char *path;
#endif
return (busy);
}
}
}
}
return (busy);
}
/*
* Destroy inactive vnodes from the hash queues which
* belong to this vfs. All of the vnodes should be inactive.
* It is essential that we destroy all rnodes in case of
* forced unmount as well as in normal unmount case.
*/
void
{
int index;
/* save the hash pointer before destroying */
rp4_rmfree(rp);
} else
}
}
}
/*
* This call to rp4_addfree will end up destroying the
* rnode, but in a safe way with the appropriate set
* of checks done.
*/
}
}
/*
* This routine destroys all the resources of an rnode
* and finally the rnode itself.
*/
static void
{
#ifdef DEBUG
#endif
vn_invalid(vp);
}
/*
* Invalidate the attributes on all rnodes forcing the next getattr
* to go over the wire. Used to flush stale uid and gid mappings.
* Maybe done on a per vfsp, or all rnodes (vfsp == NULL)
*/
void
{
int index;
/*
* Walk the hash queues looking for rnodes.
*/
continue;
continue;
/*
* Expire the attributes by resetting the change
* and attr timeout.
*/
}
}
}
/*
* Flush all vnodes in this (or every) vfs.
* Used by nfs_sync and by nfs_unmount.
*/
void
{
int index;
/*
* Check to see whether there is anything to do.
*/
num = rnode4_new;
if (num == 0)
return;
/*
* Allocate a slot for all currently active rnodes on the
* supposition that they all may need flushing.
*/
cnt = 0;
/*
* Walk the hash queues looking for rnodes with page
* lists associated with them. Make a list of these
* files.
*/
/*
* Don't bother sync'ing a vp if it
* is part of virtual swap device or
* if VFS is read-only
*/
continue;
/*
* If flushing all mounted file systems or
* the vnode belongs to this vfs, has pages
* and is marked as either dirty or mmap'd,
* hold and add this vnode to the list of
* vnodes to flush.
*/
nfs4_has_pages(vp) &&
goto toomany;
}
}
}
}
/*
* Flush and release all of the files on the list.
*/
while (cnt-- > 0) {
}
/*
* Free the space allocated to hold the list.
*/
}
int
{
char *contents;
int size;
int freed;
/*
* Free any held caches which may
* be associated with this rnode.
*/
/*
* Free the access cache entries.
*/
return (freed);
/*
* Free the readdir cache entries, incompletely if we can't block.
*/
/*
* Free the symbolic link cache.
*/
}
/*
* Free any cached ACL.
*/
/*
* Release the xattr directory vnode
*/
return (1);
}
static int
{
char *contents;
int size;
int freed;
/*
* Free any held credentials and caches which
* may be associated with this rnode.
*/
return (0);
/*
* To avoid a deadlock, do not free r_xattr_dir cache if it is hashed
* on the same r_hashq queue. We are not mandated to free all caches.
* VN_RELE(rp->r_xattr_dir) will be done sometime later - e.g. when the
* rnode 'rp' is freed or put on the free list.
*/
}
/*
* Free the access cache entries.
*/
return (freed);
/*
* Free the symbolic link cache.
*/
}
/*
* Free any cached ACL.
*/
/*
* Release the xattr directory vnode
*/
return (1);
}
static int
nfs4_free_reclaim(void)
{
int freed;
#ifdef DEBUG
#endif
freed = 0;
rp = rp4freelist;
do {
if (nfs4_free_data_reclaim(rp))
freed = 1;
}
return (freed);
}
static int
nfs4_active_reclaim(void)
{
int freed;
int index;
#ifdef DEBUG
#endif
freed = 0;
if (nfs4_active_data_reclaim(rp))
freed = 1;
}
}
return (freed);
}
static int
nfs4_rnode_reclaim(void)
{
int freed;
#ifdef DEBUG
#endif
freed = 0;
rp4_rmfree(rp);
continue;
}
}
/*
* This call to rp_addfree will end up destroying the
* rnode, but in a safe way with the appropriate set
* of checks done.
*/
}
return (freed);
}
/*ARGSUSED*/
static void
nfs4_reclaim(void *cdrarg)
{
#ifdef DEBUG
#endif
if (nfs4_free_reclaim())
return;
if (nfs4_active_reclaim())
return;
(void) nfs4_rnode_reclaim();
}
/*
* Returns the clientid4 to use for the given mntinfo4. Note that the
* clientid can change if the caller drops mi_recovlock.
*/
{
/* this locks down sp if it is found */
}
return (clientid);
}
/*
* Return the current lease time for the server associated with the given
* file. Note that the lease time could change immediately after this
* call.
*/
{
/* this locks down sp if it is found */
}
return (1); /* 1 second */
}
return (lease_time);
}
/*
* Return a list with information about all the known open instances for
* a filesystem. The caller must call r4releopenlist() when done with the
* list.
*
* We are safe at looking at os_valid and os_pending_close across dropping
* the 'os_sync_lock' to count up the number of open streams and then
* allocate memory for the osp list due to:
* -Looking at os_pending_close is safe since this routine is
* only called via recovery, and os_pending_close can only be set via
* a non-recovery operation (which are all blocked when recovery
* is active).
*
* -Examining os_valid is safe since non-recovery operations, which
* could potentially switch os_valid to 0, are blocked (via
* nfs4_start_fop) and recovery is single-threaded per mntinfo4_t
* (which means we are the only recovery thread potentially acting
* on this open stream).
*/
{
int numosp;
int index;
int hold_vnode;
reopenlist = NULL;
continue;
hold_vnode = 0;
/* Count the number of valid open_streams of the file */
numosp = 0;
numosp++;
}
/* Fill in the valid open streams per vp */
if (numosp > 0) {
int j;
hold_vnode = 1;
/*
* Add a new open instance to the list
*/
KM_SLEEP);
reopenlist = rep;
KM_SLEEP);
j = 0;
!osp->os_pending_close) {
osp->os_ref_count++;
j++;
}
}
/*
* Assuming valid osp(s) stays valid between
* the time obtaining j and numosp.
*/
}
/* do this here to keep v_lock > r_os_lock */
if (hold_vnode)
/*
* If this rnode holds a delegation,
* but if there are no valid open streams,
* then just discard the delegation
* without doing delegreturn.
*/
if (numosp > 0)
}
/* Save the delegation type for use outside the lock */
/*
* If we have a delegation then get rid of it.
* We've set rp->r_deleg_needs_recovery so we have
* enough information to recover.
*/
if (dtype != OPEN_DELEGATE_NONE) {
}
}
}
return (reopenlist);
}
/*
* Release the list of open instance references.
*/
void
{
int i;
}
}
int
nfs4_rnode_init(void)
{
int i;
/*
* Compute the size of the rnode4 hash table
*/
if (nrnode <= 0)
"setting nrnode to max value of %ld", nrnode4_max);
}
/*
* Allocate and initialize the hash buckets
*/
for (i = 0; i < rtable4size; i++) {
}
return (0);
}
int
nfs4_rnode_fini(void)
{
int i;
/*
* Deallocate the rnode hash queues
*/
for (i = 0; i < rtable4size; i++)
return (0);
}
/*
* Return non-zero if the given filehandle refers to the root filehandle
* for the given rnode.
*/
static int
{
int isroot;
isroot = 0;
isroot = 1;
return (isroot);
}
/*
* The r4_stub_* routines assume that the rnode is newly activated, and
* that the caller either holds the hash bucket r_lock for this rnode as
* RW_WRITER, or holds r_statelock.
*/
static void
{
/*
* Safely switch this vnode to the trigger vnodeops.
*
* Currently, we don't ever switch a trigger vnode back to using
* "regular" v4 vnodeops. NFS4_STUB_NONE is only used to note that
* a new v4 object is not a trigger, and it will already have the
* correct v4 vnodeops by default. So, no "else" case required here.
*/
if (type != NFS4_STUB_NONE)
}
void
{
}
void
{
}
#ifdef DEBUG
/*
* Look in the rnode table for other rnodes that have the same filehandle.
* Assume the lock is held for the hash chain of checkrp
*/
static void
{
int index;
if (!r4_check_for_dups)
return;
continue;
continue;
}
}
}
}
#endif /* DEBUG */