lofs_subr.c revision 1a5e258f5471356ca102c7176637cdce45bac147
269473047d747f7815af570197e4ef7322d3632cEvan Yan * CDDL HEADER START
269473047d747f7815af570197e4ef7322d3632cEvan Yan * The contents of this file are subject to the terms of the
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Common Development and Distribution License (the "License").
269473047d747f7815af570197e4ef7322d3632cEvan Yan * You may not use this file except in compliance with the License.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
269473047d747f7815af570197e4ef7322d3632cEvan Yan * See the License for the specific language governing permissions
269473047d747f7815af570197e4ef7322d3632cEvan Yan * and limitations under the License.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * When distributing Covered Code, include this CDDL HEADER in each
269473047d747f7815af570197e4ef7322d3632cEvan Yan * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * If applicable, add the following below this CDDL HEADER, with the
269473047d747f7815af570197e4ef7322d3632cEvan Yan * fields enclosed by brackets "[]" replaced with your own identifying
269473047d747f7815af570197e4ef7322d3632cEvan Yan * information: Portions Copyright [yyyy] [name of copyright owner]
269473047d747f7815af570197e4ef7322d3632cEvan Yan * CDDL HEADER END
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Use is subject to license terms.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * The idea behind composition-based stacked filesystems is to add a
269473047d747f7815af570197e4ef7322d3632cEvan Yan * vnode to the stack of vnodes for each mount. These vnodes have their
269473047d747f7815af570197e4ef7322d3632cEvan Yan * own set of mount options and filesystem-specific functions, so they
269473047d747f7815af570197e4ef7322d3632cEvan Yan * can modify data or operations before they are passed along. Such a
269473047d747f7815af570197e4ef7322d3632cEvan Yan * filesystem must maintain a mapping from the underlying vnodes to its
269473047d747f7815af570197e4ef7322d3632cEvan Yan * interposing vnodes.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * In lofs, this mapping is implemented by a hashtable. Each bucket
269473047d747f7815af570197e4ef7322d3632cEvan Yan * contains a count of the number of nodes currently contained, the
269473047d747f7815af570197e4ef7322d3632cEvan Yan * chain of vnodes, and a lock to protect the list of vnodes. The
269473047d747f7815af570197e4ef7322d3632cEvan Yan * hashtable dynamically grows if the number of vnodes in the table as a
269473047d747f7815af570197e4ef7322d3632cEvan Yan * whole exceeds the size of the table left-shifted by
269473047d747f7815af570197e4ef7322d3632cEvan Yan * lo_resize_threshold. In order to minimize lock contention, there is
269473047d747f7815af570197e4ef7322d3632cEvan Yan * no global lock protecting the hashtable, hence obtaining the
269473047d747f7815af570197e4ef7322d3632cEvan Yan * per-bucket locks consists of a dance to make sure we've actually
269473047d747f7815af570197e4ef7322d3632cEvan Yan * locked the correct bucket. Acquiring a bucket lock doesn't involve
269473047d747f7815af570197e4ef7322d3632cEvan Yan * locking the hashtable itself, so we refrain from freeing old
269473047d747f7815af570197e4ef7322d3632cEvan Yan * hashtables, and store them in a linked list of retired hashtables;
269473047d747f7815af570197e4ef7322d3632cEvan Yan * the list is freed when the filesystem is unmounted.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Due to the hashing algorithm, the size of the hash table needs to be a
269473047d747f7815af570197e4ef7322d3632cEvan Yan * power of 2.
269473047d747f7815af570197e4ef7322d3632cEvan Yan#define ltablehash(vp, tblsz) ((((intptr_t)(vp))>>10) & ((tblsz)-1))
269473047d747f7815af570197e4ef7322d3632cEvan Yan * The following macros can only be safely used when the desired bucket
269473047d747f7815af570197e4ef7322d3632cEvan Yan * is already locked.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * The lock in the hashtable associated with the given vnode.
269473047d747f7815af570197e4ef7322d3632cEvan Yan (&(li)->li_hashtable[ltablehash((vp), (li)->li_htsize)].lh_lock)
269473047d747f7815af570197e4ef7322d3632cEvan Yan * The bucket in the hashtable that the given vnode hashes to.
269473047d747f7815af570197e4ef7322d3632cEvan Yan ((li)->li_hashtable[ltablehash((vp), (li)->li_htsize)].lh_chain)
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Number of elements currently in the bucket that the vnode hashes to.
269473047d747f7815af570197e4ef7322d3632cEvan Yan ((li)->li_hashtable[ltablehash((vp), (li)->li_htsize)].lh_count)
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Grab/Drop the lock for the bucket this vnode hashes to.
269473047d747f7815af570197e4ef7322d3632cEvan Yan#define TABLE_LOCK_ENTER(vp, li) table_lock_enter(vp, li)
269473047d747f7815af570197e4ef7322d3632cEvan Yanstatic lnode_t *lfind(struct vnode *, struct loinfo *);
269473047d747f7815af570197e4ef7322d3632cEvan Yanstatic struct vfs *makelfsnode(struct vfs *, struct loinfo *);
269473047d747f7815af570197e4ef7322d3632cEvan Yanstatic struct lfsnode *lfsfind(struct vfs *, struct loinfo *);
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Since the hashtable itself isn't protected by a lock, obtaining a
269473047d747f7815af570197e4ef7322d3632cEvan Yan * per-bucket lock proceeds as follows:
269473047d747f7815af570197e4ef7322d3632cEvan Yan * (a) li->li_htlock protects li->li_hashtable, li->li_htsize, and
269473047d747f7815af570197e4ef7322d3632cEvan Yan * li->li_retired.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * (b) Per-bucket locks (lh_lock) protect the contents of the bucket.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * (c) Locking order for resizing the hashtable is li_htlock then
269473047d747f7815af570197e4ef7322d3632cEvan Yan * To grab the bucket lock we:
269473047d747f7815af570197e4ef7322d3632cEvan Yan * (1) Stash away the htsize and the pointer to the hashtable to make
269473047d747f7815af570197e4ef7322d3632cEvan Yan * sure neither change while we're using them.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * (2) lgrow() updates the pointer to the hashtable before it updates
269473047d747f7815af570197e4ef7322d3632cEvan Yan * the size: the worst case scenario is that we have the wrong size (but
269473047d747f7815af570197e4ef7322d3632cEvan Yan * the correct table), so we hash to the wrong bucket, grab the wrong
269473047d747f7815af570197e4ef7322d3632cEvan Yan * lock, and then realize that things have changed, rewind and start
269473047d747f7815af570197e4ef7322d3632cEvan Yan * again. If both the size and the table changed since we loaded them,
269473047d747f7815af570197e4ef7322d3632cEvan Yan * we'll realize that too and restart.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * (3) The protocol for growing the hashtable involves holding *all* the
269473047d747f7815af570197e4ef7322d3632cEvan Yan * locks in the table, hence the unlocking code (TABLE_LOCK_EXIT())
269473047d747f7815af570197e4ef7322d3632cEvan Yan * doesn't need to do any dances, since neither the table nor the size
269473047d747f7815af570197e4ef7322d3632cEvan Yan * can change while any bucket lock is held.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * (4) If the hashtable is growing (by thread t1) while another thread
269473047d747f7815af570197e4ef7322d3632cEvan Yan * (t2) is trying to grab a bucket lock, t2 might have a stale reference
269473047d747f7815af570197e4ef7322d3632cEvan Yan * to li->li_htsize:
269473047d747f7815af570197e4ef7322d3632cEvan Yan * - t1 grabs all locks in lgrow()
269473047d747f7815af570197e4ef7322d3632cEvan Yan * - t2 loads li->li_htsize and li->li_hashtable
269473047d747f7815af570197e4ef7322d3632cEvan Yan * - t1 changes li->hashtable
269473047d747f7815af570197e4ef7322d3632cEvan Yan * - t2 loads from an offset in the "stale" hashtable and tries to grab
269473047d747f7815af570197e4ef7322d3632cEvan Yan * the relevant mutex.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * If t1 had free'd the stale hashtable, t2 would be in trouble. Hence,
269473047d747f7815af570197e4ef7322d3632cEvan Yan * stale hashtables are not freed but stored in a list of "retired"
269473047d747f7815af570197e4ef7322d3632cEvan Yan * hashtables, which is emptied when the filesystem is unmounted.
269473047d747f7815af570197e4ef7322d3632cEvan Yan if (li->li_hashtable == chain && li->li_htsize == htsize)
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Initialize the cache.
269473047d747f7815af570197e4ef7322d3632cEvan Yan lnode_cache = kmem_cache_create("lnode_cache", sizeof (lnode_t),
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Initialize a (struct loinfo), and initialize the hashtable to have
269473047d747f7815af570197e4ef7322d3632cEvan Yan * htsize buckets.
269473047d747f7815af570197e4ef7322d3632cEvan Yan li->li_hashtable = kmem_zalloc(htsize * sizeof (*li->li_hashtable),
269473047d747f7815af570197e4ef7322d3632cEvan Yan mutex_init(&li->li_lfslock, NULL, MUTEX_DEFAULT, NULL);
269473047d747f7815af570197e4ef7322d3632cEvan Yan mutex_init(&li->li_htlock, NULL, MUTEX_DEFAULT, NULL);
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Destroy a (struct loinfo)
269473047d747f7815af570197e4ef7322d3632cEvan Yan for (i = 0; i < htsize; i++)
269473047d747f7815af570197e4ef7322d3632cEvan Yan kmem_free(table, htsize * sizeof (*li->li_hashtable));
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Free the retired hashtables.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Return a looped back vnode for the given vnode.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * If no lnode exists for this vnode create one and put it
269473047d747f7815af570197e4ef7322d3632cEvan Yan * in a table hashed by vnode. If the lnode for
269473047d747f7815af570197e4ef7322d3632cEvan Yan * this vnode is already in the table return it (ref count is
269473047d747f7815af570197e4ef7322d3632cEvan Yan * incremented by lfind). The lnode will be flushed from the
269473047d747f7815af570197e4ef7322d3632cEvan Yan * table when lo_inactive calls freelonode. The creation of
269473047d747f7815af570197e4ef7322d3632cEvan Yan * a new lnode can be forced via the LOF_FORCE flag even if
269473047d747f7815af570197e4ef7322d3632cEvan Yan * the vnode exists in the table. This is used in the creation
269473047d747f7815af570197e4ef7322d3632cEvan Yan * of a terminating lnode when looping is detected. A unique
269473047d747f7815af570197e4ef7322d3632cEvan Yan * lnode is required for the correct evaluation of the current
269473047d747f7815af570197e4ef7322d3632cEvan Yan * working directory.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * NOTE: vp is assumed to be a held vnode.
269473047d747f7815af570197e4ef7322d3632cEvan Yanmakelonode(struct vnode *vp, struct loinfo *li, int flag)
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Optimistically assume that we won't need to sleep.
269473047d747f7815af570197e4ef7322d3632cEvan Yan /* The lnode allocation may have succeeded, save it */
269473047d747f7815af570197e4ef7322d3632cEvan Yan VN_SET_VFS_TYPE_DEV(nvp, vfsp, vp->v_type, vp->v_rdev);
269473047d747f7815af570197e4ef7322d3632cEvan Yan nvp->v_flag |= (vp->v_flag & (VNOMOUNT|VNOMAP|VDIROPEN));
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Get/Make vfs structure for given real vfs
269473047d747f7815af570197e4ef7322d3632cEvan Yanstatic struct vfs *
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Don't grab any locks for the fast (common) case.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Even though the lfsnode is strictly speaking a private
269473047d747f7815af570197e4ef7322d3632cEvan Yan * implementation detail of lofs, it should behave as a regular
269473047d747f7815af570197e4ef7322d3632cEvan Yan * vfs_t for the benefit of the rest of the kernel.
269473047d747f7815af570197e4ef7322d3632cEvan Yan lfs->lfs_vfs.vfs_fstype = li->li_mountvfs->vfs_fstype;
269473047d747f7815af570197e4ef7322d3632cEvan Yan ((vfsp->vfs_flag | li->li_mflag) & ~li->li_dflag) &
269473047d747f7815af570197e4ef7322d3632cEvan Yan /* Leave a reference to the mountpoint */
269473047d747f7815af570197e4ef7322d3632cEvan Yan * We use 1 instead of 0 as the value to associate with
269473047d747f7815af570197e4ef7322d3632cEvan Yan * an idle lfs_vfs. This is to prevent VFS_RELE()
269473047d747f7815af570197e4ef7322d3632cEvan Yan * trying to kmem_free() our lfs_t (which is the wrong
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Free lfs node since no longer in use
269473047d747f7815af570197e4ef7322d3632cEvan Yan for (this = li->li_lfs; this != NULL; this = this->lfs_next) {
269473047d747f7815af570197e4ef7322d3632cEvan Yan /*NOTREACHED*/
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Find lfs given real vfs and mount instance(li)
269473047d747f7815af570197e4ef7322d3632cEvan Yanstatic struct lfsnode *
269473047d747f7815af570197e4ef7322d3632cEvan Yan * We need to handle the case where a UFS filesystem was forced
269473047d747f7815af570197e4ef7322d3632cEvan Yan * unmounted and then a subsequent mount got the same vfs
269473047d747f7815af570197e4ef7322d3632cEvan Yan * structure. If the new mount lies in the lofs hierarchy, then
269473047d747f7815af570197e4ef7322d3632cEvan Yan * this will confuse lofs, because the original vfsp (of the
269473047d747f7815af570197e4ef7322d3632cEvan Yan * forced unmounted filesystem) is still around. We check for
269473047d747f7815af570197e4ef7322d3632cEvan Yan * this condition here.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * If we find a cache vfsp hit, then we check to see if the
269473047d747f7815af570197e4ef7322d3632cEvan Yan * cached filesystem was forced unmounted. Skip all such
269473047d747f7815af570197e4ef7322d3632cEvan Yan * entries. This should be safe to do since no
269473047d747f7815af570197e4ef7322d3632cEvan Yan * makelonode()->makelfsnode()->lfsfind() calls should be
269473047d747f7815af570197e4ef7322d3632cEvan Yan * generated for such force-unmounted filesystems (because (ufs)
269473047d747f7815af570197e4ef7322d3632cEvan Yan * lookup would've returned an error).
269473047d747f7815af570197e4ef7322d3632cEvan Yan for (lfs = li->li_lfs; lfs != NULL; lfs = lfs->lfs_next) {
269473047d747f7815af570197e4ef7322d3632cEvan Yan if (realvp->v_vfsp == NULL || realvp->v_type == VBAD)
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Find real vfs given loopback vfs
269473047d747f7815af570197e4ef7322d3632cEvan Yanlo_realvfs(struct vfs *vfsp, struct vnode **realrootvpp)
269473047d747f7815af570197e4ef7322d3632cEvan Yan for (lfs = li->li_lfs; lfs != NULL; lfs = lfs->lfs_next) {
269473047d747f7815af570197e4ef7322d3632cEvan Yan /*NOTREACHED*/
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Lnode lookup stuff.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * These routines maintain a table of lnodes hashed by vp so
269473047d747f7815af570197e4ef7322d3632cEvan Yan * that the lnode for a vp can be found if it already exists.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * NB: A lofs shadow vnode causes exactly one VN_HOLD() on the
269473047d747f7815af570197e4ef7322d3632cEvan Yan * underlying vnode.
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Retire old hashtables.
269473047d747f7815af570197e4ef7322d3632cEvan Yanlretire(struct loinfo *li, struct lobucket *table, uint_t size)
269473047d747f7815af570197e4ef7322d3632cEvan Yan * Grow the hashtable.
uint_t i;
for (i = 0; i < oldsize; i++)
for (i = 0; i < oldsize; i++)
for (i = 0; i < oldsize; i++) {
for (i = 0; i < oldsize; i++) {
#ifdef LODEBUG
#ifdef LODEBUG
#ifdef LODEBUG
static lnode_t *
return (lt);
return (NULL);
#ifdef LODEBUG
static int lofsdebug;
#ifdef LODEBUG
int level;
char *str;