ufs_lockfs.c revision 13237b7e1e5bd293e466307b2e06f8e0e2321a0a
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/sysmacros.h>
#include <sys/resource.h>
#include <sys/pathname.h>
/* error lock status */
#define UN_ERRLCK (-1)
#define SET_ERRLCK 1
#define RE_ERRLCK 2
#define NO_ERRLCK 0
/*
* Index to be used in TSD for storing lockfs data
*/
typedef struct _ulockfs_info {
struct _ulockfs_info *next;
/*
* Check in TSD that whether we are already doing any VOP on this filesystem
*/
{ \
ulockfs_info_t *_curr; \
\
found = 1; \
break; \
} \
} \
}
/*
* Get the lockfs data from TSD so that lockfs handles the recursive VOP
* properly
*/
{ \
ulockfs_info_t *_curr; \
\
break; \
} \
} \
\
}
/*
* Validate lockfs request
*/
static int
{
int error = 0;
/*
* no input flags defined
*/
goto errout;
}
/*
* check key
*/
if (!LOCKFS_IS_ULOCK(ul_lockfsp))
goto errout;
}
return (error);
}
/*
* ufs_checkaccton
* check if accounting is turned on on this fs
*/
int
{
if (acct_fs_in_use(vp))
return (EDEADLK);
return (0);
}
/*
* ufs_checkswapon
* check if local swapping is to file on this fs
*/
int
{
return (EDEADLK);
}
return (0);
}
/*
* ufs_freeze
* pend future accesses for current lock and desired lock
*/
void
{
/*
* set to new lock type
*/
}
/*
* All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before
* starting ufs_quiesce() protocol and decrement it only when a file system no
* longer has to be in quiescent state. This allows ufs_pageio() to detect
* that another thread wants to quiesce a file system. See more comments in
* ufs_pageio().
*/
ulong_t ufs_quiesce_pend = 0;
/*
* ufs_quiesce
* wait for outstanding accesses to finish
*/
int
{
int error = 0;
/*
* Set a softlock to suspend future ufs_vnops so that
* this lockfs request will not be starved
*/
/* check if there is any outstanding ufs vnodeops calls */
/*
* use timed version of cv_wait_sig() to make sure we don't
* miss a wake up call from ufs_pageio() when it doesn't use
* ul_lock.
*
* when a fallocate thread comes in, the only way it returns
* from this function is if there are no other vnode operations
* going on (remember fallocate threads are tracked using
* ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread
* hasn't already grabbed the fs write lock.
*/
goto out;
}
goto out;
}
}
out:
/*
* unlock the soft lock
*/
return (error);
}
/*
* ufs_flush_inode
*/
int
{
int error;
int saverror = 0;
/*
* wrong file system; keep looking
*/
return (0);
/*
* asynchronously push all the dirty pages
*/
/*
* wait for io and discard all mappings
*/
}
return (saverror);
}
/*
* ufs_flush
* Flush everything that is currently dirty; this includes invalidating
* any mappings.
*/
int
{
int error;
int saverror = 0;
int tdontblock = 0;
/*
* purge dnlc
*/
(void) dnlc_purge_vfsp(vfsp, 0);
/*
* drain the delete and idle threads
*/
ufs_delete_drain(vfsp, 0, 0);
/*
* flush and invalidate quota records
*/
/*
* flush w/invalidate the inodes for vfsp
*/
/*
* synchronously flush superblock and summary info
*/
}
/*
* flush w/invalidate block device pages and buf cache
*/
/*
* drain the delete and idle threads again
*/
ufs_delete_drain(vfsp, 0, 0);
/*
* play with the clean flag
*/
if (saverror == 0)
/*
* Flush any outstanding transactions and roll the log
* only if we are supposed to do, i.e. LDL_NOROLL not set.
* We can not simply check for fs_ronly here since fsck also may
* use this code to roll the log on a read-only filesystem, e.g.
* root during early stages of boot, if other then a sanity check is
* done, it will clear LDL_NOROLL before.
* In addition we assert that the deltamap does not contain any deltas
* in case LDL_NOROLL is set since this is not supposed to happen.
*/
if (TRANS_ISTRANS(ufsvfsp)) {
} else {
/*
* Do not set T_DONTBLOCK if there is a
* transaction opened by caller.
*/
tdontblock = 1;
else
if (!error) {
}
if (tdontblock == 0)
}
}
return (saverror);
}
/*
* ufs_thaw_wlock
* special processing when thawing down to wlock
*/
static int
{
/*
* wrong file system; keep looking
*/
return (0);
/*
* iupdat refuses to clear flags if the fs is read only. The fs
* these inodes being written to disk. So clear the flags.
*/
/*
* pages are mlocked -- fail wlock
*/
return (EBUSY);
return (0);
}
/*
* ufs_thaw_hlock
* special processing when thawing down to hlock or elock
*/
static int
{
/*
* wrong file system; keep looking
*/
return (0);
/*
* blow away all pages - even if they are mlocked
*/
do {
return (0);
}
/*
* ufs_thaw
* thaw file system lock down to current value
*/
int
{
int error = 0;
/*
* if wlock or hlock or elock
*/
ULOCKFS_IS_ELOCK(ulp)) {
/*
* don't keep access times
* don't free deleted files
* if superblock writes are allowed, limit them to me for now
*/
/*
* wait for writes for deleted files and superblock updates
*/
/*
* now make sure the quota file is up-to-date
* expensive; but effective
*/
/*
* no one can write the superblock
*/
/*
*/
if (ULOCKFS_IS_WLOCK(ulp)) {
if (error)
goto errout;
if (error)
goto errout;
if (error)
goto errout;
}
error = 0;
(void) ufs_scan_inodes(0, ufs_thaw_hlock,
}
} else {
/*
* okay to keep access times
* okay to free deleted files
* okay to write the superblock
*/
/*
* flush in case deleted files are in memory
*/
if (noidel) {
goto errout;
}
}
return (error);
}
/*
* ufs_reconcile_fs
* reconcile incore superblock with ondisk superblock
*/
int
{
int needs_unlock;
char finished_fsclean;
/*
* get the on-disk copy of the superblock
*/
return (EIO);
}
/* error locks may only unlock after the fs has been made consistent */
return (EAGAIN);
}
/* repair not yet started? */
return (EBUSY);
}
}
/*
* if superblock has changed too much, abort
*/
return (EACCES);
}
return (EACCES);
}
/*
* get new summary info
*/
return (EIO);
}
/*
* release old summary info and update in-memory superblock
*/
/*
* update fields allowed to change
*/
}
/* XXX What to do about sparecon? */
/* XXX need to copy volume label */
/*
* ondisk clean flag overrides inmemory clean flag iff == FSBAD
* or if error-locked and ondisk is now clean
*/
if (needs_unlock)
else
}
if (needs_unlock)
return (0);
}
/*
* ufs_reconcile_inode
* reconcile ondisk inode with incore inode
*/
static int
{
int i;
int ndaddr;
int niaddr;
int error = 0;
/*
* not an inode we care about
*/
return (0);
/*
* Inode reconciliation fails: we made the filesystem quiescent
* and we did a ufs_flush() before calling ufs_reconcile_inode()
* and thus the inode should not have been changed inbetween.
* Any discrepancies indicate a logic error and a pretty
* significant run-state inconsistency we should complain about.
*/
return (EINVAL);
}
/*
* get the dinode
*/
return (EIO);
}
/*
* handle Sun's implementation of EFT
*/
/*
* some fields are not allowed to change
*/
goto out;
}
/*
* and some are allowed to change
*/
ndaddr = 1;
niaddr = 0;
} else {
}
for (i = 0; i < ndaddr; ++i)
for (i = 0; i < niaddr; ++i)
out:
return (error);
}
/*
* ufs_reconcile
* reconcile ondisk superblock/inodes with any incore
*/
static int
{
int error = 0;
/*
* get rid of as much inmemory data as possible
*/
/*
* reconcile the superblock and inodes
*/
return (error);
return (error);
/*
* allocation blocks may be incorrect; get rid of them
*/
return (error);
}
/*
* File system locking
*/
int
{
}
/* kernel-internal interface, also used by fix-on-panic */
int
int from_user,
int from_log)
{
int error;
int poll_events = POLLPRI;
extern struct pollhead ufs_pollhd;
int signal = 0;
/* check valid lock type */
return (EINVAL);
return (EIO);
return (EIO);
/* take the lock and check again */
return (EIO);
}
/*
* We need to check for this before we grab the ul_lock to avoid
* deadlocks with the accounting framework.
*/
return (EDEADLK);
}
}
/*
* Suspend both the reclaim thread and the delete thread.
* This must be done outside the lockfs locking protocol.
*/
/*
* Quit if there is another lockfs request in progress
* that is waiting for existing ufs_vnops to complete.
*/
if (ULOCKFS_IS_BUSY(ulp)) {
goto errexit;
}
/* cannot ulocked or downgrade a hard-lock */
if (ULOCKFS_IS_HLOCK(ulp)) {
goto errexit;
}
/* an error lock may be unlocked or relocked, only */
if (ULOCKFS_IS_ELOCK(ulp)) {
goto errexit;
}
}
/*
* a read-only error lock may only be upgraded to an
* error lock or hard lock
*/
if (ULOCKFS_IS_ROELOCK(ulp)) {
goto errexit;
}
}
/*
* until read-only error locks are fully implemented
* just return EINVAL
*/
if (LOCKFS_IS_ROELOCK(lockfsp)) {
goto errexit;
}
/*
* an error lock may only be applied if the file system is
* unlocked or already error locked.
* (this is to prevent the case where a fs gets changed out from
* underneath a fs that is locked for backup,
* that is, name/delete/write-locked.)
*/
!ULOCKFS_IS_ROELOCK(ulp)) &&
goto errexit;
}
/* get and validate the input lockfs request */
goto errexit;
/*
* save current ulockfs struct
*/
/*
* Freeze the file system (pend future accesses)
*/
/*
* Set locking in progress because ufs_quiesce may free the
* ul_lock mutex.
*/
/* update the ioctl copy */
/*
* We need to unset FWLOCK status before we call ufs_quiesce
* so that the thread doesnt get suspended. We do this only if
* this (fallocate) thread requested an unlock operation.
*/
if (!ULOCKFS_IS_WLOCK(ulp))
}
/*
* Quiesce (wait for outstanding accesses to finish)
*/
/*
* Interrupted due to signal. There could still be
* pending vnops.
*/
signal = 1;
/*
* We do broadcast because lock-status
* could be reverted to old status.
*/
goto errout;
}
/*
* If the fallocate thread requested a write fs lock operation
* then we set fwlock status in the ulp.
*/
if (ULOCKFS_IS_WLOCK(ulp))
}
/*
* save error lock status to pass down to reconcilation
* routines and for later cleanup
*/
int needs_unlock;
int needs_sbwrite;
poll_events |= POLLERR;
if (needs_unlock)
/* disable delayed i/o */
needs_sbwrite = 0;
if (errlck == SET_ERRLCK) {
needs_sbwrite = 1;
}
if (needs_unlock)
if (needs_sbwrite) {
if (needs_unlock)
if (needs_unlock)
}
}
/*
* reconcile superblock and inodes if was wlocked
*/
goto errout;
/*
* in case the fs grew; reset the metadata map for logging tests
*/
}
/*
* At least everything *currently* dirty goes out.
*/
goto errout;
/*
* thaw file system and wakeup pended processes
*/
goto errout;
/*
* reset modified flag if not already write locked
*/
if (!LOCKFS_IS_WLOCK(&lfs))
/*
* idle the lock struct
*/
/* update the ioctl copy */
/*
* free current comment
*/
}
/* do error lock cleanup */
/* don't allow error lock from user to invoke panic */
poll_events |= POLLERR;
/*
* Allow both the delete thread and the reclaim thread to
* continue.
*/
return (0);
/*
* Lock failed. Reset the old lock in ufsvfs if not hard locked.
*/
}
/*
* Don't call ufs_thaw() when there's a signal during
* ufs quiesce operation as it can lead to deadlock
* with getpage.
*/
if (signal == 0)
/*
* Allow both the delete thread and the reclaim thread to
* continue.
*/
return (error);
}
/*
* fiolfss
* return the current file system locking state info
*/
int
{
return (EINVAL);
/* file system has been forcibly unmounted */
return (EIO);
if (ULOCKFS_IS_HLOCK(ulp)) {
return (0);
}
if (ULOCKFS_IS_MOD(ulp))
return (0);
}
/*
* ufs_check_lockfs
* check whether a ufs_vnops conflicts with the file system lock
*/
int
{
return (EAGAIN);
}
/*
* In the case of an onerr umount of the fs, threads could
* have blocked before coming into ufs_check_lockfs and
* need to check for the special case of ELOCK and
* vfs_dontblock being set which would indicate that the fs
* is on its way out and will not return therefore making
* EIO the appropriate response.
*/
if (ULOCKFS_IS_HLOCK(ulp) ||
return (EIO);
/*
* wait for lock status to change
*/
} else {
return (EINTR);
}
}
if (mask & ULOCKFS_FWLOCK) {
} else {
}
return (0);
}
/*
* Check whether we came across the handcrafted lockfs protocol path. We can't
* simply check for T_DONTBLOCK here as one would assume since this can also
* falsely catch recursive VOP's going to a different filesystem, instead we
* check if we already hold the ulockfs->ul_lock mutex.
*/
static int
{
}
/*
* ufs_lockfs_begin - start the lockfs locking protocol
*/
int
{
int error;
int rec_vop;
/*
* file system has been forcibly unmounted
*/
return (EIO);
/*
* Do lockfs protocol
*/
/*
* Detect recursive VOP call or handcrafted internal lockfs protocol
* path and bail out in that case.
*/
return (0);
} else {
if (ulockfs_info_free == NULL) {
if ((ulockfs_info_temp = (ulockfs_info_t *)
kmem_zalloc(sizeof (ulockfs_info_t),
KM_NOSLEEP)) == NULL) {
return (ENOMEM);
}
}
}
/*
* First time VOP call
*
* Increment the ctr irrespective of the lockfs state. If the lockfs
* state is not ULOCKFS_ULOCK, we can decrement it later. However,
* before incrementing we need to check if there is a pending quiesce
* request because if we have a continuous stream of ufs_lockfs_begin
* requests pounding on a few cpu's then the ufs_quiesce thread might
* never see the value of zero for ctr - a livelock kind of scenario.
*/
if (!ULOCKFS_IS_SLOCK(ulp)) {
}
/*
* If the lockfs state (indicated by ul_fs_lock) is not just
* ULOCKFS_ULOCK, then we will be routed through ufs_check_lockfs
* where there is a check with an appropriate mask to selectively allow
* operations permitted for that kind of lockfs state.
*
* Even these selective operations should not be allowed to go through
* if a lockfs request is in progress because that could result in inode
* modifications during a quiesce and could hence result in inode
* reconciliation failures. ULOCKFS_SLOCK alone would not be sufficient,
* so make use of ufs_quiesce_pend to disallow vnode operations when a
* quiesce is in progress.
*/
if (op_cnt_incremented)
if (error) {
if (ulockfs_info_free == NULL)
sizeof (ulockfs_info_t));
return (error);
}
} else {
/*
* This is the common case of file system in a unlocked state.
*
* If a file system is unlocked, we would expect the ctr to have
* been incremented by now. But this will not be true when a
* quiesce is winding up - SLOCK was set when we checked before
* incrementing the ctr, but by the time we checked for
* ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. It is okay
* to take ul_lock and go through the slow path in this uncommon
* case.
*/
if (op_cnt_incremented == 0) {
if (error) {
if (ulockfs_info_free == NULL)
sizeof (ulockfs_info_t));
return (error);
}
if (mask & ULOCKFS_FWLOCK)
} else if (mask & ULOCKFS_FWLOCK) {
}
}
if (ulockfs_info_free != NULL) {
if (mask & ULOCKFS_FWLOCK)
} else {
if (mask & ULOCKFS_FWLOCK)
ASSERT(ufs_lockfs_key != 0);
}
return (0);
}
/*
* Check whether we are returning from the top level VOP.
*/
static int
{
int result = 1;
result = 0;
break;
}
}
return (result);
}
/*
* ufs_lockfs_end - terminate the lockfs locking protocol
*/
void
{
/*
* end-of-VOP protocol
*/
return;
/*
* If we're called from a first level VOP, we have to have a
* valid ulockfs record in the TSD.
*/
/*
* Invalidate the ulockfs record.
*/
/* fallocate thread */
/* Clear the thread's fallocate state */
}
} else { /* normal thread */
}
}
/*
* ufs_lockfs_trybegin - try to start the lockfs locking protocol without
* blocking.
*/
int
{
int error = 0;
int rec_vop;
/*
* file system has been forcibly unmounted
*/
return (EIO);
/*
* Do lockfs protocol
*/
/*
* Detect recursive VOP call or handcrafted internal lockfs protocol
* path and bail out in that case.
*/
return (0);
} else {
if (ulockfs_info_free == NULL) {
if ((ulockfs_info_temp = (ulockfs_info_t *)
kmem_zalloc(sizeof (ulockfs_info_t),
KM_NOSLEEP)) == NULL) {
return (ENOMEM);
}
}
}
/*
* First time VOP call
*
* Increment the ctr irrespective of the lockfs state. If the lockfs
* state is not ULOCKFS_ULOCK, we can decrement it later. However,
* before incrementing we need to check if there is a pending quiesce
* request because if we have a continuous stream of ufs_lockfs_begin
* requests pounding on a few cpu's then the ufs_quiesce thread might
* never see the value of zero for ctr - a livelock kind of scenario.
*/
if (!ULOCKFS_IS_SLOCK(ulp)) {
}
/*
* Non-blocking version of ufs_check_lockfs() code.
*
* If the file system is not hard locked or error locked
* and if ulp->ul_fs_lock allows this operation, increment
* the appropriate counter and proceed (For eg., In case the
* file system is delete locked, a mmap can still go through).
*/
if (op_cnt_incremented)
if (ULOCKFS_IS_HLOCK(ulp) ||
if (error) {
if (ulockfs_info_free == NULL)
sizeof (ulockfs_info_t));
return (error);
}
if (mask & ULOCKFS_FWLOCK)
} else {
/*
* This is the common case of file system in a unlocked state.
*
* If a file system is unlocked, we would expect the ctr to have
* been incremented by now. But this will not be true when a
* quiesce is winding up - SLOCK was set when we checked before
* incrementing the ctr, but by the time we checked for
* ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. Take
* ul_lock and go through the non-blocking version of
* ufs_check_lockfs() code.
*/
if (op_cnt_incremented == 0) {
if (ULOCKFS_IS_HLOCK(ulp) ||
if (error) {
if (ulockfs_info_free == NULL)
sizeof (ulockfs_info_t));
return (error);
}
if (mask & ULOCKFS_FWLOCK)
} else if (mask & ULOCKFS_FWLOCK) {
}
}
if (ulockfs_info_free != NULL) {
if (mask & ULOCKFS_FWLOCK)
} else {
if (mask & ULOCKFS_FWLOCK)
ASSERT(ufs_lockfs_key != 0);
}
return (0);
}
/*
* specialized version of ufs_lockfs_begin() called by ufs_getpage().
*/
int
int read_access,
{
int error;
int rec_vop;
/*
* file system has been forcibly unmounted
*/
return (EIO);
/*
* Do lockfs protocol
*/
/*
* Detect recursive VOP call or handcrafted internal lockfs protocol
* path and bail out in that case.
*/
return (0);
} else {
if (ulockfs_info_free == NULL) {
if ((ulockfs_info_temp = (ulockfs_info_t *)
kmem_zalloc(sizeof (ulockfs_info_t),
KM_NOSLEEP)) == NULL) {
return (ENOMEM);
}
}
}
/*
* First time VOP call
*/
} else if (protp && read_access) {
/*
* Restrict the mapping to readonly.
* Writes to this mapping will cause
* another fault which will then
* be suspended if fs is write locked
*/
*protp &= ~PROT_WRITE;
} else
/*
* will sleep if this fs is locked against this VOP
*/
if (error) {
if (ulockfs_info_free == NULL)
sizeof (ulockfs_info_t));
return (error);
}
}
if (ulockfs_info_free != NULL) {
} else {
ASSERT(ufs_lockfs_key != 0);
}
return (0);
}
void
ufs_lockfs_tsd_destructor(void *head)
{
/*
* The TSD destructor is being called when the thread exits
* (via thread_exit()). At that time it must have cleaned up
* all VOPs via ufs_lockfs_end() and there must not be a
* valid ulockfs record exist while a thread is exiting.
*/
}
}