zfs_vfsops.c revision c9030f6c93613fe30ee0c16f92b96da7816ac052
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
#include <sys/sysmacros.h>
#include <sys/pathname.h>
#include <sys/vfs_opreg.h>
#include <sys/zfs_znode.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_deleg.h>
#include <sys/zfs_ioctl.h>
#include <sys/zfs_ctldir.h>
#include <sys/zfs_fuid.h>
#include <sys/bootconf.h>
#include <sys/dmu_objset.h>
#include <sys/spa_boot.h>
#include "zfs_comutil.h"
int zfsfstype;
static kmutex_t zfs_dev_mtx;
extern int sys_shutdown;
static const fs_operation_def_t zfs_vfsops_template[] = {
};
static const fs_operation_def_t zfs_vfsops_eio_template[] = {
};
/*
* We need to keep a count of active fs's.
* This is necessary to prevent our module
* from being unloaded after a umount -f
*/
static uint32_t zfs_active_fs_count = 0;
/*
* MO_DEFAULT is not used since the default value is determined
* by the equivalent property.
*/
};
static mntopts_t zfs_mntopts = {
};
/*ARGSUSED*/
int
{
/*
* Data integrity is job one. We don't want a compromised kernel
* writing to the storage pool, so we never sync during panic.
*/
if (panicstr)
return (0);
/*
* SYNC_ATTR is used by fsflush() to force old filesystems like UFS
* to sync metadata, which they would otherwise cache indefinitely.
* Semantically, the only requirement is that the sync be initiated.
* The DMU syncs out txgs frequently, so there's nothing to do.
*/
return (0);
/*
* Sync a specific filesystem.
*/
dsl_pool_t *dp;
/*
* If the system is shutting down, then skip any
* filesystems which may exist on a suspended pool.
*/
return (0);
}
} else {
/*
* Sync all ZFS filesystems. This is what happens when you
* run sync(1M). Unlike other filesystems, ZFS honors the
* request by waiting for all pools to commit all dirty data.
*/
}
return (0);
}
static int
{
do {
do {
/*
* If we're still using the real major
* number space. If we're using a getudev()'ed
* major number, we can use all of its minors.
*/
else
zfs_minor = 0;
} else {
zfs_minor++;
}
/*
* We are using all ~262,000 minor numbers for the
* current major number. Create a new major number.
*/
"zfs_mount: Can't get unique major "
"device number.");
return (-1);
}
zfs_minor = 0;
} else {
break;
}
/* CONSTANTCONDITION */
} while (1);
return (0);
}
static void
{
} else {
}
}
static void
{
/* XXX locking on vfs_flag? */
} else {
/* XXX locking on vfs_flag? */
}
}
static void
{
if (newval < SPA_MINBLOCKSIZE ||
}
static void
{
if (newval) {
/* XXX locking on vfs_flag? */
} else {
/* XXX locking on vfs_flag? */
}
}
static void
{
} else {
}
}
static void
{
} else {
}
}
static void
{
} else {
}
}
/*
* The nbmand mount option can be changed at mount time.
* We can't allow it to be toggled on live file systems or incorrect
* behavior may be seen from cifs clients
*
* This property isn't registered via dsl_prop_register(), but this callback
* will be called when a file system is first mounted
*/
static void
{
} else {
}
}
static void
{
}
static void
{
}
static void
{
}
static void
{
}
static int
{
int error = 0;
/*
* The act of registering our callbacks will destroy any mount
* options we may have. In order to enable temporary overrides
* of mount options, we stash away the current values and
* restore them after we register the callbacks.
*/
}
do_devices = B_TRUE;
} else {
do_devices = B_TRUE;
do_devices = B_TRUE;
}
}
}
}
}
}
/*
* nbmand is a special property. It can only be changed at
* mount time.
*
* This is weird, but it is documented to only be changeable
* at mount time.
*/
} else {
char osname[MAXNAMELEN];
NULL)) {
return (error);
}
}
/*
* Register property callbacks.
*
* It would probably be fine to just check for i/o error from
* the first prop_register(), but I guess I like to go
* overboard...
*/
zfsvfs);
if (error)
goto unregister;
/*
* Invoke our callbacks to restore temporary mount options.
*/
if (do_readonly)
if (do_setuid)
if (do_exec)
if (do_devices)
if (do_xattr)
if (do_atime)
return (0);
/*
* We may attempt to unregister some callbacks that are not
* registered, but this is OK; it will simply return ENOMSG,
* which we will ignore.
*/
return (error);
}
static int
{
/*
* Is it a valid type of object to track?
*/
/*
* If we have a NULL data pointer
* then assume the id's aren't changing and
* return EEXIST to the dmu to let it know to
* use the same ids
*/
if (bonustype == DMU_OT_ZNODE) {
} else {
int hdrsize;
/*
* This should only happen for newly created
* files that haven't had the znode data filled
* in yet.
*/
*userp = 0;
*groupp = 0;
return (0);
}
} else {
}
if (swap) {
}
}
return (0);
}
static void
{
const char *domain;
if (domain)
else
domainbuf[0] = '\0';
}
static uint64_t
{
switch (type) {
case ZFS_PROP_USERUSED:
return (DMU_USERUSED_OBJECT);
case ZFS_PROP_GROUPUSED:
return (DMU_GROUPUSED_OBJECT);
case ZFS_PROP_USERQUOTA:
return (zfsvfs->z_userquota_obj);
case ZFS_PROP_GROUPQUOTA:
return (zfsvfs->z_groupquota_obj);
}
return (0);
}
int
{
int error;
if (obj == 0) {
*bufsizep = 0;
return (0);
}
zap_cursor_advance(&zc)) {
*bufsizep)
break;
buf++;
}
error = 0;
return (error);
}
/*
* buf must be big enough (eg, 32 bytes)
*/
static int
{
int domainid = 0;
if (domainid == -1)
}
return (0);
}
int
{
char buf[32];
int err;
*valp = 0;
if (obj == 0)
return (0);
if (err)
return (err);
err = 0;
return (err);
}
int
{
char buf[32];
int err;
if (err)
return (err);
if (*objp == 0) {
}
if (fuid_dirtied)
if (err) {
return (err);
}
if (*objp == 0) {
DMU_OT_NONE, 0, tx);
}
if (quota == 0) {
err = 0;
} else {
}
if (fuid_dirtied)
return (err);
}
{
char buf[32];
int err;
return (B_FALSE);
if (err != 0)
return (B_FALSE);
if (err != 0)
return (B_FALSE);
}
{
return (B_FALSE);
}
int
{
int i, error;
/*
* We claim to always be readonly so we can open snapshots;
* other ZPL code will prevent us from writing to snapshots.
*/
if (error) {
return (error);
}
/*
* Initialize the zfs-specific filesystem structure.
* Should probably make this a kmem cache, shuffle fields,
* and just bzero up to z_hold_mtx[].
*/
if (error) {
goto out;
(void) printf("Can't mount a version %lld file system "
"on a version %lld pool\n. Pool must be upgraded to mount "
goto out;
}
goto out;
goto out;
goto out;
/*
* Fold case on file systems that are always or sometimes case
* insensitive.
*/
/* should either have both of these objects or none */
&sa_obj);
if (error)
return (error);
} else {
/*
* Pre SA versions file systems should never touch
* either the attribute registration or layout objects.
*/
sa_obj = 0;
}
&zfsvfs->z_attr_table);
if (error)
goto out;
if (error)
goto out;
&zfsvfs->z_unlinkedobj);
if (error)
goto out;
goto out;
goto out;
&zfsvfs->z_fuid_obj);
goto out;
&zfsvfs->z_shares_dir);
goto out;
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
return (0);
out:
return (error);
}
static int
{
int error;
if (error)
return (error);
/*
* Set the objset user_ptr to track its zfsvfs.
*/
/*
* If we are not mounting (ie: online recv), then we don't
* have to worry about replaying the log as we blocked all
* operations out since we closed the ZIL.
*/
if (mounting) {
/*
* During replay we remove the read only flag to
* allow replays to succeed.
*/
if (readonly != 0)
else
/*
* Parse and replay the intent log.
*
* Because of ziltest, this must be done after
* zfs_unlinked_drain(). (Further note: ziltest
* doesn't use readonly mounts, where
* zfs_unlinked_drain() isn't called.) This is because
* ziltest causes spa_sync() to think it's committed,
* but actually it is not, so the intent log contains
* many txg's worth of changes.
*
* In particular, if object N is in the unlinked set in
* the last txg to actually sync, then it could be
* actually freed in a later txg and then reallocated
* in a yet later txg. This would write a "create
* object N" record to the intent log. Normally, this
* would be fine because the spa_sync() would have
* written out the fact that object N is free, before
* we could write the "create object N" intent log
* record.
*
* But when we are in ziltest mode, we advance the "open
* txg" without actually spa_sync()-ing the changes to
* disk. So we would see that object N is still
* allocated and in the unlinked set, and there is an
* intent log record saying to allocate it.
*/
if (zil_replay_disable) {
} else {
}
}
}
return (0);
}
void
{
int i;
/*
* This is a barrier to prevent the filesystem from going away in
* zfs_znode_move() until we can safely ensure that the filesystem is
* not unmounted. We consider the filesystem valid before the barrier
* and invalid after the barrier.
*/
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
}
static void
{
if (zfsvfs->z_use_fuids) {
} else {
}
}
}
static int
{
int error = 0;
if (error)
return (error);
/* Initialize the generic filesystem structure. */
vfsp->vfs_bcount = 0;
goto out;
}
NULL))
goto out;
/*
* The fsid is 64 bits, composed of an 8-bit fs type, which
* separates our fsid from any other filesystem types, and a
* 56-bit objset unique ID. The objset unique ID is unique to
* all objsets open on this system, provided by unique_create().
* The 8-bit fs type must be put in the low bits of fsid[1]
* because that's where other Solaris filesystems put it.
*/
zfsfstype & 0xFF;
/*
* Set features for file system.
*/
}
goto out;
} else {
}
out:
if (error) {
} else {
}
return (error);
}
void
{
struct dsl_dataset *ds;
/*
* Unregister properties.
*/
if (!dmu_objset_is_snapshot(os)) {
zfsvfs) == 0);
zfsvfs) == 0);
zfsvfs) == 0);
zfsvfs) == 0);
zfsvfs) == 0);
zfsvfs) == 0);
zfsvfs) == 0);
zfsvfs) == 0);
zfsvfs) == 0);
acl_inherit_changed_cb, zfsvfs) == 0);
vscan_changed_cb, zfsvfs) == 0);
}
}
/*
* Convert a decimal digit string to a uint64_t integer.
*/
static int
{
while (*str) {
}
return (0);
}
/*
* The boot path passed from the boot loader is in the form of
* "rootpool-name/root-filesystem-object-number'. Convert this
* string to a dataset name: "rootpool-name/root-filesystem-name".
*/
static int
{
char *slashp;
int error;
/* if no '/', just return the pool name */
return (0);
}
/* if not a number, just return the root dataset name */
return (0);
}
*slashp = '\0';
*slashp = '/';
return (error);
}
/*
* Check that the hex label string is appropriate for the dataset being
* mounted into the global_zone proper.
*
* Return an error if the hex label string is not default or
* admin_low/admin_high. For admin_low labels, the corresponding
* dataset must be readonly.
*/
int
{
return (0);
return (0);
/* must be readonly */
}
}
/*
* Determine whether the mount is allowed according to MAC check.
* by comparing (where appropriate) label of the dataset against
* the label of the zone being mounted into. If the dataset has
* no label, create one.
*
* Returns 0 if access allowed, error otherwise (e.g. EACCES)
*/
static int
{
char ds_hexsl[MAXNAMELEN];
/*
* Start by getting the dataset label if it exists.
*/
if (error)
/*
* If labeling is NOT enabled, then disallow the mount of datasets
* which have a non-default label already. No other label checks
* are needed.
*/
if (!is_system_labeled()) {
return (0);
}
/*
* Get the label of the mountpoint. If mounting into the global
* zone (i.e. mountpoint is not within an active zone and the
* zoned property is off), the label must be default or
* admin_low/admin_high only; no other checks are needed.
*/
if (!zoned)
else
/*
* This is the case of a zone dataset being mounted
* initially, before the zone has been fully created;
* allow this mount into global zone.
*/
return (0);
}
/*
* The dataset doesn't have a real label, so fabricate one.
*/
ZPROP_SRC_LOCAL, str) == 0)
retv = 0;
/*
* Now compare labels to complete the MAC check. If the
* labels are equal then allow access. If the mountpoint
* label dominates the dataset label, allow readonly access.
* Otherwise, access is denied.
*/
retv = 0;
retv = 0;
}
}
return (retv);
}
static int
{
int error = 0;
static int zfsrootdone = 0;
char *zfs_bootfs;
char *zfs_devid;
/*
* The filesystem that we mount as root is defined in the
* boot property "zfs-bootfs" with a format of
*/
if (zfsrootdone++)
/*
* the process of doing a spa_load will require the
* clock to be set before we could (for example) do
* something better by looking at the timestamp on
* an uberblock, so just set it to -1.
*/
clkset(-1);
"bootfs name");
}
if (zfs_devid)
if (error) {
error);
return (error);
}
error);
return (error);
}
return (error);
goto out;
}
goto out;
}
/*
* Leave rootvp held. The root file system is never unmounted.
*/
out:
return (error);
} else if (why == ROOT_REMOUNT) {
/* refresh mount options */
return (zfs_register_callbacks(vfsp));
} else if (why == ROOT_UNMOUNT) {
return (0);
}
/*
* if "why" is equal to anything else other than ROOT_INIT,
* ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
*/
}
/*ARGSUSED*/
static int
{
char *osname;
int error = 0;
int canwrite;
}
/*
* ZFS does not support passing unparsed data in via MS_DATA.
* Users should use the MS_OPTIONSTR interface; this means
* that all option parsing is already done and the options struct
* can be interrogated.
*/
/*
* Get the objset name (the "special" mount argument).
*/
return (error);
/*
* Check for mount privilege?
*
* If we don't have privilege then see if
* we have local permission to allow it
*/
if (error) {
/*
* Make sure user is the owner of the mount point
* or has sufficient privileges.
*/
goto out;
}
goto out;
}
} else {
goto out;
}
}
/*
* Refuse to mount a filesystem if we are in a local zone and the
* dataset is not visible.
*/
if (!INGLOBALZONE(curproc) &&
goto out;
}
if (error)
goto out;
/*
* When doing a remount, we simply refresh our temporary properties
* according to those options set in the current VFS options.
*/
/* refresh mount options */
goto out;
}
/*
* Add an extra VFS_HOLD on our parent vfs so that it can't
* disappear due to a forced unmount.
*/
out:
return (error);
}
static int
{
/*
* The underlying storage pool actually uses multiple block sizes.
* We report the fragsize as the smallest block size we support,
* and we report our blocksize as the filesystem's maximum blocksize.
*/
/*
* The following report "total" blocks of various kinds in the
* file system, but reported in terms of f_frsize - the
* "fragment" size.
*/
/*
* statvfs() should really be called statufs(), because it assumes
* static metadata. ZFS doesn't preallocate files, so the best
* we can do is report the max that could possibly fit in f_files,
* and that minus the number actually used in f_ffree.
* For f_ffree, report the smaller of the number of object available
* and the number of blocks (each object will take at least a block).
*/
/*
* We're a zfs filesystem.
*/
/*
* We have all of 32 characters to stuff a string here.
*/
return (0);
}
static int
{
int error;
if (error == 0)
return (error);
}
/*
* Teardown the zfsvfs::z_os.
*
* Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
* and 'z_teardown_inactive_lock' held.
*/
static int
{
if (!unmounting) {
/*
* We purge the parent filesystem's vfsp as the parent
* filesystem and all of its snapshots have their vnode's
* v_vfsp set to the parent's filesystem's vfsp. Note,
* 'z_parent' is self referential for non-snapshots.
*/
}
/*
* Close the zil. NB: Can't close the zil while zfs_inactive
* threads are blocked as zil_close can call zfs_inactive.
*/
}
/*
* If we are not unmounting (ie: online recv) and someone already
* unmounted this file system while we were doing the switcheroo,
* or a reopen of z_os failed then just bail out now.
*/
}
/*
* At this point there are no vops active, and any new vops will
* fail with EIO since we have z_teardown_lock for writer (only
* relavent for forced unmount).
*
* Release all holds on dbufs.
*/
}
/*
* If we are unmounting, set the unmounted flag and let new vops
* unblock. zfs_inactive will have the unmounted behavior, and all
* other vops will fail with EIO.
*/
if (unmounting) {
}
/*
* z_os will be NULL if there was an error in attempting to reopen
* zfsvfs, so just return as the properties had already been
* unregistered and cached data had been evicted before.
*/
return (0);
/*
* Unregister properties.
*/
/*
* Evict cached data
*/
return (0);
}
/*ARGSUSED*/
static int
{
int ret;
if (ret) {
return (ret);
}
/*
* We purge the parent filesystem's vfsp as the parent filesystem
* and all of its snapshots have their vnode's v_vfsp set to the
* parent's filesystem's vfsp. Note, 'z_parent' is self
* referential for non-snapshots.
*/
/*
* Unmount any snapshots mounted under .zfs before unmounting the
* dataset itself.
*/
return (ret);
}
/*
* Check the number of active vnodes in the file system.
* Our count is maintained in the vfs structure, but the
* number is off by 1 to indicate a hold on the vfs
* structure itself.
*
* The '.zfs' directory maintains a reference of its
* own, and any active references underneath are
* reflected in the vnode count.
*/
} else {
}
}
/*
* z_os will be NULL if there was an error in
* attempting to reopen zfsvfs.
*/
/*
* Unset the objset user_ptr.
*/
/*
* Finally release the objset
*/
}
/*
* We can now safely destroy the '.zfs' directory node.
*/
return (0);
}
static int
{
int i, err;
if (err)
}
} else {
}
/* A zero fid_gen means we are in the .zfs control directories */
if (fid_gen == 0 &&
if (object == ZFSCTL_INO_SNAPDIR) {
} else {
}
return (0);
}
return (err);
}
sizeof (uint64_t));
if (zp_gen == 0)
zp_gen = 1;
}
return (0);
}
/*
* Block out VOPs and close zfsvfs_t::z_os
*
* Note, if successful, then we return with the 'z_teardown_lock' and
* 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
* dataset and objset intact so that they can be atomically handed off during
* a subsequent rollback or recv operation and the resume thereafter.
*/
int
{
int error;
return (error);
return (0);
}
/*
* Rebuild SA and release VOPs. Note that ownership of the underlying dataset
* is an invariant across any of the operations that can be performed while the
* filesystem was suspended. Whether it succeeded or failed, the preconditions
* are the same: the relevant objset and associated dataset are owned by
* zfsvfs, held, and long held on entry.
*/
int
{
int err;
/*
* We already own this, so just hold and rele it to update the
* objset_t, as the one we had before may have been evicted.
*/
/*
* Make sure version hasn't changed
*/
if (err)
goto bail;
goto bail;
goto bail;
/*
* Attempt to re-establish all the active znodes with
* their dbufs. If a zfs_rezget() fails, then we'll let
* any potential callers discover that via ZFS_ENTER_VERIFY_VP
* when they try to use their znode.
*/
(void) zfs_rezget(zp);
}
bail:
/* release the VOPs */
if (err) {
/*
* Since we couldn't setup the sa framework, try to force
* unmount this file system.
*/
}
return (err);
}
static void
{
/*
* If this is a snapshot, we have an extra VFS_HOLD on our parent
* from zfs_mount(). Release it here. If we came through
* zfs_mountroot() instead, we didn't grab an extra hold, so
* skip the VFS_RELE for rootvfs.
*/
}
/*
* VFS_INIT() initialization. Note that there is no VFS_FINI(),
* so we can't safely do any non-idempotent initialization here.
* Leave that to zfs_init() and zfs_fini(), which are called
* from the module's _init() and _fini() entry points.
*/
/*ARGSUSED*/
static int
{
int error;
/*
* Setup vfsops and vnodeops tables.
*/
if (error != 0) {
}
if (error) {
(void) vfs_freevfsops_by_type(zfsfstype);
return (error);
}
/*
* Unique major number for all zfs mounts.
* If we run out of 32-bit minors, we'll getudev() another major.
*/
return (0);
}
void
zfs_init(void)
{
/*
* Initialize .zfs directory structures
*/
zfsctl_init();
/*
* Initialize znode cache, vnode ops, etc...
*/
}
void
zfs_fini(void)
{
zfsctl_fini();
}
int
zfs_busy(void)
{
return (zfs_active_fs_count != 0);
}
int
{
int error;
if (zfs_spa_version_map(newvers) >
}
if (error) {
return (error);
}
if (error) {
return (error);
}
DMU_OT_NONE, 0, tx);
}
return (0);
}
/*
* Read a property stored within the master node.
*/
int
{
const char *pname;
/*
* Look up the file system's value for the property. For the
* version property, we look up a slightly different string.
*/
if (prop == ZFS_PROP_VERSION)
else
/* No value set, use the default value */
switch (prop) {
case ZFS_PROP_VERSION:
*value = ZPL_VERSION;
break;
case ZFS_PROP_NORMALIZE:
case ZFS_PROP_UTF8ONLY:
*value = 0;
break;
case ZFS_PROP_CASE:
break;
default:
return (error);
}
error = 0;
}
return (error);
}
};
struct modlfs zfs_modlfs = {
};