vfs.c revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
* The Regents of the University of California
* All Rights Reserved
*
* University Acknowledgment- Portions of this document are derived from
* software developed by the University of California, Berkeley, and its
* contributors.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/errno.h>
#include <sys/user.h>
#include <sys/fstyp.h>
#include <sys/kmem.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/vfs.h>
#include <sys/fem.h>
#include <sys/mntent.h>
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/statfs.h>
#include <sys/cred.h>
#include <sys/vnode.h>
#include <sys/rwstlock.h>
#include <sys/dnlc.h>
#include <sys/file.h>
#include <sys/time.h>
#include <sys/atomic.h>
#include <sys/cmn_err.h>
#include <sys/buf.h>
#include <sys/swap.h>
#include <sys/debug.h>
#include <sys/vnode.h>
#include <sys/modctl.h>
#include <sys/ddi.h>
#include <sys/pathname.h>
#include <sys/bootconf.h>
#include <sys/dumphdr.h>
#include <sys/dc_ki.h>
#include <sys/poll.h>
#include <sys/sunddi.h>
#include <sys/sysmacros.h>
#include <sys/zone.h>
#include <sys/policy.h>
#include <sys/ctfs.h>
#include <sys/objfs.h>
#include <sys/console.h>
#include <sys/reboot.h>
#include <vm/page.h>
#include <fs/fs_subr.h>
static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int);
static void vfs_setmntopt_nolock(mntopts_t *, const char *,
const char *, int, int);
static int vfs_optionisset_nolock(const mntopts_t *, const char *, char **);
static void vfs_freemnttab(struct vfs *);
static void vfs_freeopt(mntopt_t *);
static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *);
static void vfs_swapopttbl(mntopts_t *, mntopts_t *);
static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int);
static void vfs_createopttbl_extend(mntopts_t *, const char *,
const mntopts_t *);
static char **vfs_copycancelopt_extend(char **const, int);
static void vfs_freecancelopt(char **);
static char *getrootfs(void);
static int getmacpath(dev_info_t *, void *);
struct ipmnt {
struct ipmnt *mip_next;
dev_t mip_dev;
struct vfs *mip_vfsp;
};
static kmutex_t vfs_miplist_mutex;
static struct ipmnt *vfs_miplist = NULL;
static struct ipmnt *vfs_miplist_end = NULL;
/*
* VFS global data.
*/
vnode_t *rootdir; /* pointer to root inode vnode. */
vnode_t *devicesdir; /* pointer to inode of devices root */
char *server_rootpath; /* root path for diskless clients */
char *server_hostname; /* hostname of diskless server */
static struct vfs root;
static struct vfs devices;
struct vfs *rootvfs = &root; /* pointer to root vfs; head of VFS list. */
rvfs_t *rvfs_list; /* array of vfs ptrs for vfs hash list */
int vfshsz = 512; /* # of heads/locks in vfs hash arrays */
/* must be power of 2! */
timespec_t vfs_mnttab_ctime; /* mnttab created time */
timespec_t vfs_mnttab_mtime; /* mnttab last modified time */
char *vfs_dummyfstype = "\0";
struct pollhead vfs_pollhd; /* for mnttab pollers */
/*
* Table for generic options recognized in the VFS layer and acted
* on at this level before parsing file system specific options.
* The nosuid option is stronger than any of the devices and setuid
* options, so those are canceled when nosuid is seen.
*
* All options which are added here need to be added to the
* list of standard options in usr/src/cmd/fs.d/fslib.c as well.
*/
/*
* VFS Mount options table
*/
static char *ro_cancel[] = { MNTOPT_RW, NULL };
static char *rw_cancel[] = { MNTOPT_RO, NULL };
static char *suid_cancel[] = { MNTOPT_NOSUID, NULL };
static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES,
MNTOPT_NOSETUID, MNTOPT_SETUID, NULL };
static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL };
static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL };
static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL };
static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL };
static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL };
static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL };
static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL };
static char *noexec_cancel[] = { MNTOPT_EXEC, NULL };
static const mntopt_t mntopts[] = {
/*
* option name cancel options default arg flags
*/
{ MNTOPT_REMOUNT, NULL, NULL,
MO_NODISPLAY, (void *)0 },
{ MNTOPT_RO, ro_cancel, NULL, 0,
(void *)0 },
{ MNTOPT_RW, rw_cancel, NULL, 0,
(void *)0 },
{ MNTOPT_SUID, suid_cancel, NULL, 0,
(void *)0 },
{ MNTOPT_NOSUID, nosuid_cancel, NULL, 0,
(void *)0 },
{ MNTOPT_DEVICES, devices_cancel, NULL, 0,
(void *)0 },
{ MNTOPT_NODEVICES, nodevices_cancel, NULL, 0,
(void *)0 },
{ MNTOPT_SETUID, setuid_cancel, NULL, 0,
(void *)0 },
{ MNTOPT_NOSETUID, nosetuid_cancel, NULL, 0,
(void *)0 },
{ MNTOPT_NBMAND, nbmand_cancel, NULL, 0,
(void *)0 },
{ MNTOPT_NONBMAND, nonbmand_cancel, NULL, 0,
(void *)0 },
{ MNTOPT_EXEC, exec_cancel, NULL, 0,
(void *)0 },
{ MNTOPT_NOEXEC, noexec_cancel, NULL, 0,
(void *)0 },
};
const mntopts_t vfs_mntopts = {
sizeof (mntopts) / sizeof (mntopt_t),
(mntopt_t *)&mntopts[0]
};
/*
* File system operation dispatch functions.
*/
int
fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
{
return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr);
}
int
fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr)
{
return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr);
}
int
fsop_root(vfs_t *vfsp, vnode_t **vpp)
{
refstr_t *mntpt;
int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp);
/*
* Make sure this root has a path. With lofs, it is possible to have
* a NULL mountpoint.
*/
if (vfs_vnode_path && ret == 0 && vfsp->vfs_mntpt != NULL &&
vn_path(*vpp) == NULL) {
mntpt = vfs_getmntpoint(vfsp);
vn_setpath_str(*vpp, refstr_value(mntpt),
strlen(refstr_value(mntpt)));
refstr_rele(mntpt);
}
return (ret);
}
int
fsop_statfs(vfs_t *vfsp, statvfs64_t *sp)
{
return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp);
}
int
fsop_sync(vfs_t *vfsp, short flag, cred_t *cr)
{
return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr);
}
int
fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
{
return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp);
}
int
fsop_mountroot(vfs_t *vfsp, enum whymountroot reason)
{
return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason);
}
void
fsop_freefs(vfs_t *vfsp)
{
(*(vfsp)->vfs_op->vfs_freevfs)(vfsp);
}
int
fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate)
{
return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate));
}
int
fsop_sync_by_kind(int fstype, short flag, cred_t *cr)
{
ASSERT((fstype >= 0) && (fstype < nfstype));
if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype]))
return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr);
else
return (ENOTSUP);
}
/*
* File system initialization. vfs_setfsops() must be called from a file
* system's init routine.
*/
static int
fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual,
int *unused_ops)
{
static const fs_operation_trans_def_t vfs_ops_table[] = {
VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount),
fs_nosys, fs_nosys,
VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount),
fs_nosys, fs_nosys,
VFSNAME_ROOT, offsetof(vfsops_t, vfs_root),
fs_nosys, fs_nosys,
VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs),
fs_nosys, fs_nosys,
VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync),
(fs_generic_func_p) fs_sync,
(fs_generic_func_p) fs_sync, /* No errors allowed */
VFSNAME_VGET, offsetof(vfsops_t, vfs_vget),
fs_nosys, fs_nosys,
VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot),
fs_nosys, fs_nosys,
VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs),
(fs_generic_func_p)fs_freevfs,
(fs_generic_func_p)fs_freevfs, /* Shouldn't fail */
VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate),
(fs_generic_func_p)fs_nosys,
(fs_generic_func_p)fs_nosys,
NULL, 0, NULL, NULL
};
return (fs_build_vector(actual, unused_ops, vfs_ops_table, template));
}
int
vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual)
{
int error;
int unused_ops;
/* Verify that fstype refers to a loaded fs (and not fsid 0). */
if ((fstype <= 0) || (fstype >= nfstype))
return (EINVAL);
if (!ALLOCATED_VFSSW(&vfssw[fstype]))
return (EINVAL);
/* Set up the operations vector. */
error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops);
if (error != 0)
return (error);
vfssw[fstype].vsw_flag |= VSW_INSTALLED;
if (actual != NULL)
*actual = &vfssw[fstype].vsw_vfsops;
#if DEBUG
if (unused_ops != 0)
cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied "
"but not used", vfssw[fstype].vsw_name, unused_ops);
#endif
return (0);
}
int
vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual)
{
int error;
int unused_ops;
*actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP);
error = fs_copyfsops(template, *actual, &unused_ops);
if (error != 0) {
kmem_free(*actual, sizeof (vfsops_t));
*actual = NULL;
return (error);
}
return (0);
}
/*
* Free a vfsops structure created as a result of vfs_makefsops().
* NOTE: For a vfsops structure initialized by vfs_setfsops(), use
* vfs_freevfsops_by_type().
*/
void
vfs_freevfsops(vfsops_t *vfsops)
{
kmem_free(vfsops, sizeof (vfsops_t));
}
/*
* Since the vfsops structure is part of the vfssw table and wasn't
* really allocated, we're not really freeing anything. We keep
* the name for consistency with vfs_freevfsops(). We do, however,
* need to take care of a little bookkeeping.
* NOTE: For a vfsops structure created by vfs_setfsops(), use
* vfs_freevfsops_by_type().
*/
int
vfs_freevfsops_by_type(int fstype)
{
/* Verify that fstype refers to a loaded fs (and not fsid 0). */
if ((fstype <= 0) || (fstype >= nfstype))
return (EINVAL);
WLOCK_VFSSW();
if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) {
WUNLOCK_VFSSW();
return (EINVAL);
}
vfssw[fstype].vsw_flag &= ~VSW_INSTALLED;
WUNLOCK_VFSSW();
return (0);
}
/* Support routines used to reference vfs_op */
/* Set the operations vector for a vfs */
void
vfs_setops(vfs_t *vfsp, vfsops_t *vfsops)
{
vfsops_t *op;
ASSERT(vfsp != NULL);
ASSERT(vfsops != NULL);
op = vfsp->vfs_op;
membar_consumer();
if (vfsp->vfs_femhead == NULL &&
casptr(&vfsp->vfs_op, op, vfsops) == op) {
return;
}
fsem_setvfsops(vfsp, vfsops);
}
/* Retrieve the operations vector for a vfs */
vfsops_t *
vfs_getops(vfs_t *vfsp)
{
vfsops_t *op;
ASSERT(vfsp != NULL);
op = vfsp->vfs_op;
membar_consumer();
if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) {
return (op);
} else {
return (fsem_getvfsops(vfsp));
}
}
/*
* Returns non-zero (1) if the vfsops matches that of the vfs.
* Returns zero (0) if not.
*/
int
vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops)
{
return (vfs_getops(vfsp) == vfsops);
}
/*
* Returns non-zero (1) if the file system has installed a non-default,
* non-error vfs_sync routine. Returns zero (0) otherwise.
*/
int
vfs_can_sync(vfs_t *vfsp)
{
/* vfs_sync() routine is not the default/error function */
return (vfs_getops(vfsp)->vfs_sync != fs_sync);
}
/*
* Initialize a vfs structure.
*/
void
vfs_init(vfs_t *vfsp, vfsops_t *op, void *data)
{
vfsp->vfs_count = 0;
vfsp->vfs_next = vfsp;
vfsp->vfs_prev = vfsp;
vfsp->vfs_zone_next = vfsp;
vfsp->vfs_zone_prev = vfsp;
vfsp->vfs_flag = 0;
vfsp->vfs_data = (data);
vfsp->vfs_resource = NULL;
vfsp->vfs_mntpt = NULL;
vfsp->vfs_mntopts.mo_count = 0;
vfsp->vfs_mntopts.mo_list = NULL;
vfsp->vfs_femhead = NULL;
vfsp->vfs_zone = NULL;
vfs_setops((vfsp), (op));
sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL);
}
/*
* VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs,
* fstatvfs, and sysfs moved to common/syscall.
*/
/*
* Update every mounted file system. We call the vfs_sync operation of
* each file system type, passing it a NULL vfsp to indicate that all
* mounted file systems of that type should be updated.
*/
void
vfs_sync(int flag)
{
struct vfssw *vswp;
RLOCK_VFSSW();
for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
vfs_refvfssw(vswp);
RUNLOCK_VFSSW();
(void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag,
CRED());
vfs_unrefvfssw(vswp);
RLOCK_VFSSW();
}
}
RUNLOCK_VFSSW();
}
void
sync(void)
{
vfs_sync(0);
}
/*
* External routines.
*/
krwlock_t vfssw_lock; /* lock accesses to vfssw */
/*
* Lock for accessing the vfs linked list. Initialized in vfs_mountroot(),
* but otherwise should be accessed only via vfs_list_lock() and
* vfs_list_unlock(). Also used to protect the timestamp for mods to the list.
*/
static krwlock_t vfslist;
/*
* Mount devfs on /devices. This is done right after root is mounted
* to provide device access support for the system
*/
static void
vfs_mountdevices(void)
{
struct vfssw *vsw;
struct vnode *mvp;
struct mounta mounta = { /* fake mounta for devfs_mount() */
NULL,
NULL,
MS_SYSSPACE,
NULL,
NULL,
0,
NULL,
0
};
/*
* _init devfs module to fill in the vfssw
*/
if (modload("fs", "devfs") == -1)
cmn_err(CE_PANIC, "Cannot _init devfs module\n");
/*
* Hold vfs
*/
RLOCK_VFSSW();
vsw = vfs_getvfsswbyname("devfs");
VFS_INIT(&devices, &vsw->vsw_vfsops, NULL);
VFS_HOLD(&devices);
/*
* Locate mount point
*/
if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
cmn_err(CE_PANIC, "Cannot find /devices\n");
/*
* Perform the mount of /devices
*/
if (VFS_MOUNT(&devices, mvp, &mounta, CRED()))
cmn_err(CE_PANIC, "Cannot mount /devices\n");
RUNLOCK_VFSSW();
/*
* Set appropriate members and add to vfs list for mnttab display
*/
vfs_setresource(&devices, "/devices");
vfs_setmntpoint(&devices, "/devices");
/*
* Hold the root of /devices so it won't go away
*/
if (VFS_ROOT(&devices, &devicesdir))
cmn_err(CE_PANIC, "vfs_mountdevices: not devices root");
VN_HOLD(devicesdir);
if (vfs_lock(&devices) != 0) {
cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices");
return;
}
if (vn_vfswlock(mvp) != 0) {
vfs_unlock(&devices);
cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices");
return;
}
vfs_add(mvp, &devices, 0);
vn_vfsunlock(mvp);
vfs_unlock(&devices);
}
/*
* Mount required filesystem. This is done right after root is mounted.
*/
static void
vfs_mountfs(char *module, char *spec, char *path)
{
struct vnode *mvp;
struct mounta mounta;
vfs_t *vfsp;
mounta.flags = MS_SYSSPACE | MS_DATA;
mounta.fstype = module;
mounta.spec = spec;
mounta.dir = path;
if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) {
cmn_err(CE_WARN, "Cannot find %s\n", path);
return;
}
if (domount(NULL, &mounta, mvp, CRED(), &vfsp))
cmn_err(CE_WARN, "Cannot mount %s\n", path);
else
VFS_RELE(vfsp);
VN_RELE(mvp);
}
/*
* vfs_mountroot is called by main() to mount the root filesystem.
*/
void
vfs_mountroot(void)
{
struct vnode *rvp = NULL;
char *path;
size_t plen;
rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL);
rw_init(&vfslist, NULL, RW_DEFAULT, NULL);
/*
* Alloc the vfs hash bucket array and locks
*/
rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP);
/*
* Call machine-dependent routine "rootconf" to choose a root
* file system type.
*/
if (rootconf())
cmn_err(CE_PANIC, "vfs_mountroot: cannot mount root");
/*
* Get vnode for '/'. Set up rootdir, u.u_rdir and u.u_cdir
* to point to it. These are used by lookuppn() so that it
* knows where to start from ('/' or '.').
*/
vfs_setmntpoint(rootvfs, "/");
if (VFS_ROOT(rootvfs, &rootdir))
cmn_err(CE_PANIC, "vfs_mountroot: no root vnode");
u.u_cdir = rootdir;
VN_HOLD(u.u_cdir);
u.u_rdir = NULL;
/*
* Setup the global zone's rootvp, now that it exists.
*/
global_zone->zone_rootvp = rootdir;
VN_HOLD(global_zone->zone_rootvp);
/*
* Notify the module code that it can begin using the
* root filesystem instead of the boot program's services.
*/
modrootloaded = 1;
/*
* Set up mnttab information for root
*/
vfs_setresource(rootvfs, rootfs.bo_name);
/*
* Notify cluster software that the root filesystem is available.
*/
clboot_mountroot();
/*
* Mount /devices, /system/contract, /etc/mnttab, /etc/svc/volatile,
* /system/object, and /proc.
*/
vfs_mountdevices();
vfs_mountfs("ctfs", "ctfs", CTFS_ROOT);
vfs_mountfs("proc", "/proc", "/proc");
vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
#ifdef __sparc
/*
* This bit of magic can go away when we convert sparc to
* the new boot architecture based on ramdisk.
*
* Booting off a mirrored root volume:
* At this point, we have booted and mounted root on a
* single component of the mirror. Complete the boot
* by configuring SVM and converting the root to the
* dev_t of the mirrored root device. This dev_t conversion
* only works because the underlying device doesn't change.
*/
if (root_is_svm) {
if (svm_rootconf()) {
cmn_err(CE_PANIC, "vfs_mountroot: cannot remount root");
}
/*
* mnttab should reflect the new root device
*/
vfs_lock_wait(rootvfs);
vfs_setresource(rootvfs, rootfs.bo_name);
vfs_unlock(rootvfs);
}
#endif /* __sparc */
/*
* Look up the root device via devfs so that a dv_node is
* created for it. The vnode is never VN_RELE()ed.
* We allocate more than MAXPATHLEN so that the
* buffer passed to i_ddi_prompath_to_devfspath() is
* exactly MAXPATHLEN (the function expects a buffer
* of that length).
*/
plen = strlen("/devices");
path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP);
(void) strcpy(path, "/devices");
if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen)
!= DDI_SUCCESS ||
lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) {
/* NUL terminate in case "path" has garbage */
path[plen + MAXPATHLEN - 1] = '\0';
#ifdef DEBUG
cmn_err(CE_WARN, "!Cannot lookup root device: %s", path);
#endif
}
kmem_free(path, plen + MAXPATHLEN);
}
/*
* Common mount code. Called from the system call entry point, from autofs,
* and from pxfs.
*
* Takes the effective file system type, mount arguments, the mount point
* vnode, flags specifying whether the mount is a remount and whether it
* should be entered into the vfs list, and credentials. Fills in its vfspp
* parameter with the mounted file system instance's vfs.
*
* Note that the effective file system type is specified as a string. It may
* be null, in which case it's determined from the mount arguments, and may
* differ from the type specified in the mount arguments; this is a hook to
* allow interposition when instantiating file system instances.
*
* The caller is responsible for releasing its own hold on the mount point
* vp (this routine does its own hold when necessary).
* Also note that for remounts, the mount point vp should be the vnode for
* the root of the file system rather than the vnode that the file system
* is mounted on top of.
*/
int
domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
struct vfs **vfspp)
{
struct vfssw *vswp;
vfsops_t *vfsops;
struct vfs *vfsp;
struct vnode *bvp;
dev_t bdev = 0;
mntopts_t mnt_mntopts;
int error = 0;
int copyout_error = 0;
int ovflags;
char *opts = uap->optptr;
char *inargs = opts;
int optlen = uap->optlen;
int remount;
int rdonly;
int nbmand = 0;
int delmip = 0;
int addmip = 0;
int splice = ((uap->flags & MS_NOSPLICE) == 0);
int fromspace = (uap->flags & MS_SYSSPACE) ?
UIO_SYSSPACE : UIO_USERSPACE;
char *resource = NULL, *mountpt = NULL;
refstr_t *oldresource, *oldmntpt;
struct pathname pn, rpn;
/*
* The v_flag value for the mount point vp is permanently set
* to VVFSLOCK so that no one bypasses the vn_vfs*locks routine
* for mount point locking.
*/
mutex_enter(&vp->v_lock);
vp->v_flag |= VVFSLOCK;
mutex_exit(&vp->v_lock);
mnt_mntopts.mo_count = 0;
/*
* Find the ops vector to use to invoke the file system-specific mount
* method. If the fsname argument is non-NULL, use it directly.
* Otherwise, dig the file system type information out of the mount
* arguments.
*
* A side effect is to hold the vfssw entry.
*
* Mount arguments can be specified in several ways, which are
* distinguished by flag bit settings. The preferred way is to set
* MS_OPTIONSTR, indicating an 8 argument mount with the file system
* type supplied as a character string and the last two arguments
* being a pointer to a character buffer and the size of the buffer.
* On entry, the buffer holds a null terminated list of options; on
* return, the string is the list of options the file system
* recognized. If MS_DATA is set arguments five and six point to a
* block of binary data which the file system interprets.
* A further wrinkle is that some callers don't set MS_FSS and MS_DATA
* consistently with these conventions. To handle them, we check to
* see whether the pointer to the file system name has a numeric value
* less than 256. If so, we treat it as an index.
*/
if (fsname != NULL) {
if ((vswp = vfs_getvfssw(fsname)) == NULL) {
return (EINVAL);
}
} else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) {
size_t n;
uint_t fstype;
char name[FSTYPSZ];
if ((fstype = (uintptr_t)uap->fstype) < 256) {
RLOCK_VFSSW();
if (fstype == 0 || fstype >= nfstype ||
!ALLOCATED_VFSSW(&vfssw[fstype])) {
RUNLOCK_VFSSW();
return (EINVAL);
}
(void) strcpy(name, vfssw[fstype].vsw_name);
RUNLOCK_VFSSW();
if ((vswp = vfs_getvfssw(name)) == NULL)
return (EINVAL);
} else {
/*
* Handle either kernel or user address space.
*/
if (uap->flags & MS_SYSSPACE) {
error = copystr(uap->fstype, name,
FSTYPSZ, &n);
} else {
error = copyinstr(uap->fstype, name,
FSTYPSZ, &n);
}
if (error) {
if (error == ENAMETOOLONG)
return (EINVAL);
return (error);
}
if ((vswp = vfs_getvfssw(name)) == NULL)
return (EINVAL);
}
} else {
if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL)
return (EINVAL);
}
if (!VFS_INSTALLED(vswp))
return (EINVAL);
vfsops = &vswp->vsw_vfsops;
vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts);
/*
* Fetch mount options and parse them for generic vfs options
*/
if (uap->flags & MS_OPTIONSTR) {
/*
* Limit the buffer size
*/
if (optlen < 0 || optlen > MAX_MNTOPT_STR) {
error = EINVAL;
goto errout;
}
if ((uap->flags & MS_SYSSPACE) == 0) {
inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
inargs[0] = '\0';
if (optlen) {
error = copyinstr(opts, inargs, (size_t)optlen,
NULL);
if (error) {
goto errout;
}
}
}
vfs_parsemntopts(&mnt_mntopts, inargs, 0);
}
/*
* Flag bits override the options string.
*/
if (uap->flags & MS_REMOUNT)
vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0);
if (uap->flags & MS_RDONLY)
vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0);
if (uap->flags & MS_NOSUID)
vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
/*
* Check if this is a remount; must be set in the option string and
* the file system must support a remount option.
*/
if (remount = vfs_optionisset_nolock(&mnt_mntopts,
MNTOPT_REMOUNT, NULL)) {
if (!(vswp->vsw_flag & VSW_CANREMOUNT)) {
error = ENOTSUP;
goto errout;
}
uap->flags |= MS_REMOUNT;
}
/*
* uap->flags and vfs_optionisset() should agree.
*/
if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) {
uap->flags |= MS_RDONLY;
}
if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) {
uap->flags |= MS_NOSUID;
}
nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL);
ASSERT(splice || !remount);
/*
* If we are splicing the fs into the namespace,
* perform mount point checks.
*
* We want to resolve the path for the mount point to eliminate
* '.' and ".." and symlinks in mount points; we can't do the
* same for the resource string, since it would turn
* "/dev/dsk/c0t0d0s0" into "/devices/pci@...". We need to do
* this before grabbing vn_vfswlock(), because otherwise we
* would deadlock with lookuppn().
*/
if (splice) {
ASSERT(vp->v_count > 0);
/*
* Pick up mount point and device from appropriate space.
*/
if (pn_get(uap->spec, fromspace, &pn) == 0) {
resource = kmem_alloc(pn.pn_pathlen + 1,
KM_SLEEP);
(void) strcpy(resource, pn.pn_path);
pn_free(&pn);
}
/*
* Do a lookupname prior to taking the
* writelock. Mark this as completed if
* successful for later cleanup and addition to
* the mount in progress table.
*/
if ((uap->flags & MS_GLOBAL) == 0 &&
lookupname(uap->spec, fromspace,
FOLLOW, NULL, &bvp) == 0) {
addmip = 1;
}
if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) {
pathname_t *pnp;
if (*pn.pn_path != '/') {
error = EINVAL;
pn_free(&pn);
goto errout;
}
pn_alloc(&rpn);
/*
* Kludge to prevent autofs from deadlocking with
* itself when it calls domount().
*
* If autofs is calling, it is because it is doing
* (autofs) mounts in the process of an NFS mount. A
* lookuppn() here would cause us to block waiting for
* said NFS mount to complete, which can't since this
* is the thread that was supposed to doing it.
*/
if (fromspace == UIO_USERSPACE) {
if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL,
NULL)) == 0) {
pnp = &rpn;
} else {
/*
* The file disappeared or otherwise
* became inaccessible since we opened
* it; might as well fail the mount
* since the mount point is no longer
* accessible.
*/
pn_free(&rpn);
pn_free(&pn);
goto errout;
}
} else {
pnp = &pn;
}
mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP);
(void) strcpy(mountpt, pnp->pn_path);
/*
* If the addition of the zone's rootpath
* would push us over a total path length
* of MAXPATHLEN, we fail the mount with
* ENAMETOOLONG, which is what we would have
* gotten if we were trying to perform the same
* mount in the global zone.
*
* strlen() doesn't count the trailing
* '\0', but zone_rootpathlen counts both a
* trailing '/' and the terminating '\0'.
*/
if ((curproc->p_zone->zone_rootpathlen - 1 +
strlen(mountpt)) > MAXPATHLEN ||
(resource != NULL &&
(curproc->p_zone->zone_rootpathlen - 1 +
strlen(resource)) > MAXPATHLEN)) {
error = ENAMETOOLONG;
}
pn_free(&rpn);
pn_free(&pn);
}
if (error)
goto errout;
/*
* Prevent path name resolution from proceeding past
* the mount point.
*/
if (vn_vfswlock(vp) != 0) {
error = EBUSY;
goto errout;
}
/*
* Verify that it's legitimate to establish a mount on
* the prospective mount point.
*/
if (vn_mountedvfs(vp) != NULL) {
/*
* The mount point lock was obtained after some
* other thread raced through and established a mount.
*/
vn_vfsunlock(vp);
error = EBUSY;
goto errout;
}
if (vp->v_flag & VNOMOUNT) {
vn_vfsunlock(vp);
error = EINVAL;
goto errout;
}
}
if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) {
uap->dataptr = NULL;
uap->datalen = 0;
}
/*
* If this is a remount, we don't want to create a new VFS.
* Instead, we pass the existing one with a remount flag.
*/
if (remount) {
/*
* Confirm that the mount point is the root vnode of the
* file system that is being remounted.
* This can happen if the user specifies a different
* mount point directory pathname in the (re)mount command.
*
* Code below can only be reached if splice is true, so it's
* safe to do vn_vfsunlock() here.
*/
if ((vp->v_flag & VROOT) == 0) {
vn_vfsunlock(vp);
error = ENOENT;
goto errout;
}
/*
* Disallow making file systems read-only unless file system
* explicitly allows it in its vfssw. Ignore other flags.
*/
if (rdonly && vn_is_readonly(vp) == 0 &&
(vswp->vsw_flag & VSW_CANRWRO) == 0) {
vn_vfsunlock(vp);
error = EINVAL;
goto errout;
}
/*
* Changing the NBMAND setting on remounts is permitted
* but logged since it can lead to unexpected behavior.
* We also counsel against using it for / and /usr.
*/
if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) ||
(!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) {
cmn_err(CE_WARN, "domount: nbmand turned %s via "
"remounting %s", nbmand ? "on" : "off",
refstr_value(vp->v_vfsp->vfs_mntpt));
}
vfsp = vp->v_vfsp;
ovflags = vfsp->vfs_flag;
vfsp->vfs_flag |= VFS_REMOUNT;
vfsp->vfs_flag &= ~VFS_RDONLY;
} else {
vfsp = kmem_alloc(sizeof (vfs_t), KM_SLEEP);
VFS_INIT(vfsp, vfsops, NULL);
}
VFS_HOLD(vfsp);
/*
* The vfs_reflock is not used anymore the code below explicitly
* holds it preventing others accesing it directly.
*/
if ((sema_tryp(&vfsp->vfs_reflock) == 0) &&
!(vfsp->vfs_flag & VFS_REMOUNT))
cmn_err(CE_WARN,
"mount type %s couldn't get vfs_reflock\n", vswp->vsw_name);
/*
* Lock the vfs. If this is a remount we want to avoid spurious umount
* failures that happen as a side-effect of fsflush() and other mount
* and unmount operations that might be going on simultaneously and
* may have locked the vfs currently. To not return EBUSY immediately
* here we use vfs_lock_wait() instead vfs_lock() for the remount case.
*/
if (!remount) {
if (error = vfs_lock(vfsp)) {
vfsp->vfs_flag = ovflags;
if (splice)
vn_vfsunlock(vp);
kmem_free(vfsp, sizeof (struct vfs));
goto errout;
}
} else {
vfs_lock_wait(vfsp);
}
/*
* Add device to mount in progress table, global mounts require special
* handling. It is possible that we have already done the lookupname
* on a spliced, non-global fs. If so, we don't want to do it again
* since we cannot do a lookupname after taking the
* wlock above. This case is for a non-spliced, non-global filesystem.
*/
if (!addmip) {
if ((uap->flags & MS_GLOBAL) == 0 &&
lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) {
addmip = 1;
}
}
if (addmip) {
bdev = bvp->v_rdev;
VN_RELE(bvp);
vfs_addmip(bdev, vfsp);
addmip = 0;
delmip = 1;
}
/*
* Invalidate cached entry for the mount point.
*/
if (splice)
dnlc_purge_vp(vp);
/*
* If have an option string but the filesystem doesn't supply a
* prototype options table, create a table with the global
* options and sufficient room to accept all the options in the
* string. Then parse the passed in option string
* accepting all the options in the string. This gives us an
* option table with all the proper cancel properties for the
* global options.
*
* Filesystems that supply a prototype options table are handled
* earlier in this function.
*/
if (uap->flags & MS_OPTIONSTR) {
if (!(vswp->vsw_flag & VSW_HASPROTO)) {
mntopts_t tmp_mntopts;
tmp_mntopts.mo_count = 0;
vfs_createopttbl_extend(&tmp_mntopts, inargs,
&mnt_mntopts);
vfs_parsemntopts(&tmp_mntopts, inargs, 1);
vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts);
vfs_freeopttbl(&tmp_mntopts);
}
}
/*
* Serialize with zone creations.
*/
mount_in_progress();
/*
* Instantiate (or reinstantiate) the file system. If appropriate,
* splice it into the file system name space.
*
* We want VFS_MOUNT() to be able to override the vfs_resource
* string if necessary (ie, mntfs), and also for a remount to
* change the same (necessary when remounting '/' during boot).
* So we set up vfs_mntpt and vfs_resource to what we think they
* should be, then hand off control to VFS_MOUNT() which can
* override this.
*
* For safety's sake, when changing vfs_resource or vfs_mntpt of
* a vfs which is on the vfs list (i.e. during a remount), we must
* never set those fields to NULL. Several bits of code make
* assumptions that the fields are always valid.
*/
vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
if (remount) {
if ((oldresource = vfsp->vfs_resource) != NULL)
refstr_hold(oldresource);
if ((oldmntpt = vfsp->vfs_mntpt) != NULL)
refstr_hold(oldmntpt);
}
vfs_setresource(vfsp, resource);
vfs_setmntpoint(vfsp, mountpt);
error = VFS_MOUNT(vfsp, vp, uap, credp);
if (uap->flags & MS_RDONLY)
vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
if (uap->flags & MS_NOSUID)
vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0);
if (uap->flags & MS_GLOBAL)
vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0);
if (error) {
if (remount) {
/* put back pre-remount options */
vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
vfs_setmntpoint(vfsp, refstr_value(oldmntpt));
if (oldmntpt)
refstr_rele(oldmntpt);
vfs_setresource(vfsp, refstr_value(oldresource));
if (oldresource)
refstr_rele(oldresource);
vfsp->vfs_flag = ovflags;
vfs_unlock(vfsp);
VFS_RELE(vfsp);
} else {
vfs_unlock(vfsp);
vfs_freemnttab(vfsp);
kmem_free(vfsp, sizeof (struct vfs));
}
} else {
/*
* Set the mount time to now
*/
vfsp->vfs_mtime = ddi_get_time();
if (remount) {
vfsp->vfs_flag &= ~VFS_REMOUNT;
if (oldresource)
refstr_rele(oldresource);
if (oldmntpt)
refstr_rele(oldmntpt);
} else if (splice) {
/*
* Link vfsp into the name space at the mount
* point. Vfs_add() is responsible for
* holding the mount point which will be
* released when vfs_remove() is called.
*/
vfs_add(vp, vfsp, uap->flags);
} else {
/*
* Hold the reference to file system which is
* not linked into the name space.
*/
vfsp->vfs_zone = NULL;
VFS_HOLD(vfsp);
vfsp->vfs_vnodecovered = NULL;
}
/*
* Set flags for global options encountered
*/
if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
vfsp->vfs_flag |= VFS_RDONLY;
else
vfsp->vfs_flag &= ~VFS_RDONLY;
if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES);
} else {
if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
vfsp->vfs_flag |= VFS_NODEVICES;
else
vfsp->vfs_flag &= ~VFS_NODEVICES;
if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
vfsp->vfs_flag |= VFS_NOSETUID;
else
vfsp->vfs_flag &= ~VFS_NOSETUID;
}
if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
vfsp->vfs_flag |= VFS_NBMAND;
else
vfsp->vfs_flag &= ~VFS_NBMAND;
if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
vfsp->vfs_flag |= VFS_XATTR;
else
vfsp->vfs_flag &= ~VFS_XATTR;
if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
vfsp->vfs_flag |= VFS_NOEXEC;
else
vfsp->vfs_flag &= ~VFS_NOEXEC;
/*
* Now construct the output option string of options
* we recognized.
*/
if (uap->flags & MS_OPTIONSTR) {
vfs_list_read_lock();
copyout_error = vfs_buildoptionstr(
&vfsp->vfs_mntopts, inargs, optlen);
vfs_list_unlock();
if (copyout_error == 0 &&
(uap->flags & MS_SYSSPACE) == 0) {
copyout_error = copyoutstr(inargs, opts,
optlen, NULL);
}
}
vfs_unlock(vfsp);
}
mount_completed();
if (splice)
vn_vfsunlock(vp);
/*
* Return vfsp to caller.
*/
if ((error == 0) && (copyout_error == 0)) {
*vfspp = vfsp;
}
errout:
vfs_freeopttbl(&mnt_mntopts);
if (resource != NULL)
kmem_free(resource, strlen(resource) + 1);
if (mountpt != NULL)
kmem_free(mountpt, strlen(mountpt) + 1);
/*
* It is possible we errored prior to adding to mount in progress
* table. Must free vnode we acquired with successful lookupname.
*/
if (addmip)
VN_RELE(bvp);
if (delmip)
vfs_delmip(vfsp);
ASSERT(vswp != NULL);
vfs_unrefvfssw(vswp);
if (inargs != opts)
kmem_free(inargs, MAX_MNTOPT_STR);
if (copyout_error) {
VFS_RELE(vfsp);
error = copyout_error;
}
return (error);
}
static void
vfs_setpath(struct vfs *vfsp, refstr_t **refp, const char *newpath)
{
size_t len;
refstr_t *ref;
zone_t *zone = curproc->p_zone;
char *sp;
int have_list_lock = 0;
ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp));
/*
* New path must be less than MAXPATHLEN because mntfs
* will only display up to MAXPATHLEN bytes. This is currently
* safe, because domount() uses pn_get(), and other callers
* similarly cap the size to fewer than MAXPATHLEN bytes.
*/
ASSERT(strlen(newpath) < MAXPATHLEN);
/* mntfs requires consistency while vfs list lock is held */
if (VFS_ON_LIST(vfsp)) {
have_list_lock = 1;
vfs_list_lock();
}
if (*refp != NULL)
refstr_rele(*refp);
/* Do we need to modify the path? */
if (zone == global_zone || *newpath != '/') {
ref = refstr_alloc(newpath);
goto out;
}
/*
* Truncate the trailing '/' in the zoneroot, and merge
* in the zone's rootpath with the "newpath" (resource
* or mountpoint) passed in.
*
* The size of the required buffer is thus the size of
* the buffer required for the passed-in newpath
* (strlen(newpath) + 1), plus the size of the buffer
* required to hold zone_rootpath (zone_rootpathlen)
* minus one for one of the now-superfluous NUL
* terminations, minus one for the trailing '/'.
*
* That gives us:
*
* (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1
*
* Which is what we have below.
*/
len = strlen(newpath) + zone->zone_rootpathlen - 1;
sp = kmem_alloc(len, KM_SLEEP);
/*
* Copy everything including the trailing slash, which
* we then overwrite with the NUL character.
*/
(void) strcpy(sp, zone->zone_rootpath);
sp[zone->zone_rootpathlen - 2] = '\0';
(void) strcat(sp, newpath);
ref = refstr_alloc(sp);
kmem_free(sp, len);
out:
*refp = ref;
if (have_list_lock) {
vfs_mnttab_modtimeupd();
vfs_list_unlock();
}
}
/*
* Record a mounted resource name in a vfs structure.
* If vfsp is already mounted, caller must hold the vfs lock.
*/
void
vfs_setresource(struct vfs *vfsp, const char *resource)
{
if (resource == NULL || resource[0] == '\0')
resource = VFS_NORESOURCE;
vfs_setpath(vfsp, &vfsp->vfs_resource, resource);
}
/*
* Record a mount point name in a vfs structure.
* If vfsp is already mounted, caller must hold the vfs lock.
*/
void
vfs_setmntpoint(struct vfs *vfsp, const char *mntpt)
{
if (mntpt == NULL || mntpt[0] == '\0')
mntpt = VFS_NOMNTPT;
vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt);
}
/* Returns the vfs_resource. Caller must call refstr_rele() when finished. */
refstr_t *
vfs_getresource(const struct vfs *vfsp)
{
refstr_t *resource;
vfs_list_read_lock();
resource = vfsp->vfs_resource;
refstr_hold(resource);
vfs_list_unlock();
return (resource);
}
/* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */
refstr_t *
vfs_getmntpoint(const struct vfs *vfsp)
{
refstr_t *mntpt;
vfs_list_read_lock();
mntpt = vfsp->vfs_mntpt;
refstr_hold(mntpt);
vfs_list_unlock();
return (mntpt);
}
/*
* Create an empty options table with enough empty slots to hold all
* The options in the options string passed as an argument.
* Potentially prepend another options table.
*
* Note: caller is responsible for locking the vfs list, if needed,
* to protect mops.
*/
static void
vfs_createopttbl_extend(mntopts_t *mops, const char *opts,
const mntopts_t *mtmpl)
{
const char *s = opts;
uint_t count;
if (opts == NULL || *opts == '\0') {
count = 0;
} else {
count = 1;
/*
* Count number of options in the string
*/
for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) {
count++;
s++;
}
}
vfs_copyopttbl_extend(mtmpl, mops, count);
}
/*
* Create an empty options table with enough empty slots to hold all
* The options in the options string passed as an argument.
*
* This function is *not* for general use by filesystems.
*
* Note: caller is responsible for locking the vfs list, if needed,
* to protect mops.
*/
void
vfs_createopttbl(mntopts_t *mops, const char *opts)
{
vfs_createopttbl_extend(mops, opts, NULL);
}
/*
* Swap two mount options tables
*/
static void
vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2)
{
uint_t tmpcnt;
mntopt_t *tmplist;
tmpcnt = optbl2->mo_count;
tmplist = optbl2->mo_list;
optbl2->mo_count = optbl1->mo_count;
optbl2->mo_list = optbl1->mo_list;
optbl1->mo_count = tmpcnt;
optbl1->mo_list = tmplist;
}
static void
vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2)
{
vfs_list_lock();
vfs_swapopttbl_nolock(optbl1, optbl2);
vfs_mnttab_modtimeupd();
vfs_list_unlock();
}
static char **
vfs_copycancelopt_extend(char **const moc, int extend)
{
int i = 0;
int j;
char **result;
if (moc != NULL) {
for (; moc[i] != NULL; i++)
/* count number of options to cancel */;
}
if (i + extend == 0)
return (NULL);
result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP);
for (j = 0; j < i; j++) {
result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP);
(void) strcpy(result[j], moc[j]);
}
for (; j <= i + extend; j++)
result[j] = NULL;
return (result);
}
static void
vfs_copyopt(const mntopt_t *s, mntopt_t *d)
{
char *sp, *dp;
d->mo_flags = s->mo_flags;
d->mo_data = s->mo_data;
sp = s->mo_name;
if (sp != NULL) {
dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
(void) strcpy(dp, sp);
d->mo_name = dp;
} else {
d->mo_name = NULL; /* should never happen */
}
d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0);
sp = s->mo_arg;
if (sp != NULL) {
dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
(void) strcpy(dp, sp);
d->mo_arg = dp;
} else {
d->mo_arg = NULL;
}
}
/*
* Copy a mount options table, possibly allocating some spare
* slots at the end. It is permissible to copy_extend the NULL table.
*/
static void
vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra)
{
uint_t i, count;
mntopt_t *motbl;
/*
* Clear out any existing stuff in the options table being initialized
*/
vfs_freeopttbl(dmo);
count = (smo == NULL) ? 0 : smo->mo_count;
if ((count + extra) == 0) /* nothing to do */
return;
dmo->mo_count = count + extra;
motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP);
dmo->mo_list = motbl;
for (i = 0; i < count; i++) {
vfs_copyopt(&smo->mo_list[i], &motbl[i]);
}
for (i = count; i < count + extra; i++) {
motbl[i].mo_flags = MO_EMPTY;
}
}
/*
* Copy a mount options table.
*
* This function is *not* for general use by filesystems.
*
* Note: caller is responsible for locking the vfs list, if needed,
* to protect smo and dmo.
*/
void
vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo)
{
vfs_copyopttbl_extend(smo, dmo, 0);
}
static char **
vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2)
{
int c1 = 0;
int c2 = 0;
char **result;
char **sp1, **sp2, **dp;
/*
* First we count both lists of cancel options.
* If either is NULL or has no elements, we return a copy of
* the other.
*/
if (mop1->mo_cancel != NULL) {
for (; mop1->mo_cancel[c1] != NULL; c1++)
/* count cancel options in mop1 */;
}
if (c1 == 0)
return (vfs_copycancelopt_extend(mop2->mo_cancel, 0));
if (mop2->mo_cancel != NULL) {
for (; mop2->mo_cancel[c2] != NULL; c2++)
/* count cancel options in mop2 */;
}
result = vfs_copycancelopt_extend(mop1->mo_cancel, c2);
if (c2 == 0)
return (result);
/*
* When we get here, we've got two sets of cancel options;
* we need to merge the two sets. We know that the result
* array has "c1+c2+1" entries and in the end we might shrink
* it.
* Result now has a copy of the c1 entries from mop1; we'll
* now lookup all the entries of mop2 in mop1 and copy it if
* it is unique.
* This operation is O(n^2) but it's only called once per
* filesystem per duplicate option. This is a situation
* which doesn't arise with the filesystems in ON and
* n is generally 1.
*/
dp = &result[c1];
for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) {
for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) {
if (strcmp(*sp1, *sp2) == 0)
break;
}
if (*sp1 == NULL) {
/*
* Option *sp2 not found in mop1, so copy it.
* The calls to vfs_copycancelopt_extend()
* guarantee that there's enough room.
*/
*dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP);
(void) strcpy(*dp++, *sp2);
}
}
if (dp != &result[c1+c2]) {
size_t bytes = (dp - result + 1) * sizeof (char *);
char **nres = kmem_alloc(bytes, KM_SLEEP);
bcopy(result, nres, bytes);
kmem_free(result, (c1 + c2 + 1) * sizeof (char *));
result = nres;
}
return (result);
}
/*
* Merge two mount option tables (outer and inner) into one. This is very
* similar to "merging" global variables and automatic variables in C.
*
* This isn't (and doesn't have to be) fast.
*
* This function is *not* for general use by filesystems.
*
* Note: caller is responsible for locking the vfs list, if needed,
* to protect omo, imo & dmo.
*/
void
vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo)
{
uint_t i, count;
mntopt_t *mop, *motbl;
uint_t freeidx;
/*
* First determine how much space we need to allocate.
*/
count = omo->mo_count;
for (i = 0; i < imo->mo_count; i++) {
if (imo->mo_list[i].mo_flags & MO_EMPTY)
continue;
if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL)
count++;
}
ASSERT(count >= omo->mo_count &&
count <= omo->mo_count + imo->mo_count);
motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP);
for (i = 0; i < omo->mo_count; i++)
vfs_copyopt(&omo->mo_list[i], &motbl[i]);
freeidx = omo->mo_count;
for (i = 0; i < imo->mo_count; i++) {
if (imo->mo_list[i].mo_flags & MO_EMPTY)
continue;
if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) {
char **newcanp;
uint_t index = mop - omo->mo_list;
newcanp = vfs_mergecancelopts(mop, &motbl[index]);
vfs_freeopt(&motbl[index]);
vfs_copyopt(&imo->mo_list[i], &motbl[index]);
vfs_freecancelopt(motbl[index].mo_cancel);
motbl[index].mo_cancel = newcanp;
} else {
/*
* If it's a new option, just copy it over to the first
* free location.
*/
vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]);
}
}
dmo->mo_count = count;
dmo->mo_list = motbl;
}
/*
* Functions to set and clear mount options in a mount options table.
*/
/*
* Clear a mount option, if it exists.
*
* The update_mnttab arg indicates whether mops is part of a vfs that is on
* the vfs list.
*/
static void
vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab)
{
struct mntopt *mop;
uint_t i, count;
ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
count = mops->mo_count;
for (i = 0; i < count; i++) {
mop = &mops->mo_list[i];
if (mop->mo_flags & MO_EMPTY)
continue;
if (strcmp(opt, mop->mo_name))
continue;
mop->mo_flags &= ~MO_SET;
if (mop->mo_arg != NULL) {
kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
}
mop->mo_arg = NULL;
if (update_mnttab)
vfs_mnttab_modtimeupd();
break;
}
}
void
vfs_clearmntopt(struct vfs *vfsp, const char *opt)
{
int gotlock = 0;
if (VFS_ON_LIST(vfsp)) {
gotlock = 1;
vfs_list_lock();
}
vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock);
if (gotlock)
vfs_list_unlock();
}
/*
* Set a mount option on. If it's not found in the table, it's silently
* ignored. If the option has MO_IGNORE set, it is still set unless the
* VFS_NOFORCEOPT bit is set in the flags. Also, VFS_DISPLAY/VFS_NODISPLAY flag
* bits can be used to toggle the MO_NODISPLAY bit for the option.
* If the VFS_CREATEOPT flag bit is set then the first option slot with
* MO_EMPTY set is created as the option passed in.
*
* The update_mnttab arg indicates whether mops is part of a vfs that is on
* the vfs list.
*/
static void
vfs_setmntopt_nolock(mntopts_t *mops, const char *opt,
const char *arg, int flags, int update_mnttab)
{
mntopt_t *mop;
uint_t i, count;
char *sp;
ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
if (flags & VFS_CREATEOPT) {
if (vfs_hasopt(mops, opt) != NULL) {
flags &= ~VFS_CREATEOPT;
}
}
count = mops->mo_count;
for (i = 0; i < count; i++) {
mop = &mops->mo_list[i];
if (mop->mo_flags & MO_EMPTY) {
if ((flags & VFS_CREATEOPT) == 0)
continue;
sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP);
(void) strcpy(sp, opt);
mop->mo_name = sp;
if (arg != NULL)
mop->mo_flags = MO_HASVALUE;
else
mop->mo_flags = 0;
} else if (strcmp(opt, mop->mo_name)) {
continue;
}
if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT))
break;
if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) {
sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP);
(void) strcpy(sp, arg);
} else {
sp = NULL;
}
if (mop->mo_arg != NULL)
kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
mop->mo_arg = sp;
if (flags & VFS_DISPLAY)
mop->mo_flags &= ~MO_NODISPLAY;
if (flags & VFS_NODISPLAY)
mop->mo_flags |= MO_NODISPLAY;
mop->mo_flags |= MO_SET;
if (mop->mo_cancel != NULL) {
char **cp;
for (cp = mop->mo_cancel; *cp != NULL; cp++)
vfs_clearmntopt_nolock(mops, *cp, 0);
}
if (update_mnttab)
vfs_mnttab_modtimeupd();
break;
}
}
void
vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags)
{
int gotlock = 0;
if (VFS_ON_LIST(vfsp)) {
gotlock = 1;
vfs_list_lock();
}
vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock);
if (gotlock)
vfs_list_unlock();
}
/*
* Add a "tag" option to a mounted file system's options list.
*
* Note: caller is responsible for locking the vfs list, if needed,
* to protect mops.
*/
static mntopt_t *
vfs_addtag(mntopts_t *mops, const char *tag)
{
uint_t count;
mntopt_t *mop, *motbl;
count = mops->mo_count + 1;
motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP);
if (mops->mo_count) {
size_t len = (count - 1) * sizeof (mntopt_t);
bcopy(mops->mo_list, motbl, len);
kmem_free(mops->mo_list, len);
}
mops->mo_count = count;
mops->mo_list = motbl;
mop = &motbl[count - 1];
mop->mo_flags = MO_TAG;
mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP);
(void) strcpy(mop->mo_name, tag);
return (mop);
}
/*
* Allow users to set arbitrary "tags" in a vfs's mount options.
* Broader use within the kernel is discouraged.
*/
int
vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
cred_t *cr)
{
vfs_t *vfsp;
mntopts_t *mops;
mntopt_t *mop;
int found = 0;
dev_t dev = makedevice(major, minor);
int err = 0;
char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
/*
* Find the desired mounted file system
*/
vfs_list_lock();
vfsp = rootvfs;
do {
if (vfsp->vfs_dev == dev &&
strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
found = 1;
break;
}
vfsp = vfsp->vfs_next;
} while (vfsp != rootvfs);
if (!found) {
err = EINVAL;
goto out;
}
err = secpolicy_fs_config(cr, vfsp);
if (err != 0)
goto out;
mops = &vfsp->vfs_mntopts;
/*
* Add tag if it doesn't already exist
*/
if ((mop = vfs_hasopt(mops, tag)) == NULL) {
int len;
(void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR);
len = strlen(buf);
if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) {
err = ENAMETOOLONG;
goto out;
}
mop = vfs_addtag(mops, tag);
}
if ((mop->mo_flags & MO_TAG) == 0) {
err = EINVAL;
goto out;
}
vfs_setmntopt_nolock(mops, tag, NULL, 0, 1);
out:
vfs_list_unlock();
kmem_free(buf, MAX_MNTOPT_STR);
return (err);
}
/*
* Allow users to remove arbitrary "tags" in a vfs's mount options.
* Broader use within the kernel is discouraged.
*/
int
vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
cred_t *cr)
{
vfs_t *vfsp;
mntopt_t *mop;
int found = 0;
dev_t dev = makedevice(major, minor);
int err = 0;
/*
* Find the desired mounted file system
*/
vfs_list_lock();
vfsp = rootvfs;
do {
if (vfsp->vfs_dev == dev &&
strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
found = 1;
break;
}
vfsp = vfsp->vfs_next;
} while (vfsp != rootvfs);
if (!found) {
err = EINVAL;
goto out;
}
err = secpolicy_fs_config(cr, vfsp);
if (err != 0)
goto out;
if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) {
err = EINVAL;
goto out;
}
if ((mop->mo_flags & MO_TAG) == 0) {
err = EINVAL;
goto out;
}
vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1);
out:
vfs_list_unlock();
return (err);
}
/*
* Function to parse an option string and fill in a mount options table.
* Unknown options are silently ignored. The input option string is modified
* by replacing separators with nulls. If the create flag is set, options
* not found in the table are just added on the fly. The table must have
* an option slot marked MO_EMPTY to add an option on the fly.
*
* This function is *not* for general use by filesystems.
*
* Note: caller is responsible for locking the vfs list, if needed,
* to protect mops..
*/
void
vfs_parsemntopts(mntopts_t *mops, char *osp, int create)
{
char *s = osp, *p, *nextop, *valp, *cp, *ep;
int setflg = VFS_NOFORCEOPT;
if (osp == NULL)
return;
while (*s != '\0') {
p = strchr(s, ','); /* find next option */
if (p == NULL) {
cp = NULL;
p = s + strlen(s);
} else {
cp = p; /* save location of comma */
*p++ = '\0'; /* mark end and point to next option */
}
nextop = p;
p = strchr(s, '='); /* look for value */
if (p == NULL) {
valp = NULL; /* no value supplied */
} else {
ep = p; /* save location of equals */
*p++ = '\0'; /* end option and point to value */
valp = p;
}
/*
* set option into options table
*/
if (create)
setflg |= VFS_CREATEOPT;
vfs_setmntopt_nolock(mops, s, valp, setflg, 0);
if (cp != NULL)
*cp = ','; /* restore the comma */
if (valp != NULL)
*ep = '='; /* restore the equals */
s = nextop;
}
}
/*
* Function to inquire if an option exists in a mount options table.
* Returns a pointer to the option if it exists, else NULL.
*
* This function is *not* for general use by filesystems.
*
* Note: caller is responsible for locking the vfs list, if needed,
* to protect mops.
*/
struct mntopt *
vfs_hasopt(const mntopts_t *mops, const char *opt)
{
struct mntopt *mop;
uint_t i, count;
count = mops->mo_count;
for (i = 0; i < count; i++) {
mop = &mops->mo_list[i];
if (mop->mo_flags & MO_EMPTY)
continue;
if (strcmp(opt, mop->mo_name) == 0)
return (mop);
}
return (NULL);
}
/*
* Function to inquire if an option is set in a mount options table.
* Returns non-zero if set and fills in the arg pointer with a pointer to
* the argument string or NULL if there is no argument string.
*/
static int
vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp)
{
struct mntopt *mop;
uint_t i, count;
count = mops->mo_count;
for (i = 0; i < count; i++) {
mop = &mops->mo_list[i];
if (mop->mo_flags & MO_EMPTY)
continue;
if (strcmp(opt, mop->mo_name))
continue;
if ((mop->mo_flags & MO_SET) == 0)
return (0);
if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0)
*argp = mop->mo_arg;
return (1);
}
return (0);
}
int
vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp)
{
int ret;
vfs_list_read_lock();
ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp);
vfs_list_unlock();
return (ret);
}
/*
* Construct a comma separated string of the options set in the given
* mount table, return the string in the given buffer. Return non-zero if
* the buffer would overflow.
*
* This function is *not* for general use by filesystems.
*
* Note: caller is responsible for locking the vfs list, if needed,
* to protect mp.
*/
int
vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len)
{
char *cp;
uint_t i;
buf[0] = '\0';
cp = buf;
for (i = 0; i < mp->mo_count; i++) {
struct mntopt *mop;
mop = &mp->mo_list[i];
if (mop->mo_flags & MO_SET) {
int optlen, comma = 0;
if (buf[0] != '\0')
comma = 1;
optlen = strlen(mop->mo_name);
if (strlen(buf) + comma + optlen + 1 > len)
goto err;
if (comma)
*cp++ = ',';
(void) strcpy(cp, mop->mo_name);
cp += optlen;
/*
* Append option value if there is one
*/
if (mop->mo_arg != NULL) {
int arglen;
arglen = strlen(mop->mo_arg);
if (strlen(buf) + arglen + 2 > len)
goto err;
*cp++ = '=';
(void) strcpy(cp, mop->mo_arg);
cp += arglen;
}
}
}
return (0);
err:
return (EOVERFLOW);
}
static void
vfs_freecancelopt(char **moc)
{
if (moc != NULL) {
int ccnt = 0;
char **cp;
for (cp = moc; *cp != NULL; cp++) {
kmem_free(*cp, strlen(*cp) + 1);
ccnt++;
}
kmem_free(moc, (ccnt + 1) * sizeof (char *));
}
}
static void
vfs_freeopt(mntopt_t *mop)
{
if (mop->mo_name != NULL)
kmem_free(mop->mo_name, strlen(mop->mo_name) + 1);
vfs_freecancelopt(mop->mo_cancel);
if (mop->mo_arg != NULL)
kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
}
/*
* Free a mount options table
*
* This function is *not* for general use by filesystems.
*
* Note: caller is responsible for locking the vfs list, if needed,
* to protect mp.
*/
void
vfs_freeopttbl(mntopts_t *mp)
{
uint_t i, count;
count = mp->mo_count;
for (i = 0; i < count; i++) {
vfs_freeopt(&mp->mo_list[i]);
}
if (count) {
kmem_free(mp->mo_list, sizeof (mntopt_t) * count);
mp->mo_count = 0;
mp->mo_list = NULL;
}
}
/*
* Free any mnttab information recorded in the vfs struct.
* The vfs must not be on the vfs list.
*/
static void
vfs_freemnttab(struct vfs *vfsp)
{
ASSERT(!VFS_ON_LIST(vfsp));
/*
* Free device and mount point information
*/
if (vfsp->vfs_mntpt != NULL) {
refstr_rele(vfsp->vfs_mntpt);
vfsp->vfs_mntpt = NULL;
}
if (vfsp->vfs_resource != NULL) {
refstr_rele(vfsp->vfs_resource);
vfsp->vfs_resource = NULL;
}
/*
* Now free mount options information
*/
vfs_freeopttbl(&vfsp->vfs_mntopts);
}
/*
* Return the last mnttab modification time
*/
void
vfs_mnttab_modtime(timespec_t *ts)
{
ASSERT(RW_LOCK_HELD(&vfslist));
*ts = vfs_mnttab_mtime;
}
/*
* See if mnttab is changed
*/
void
vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp)
{
int changed;
*phpp = (struct pollhead *)NULL;
/*
* Note: don't grab vfs list lock before accessing vfs_mnttab_mtime.
* Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe
* to not grab the vfs list lock because tv_sec is monotonically
* increasing.
*/
changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) ||
(old->tv_sec != vfs_mnttab_mtime.tv_sec);
if (!changed) {
*phpp = &vfs_pollhd;
}
}
/*
* Update the mnttab modification time and wake up any waiters for
* mnttab changes
*/
void
vfs_mnttab_modtimeupd()
{
hrtime_t oldhrt, newhrt;
ASSERT(RW_WRITE_HELD(&vfslist));
oldhrt = ts2hrt(&vfs_mnttab_mtime);
gethrestime(&vfs_mnttab_mtime);
newhrt = ts2hrt(&vfs_mnttab_mtime);
if (oldhrt == (hrtime_t)0)
vfs_mnttab_ctime = vfs_mnttab_mtime;
/*
* Attempt to provide unique mtime (like uniqtime but not).
*/
if (newhrt == oldhrt) {
newhrt++;
hrt2ts(newhrt, &vfs_mnttab_mtime);
}
pollwakeup(&vfs_pollhd, (short)POLLRDBAND);
}
int
dounmount(struct vfs *vfsp, int flag, cred_t *cr)
{
vnode_t *coveredvp;
int error;
/*
* Get covered vnode. This will be NULL if the vfs is not linked
* into the file system name space (i.e., domount() with MNT_NOSPICE).
*/
coveredvp = vfsp->vfs_vnodecovered;
ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp));
/*
* Purge all dnlc entries for this vfs.
*/
(void) dnlc_purge_vfsp(vfsp, 0);
/* For forcible umount, skip VFS_SYNC() since it may hang */
if ((flag & MS_FORCE) == 0)
(void) VFS_SYNC(vfsp, 0, cr);
/*
* Lock the vfs to maintain fs status quo during unmount. This
* has to be done after the sync because ufs_update tries to acquire
* the vfs_reflock.
*/
vfs_lock_wait(vfsp);
if (error = VFS_UNMOUNT(vfsp, flag, cr)) {
vfs_unlock(vfsp);
if (coveredvp != NULL)
vn_vfsunlock(coveredvp);
} else if (coveredvp != NULL) {
/*
* vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered)
* when it frees vfsp so we do a VN_HOLD() so we can
* continue to use coveredvp afterwards.
*/
VN_HOLD(coveredvp);
vfs_remove(vfsp);
vn_vfsunlock(coveredvp);
VN_RELE(coveredvp);
} else {
/*
* Release the reference to vfs that is not linked
* into the name space.
*/
vfs_unlock(vfsp);
VFS_RELE(vfsp);
}
return (error);
}
/*
* Vfs_unmountall() is called by uadmin() to unmount all
* mounted file systems (except the root file system) during shutdown.
* It follows the existing locking protocol when traversing the vfs list
* to sync and unmount vfses. Even though there should be no
* other thread running while the system is shutting down, it is prudent
* to still follow the locking protocol.
*/
void
vfs_unmountall(void)
{
struct vfs *vfsp;
struct vfs *prev_vfsp = NULL;
int error;
/*
* Toss all dnlc entries now so that the per-vfs sync
* and unmount operations don't have to slog through
* a bunch of uninteresting vnodes over and over again.
*/
dnlc_purge();
vfs_list_lock();
for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) {
prev_vfsp = vfsp->vfs_prev;
if (vfs_lock(vfsp) != 0)
continue;
error = vn_vfswlock(vfsp->vfs_vnodecovered);
vfs_unlock(vfsp);
if (error)
continue;
vfs_list_unlock();
(void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED());
(void) dounmount(vfsp, 0, CRED());
/*
* Since we dropped the vfslist lock above we must
* verify that next_vfsp still exists, else start over.
*/
vfs_list_lock();
for (vfsp = rootvfs->vfs_prev;
vfsp != rootvfs; vfsp = vfsp->vfs_prev)
if (vfsp == prev_vfsp)
break;
if (vfsp == rootvfs && prev_vfsp != rootvfs)
prev_vfsp = rootvfs->vfs_prev;
}
vfs_list_unlock();
}
/*
* Called to add an entry to the end of the vfs mount in progress list
*/
void
vfs_addmip(dev_t dev, struct vfs *vfsp)
{
struct ipmnt *mipp;
mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP);
mipp->mip_next = NULL;
mipp->mip_dev = dev;
mipp->mip_vfsp = vfsp;
mutex_enter(&vfs_miplist_mutex);
if (vfs_miplist_end != NULL)
vfs_miplist_end->mip_next = mipp;
else
vfs_miplist = mipp;
vfs_miplist_end = mipp;
mutex_exit(&vfs_miplist_mutex);
}
/*
* Called to remove an entry from the mount in progress list
* Either because the mount completed or it failed.
*/
void
vfs_delmip(struct vfs *vfsp)
{
struct ipmnt *mipp, *mipprev;
mutex_enter(&vfs_miplist_mutex);
mipprev = NULL;
for (mipp = vfs_miplist;
mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) {
mipprev = mipp;
}
if (mipp == NULL)
return; /* shouldn't happen */
if (mipp == vfs_miplist_end)
vfs_miplist_end = mipprev;
if (mipprev == NULL)
vfs_miplist = mipp->mip_next;
else
mipprev->mip_next = mipp->mip_next;
mutex_exit(&vfs_miplist_mutex);
kmem_free(mipp, sizeof (struct ipmnt));
}
/*
* vfs_add is called by a specific filesystem's mount routine to add
* the new vfs into the vfs list/hash and to cover the mounted-on vnode.
* The vfs should already have been locked by the caller.
*
* coveredvp is NULL if this is the root.
*/
void
vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag)
{
int newflag;
ASSERT(vfs_lock_held(vfsp));
VFS_HOLD(vfsp);
newflag = vfsp->vfs_flag;
if (mflag & MS_RDONLY)
newflag |= VFS_RDONLY;
else
newflag &= ~VFS_RDONLY;
if (mflag & MS_NOSUID)
newflag |= (VFS_NOSETUID|VFS_NODEVICES);
else
newflag &= ~(VFS_NOSETUID|VFS_NODEVICES);
if (mflag & MS_NOMNTTAB)
newflag |= VFS_NOMNTTAB;
else
newflag &= ~VFS_NOMNTTAB;
if (coveredvp != NULL) {
ASSERT(vn_vfswlock_held(coveredvp));
coveredvp->v_vfsmountedhere = vfsp;
VN_HOLD(coveredvp);
}
vfsp->vfs_vnodecovered = coveredvp;
vfsp->vfs_flag = newflag;
vfs_list_add(vfsp);
}
/*
* Remove a vfs from the vfs list, null out the pointer from the
* covered vnode to the vfs (v_vfsmountedhere), and null out the pointer
* from the vfs to the covered vnode (vfs_vnodecovered). Release the
* reference to the vfs and to the covered vnode.
*
* Called from dounmount after it's confirmed with the file system
* that the unmount is legal.
*/
void
vfs_remove(struct vfs *vfsp)
{
vnode_t *vp;
ASSERT(vfs_lock_held(vfsp));
/*
* Can't unmount root. Should never happen because fs will
* be busy.
*/
if (vfsp == rootvfs)
cmn_err(CE_PANIC, "vfs_remove: unmounting root");
vfs_list_remove(vfsp);
/*
* Unhook from the file system name space.
*/
vp = vfsp->vfs_vnodecovered;
ASSERT(vn_vfswlock_held(vp));
vp->v_vfsmountedhere = NULL;
vfsp->vfs_vnodecovered = NULL;
VN_RELE(vp);
/*
* Release lock and wakeup anybody waiting.
*/
vfs_unlock(vfsp);
VFS_RELE(vfsp);
}
/*
* Lock a filesystem to prevent access to it while mounting,
* unmounting and syncing. Return EBUSY immediately if lock
* can't be acquired.
*/
int
vfs_lock(vfs_t *vfsp)
{
vn_vfslocks_entry_t *vpvfsentry;
vpvfsentry = vn_vfslocks_getlock(vfsp);
if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
return (0);
vn_vfslocks_rele(vpvfsentry);
return (EBUSY);
}
int
vfs_rlock(vfs_t *vfsp)
{
vn_vfslocks_entry_t *vpvfsentry;
vpvfsentry = vn_vfslocks_getlock(vfsp);
if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
return (0);
vn_vfslocks_rele(vpvfsentry);
return (EBUSY);
}
void
vfs_lock_wait(vfs_t *vfsp)
{
vn_vfslocks_entry_t *vpvfsentry;
vpvfsentry = vn_vfslocks_getlock(vfsp);
rwst_enter(&vpvfsentry->ve_lock, RW_WRITER);
}
void
vfs_rlock_wait(vfs_t *vfsp)
{
vn_vfslocks_entry_t *vpvfsentry;
vpvfsentry = vn_vfslocks_getlock(vfsp);
rwst_enter(&vpvfsentry->ve_lock, RW_READER);
}
/*
* Unlock a locked filesystem.
*/
void
vfs_unlock(vfs_t *vfsp)
{
vn_vfslocks_entry_t *vpvfsentry;
/*
* vfs_unlock will mimic sema_v behaviour to fix 4748018.
* And these changes should remain for the patch changes as it is.
*/
if (panicstr)
return;
/*
* ve_refcount needs to be dropped twice here.
* 1. To release refernce after a call to vfs_locks_getlock()
* 2. To release the reference from the locking routines like
* vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,.
*/
vpvfsentry = vn_vfslocks_getlock(vfsp);
vn_vfslocks_rele(vpvfsentry);
rwst_exit(&vpvfsentry->ve_lock);
vn_vfslocks_rele(vpvfsentry);
}
/*
* Utility routine that allows a filesystem to construct its
* fsid in "the usual way" - by munging some underlying dev_t and
* the filesystem type number into the 64-bit fsid. Note that
* this implicitly relies on dev_t persistence to make filesystem
* id's persistent.
*
* There's nothing to prevent an individual fs from constructing its
* fsid in a different way, and indeed they should.
*
* Since we want fsids to be 32-bit quantities (so that they can be
* exported identically by either 32-bit or 64-bit APIs, as well as
* the fact that fsid's are "known" to NFS), we compress the device
* number given down to 32-bits, and panic if that isn't possible.
*/
void
vfs_make_fsid(fsid_t *fsi, dev_t dev, int val)
{
if (!cmpldev((dev32_t *)&fsi->val[0], dev))
panic("device number too big for fsid!");
fsi->val[1] = val;
}
int
vfs_lock_held(vfs_t *vfsp)
{
int held;
vn_vfslocks_entry_t *vpvfsentry;
/*
* vfs_lock_held will mimic sema_held behaviour
* if panicstr is set. And these changes should remain
* for the patch changes as it is.
*/
if (panicstr)
return (1);
vpvfsentry = vn_vfslocks_getlock(vfsp);
held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
vn_vfslocks_rele(vpvfsentry);
return (held);
}
struct _kthread *
vfs_lock_owner(vfs_t *vfsp)
{
struct _kthread *owner;
vn_vfslocks_entry_t *vpvfsentry;
/*
* vfs_wlock_held will mimic sema_held behaviour
* if panicstr is set. And these changes should remain
* for the patch changes as it is.
*/
if (panicstr)
return (NULL);
vpvfsentry = vn_vfslocks_getlock(vfsp);
owner = rwst_owner(&vpvfsentry->ve_lock);
vn_vfslocks_rele(vpvfsentry);
return (owner);
}
/*
* vfs list locking.
*
* Rather than manipulate the vfslist lock directly, we abstract into lock
* and unlock routines to allow the locking implementation to be changed for
* clustering.
*
* Whenever the vfs list is modified through its hash links, the overall list
* lock must be obtained before locking the relevant hash bucket. But to see
* whether a given vfs is on the list, it suffices to obtain the lock for the
* hash bucket without getting the overall list lock. (See getvfs() below.)
*/
void
vfs_list_lock()
{
rw_enter(&vfslist, RW_WRITER);
}
void
vfs_list_read_lock()
{
rw_enter(&vfslist, RW_READER);
}
void
vfs_list_unlock()
{
rw_exit(&vfslist);
}
/*
* Low level worker routines for adding entries to and removing entries from
* the vfs list.
*/
static void
vfs_hash_add(struct vfs *vfsp, int insert_at_head)
{
int vhno;
struct vfs **hp;
dev_t dev;
ASSERT(RW_WRITE_HELD(&vfslist));
dev = expldev(vfsp->vfs_fsid.val[0]);
vhno = VFSHASH(getmajor(dev), getminor(dev));
mutex_enter(&rvfs_list[vhno].rvfs_lock);
/*
* Link into the hash table, inserting it at the end, so that LOFS
* with the same fsid as UFS (or other) file systems will not hide the
* UFS.
*/
if (insert_at_head) {
vfsp->vfs_hash = rvfs_list[vhno].rvfs_head;
rvfs_list[vhno].rvfs_head = vfsp;
} else {
for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL;
hp = &(*hp)->vfs_hash)
continue;
/*
* hp now contains the address of the pointer to update
* to effect the insertion.
*/
vfsp->vfs_hash = NULL;
*hp = vfsp;
}
rvfs_list[vhno].rvfs_len++;
mutex_exit(&rvfs_list[vhno].rvfs_lock);
}
static void
vfs_hash_remove(struct vfs *vfsp)
{
int vhno;
struct vfs *tvfsp;
dev_t dev;
ASSERT(RW_WRITE_HELD(&vfslist));
dev = expldev(vfsp->vfs_fsid.val[0]);
vhno = VFSHASH(getmajor(dev), getminor(dev));
mutex_enter(&rvfs_list[vhno].rvfs_lock);
/*
* Remove from hash.
*/
if (rvfs_list[vhno].rvfs_head == vfsp) {
rvfs_list[vhno].rvfs_head = vfsp->vfs_hash;
rvfs_list[vhno].rvfs_len--;
goto foundit;
}
for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL;
tvfsp = tvfsp->vfs_hash) {
if (tvfsp->vfs_hash == vfsp) {
tvfsp->vfs_hash = vfsp->vfs_hash;
rvfs_list[vhno].rvfs_len--;
goto foundit;
}
}
cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash");
foundit:
mutex_exit(&rvfs_list[vhno].rvfs_lock);
}
void
vfs_list_add(struct vfs *vfsp)
{
zone_t *zone;
/*
* The zone that owns the mount is the one that performed the mount.
* Note that this isn't necessarily the same as the zone mounted into.
* The corresponding zone_rele() will be done when the vfs_t is
* being free'd.
*/
vfsp->vfs_zone = curproc->p_zone;
zone_hold(vfsp->vfs_zone);
/*
* Find the zone mounted into, and put this mount on its vfs list.
*/
zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
ASSERT(zone != NULL);
/*
* Special casing for the root vfs. This structure is allocated
* statically and hooked onto rootvfs at link time. During the
* vfs_mountroot call at system startup time, the root file system's
* VFS_MOUNTROOT routine will call vfs_add with this root vfs struct
* as argument. The code below must detect and handle this special
* case. The only apparent justification for this special casing is
* to ensure that the root file system appears at the head of the
* list.
*
* XXX: I'm assuming that it's ok to do normal list locking when
* adding the entry for the root file system (this used to be
* done with no locks held).
*/
vfs_list_lock();
/*
* Link into the vfs list proper.
*/
if (vfsp == &root) {
/*
* Assert: This vfs is already on the list as its first entry.
* Thus, there's nothing to do.
*/
ASSERT(rootvfs == vfsp);
/*
* Add it to the head of the global zone's vfslist.
*/
ASSERT(zone == global_zone);
ASSERT(zone->zone_vfslist == NULL);
zone->zone_vfslist = vfsp;
} else {
/*
* Link to end of list using vfs_prev (as rootvfs is now a
* doubly linked circular list) so list is in mount order for
* mnttab use.
*/
rootvfs->vfs_prev->vfs_next = vfsp;
vfsp->vfs_prev = rootvfs->vfs_prev;
rootvfs->vfs_prev = vfsp;
vfsp->vfs_next = rootvfs;
/*
* Do it again for the zone-private list (which may be NULL).
*/
if (zone->zone_vfslist == NULL) {
ASSERT(zone != global_zone);
zone->zone_vfslist = vfsp;
} else {
zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp;
vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev;
zone->zone_vfslist->vfs_zone_prev = vfsp;
vfsp->vfs_zone_next = zone->zone_vfslist;
}
}
/*
* Link into the hash table, inserting it at the end, so that LOFS
* with the same fsid as UFS (or other) file systems will not hide
* the UFS.
*/
vfs_hash_add(vfsp, 0);
/*
* update the mnttab modification time
*/
vfs_mnttab_modtimeupd();
vfs_list_unlock();
zone_rele(zone);
}
void
vfs_list_remove(struct vfs *vfsp)
{
zone_t *zone;
zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
ASSERT(zone != NULL);
/*
* Callers are responsible for preventing attempts to unmount the
* root.
*/
ASSERT(vfsp != rootvfs);
vfs_list_lock();
/*
* Remove from hash.
*/
vfs_hash_remove(vfsp);
/*
* Remove from vfs list.
*/
vfsp->vfs_prev->vfs_next = vfsp->vfs_next;
vfsp->vfs_next->vfs_prev = vfsp->vfs_prev;
vfsp->vfs_next = vfsp->vfs_prev = NULL;
/*
* Remove from zone-specific vfs list.
*/
if (zone->zone_vfslist == vfsp)
zone->zone_vfslist = vfsp->vfs_zone_next;
if (vfsp->vfs_zone_next == vfsp) {
ASSERT(vfsp->vfs_zone_prev == vfsp);
ASSERT(zone->zone_vfslist == vfsp);
zone->zone_vfslist = NULL;
}
vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next;
vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev;
vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL;
/*
* update the mnttab modification time
*/
vfs_mnttab_modtimeupd();
vfs_list_unlock();
zone_rele(zone);
}
struct vfs *
getvfs(fsid_t *fsid)
{
struct vfs *vfsp;
int val0 = fsid->val[0];
int val1 = fsid->val[1];
dev_t dev = expldev(val0);
int vhno = VFSHASH(getmajor(dev), getminor(dev));
kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock;
mutex_enter(hmp);
for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) {
if (vfsp->vfs_fsid.val[0] == val0 &&
vfsp->vfs_fsid.val[1] == val1) {
VFS_HOLD(vfsp);
mutex_exit(hmp);
return (vfsp);
}
}
mutex_exit(hmp);
return (NULL);
}
/*
* Search the vfs mount in progress list for a specified device/vfs entry.
* Returns 0 if the first entry in the list that the device matches has the
* given vfs pointer as well. If the device matches but a different vfs
* pointer is encountered in the list before the given vfs pointer then
* a 1 is returned.
*/
int
vfs_devmounting(dev_t dev, struct vfs *vfsp)
{
int retval = 0;
struct ipmnt *mipp;
mutex_enter(&vfs_miplist_mutex);
for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) {
if (mipp->mip_dev == dev) {
if (mipp->mip_vfsp != vfsp)
retval = 1;
break;
}
}
mutex_exit(&vfs_miplist_mutex);
return (retval);
}
/*
* Search the vfs list for a specified device. Returns 1, if entry is found
* or 0 if no suitable entry is found.
*/
int
vfs_devismounted(dev_t dev)
{
struct vfs *vfsp;
int found;
vfs_list_read_lock();
vfsp = rootvfs;
found = 0;
do {
if (vfsp->vfs_dev == dev) {
found = 1;
break;
}
vfsp = vfsp->vfs_next;
} while (vfsp != rootvfs);
vfs_list_unlock();
return (found);
}
/*
* Search the vfs list for a specified device. Returns a pointer to it
* or NULL if no suitable entry is found. The caller of this routine
* is responsible for releasing the returned vfs pointer.
*/
struct vfs *
vfs_dev2vfsp(dev_t dev)
{
struct vfs *vfsp;
int found;
vfs_list_read_lock();
vfsp = rootvfs;
found = 0;
do {
/*
* The following could be made more efficient by making
* the entire loop use vfs_zone_next if the call is from
* a zone. The only callers, however, ustat(2) and
* umount2(2), don't seem to justify the added
* complexity at present.
*/
if (vfsp->vfs_dev == dev &&
ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt),
curproc->p_zone)) {
VFS_HOLD(vfsp);
found = 1;
break;
}
vfsp = vfsp->vfs_next;
} while (vfsp != rootvfs);
vfs_list_unlock();
return (found ? vfsp: NULL);
}
/*
* Search the vfs list for a specified mntpoint. Returns a pointer to it
* or NULL if no suitable entry is found. The caller of this routine
* is responsible for releasing the returned vfs pointer.
*
* Note that if multiple mntpoints match, the last one matching is
* returned in an attempt to return the "top" mount when overlay
* mounts are covering the same mount point. This is accomplished by starting
* at the end of the list and working our way backwards, stopping at the first
* matching mount.
*/
struct vfs *
vfs_mntpoint2vfsp(const char *mp)
{
struct vfs *vfsp;
struct vfs *retvfsp = NULL;
zone_t *zone = curproc->p_zone;
struct vfs *list;
vfs_list_read_lock();
if (getzoneid() == GLOBAL_ZONEID) {
/*
* The global zone may see filesystems in any zone.
*/
vfsp = rootvfs->vfs_prev;
do {
if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) {
retvfsp = vfsp;
break;
}
vfsp = vfsp->vfs_prev;
} while (vfsp != rootvfs->vfs_prev);
} else if ((list = zone->zone_vfslist) != NULL) {
const char *mntpt;
vfsp = list->vfs_zone_prev;
do {
mntpt = refstr_value(vfsp->vfs_mntpt);
mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
if (strcmp(mntpt, mp) == 0) {
retvfsp = vfsp;
break;
}
vfsp = vfsp->vfs_zone_prev;
} while (vfsp != list->vfs_zone_prev);
}
if (retvfsp)
VFS_HOLD(retvfsp);
vfs_list_unlock();
return (retvfsp);
}
/*
* Search the vfs list for a specified vfsops.
* if vfs entry is found then return 1, else 0.
*/
int
vfs_opsinuse(vfsops_t *ops)
{
struct vfs *vfsp;
int found;
vfs_list_read_lock();
vfsp = rootvfs;
found = 0;
do {
if (vfs_getops(vfsp) == ops) {
found = 1;
break;
}
vfsp = vfsp->vfs_next;
} while (vfsp != rootvfs);
vfs_list_unlock();
return (found);
}
/*
* Allocate an entry in vfssw for a file system type
*/
struct vfssw *
allocate_vfssw(char *type)
{
struct vfssw *vswp;
if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) {
/*
* The vfssw table uses the empty string to identify an
* available entry; we cannot add any type which has
* a leading NUL. The string length is limited to
* the size of the st_fstype array in struct stat.
*/
return (NULL);
}
ASSERT(VFSSW_WRITE_LOCKED());
for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++)
if (!ALLOCATED_VFSSW(vswp)) {
vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP);
(void) strcpy(vswp->vsw_name, type);
ASSERT(vswp->vsw_count == 0);
vswp->vsw_count = 1;
mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL);
return (vswp);
}
return (NULL);
}
/*
* Impose additional layer of translation between vfstype names
* and module names in the filesystem.
*/
static char *
vfs_to_modname(char *vfstype)
{
if (strcmp(vfstype, "proc") == 0) {
vfstype = "procfs";
} else if (strcmp(vfstype, "fd") == 0) {
vfstype = "fdfs";
} else if (strncmp(vfstype, "nfs", 3) == 0) {
vfstype = "nfs";
}
return (vfstype);
}
/*
* Find a vfssw entry given a file system type name.
* Try to autoload the filesystem if it's not found.
* If it's installed, return the vfssw locked to prevent unloading.
*/
struct vfssw *
vfs_getvfssw(char *type)
{
struct vfssw *vswp;
char *modname;
RLOCK_VFSSW();
vswp = vfs_getvfsswbyname(type);
modname = vfs_to_modname(type);
if (rootdir == NULL) {
/*
* If we haven't yet loaded the root file system, then our
* _init won't be called until later. Allocate vfssw entry,
* because mod_installfs won't be called.
*/
if (vswp == NULL) {
RUNLOCK_VFSSW();
WLOCK_VFSSW();
if ((vswp = vfs_getvfsswbyname(type)) == NULL) {
if ((vswp = allocate_vfssw(type)) == NULL) {
WUNLOCK_VFSSW();
return (NULL);
}
}
WUNLOCK_VFSSW();
RLOCK_VFSSW();
}
if (!VFS_INSTALLED(vswp)) {
RUNLOCK_VFSSW();
(void) modloadonly("fs", modname);
} else
RUNLOCK_VFSSW();
return (vswp);
}
/*
* Try to load the filesystem. Before calling modload(), we drop
* our lock on the VFS switch table, and pick it up after the
* module is loaded. However, there is a potential race: the
* module could be unloaded after the call to modload() completes
* but before we pick up the lock and drive on. Therefore,
* we keep reloading the module until we've loaded the module
* _and_ we have the lock on the VFS switch table.
*/
while (vswp == NULL || !VFS_INSTALLED(vswp)) {
RUNLOCK_VFSSW();
if (modload("fs", modname) == -1)
return (NULL);
RLOCK_VFSSW();
if (vswp == NULL)
if ((vswp = vfs_getvfsswbyname(type)) == NULL)
break;
}
RUNLOCK_VFSSW();
return (vswp);
}
/*
* Find a vfssw entry given a file system type name.
*/
struct vfssw *
vfs_getvfsswbyname(char *type)
{
struct vfssw *vswp;
ASSERT(VFSSW_LOCKED());
if (type == NULL || *type == '\0')
return (NULL);
for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
if (strcmp(type, vswp->vsw_name) == 0) {
vfs_refvfssw(vswp);
return (vswp);
}
}
return (NULL);
}
/*
* Find a vfssw entry given a set of vfsops.
*/
struct vfssw *
vfs_getvfsswbyvfsops(vfsops_t *vfsops)
{
struct vfssw *vswp;
RLOCK_VFSSW();
for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) {
vfs_refvfssw(vswp);
RUNLOCK_VFSSW();
return (vswp);
}
}
RUNLOCK_VFSSW();
return (NULL);
}
/*
* Reference a vfssw entry.
*/
void
vfs_refvfssw(struct vfssw *vswp)
{
mutex_enter(&vswp->vsw_lock);
vswp->vsw_count++;
mutex_exit(&vswp->vsw_lock);
}
/*
* Unreference a vfssw entry.
*/
void
vfs_unrefvfssw(struct vfssw *vswp)
{
mutex_enter(&vswp->vsw_lock);
vswp->vsw_count--;
mutex_exit(&vswp->vsw_lock);
}
int sync_timeout = 30; /* timeout for syncing a page during panic */
int sync_timeleft; /* portion of sync_timeout remaining */
static int sync_retries = 20; /* number of retries when not making progress */
static int sync_triesleft; /* portion of sync_retries remaining */
static pgcnt_t old_pgcnt, new_pgcnt;
static int new_bufcnt, old_bufcnt;
/*
* Sync all of the mounted filesystems, and then wait for the actual i/o to
* complete. We wait by counting the number of dirty pages and buffers,
* pushing them out using bio_busy() and page_busy(), and then counting again.
* This routine is used during both the uadmin A_SHUTDOWN code as well as
* the SYNC phase of the panic code (see comments in panic.c). It should only
* be used after some higher-level mechanism has quiesced the system so that
* new writes are not being initiated while we are waiting for completion.
*
* To ensure finite running time, our algorithm uses two timeout mechanisms:
* sync_timeleft (a timer implemented by the omnipresent deadman() cyclic), and
* sync_triesleft (a progress counter used by the vfs_syncall() loop below).
* Together these ensure that syncing completes if our i/o paths are stuck.
* The counters are declared above so they can be found easily in the debugger.
*
* The sync_timeleft counter is reset by bio_busy() and page_busy() using the
* vfs_syncprogress() subroutine whenever we make progress through the lists of
* pages and buffers. It is decremented and expired by the deadman() cyclic.
* When vfs_syncall() decides it is done, we disable the deadman() counter by
* setting sync_timeleft to zero. This timer guards against vfs_syncall()
* deadlocking or hanging inside of a broken filesystem or driver routine.
*
* The sync_triesleft counter is updated by vfs_syncall() itself. If we make
* sync_retries consecutive calls to bio_busy() and page_busy() without
* decreasing either the number of dirty buffers or dirty pages below the
* lowest count we have seen so far, we give up and return from vfs_syncall().
*
* Each loop iteration ends with a call to delay() one second to allow time for
* i/o completion and to permit the user time to read our progress messages.
*/
void
vfs_syncall(void)
{
if (rootdir == NULL && !modrootloaded)
return; /* panic during boot - no filesystems yet */
printf("syncing file systems...");
vfs_syncprogress();
sync();
vfs_syncprogress();
sync_triesleft = sync_retries;
old_bufcnt = new_bufcnt = INT_MAX;
old_pgcnt = new_pgcnt = ULONG_MAX;
while (sync_triesleft > 0) {
old_bufcnt = MIN(old_bufcnt, new_bufcnt);
old_pgcnt = MIN(old_pgcnt, new_pgcnt);
new_bufcnt = bio_busy(B_TRUE);
new_pgcnt = page_busy(B_TRUE);
vfs_syncprogress();
if (new_bufcnt == 0 && new_pgcnt == 0)
break;
if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt)
sync_triesleft = sync_retries;
else
sync_triesleft--;
if (new_bufcnt)
printf(" [%d]", new_bufcnt);
if (new_pgcnt)
printf(" %lu", new_pgcnt);
delay(hz);
}
if (new_bufcnt != 0 || new_pgcnt != 0)
printf(" done (not all i/o completed)\n");
else
printf(" done\n");
sync_timeleft = 0;
delay(hz);
}
/*
* If we are in the middle of the sync phase of panic, reset sync_timeleft to
* sync_timeout to indicate that we are making progress and the deadman()
* omnipresent cyclic should not yet time us out. Note that it is safe to
* store to sync_timeleft here since the deadman() is firing at high-level
* on top of us. If we are racing with the deadman(), either the deadman()
* will decrement the old value and then we will reset it, or we will
* reset it and then the deadman() will immediately decrement it. In either
* case, correct behavior results.
*/
void
vfs_syncprogress(void)
{
if (panicstr)
sync_timeleft = sync_timeout;
}
/*
* Map VFS flags to statvfs flags. These shouldn't really be separate
* flags at all.
*/
uint_t
vf_to_stf(uint_t vf)
{
uint_t stf = 0;
if (vf & VFS_RDONLY)
stf |= ST_RDONLY;
if (vf & VFS_NOSETUID)
stf |= ST_NOSUID;
if (vf & VFS_NOTRUNC)
stf |= ST_NOTRUNC;
return (stf);
}
/*
* Use old-style function prototype for vfsstray() so
* that we can use it anywhere in the vfsops structure.
*/
int vfsstray();
/*
* Entries for (illegal) fstype 0.
*/
/* ARGSUSED */
int
vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr)
{
cmn_err(CE_PANIC, "stray vfs operation");
return (0);
}
vfsops_t vfs_strayops = {
vfsstray,
vfsstray,
vfsstray,
vfsstray,
vfsstray_sync,
vfsstray,
vfsstray,
vfsstray
};
/*
* Entries for (illegal) fstype 0.
*/
int
vfsstray(void)
{
cmn_err(CE_PANIC, "stray vfs operation");
return (0);
}
/*
* Support for dealing with forced UFS unmount and its interaction with
* LOFS. Could be used by any filesystem.
* See bug 1203132.
*/
int
vfs_EIO(void)
{
return (EIO);
}
/*
* We've gotta define the op for sync separately, since the compiler gets
* confused if we mix and match ANSI and normal style prototypes when
* a "short" argument is present and spits out a warning.
*/
/*ARGSUSED*/
int
vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr)
{
return (EIO);
}
vfs_t EIO_vfs;
vfsops_t *EIO_vfsops;
/*
* Called from startup() to initialize all loaded vfs's
*/
void
vfsinit(void)
{
struct vfssw *vswp;
int error;
static const fs_operation_def_t EIO_vfsops_template[] = {
VFSNAME_MOUNT, vfs_EIO,
VFSNAME_UNMOUNT, vfs_EIO,
VFSNAME_ROOT, vfs_EIO,
VFSNAME_STATVFS, vfs_EIO,
VFSNAME_SYNC, (fs_generic_func_p) vfs_EIO_sync,
VFSNAME_VGET, vfs_EIO,
VFSNAME_MOUNTROOT, vfs_EIO,
VFSNAME_FREEVFS, vfs_EIO,
VFSNAME_VNSTATE, vfs_EIO,
NULL, NULL
};
/* Initialize the vnode cache (file systems may use it during init). */
vn_create_cache();
/* Setup event monitor framework */
fem_init();
/* Initialize the dummy stray file system type. */
vfssw[0].vsw_vfsops = vfs_strayops;
/* Initialize the dummy EIO file system. */
error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops);
if (error != 0) {
cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template");
/* Shouldn't happen, but not bad enough to panic */
}
VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL);
/*
* Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup
* on this vfs can immediately notice it's invalid.
*/
EIO_vfs.vfs_flag |= VFS_UNMOUNTED;
/*
* Call the init routines of non-loadable filesystems only.
* Filesystems which are loaded as separate modules will be
* initialized by the module loading code instead.
*/
for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
RLOCK_VFSSW();
if (vswp->vsw_init != NULL)
(*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name);
RUNLOCK_VFSSW();
}
}
/*
* Increments the vfs reference count by one atomically.
*/
void
vfs_hold(vfs_t *vfsp)
{
atomic_add_32(&vfsp->vfs_count, 1);
ASSERT(vfsp->vfs_count != 0);
}
/*
* Decrements the vfs reference count by one atomically. When
* vfs reference count becomes zero, it calls the file system
* specific vfs_freevfs() to free up the resources.
*/
void
vfs_rele(vfs_t *vfsp)
{
ASSERT(vfsp->vfs_count != 0);
if (atomic_add_32_nv(&vfsp->vfs_count, -1) == 0) {
VFS_FREEVFS(vfsp);
if (vfsp->vfs_zone)
zone_rele(vfsp->vfs_zone);
vfs_freemnttab(vfsp);
sema_destroy(&vfsp->vfs_reflock);
kmem_free(vfsp, sizeof (*vfsp));
}
}
/*
* Generic operations vector support.
*
* This is used to build operations vectors for both the vfs and vnode.
* It's normally called only when a file system is loaded.
*
* There are many possible algorithms for this, including the following:
*
* (1) scan the list of known operations; for each, see if the file system
* includes an entry for it, and fill it in as appropriate.
*
* (2) set up defaults for all known operations. scan the list of ops
* supplied by the file system; for each which is both supplied and
* known, fill it in.
*
* (3) sort the lists of known ops & supplied ops; scan the list, filling
* in entries as we go.
*
* we choose (1) for simplicity, and because performance isn't critical here.
* note that (2) could be sped up using a precomputed hash table on known ops.
* (3) could be faster than either, but only if the lists were very large or
* supplied in sorted order.
*
*/
int
fs_build_vector(void *vector, int *unused_ops,
const fs_operation_trans_def_t *translation,
const fs_operation_def_t *operations)
{
int i, num_trans, num_ops, used;
/* Count the number of translations and the number of supplied */
/* operations. */
{
const fs_operation_trans_def_t *p;
for (num_trans = 0, p = translation;
p->name != NULL;
num_trans++, p++)
;
}
{
const fs_operation_def_t *p;
for (num_ops = 0, p = operations;
p->name != NULL;
num_ops++, p++)
;
}
/* Walk through each operation known to our caller. There will be */
/* one entry in the supplied "translation table" for each. */
used = 0;
for (i = 0; i < num_trans; i++) {
int j, found;
char *curname;
fs_generic_func_p result;
fs_generic_func_p *location;
curname = translation[i].name;
/* Look for a matching operation in the list supplied by the */
/* file system. */
found = 0;
for (j = 0; j < num_ops; j++) {
if (strcmp(operations[j].name, curname) == 0) {
used++;
found = 1;
break;
}
}
/* If the file system is using a "placeholder" for default */
/* or error functions, grab the appropriate function out of */
/* the translation table. If the file system didn't supply */
/* this operation at all, use the default function. */
if (found) {
result = operations[j].func;
if (result == fs_default) {
result = translation[i].defaultFunc;
} else if (result == fs_error) {
result = translation[i].errorFunc;
} else if (result == NULL) {
/* Null values are PROHIBITED */
return (EINVAL);
}
} else {
result = translation[i].defaultFunc;
}
/* Now store the function into the operations vector. */
location = (fs_generic_func_p *)
(((char *)vector) + translation[i].offset);
*location = result;
}
*unused_ops = num_ops - used;
return (0);
}
/* Placeholder functions, should never be called. */
int
fs_error(void)
{
cmn_err(CE_PANIC, "fs_error called");
return (0);
}
int
fs_default(void)
{
cmn_err(CE_PANIC, "fs_default called");
return (0);
}
#ifdef __sparc
/*
* Part of the implementation of booting off a mirrored root
* involves a change of dev_t for the root device. To
* accomplish this, first remove the existing hash table
* entry for the root device, convert to the new dev_t,
* then re-insert in the hash table at the head of the list.
*/
void
vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype)
{
vfs_list_lock();
vfs_hash_remove(vfsp);
vfsp->vfs_dev = ndev;
vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype);
vfs_hash_add(vfsp, 1);
vfs_list_unlock();
}
#else /* x86 NEWBOOT */
int
rootconf()
{
int error;
struct vfssw *vsw;
extern void pm_init();
char *fstyp;
fstyp = getrootfs();
if (error = clboot_rootconf())
return (error);
if (modload("fs", fstyp) == -1)
cmn_err(CE_PANIC, "Cannot _init %s module\n", fstyp);
RLOCK_VFSSW();
vsw = vfs_getvfsswbyname(fstyp);
RUNLOCK_VFSSW();
VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0);
VFS_HOLD(rootvfs);
/* always mount readonly first */
rootvfs->vfs_flag |= VFS_RDONLY;
pm_init();
if (netboot)
(void) strplumb();
error = VFS_MOUNTROOT(rootvfs, ROOT_INIT);
vfs_unrefvfssw(vsw);
rootdev = rootvfs->vfs_dev;
if (error)
cmn_err(CE_PANIC, "cannot mount root path %s", svm_bootpath);
return (error);
}
/*
* XXX this is called by nfs only and should probably be removed
* If booted with ASKNAME, prompt on the console for a filesystem
* name and return it.
*/
void
getfsname(char *askfor, char *name, size_t namelen)
{
if (boothowto & RB_ASKNAME) {
printf("%s name: ", askfor);
console_gets(name, namelen);
}
}
/*
* If server_path exists, then we are booting a diskless
* client. Otherwise, we default to ufs. Zfs should perhaps be
* another property.
*/
static char *
getrootfs(void)
{
extern char *strplumb_get_netdev_path(void);
char *propstr = NULL;
/* check fstype property; it should be nfsdyn for diskless */
if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
DDI_PROP_DONTPASS, "fstype", &propstr)
== DDI_SUCCESS) {
(void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME);
ddi_prop_free(propstr);
}
if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0)
return (rootfs.bo_fstype);
++netboot;
/* check if path to network interface is specified in bootpath */
if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
DDI_PROP_DONTPASS, "bootpath", &propstr)
== DDI_SUCCESS) {
(void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME);
ddi_prop_free(propstr);
} else {
/* attempt to determine netdev_path via boot_mac address */
netdev_path = strplumb_get_netdev_path();
if (netdev_path == NULL)
cmn_err(CE_PANIC,
"Cannot find boot network interface\n");
(void) strncpy(rootfs.bo_name, netdev_path, BO_MAXOBJNAME);
}
return ("nfs");
}
#endif