mntvnops.c revision 5010b7f7d31dbab7bc89cb216aeb080b56ed2e5d
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
#include <sys/sysmacros.h>
#include <sys/vfs_opreg.h>
#define MNTROOTINO 2
extern void vfs_mnttab_readop(void);
/*
* Design of kernel mnttab accounting.
*
* mntfs provides two methods of reading the in-kernel mnttab, i.e. the state of
* ioctl() commands. Most of these interfaces are public and are described in
* mnttab(4). Three private ioctl() commands, MNTIOC_GETMNTENT,
* MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY, provide for the getmntent(3C)
* family of functions, allowing them to support white space in mount names.
*
* A significant feature of mntfs is that it provides a file descriptor with a
* snapshot once it begins to consume mnttab data. Thus, as the process
* continues to consume data, its view of the in-kernel mnttab does not change
* even if resources are mounted or unmounted. The intent is to ensure that
* processes are guaranteed to read self-consistent data even as the system
* changes.
*
* The snapshot is implemented by a "database", unique to each zone, that
* comprises a linked list of mntelem_ts. The database is identified by
* zone_mntfs_db and is protected by zone_mntfs_db_lock. Each element contains
* marked with its time of "birth", i.e. creation. An element is "killed", and
* marked with its time of death, when it is found to be out of date, e.g. when
* the corresponding resource has been unmounted.
*
* When a process performs the first read() or ioctl() for a file descriptor for
* that an element exists for each currently mounted resource. Following this,
* the current time is written into a snapshot structure, a mntsnap_t, embedded
* in the descriptor's mntnode_t.
*
* particular file descriptor by searching the database for entries that were
* born before the appropriate snapshot and that either are still alive or died
* after the snapshot was created. Consumers use the iterator function
* mntfs_get_next_elem() to identify the next suitable element in the database.
*
* Each snapshot has a hold on its corresponding database elements, effected by
* a per-element reference count. At last close(), a snapshot is destroyed in
* mntfs_freesnap() by releasing all of its holds; an element is destroyed if
* its reference count becomes zero. Therefore the database never exists unless
*
* getmntent(3C) et al. "do not open, close or rewind the file." This implies
* that getmntent() and read() must be able to operate without interaction on
* the same file descriptor; this is accomplished by the use of separate
* mntsnap_ts for both read() and ioctl().
*
* mntfs observes the following lock-ordering:
*
* mnp->mnt_contents -> vfslist -> zonep->zone_mntfs_db_lock
*
* NOTE: The following variable enables the generation of the "dev=xxx"
* in the option string for a mounted file system. Really this should
* be gotten rid of altogether, but for the sake of backwards compatibility
* we had to leave it in. It is defined as a 32-bit device number. This
* means that when 64-bit device numbers are in use, if either the major or
* minor part of the device number will not fit in a 16 bit quantity, the
* "dev=" will be set to NODEV (0x7fffffff). See PSARC 1999/566 and
* 1999/131 for details. The cmpldev() function used to generate the 32-bit
* device number handles this check and assigns the proper value.
*/
extern void vfs_mono_time(timespec_t *);
/*
* content or simply the marker string "-". This never applies to the time,
* therefore the delimiter must be a tab.
*/
static int
{
}
static int
{
}
/* Identify which, if either, of two supplied timespec structs is newer. */
static int
{
return (MNTFS_NEITHER);
return (MNTFS_SECOND);
} else {
return (MNTFS_FIRST);
}
}
static int
{
int i, size = 0;
continue;
if (size)
size++; /* space for comma */
/*
* count option value if there is one
*/
}
}
}
/*
* Add space for "zone=<zone_name>" if required.
*/
if (size)
size++; /* space for comma */
}
if (mntfs_enabledev) {
if (size != 0)
size++; /* space for comma */
}
if (size == 0)
return (size);
}
static int
{
int i, optinbuf = 0;
continue;
if (optinbuf)
*buf++ = ',';
else
optinbuf = 1;
/*
* print option value if there is one
*/
}
}
}
if (optinbuf)
*buf++ = ',';
else
optinbuf = 1;
}
if (mntfs_enabledev) {
if (optinbuf++)
*buf++ = ',';
}
if (!optinbuf) {
}
}
void
{
tabp->mnt_special = 0;
if (resource[0] != '/') {
/*
* Use the mount point as the resource.
*/
} else {
}
} else {
}
/*
* We know the mount point is visible from within the zone,
* otherwise it wouldn't be on the zone's vfs list.
*/
} else {
}
*cp++ = '\t';
}
static size_t
{
} else {
}
if (resource[0] != '/') {
/*
* Same as the zone's view of the mount point.
*/
} else {
}
} else {
}
return (size);
}
/* Destroy the resources associated with a snapshot element. */
static void
{
}
/*
* Return 1 if the given snapshot is in the range of the given element; return
* 0 otherwise.
*/
static int
{
/*
* If a snapshot is in range of an element then the snapshot must have
* been created after the birth of the element, and either the element
* is still alive or it died after the snapshot was created.
*/
(MNTFS_ELEM_IS_ALIVE(elemp) ||
return (1);
else
return (0);
}
/*
* Return the next valid database element, after the one provided, for a given
* snapshot; return NULL if none exists. The caller must hold the zone's
* database lock as a reader before calling this function.
*/
static mntelem_t *
{
do {
} while (elemp &&
return (elemp);
}
/*
* This function frees the resources associated with a mntsnap_t. It walks
* through the database, decrementing the reference count of any element that
* satisfies the snapshot. If the reference count of an element becomes zero
* then it is removed from the database.
*/
static void
{
size_t number_decremented = 0;
/* Ignore an uninitialised snapshot. */
if (snapp->mnts_nmnts == 0)
return;
/* Drop the holds on any matching database elements. */
} else {
}
}
/* Clear the snapshot data. */
}
/* Insert the new database element newp after the existing element prevp. */
static void
{
}
/* Create and return a copy of a given database element. */
static mntelem_t *
{
return (copyp);
}
/*
* Compare two database elements and determine whether or not the vfs_t payload
* data of each are the same. Return 1 if so and 0 otherwise.
*/
static int
{
if (a->mnte_hidden == b->mnte_hidden &&
a->mnte_text_size == b->mnte_text_size &&
return (1);
else
return (0);
}
/*
* mntfs_snapshot() updates the database, creating it if necessary, so that it
* accurately reflects the state of the in-kernel mnttab. It also increments
* the reference count on all database elements that correspond to currently-
* mounted resources. Finally, it initialises the appropriate snapshot
* structure.
*
* Each vfs_t is given a high-resolution time stamp, for the benefit of mntfs,
* when it is inserted into the in-kernel mnttab. This time stamp is copied into
* the corresponding database element when it is created, allowing the element
* and the vfs_t to be identified as a pair. It is possible that some file
* systems may make unadvertised changes to, for example, a resource's mount
* options. Therefore, in order to determine whether a database element is an
* up-to-date representation of a given vfs_t, it is compared with a temporary
* element generated for this purpose. Although less efficient, this is safer
* than implementing an mtime for a vfs_t.
*
* Some mounted resources are marked as "hidden" with a VFS_NOMNTTAB flag. These
* are considered invisible unless the user has already set the MNT_SHOWHIDDEN
* flag in the vnode using the MNTIOC_SHOWHIDDEN ioctl.
*/
static void
{
int order;
size_t total_text_size = 0;
size_t normal_text_size = 0;
int insert_before;
/*
* If this snapshot already exists then we must have been asked to
* rewind the file, i.e. discard the snapshot and create a new one in
* its place. In this case we first see if the in-kernel mnttab has
* advertised a change; if not then we simply reinitialise the metadata.
*/
if (snapp->mnts_nmnts) {
/*
* An unchanged mtime is no guarantee that the
* in-kernel mnttab is unchanged; for example, a
* concurrent remount may be between calls to
* vfs_setmntopt_nolock() and vfs_mnttab_modtimeupd().
* It follows that the database may have changed, and
* in particular that some elements in this snapshot
* may have been killed by another call to
* mntfs_snapshot(). It is therefore not merely
* unnecessary to update the snapshot's time but in
* fact dangerous; it needs to be left alone.
*/
return;
} else {
}
}
/*
* Create a temporary database element. For each vfs_t, the temporary
* element will be populated with the corresponding text. If the vfs_t
* does not have a corresponding element within the database, or if
* there is such an element but it is stale, a copy of the temporary
* element is inserted into the database at the appropriate location.
*/
/* Find the first and last vfs_t for the given zone. */
if (is_global_zone) {
} else {
/*
* If there isn't already a vfs_t for root then we create a
* dummy which will be used as the head of the list (which will
* therefore no longer be circular).
*/
zonep->zone_rootpath) != 0) {
/*
* The zone's vfs_ts will have mount points relative to
* the zone's root path. The vfs_t for the zone's
* root file system would therefore have a mount point
* equal to the zone's root path. Since the zone's root
* path isn't a mount point, we copy the vfs_t of the
* zone's root vnode, and provide it with a fake mount
* and resource. However, if the zone's root is a
* zfs dataset, use the dataset name as the resource.
*
* Note that by cloning another vfs_t we also acquire
* its high-resolution ctime. This might appear to
* violate the requirement that the ctimes in the list
* of vfs_ts are unique and monotonically increasing;
* this is not the case. The dummy vfs_t appears in only
* a non-global zone's vfs_t list, where the cloned
* vfs_t would not ordinarily be visible; the ctimes are
* therefore unique. The zone's root path must be
* available before the zone boots, and so its root
* vnode's vfs_t's ctime must be lower than those of any
* resources subsequently mounted by the zone. The
* ctimes are therefore monotonically increasing.
*/
!= 0)
} else {
}
} else {
}
}
/*
* Now walk through all the vfs_ts for this zone. For each one, find the
* corresponding database element, creating it first if necessary, and
* increment its reference count.
*/
/* CSTYLED */
/* Consider only visible entries. */
/*
* Walk through the existing database looking for either
* an element that matches the current vfs_t, or for the
* correct place in which to insert a new element.
*/
insert_before = 0;
elemp);
/* Compare the vfs_t with the element. */
&vfsp->vfs_hrctime);
/*
* If we encounter a database element newer than
* this vfs_t then we've stepped over a gap
* where the element for this vfs_t must be
* inserted.
*/
if (order == MNTFS_FIRST) {
insert_before = 1;
break;
}
/* Dead elements no longer interest us. */
if (MNTFS_ELEM_IS_DEAD(elemp))
continue;
/*
* If the time stamps are the same then the
* element is potential match for the vfs_t,
* although it may later prove to be stale.
*/
if (order == MNTFS_NEITHER)
break;
/*
* This element must be older than the vfs_t.
* It must, therefore, correspond to a vfs_t
* that has been unmounted. Since the element is
* still alive, we kill it if it is visible.
*/
}
/* Create a new database element if required. */
if (new_entry_length > entry_length) {
}
/*
* We ran off the end of the database. Insert a
* new element at the end.
*/
if (prevp) {
} else {
}
} else if (insert_before) {
/*
* Insert a new element before the current one.
*/
if (prevp) {
} else {
}
/*
* The element corresponds to the vfs_t, but the
* vfs_t has changed; it must have been
* remounted. Kill the old element and insert a
* new one after it.
*/
}
/* We've found the corresponding element. Hold it. */
elemp->mnte_refcnt++;
/*
* Update the parameters used to initialise the
* snapshot.
*/
nmnts++;
if (!elemp->mnte_hidden)
if (!firstp)
}
break;
}
/*
* Any remaining visible database elements that are still alive must be
* killed now, because their corresponding vfs_ts must have been
* unmounted.
*/
if (MNTFS_ELEM_IS_ALIVE(elemp) &&
}
/* Initialise the snapshot. */
/*
* by mntgetattr().
*/
if (show_hidden) {
}
/* Clean up. */
}
/*
* Public function to convert vfs_mntopts into a string.
* A buffer of sufficient size is allocated, which is returned via bufp,
* and whose length is returned via lenp.
*/
void
{
char *buf;
return;
}
}
/* ARGSUSED */
static int
{
/*
* Not allowed to open for writing, return error.
*/
return (EPERM);
/*
* hang the snapshot on.
*/
return (0);
}
/* ARGSUSED */
static int
{
/* Clean up any locks or shares held by the current process */
if (count > 1)
return (0);
}
return (0);
}
/* ARGSUSED */
static int
{
char *bufferp;
int error = 0;
return (EFAULT);
}
if (len == 0) {
return (0);
}
/*
* For the file offset provided, locate the corresponding database
* element and calculate the corresponding offset within its text. If
* the file offset is the same as that reached during the last read(2)
* then use the saved element and intra-element offset.
*/
} else {
/*
* Find the element corresponding to the requested file offset
* by walking through the database and summing the text sizes
* of the individual elements. If the requested file offset is
* greater than that reached on the last visit then we can start
* at the last seen element; otherwise, we have to start at the
* beginning.
*/
} else {
total_off = 0;
}
}
/* Calculate the intra-element offset. */
else
ieoffset = 0;
}
/*
* Create a buffer and populate it with the text from successive
* database elements until it is full.
*/
ieoffset = 0;
} else {
}
}
/*
* Write the populated buffer, update the snapshot's state if
* successful and then advertise our read.
*/
if (error == 0) {
}
/* Clean up. */
return (error);
}
static int
{
int error;
extern timespec_t vfs_mnttab_ctime;
/* AT_MODE, AT_UID and AT_GID are derived from the underlying file. */
return (error);
}
/*
* There are some minor subtleties in the determination of
* which, in the vicinity of a change to the in-kernel mnttab, we
* return an old value for one but a new value for the other. We cannot
* simply hold vfslist for the entire calculation because we might need
* to call mntfs_snapshot(), which calls vfs_list_read_lock().
*/
} else {
}
/*
* The mntnode already has at least one snapshot from
* which to take the size; the user will understand from
* mnttab(4) that the current size of the in-kernel
* mnttab is irrelevant.
*/
/*
* There is no existing valid snapshot but the in-kernel
* mnttab has not changed since the time that the last
* one was generated. Use the old file size; note that
* it is guaranteed to be consistent with mtime, which
* may be returned to the user later.
*/
} else {
/*
* There is no snapshot and the in-kernel mnttab has
* changed since the last one was created. We generate a
* new snapshot which we use for not only the size but
* also the mtime, thereby ensuring that the two are
* consistent.
*/
}
}
/* Always look like a regular file. */
/* Mode should basically be read only. */
/* Nodeid is always ROOTINO. */
/*
* Set nlink to the number of open vnodes for mnttab info
* plus one for existing.
*/
if (mask & AT_BLKSIZE)
if (mask & AT_NBLOCKS)
return (0);
}
static int
{
return (EROFS);
/*
* Do access check on the underlying directory vnode.
*/
}
/*
* New /mntfs vnode required; allocate it and fill in most of the fields.
*/
static mntnode_t *
{
return (mnp);
}
/*
* Free the storage obtained from mntgetnode().
*/
static void
{
vn_invalid(vp);
}
/* ARGSUSED */
static int
{
return (0);
}
/* ARGSUSED */
static void
{
}
/*
* lseek(2) is supported only to rewind the file by resetmnttab(3C). Rewinding
* snapshot at the next ioctl().
*
* mnttab(4) explains that "the snapshot...is taken any time a read(2) is
* performed at offset 0". We therefore ignore the read snapshot here.
*/
/* ARGSUSED */
static int
{
if (*noffp == 0) {
}
return (0);
}
/*
* Return the answer requested to poll().
* POLLRDBAND will return when the mtime of the mnttab
* information is newer than the latest one read for this open.
*/
/* ARGSUSED */
static int
{
else
*revp = 0;
if (ev & POLLRDNORM)
*revp |= POLLRDNORM;
if (ev & POLLRDBAND) {
*revp |= POLLRDBAND;
}
return (0);
}
/*
* If someone is polling an unsupported poll events (e.g.
* POLLOUT, POLLPRI, etc.), just return POLLERR revents.
* That way we will ensure that we don't return a 0
* revents with a NULL pollhead pointer.
*/
return (0);
}
/*
* mntfs_same_word() returns 1 if two words are the same in the context of
* MNTIOC_GETMNTANY and 0 otherwise.
*
* worda is a memory address that lies somewhere in the buffer bufa; it cannot
* be NULL since this is used to indicate to getmntany(3C) that the user does
* not wish to match a particular field. The text to which worda points is
* supplied by the user; if it is not null-terminated then it cannot match.
*
* delimited by tab or new-line characters. offb is the offset of the second
* word within this buffer.
*
* mntfs_same_word() returns 1 if the words are the same and 0 otherwise.
*/
int
{
int bytes_remaining;
worda++;
wordb++;
}
if (bytes_remaining &&
return (1);
else
return (0);
}
/*
* mntfs_special_info_string() returns which, if either, of VBLK or VCHR
* corresponds to a supplied path. If the path is a special device then the
* function optionally sets the major and minor numbers.
*/
{
int error;
return (0);
}
return (type);
} else {
return (0);
}
}
/*
* mntfs_special_info_element() extracts the name of the mounted resource
* for a given element and copies it into a null-terminated string, which it
* then passes to mntfs_special_info_string().
*/
{
char *newpath;
return (type);
}
/*
* Convert an address that points to a byte within a user buffer into an
* address that points to the corresponding offset within a kernel buffer. If
* the user address is NULL then make no conversion. If the address does not
* lie within the buffer then reset it to NULL.
*/
char *
{
return (NULL);
else
}
/*
* These 32-bit versions are to support STRUCT_DECL(9F) etc. in
* mntfs_copyout_element() and mntioctl().
*/
#ifdef _SYSCALL32_IMPL
typedef struct extmnttab32 {
typedef struct mnttab32 {
} mnttab32_t;
struct mntentbuf32 {
};
#endif
/*
* mntfs_copyout_element() is common code for the MNTIOC_GETMNTENT,
* MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY ioctls. Having identifed the
* database element desired by the user, this function copies out the text and
* the pointers to the relevant userland addresses. It returns 0 on success
* and non-zero otherwise.
*/
int
{
char *kbufp;
int error = 0;
/*
* We create a struct extmnttab within the kernel of the size
* determined by the user's data model. We then populate its
* fields by combining the start address of the text buffer
* supplied by the user, ubufp, with the offsets stored for
* this database element within dbtabp, a pointer to a struct
* extmnttab.
*
* Note that if the corresponding field is "-" this signifies
* no real content, and we set the address to NULL. This does
* not apply to mnt_time.
*/
if (cmd == MNTIOC_GETEXTMNTENT) {
} else {
}
return (EFAULT);
/*
* We create a text buffer in the kernel into which we copy the
* new-line delimiters to null bytes before copying out the
* buffer.
*/
return (error);
}
/* ARGSUSED */
static int
{
int error = 0;
switch (cmd) {
case MNTIOC_NMNTS: /* get no. of mounted resources */
{
if (snapp->mnts_nmnts == 0 ||
}
if (snapp->mnts_nmnts == 0 ||
}
break;
}
case MNTIOC_GETDEVLIST: /* get mounted device major/minor nos */
{
int i = 0;
if (snapp->mnts_nmnts == 0 ||
}
if (snapp->mnts_nmnts == 0 ||
}
/* Create a local buffer to hold the device numbers. */
/*
* Walk the database elements for this snapshot and add their
* major and minor numbers.
*/
i++;
}
break;
}
case MNTIOC_SETTAG: /* set tag on mounted file system */
case MNTIOC_CLRTAG: /* clear tag on mounted file system */
{
char *cptr;
char tagbuf[MAX_MNTOPT_TAG];
char *pbuf;
break;
}
if (zone != global_zone) {
/* truncate "/" and nul */
}
if (error) {
break;
}
break;
}
break;
}
if (cmd == MNTIOC_SETTAG)
else
break;
}
case MNTIOC_SHOWHIDDEN:
{
break;
}
case MNTIOC_GETMNTANY:
{
char *ubufp; /* uaddr of user's text buf */
char *prefbuf; /* our copy of user's text */
char *dbbufp; /* element's text buf */
/*
* embuf is a struct embuf within the kernel. We copy into it
* the struct embuf supplied by the user.
*/
STRUCT_SIZE(embuf))) {
break;
}
/*
* Check that the text buffer offered by the user is the
* agreed size.
*/
if (ubufsize != MNT_LINE_MAX) {
break;
}
/* Copy the user-supplied entry into a local buffer. */
break;
}
/* Ensure that any string within it is null-terminated. */
/* Copy in the user-supplied mpref */
break;
}
/*
* Copy the members of the user's pref struct into a local
* struct. The pointers need to be offset and verified to
* ensure that they lie within the bounds of the buffer.
*/
/*
* If the user specifies a mounted resource that is a special
* device then we capture its mode and major and minor numbers;
* cf. the block comment below.
*/
if (snapp->mnts_nmnts == 0 ||
/*
* This is the core functionality that implements getmntany().
* We walk through the mntfs database until we find an element
* matching the user's preferences that are contained in
* preftab. Typically, this means checking that the text
* matches. However, the mounted resource is special: if the
* user is looking for a special device then we must find a
* database element with the same major and minor numbers and
* the same type, i.e. VBLK or VCHR. The type is not recorded
* in the element because it cannot be inferred from the vfs_t.
* We therefore check the type of suitable candidates via
* mntfs_special_info_element(); since this calls into the
* underlying file system we make sure to drop the database lock
* first.
*/
for (;;) {
elemp)) {
if (((type &&
MNTFS_REAL_FIELD(dbbufp)) ||
dbbufsize)))) &&
dbbufsize)) &&
dbbufsize)) &&
dbbufsize)) &&
dbbufsize)))
break;
}
break;
}
/* If we failed to find a match then return EOF. */
break;
}
/*
* Check that the text buffer offered by the user will be large
* enough to accommodate the text for this entry.
*/
*rvalp = MNTFS_TOOLONG;
break;
}
/*
* Populate the user's struct mnttab and text buffer using the
* element's contents.
*/
} else {
}
break;
}
case MNTIOC_GETMNTENT:
case MNTIOC_GETEXTMNTENT:
{
char *ubufp; /* uaddr of user's text buf */
if (snapp->mnts_nmnts == 0 ||
break;
}
/*
* embuf is a struct embuf within the kernel. We copy into it
* the struct embuf supplied by the user.
*/
STRUCT_SIZE(embuf))) {
break;
}
/*
* Check that the text buffer offered by the user will be large
* enough to accommodate the text for this entry.
*/
*rvalp = MNTFS_TOOLONG;
break;
}
/*
* Populate the user's struct mnttab and text buffer using the
* element's contents.
*/
} else {
}
break;
}
default:
break;
}
return (error);
}
/*
* mntfs provides a new vnode for each open(2). Two vnodes will represent the
*/
/* ARGSUSED */
int
{
}
/*
* /mntfs vnode operations vector
*/
const fs_operation_def_t mnt_vnodeops_template[] = {
};