fs/mntfs/mntvnops.c

	mntvnops.c revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

#include <sys/file.h>
#include <sys/stat.h>
#include <sys/atomic.h>
#include <sys/mntio.h>
#include <sys/mnttab.h>
#include <sys/mount.h>
#include <sys/sunddi.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/vfs.h>
#include <sys/fs/mntdata.h>
#include <fs/fs_subr.h>
#include <sys/vmsystm.h>
#include <vm/seg_vn.h>

#define MNTROOTINO  2

static mntnode_t *mntgetnode(vnode_t *);

vnodeops_t *mntvnodeops;

/*
 * Design of kernel mnttab accounting.
 *
 * To support whitespace in mount names, we implement an ioctl
 * (MNTIOC_GETMNTENT) which allows a programmatic interface to the data in
 * /etc/mnttab.  The libc functions getmntent() and getextmntent() are built
 * atop this interface.
 *
 * To minimize the amount of memory used in the kernel, we keep all the
 * necessary information in the user's address space.  Large server
 * configurations can have /etc/mnttab files in excess of 64k.
 *
 * To support both vanilla read() calls as well as ioctl() calls, we have two
 * different snapshots of the kernel data structures, mnt_read and mnt_ioctl.
 * These snapshots include the base location in user memory, the number of
 * mounts in the snapshot, and any metadata associated with it.  The metadata is
 * used only to support the ioctl() interface, and is a series of extmnttab
 * structures.  When the user issues an ioctl(), we simply copyout a pointer to
 * that structure, and the rest is handled in userland.
 */

/*
 * NOTE: The following variable enables the generation of the "dev=xxx"
 * in the option string for a mounted file system.  Really this should
 * be gotten rid of altogether, but for the sake of backwards compatibility
 * we had to leave it in.  It is defined as a 32-bit device number.  This
 * means that when 64-bit device numbers are in use, if either the major or
 * minor part of the device number will not fit in a 16 bit quantity, the
 * "dev=" will be set to NODEV (0x7fffffff).  See PSARC 1999/566 and
 * 1999/131 for details.  The cmpldev() function used to generate the 32-bit
 * device number handles this check and assigns the proper value.
 */
int mntfs_enabledev = 1;    /* enable old "dev=xxx" option */

static int
mntfs_devsize(struct vfs *vfsp)
{
    dev32_t odev;

    (void) cmpldev(&odev, vfsp->vfs_dev);
    return (snprintf(NULL, 0, "dev=%x", odev));
}

static int
mntfs_devprint(struct vfs *vfsp, char *buf)
{
    dev32_t odev;

    (void) cmpldev(&odev, vfsp->vfs_dev);
    return (snprintf(buf, MAX_MNTOPT_STR, "dev=%x", odev));
}

static int
mntfs_optsize(struct vfs *vfsp)
{
    int i, size = 0;
    mntopt_t *mop;

    for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
        mop = &vfsp->vfs_mntopts.mo_list[i];
        if (mop->mo_flags & MO_NODISPLAY)
            continue;
        if (mop->mo_flags & MO_SET) {
            if (size)
                size++; /* space for comma */
            size += strlen(mop->mo_name);
            /*
             * count option value if there is one
             */
            if (mop->mo_arg != NULL) {
                size += strlen(mop->mo_arg) + 1;
            }
        }
    }
    if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
        /*
         * Add space for "zone=<zone_name>" if required.
         */
        if (size)
            size++; /* space for comma */
        size += sizeof ("zone=") - 1;
        size += strlen(vfsp->vfs_zone->zone_name);
    }
    if (mntfs_enabledev) {
        if (size != 0)
            size++; /* space for comma */
        size += mntfs_devsize(vfsp);
    }
    if (size == 0)
        size = strlen("-");
    return (size);
}

static int
mntfs_optprint(struct vfs *vfsp, char *buf)
{
    int i, optinbuf = 0;
    mntopt_t *mop;
    char *origbuf = buf;

    for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
        mop = &vfsp->vfs_mntopts.mo_list[i];
        if (mop->mo_flags & MO_NODISPLAY)
            continue;
        if (mop->mo_flags & MO_SET) {
            if (optinbuf)
                *buf++ = ',';
            else
                optinbuf = 1;
            buf += snprintf(buf, MAX_MNTOPT_STR,
                "%s", mop->mo_name);
            /*
             * print option value if there is one
             */
            if (mop->mo_arg != NULL) {
                buf += snprintf(buf, MAX_MNTOPT_STR, "=%s",
                    mop->mo_arg);
            }
        }
    }
    if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
        if (optinbuf)
            *buf++ = ',';
        else
            optinbuf = 1;
        buf += snprintf(buf, MAX_MNTOPT_STR, "zone=%s",
            vfsp->vfs_zone->zone_name);
    }
    if (mntfs_enabledev) {
        if (optinbuf++)
            *buf++ = ',';
        buf += mntfs_devprint(vfsp, buf);
    }
    if (!optinbuf) {
        buf += snprintf(buf, MAX_MNTOPT_STR, "-");
    }
    return (buf - origbuf);
}

static size_t
mntfs_vfs_len(vfs_t *vfsp, zone_t *zone)
{
    size_t size = 0;
    const char *resource, *mntpt;

    mntpt = refstr_value(vfsp->vfs_mntpt);
    if (mntpt != NULL && mntpt[0] != '\0') {
        size += strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
    } else {
        size += strlen("-") + 1;
    }

    resource = refstr_value(vfsp->vfs_resource);
    if (resource != NULL && resource[0] != '\0') {
        if (resource[0] != '/') {
            size += strlen(resource) + 1;
        } else if (!ZONE_PATH_VISIBLE(resource, zone)) {
            /*
             * Same as the zone's view of the mount point.
             */
            size += strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
        } else {
            size += strlen(ZONE_PATH_TRANSLATE(resource, zone)) + 1;
        }
    } else {
        size += strlen("-") + 1;
    }
    size += strlen(vfssw[vfsp->vfs_fstype].vsw_name) + 1;
    size += mntfs_optsize(vfsp);
    size += snprintf(NULL, 0, "\t%ld\n", vfsp->vfs_mtime);
    return (size);
}

static void
mntfs_zonerootvfs(zone_t *zone, vfs_t *rootvfsp)
{
    /*
     * Basically copy over the real vfs_t on which the root vnode is
     * located, changing its mountpoint and resource to match those of
     * the zone's rootpath.
     */
    *rootvfsp = *zone->zone_rootvp->v_vfsp;
    rootvfsp->vfs_mntpt = refstr_alloc(zone->zone_rootpath);
    rootvfsp->vfs_resource = rootvfsp->vfs_mntpt;
}

static size_t
mntfs_zone_len(uint_t *nent_ptr, zone_t *zone, int showhidden)
{
    struct vfs *zonelist;
    struct vfs *vfsp;
    size_t size = 0;
    uint_t cnt = 0;

    ASSERT(zone->zone_rootpath != NULL);

    /*
     * If the zone has a root entry, it will be the first in the list.  If
     * it doesn't, we conjure one up.
     */
    vfsp = zonelist = zone->zone_vfslist;
    if (zonelist == NULL ||
        strcmp(refstr_value(vfsp->vfs_mntpt), zone->zone_rootpath) != 0) {
        vfs_t tvfs;
        /*
         * The root of the zone is not a mount point.  The vfs we want
         * to report is that of the zone's root vnode.
         */
        ASSERT(zone != global_zone);
        mntfs_zonerootvfs(zone, &tvfs);
        size += mntfs_vfs_len(&tvfs, zone);
        refstr_rele(tvfs.vfs_mntpt);
        cnt++;
    }
    if (zonelist == NULL)
        goto out;
    do {
        /*
         * Skip mounts that should not show up in mnttab
         */
        if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
            vfsp = vfsp->vfs_zone_next;
            continue;
        }
        cnt++;
        size += mntfs_vfs_len(vfsp, zone);
        vfsp = vfsp->vfs_zone_next;
    } while (vfsp != zonelist);
out:
    *nent_ptr = cnt;
    return (size);
}

static size_t
mntfs_global_len(uint_t *nent_ptr, int showhidden)
{
    struct vfs *vfsp;
    size_t size = 0;
    uint_t cnt = 0;

    vfsp = rootvfs;
    do {
        /*
         * Skip mounts that should not show up in mnttab
         */
        if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
            vfsp = vfsp->vfs_next;
            continue;
        }
        cnt++;
        size += mntfs_vfs_len(vfsp, global_zone);
        vfsp = vfsp->vfs_next;
    } while (vfsp != rootvfs);
    *nent_ptr = cnt;
    return (size);
}

static void
mntfs_vfs_generate(vfs_t *vfsp, zone_t *zone, struct extmnttab *tab,
    char **basep, int forread)
{
    const char *resource, *mntpt;
    char *cp = *basep;

    mntpt = refstr_value(vfsp->vfs_mntpt);
    resource = refstr_value(vfsp->vfs_resource);

    if (tab)
        tab->mnt_special = cp;
    if (resource != NULL && resource[0] != '\0') {
        if (resource[0] != '/') {
            cp += snprintf(cp, MAXPATHLEN, "%s", resource);
        } else if (!ZONE_PATH_VISIBLE(resource, zone)) {
            /*
             * Use the mount point as the resource.
             */
            cp += snprintf(cp, MAXPATHLEN, "%s",
                ZONE_PATH_TRANSLATE(mntpt, zone));
        } else {
            cp += snprintf(cp, MAXPATHLEN, "%s",
                ZONE_PATH_TRANSLATE(resource, zone));
        }
    } else {
        cp += snprintf(cp, MAXPATHLEN, "-");
    }
    *cp++ = forread ? '\t' : '\0';

    if (tab)
        tab->mnt_mountp = cp;
    if (mntpt != NULL && mntpt[0] != '\0') {
        /*
         * We know the mount point is visible from within the zone,
         * otherwise it wouldn't be on the zone's vfs list.
         */
        cp += snprintf(cp, MAXPATHLEN, "%s",
            ZONE_PATH_TRANSLATE(mntpt, zone));
    } else {
        cp += snprintf(cp, MAXPATHLEN, "-");
    }
    *cp++ = forread ? '\t' : '\0';

    if (tab)
        tab->mnt_fstype = cp;
    cp += snprintf(cp, MAXPATHLEN, "%s",
        vfssw[vfsp->vfs_fstype].vsw_name);
    *cp++ = forread ? '\t' : '\0';

    if (tab)
        tab->mnt_mntopts = cp;
    cp += mntfs_optprint(vfsp, cp);
    *cp++ = forread ? '\t' : '\0';

    if (tab)
        tab->mnt_time = cp;
    cp += snprintf(cp, MAX_MNTOPT_STR, "%ld", vfsp->vfs_mtime);
    *cp++ = forread ? '\n' : '\0';

    if (tab) {
        tab->mnt_major = getmajor(vfsp->vfs_dev);
        tab->mnt_minor = getminor(vfsp->vfs_dev);
    }

    *basep = cp;
}

static void
mntfs_zone_generate(zone_t *zone, int showhidden, struct extmnttab *tab,
    char *basep, int forread)
{
    vfs_t *zonelist;
    vfs_t *vfsp;
    char *cp = basep;

    /*
     * If the zone has a root entry, it will be the first in the list.  If
     * it doesn't, we conjure one up.
     */
    vfsp = zonelist = zone->zone_vfslist;
    if (zonelist == NULL ||
        strcmp(refstr_value(vfsp->vfs_mntpt), zone->zone_rootpath) != 0) {
        vfs_t tvfs;
        /*
         * The root of the zone is not a mount point.  The vfs we want
         * to report is that of the zone's root vnode.
         */
        ASSERT(zone != global_zone);
        mntfs_zonerootvfs(zone, &tvfs);
        mntfs_vfs_generate(&tvfs, zone, tab, &cp, forread);
        refstr_rele(tvfs.vfs_mntpt);
        if (tab)
            tab++;
    }
    if (zonelist == NULL)
        return;
    do {
        /*
         * Skip mounts that should not show up in mnttab
         */
        if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
            vfsp = vfsp->vfs_zone_next;
            continue;
        }
        mntfs_vfs_generate(vfsp, zone, tab, &cp, forread);
        if (tab)
            tab++;
        vfsp = vfsp->vfs_zone_next;
    } while (vfsp != zonelist);
}

static void
mntfs_global_generate(int showhidden, struct extmnttab *tab, char *basep,
    int forread)
{
    vfs_t *vfsp;
    char *cp = basep;

    vfsp = rootvfs;
    do {
        /*
         * Skip mounts that should not show up in mnttab
         */
        if (!showhidden && vfsp->vfs_flag & VFS_NOMNTTAB) {
            vfsp = vfsp->vfs_next;
            continue;
        }
        mntfs_vfs_generate(vfsp, global_zone, tab, &cp, forread);
        if (tab)
            tab++;
        vfsp = vfsp->vfs_next;
    } while (vfsp != rootvfs);
}

static char *
mntfs_mapin(char *base, size_t size)
{
    size_t rlen = roundup(size, PAGESIZE);
    struct as *as = curproc->p_as;
    char *addr;

    as_rangelock(as);
    map_addr(&addr, rlen, 0, 1, 0);
    if (addr == NULL || as_map(as, addr, rlen, segvn_create, zfod_argsp)) {
        as_rangeunlock(as);
        return (NULL);
    }
    as_rangeunlock(as);
    if (copyout(base, addr, size)) {
        (void) as_unmap(as, addr, rlen);
        return (NULL);
    }
    return (addr);
}

static void
mntfs_freesnap(mntsnap_t *snap)
{
    if (snap->mnts_text != NULL)
        (void) as_unmap(curproc->p_as, snap->mnts_text,
            roundup(snap->mnts_textsize, PAGESIZE));
    snap->mnts_textsize = snap->mnts_count = 0;
    if (snap->mnts_metadata != NULL)
        (void) as_unmap(curproc->p_as, snap->mnts_metadata,
            roundup(snap->mnts_metasize, PAGESIZE));
    snap->mnts_metasize = 0;
}

#ifdef _SYSCALL32_IMPL

typedef struct extmnttab32 {
    uint32_t    mnt_special;
    uint32_t    mnt_mountp;
    uint32_t    mnt_fstype;
    uint32_t    mnt_mntopts;
    uint32_t    mnt_time;
    uint_t      mnt_major;
    uint_t      mnt_minor;
} extmnttab32_t;

#endif

/*
 * Snapshot the latest version of the kernel mounted resource information
 *
 * There are two types of snapshots: one destined for reading, and one destined
 * for ioctl().  The difference is that the ioctl() interface is delimited by
 * NULLs, while the read() interface is delimited by tabs and newlines.
 */
/* ARGSUSED */
static int
mntfs_snapshot(mntnode_t *mnp, int forread, int datamodel)
{
    size_t size;
    timespec_t lastmodt;
    mntdata_t *mntdata = MTOD(mnp);
    zone_t *zone = mntdata->mnt_zone;
    boolean_t global_view = (MTOD(mnp)->mnt_zone == global_zone);
    boolean_t showhidden = ((mnp->mnt_flags & MNT_SHOWHIDDEN) != 0);
    struct extmnttab *metadata_baseaddr;
    char *text_baseaddr;
    int i;
    mntsnap_t *snap;

    if (forread)
        snap = &mnp->mnt_read;
    else
        snap = &mnp->mnt_ioctl;

    vfs_list_read_lock();
    /*
     * Check if the mnttab info has changed since the last snapshot
     */
    vfs_mnttab_modtime(&lastmodt);
    if (snap->mnts_count &&
        lastmodt.tv_sec == snap->mnts_time.tv_sec &&
        lastmodt.tv_nsec == snap->mnts_time.tv_nsec) {
        vfs_list_unlock();
        return (0);
    }


    if (snap->mnts_count != 0)
        mntfs_freesnap(snap);
    if (global_view)
        size = mntfs_global_len(&snap->mnts_count, showhidden);
    else
        size = mntfs_zone_len(&snap->mnts_count, zone, showhidden);
    ASSERT(size != 0);

    if (!forread)
        metadata_baseaddr = kmem_alloc(
            snap->mnts_count * sizeof (struct extmnttab), KM_SLEEP);
    else
        metadata_baseaddr = NULL;

    text_baseaddr = kmem_alloc(size, KM_SLEEP);

    if (global_view)
        mntfs_global_generate(showhidden, metadata_baseaddr,
            text_baseaddr, forread);
    else
        mntfs_zone_generate(zone, showhidden,
            metadata_baseaddr, text_baseaddr, forread);

    vfs_mnttab_modtime(&snap->mnts_time);
    vfs_list_unlock();

    snap->mnts_text = mntfs_mapin(text_baseaddr, size);
    snap->mnts_textsize = size;
    kmem_free(text_baseaddr, size);

    /*
     * The pointers in the metadata refer to addreesses in the range
     * [base_addr, base_addr + size].  Now that we have mapped the text into
     * the user's address space, we have to convert these addresses into the
     * new (user) range.  We also handle the conversion for 32-bit and
     * 32-bit applications here.
     */
    if (!forread) {
        struct extmnttab *tab;
#ifdef _SYSCALL32_IMPL
        struct extmnttab32 *tab32;

        if (datamodel == DATAMODEL_ILP32) {
            tab = (struct extmnttab *)metadata_baseaddr;
            tab32 = (struct extmnttab32 *)metadata_baseaddr;

            for (i = 0; i < snap->mnts_count; i++) {
                tab32[i].mnt_special =
                    (uintptr_t)snap->mnts_text +
                    (tab[i].mnt_special - text_baseaddr);
                tab32[i].mnt_mountp =
                    (uintptr_t)snap->mnts_text +
                    (tab[i].mnt_mountp - text_baseaddr);
                tab32[i].mnt_fstype =
                    (uintptr_t)snap->mnts_text +
                    (tab[i].mnt_fstype - text_baseaddr);
                tab32[i].mnt_mntopts =
                    (uintptr_t)snap->mnts_text +
                    (tab[i].mnt_mntopts - text_baseaddr);
                tab32[i].mnt_time = (uintptr_t)snap->mnts_text +
                    (tab[i].mnt_time - text_baseaddr);
                tab32[i].mnt_major = tab[i].mnt_major;
                tab32[i].mnt_minor = tab[i].mnt_minor;
            }

            snap->mnts_metasize =
                snap->mnts_count * sizeof (struct extmnttab32);
            snap->mnts_metadata = mntfs_mapin(
                (char *)metadata_baseaddr,
                snap->mnts_metasize);

        } else {
#endif
            tab = (struct extmnttab *)metadata_baseaddr;
            for (i = 0; i < snap->mnts_count; i++) {
                tab[i].mnt_special = snap->mnts_text +
                    (tab[i].mnt_special - text_baseaddr);
                tab[i].mnt_mountp = snap->mnts_text +
                    (tab[i].mnt_mountp - text_baseaddr);
                tab[i].mnt_fstype = snap->mnts_text +
                    (tab[i].mnt_fstype - text_baseaddr);
                tab[i].mnt_mntopts = snap->mnts_text +
                    (tab[i].mnt_mntopts - text_baseaddr);
                tab[i].mnt_time = snap->mnts_text +
                    (tab[i].mnt_time - text_baseaddr);
            }

            snap->mnts_metasize =
                snap->mnts_count * sizeof (struct extmnttab);
            snap->mnts_metadata = mntfs_mapin(
                (char *)metadata_baseaddr, snap->mnts_metasize);
#ifdef _SYSCALL32_IMPL
        }
#endif

        kmem_free(metadata_baseaddr,
            snap->mnts_count * sizeof (struct extmnttab));
    }

    mntdata->mnt_size = size;

    if (snap->mnts_text == NULL ||
        (!forread && snap->mnts_metadata == NULL)) {
        mntfs_freesnap(snap);
        return (ENOMEM);
    }

    return (0);
}

/*
 * Public function to convert vfs_mntopts into a string.
 * A buffer of sufficient size is allocated, which is returned via bufp,
 * and whose length is returned via lenp.
 */
void
mntfs_getmntopts(struct vfs *vfsp, char **bufp, size_t *lenp)
{
    size_t len;
    char *buf;

    vfs_list_read_lock();

    len = mntfs_optsize(vfsp) + 1;
    buf = kmem_alloc(len, KM_NOSLEEP);
    if (buf == NULL) {
        *bufp = NULL;
        vfs_list_unlock();
        return;
    }
    buf[len - 1] = '\0';
    (void) mntfs_optprint(vfsp, buf);
    ASSERT(buf[len - 1] == '\0');

    vfs_list_unlock();
    *bufp = buf;
    *lenp = len;
}


/* ARGSUSED */
static int
mntopen(vnode_t **vpp, int flag, cred_t *cr)
{
    vnode_t *vp = *vpp;
    mntnode_t *nmnp;

    /*
     * Not allowed to open for writing, return error.
     */
    if (flag & FWRITE)
        return (EPERM);
    /*
     * Create a new mnt/vnode for each open, this will give us a handle to
     * hang the snapshot on.
     */
    nmnp = mntgetnode(vp);

    *vpp = MTOV(nmnp);
    atomic_add_32(&MTOD(nmnp)->mnt_nopen, 1);
    VN_RELE(vp);
    return (0);
}

/* ARGSUSED */
static int
mntclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
{
    mntnode_t *mnp = VTOM(vp);

    /* Clean up any locks or shares held by the current process */
    cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
    cleanshares(vp, ttoproc(curthread)->p_pid);

    if (count > 1)
        return (0);
    if (vp->v_count == 1) {
        mntfs_freesnap(&mnp->mnt_read);
        mntfs_freesnap(&mnp->mnt_ioctl);
        atomic_add_32(&MTOD(mnp)->mnt_nopen, -1);
    }
    return (0);
}

/* ARGSUSED */
static int
mntread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, caller_context_t *ct)
{
    int error = 0;
    off_t off = uio->uio_offset;
    size_t len = uio->uio_resid;
    mntnode_t *mnp = VTOM(vp);
    char *buf;
    mntsnap_t *snap = &mnp->mnt_read;
    int datamodel;

    if (off == (off_t)0 || snap->mnts_count == 0) {
        /*
         * It is assumed that any kernel callers wishing
         * to read mnttab will be using extmnttab entries
         * and not extmnttab32 entries, whether or not
         * the kernel is LP64 or ILP32.  Thus, force the
         * datamodel that mntfs_snapshot uses to be
         * DATAMODEL_LP64.
         */
        if (uio->uio_segflg == UIO_SYSSPACE)
            datamodel = DATAMODEL_LP64;
        else
            datamodel = get_udatamodel();
        if ((error = mntfs_snapshot(mnp, 1, datamodel)) != 0)
            return (error);
    }
    if ((size_t)(off + len) > snap->mnts_textsize)
        len = snap->mnts_textsize - off;

    if (off < 0 || len > snap->mnts_textsize)
        return (EFAULT);

    if (len == 0)
        return (0);

    /*
     * The mnttab image is stored in the user's address space,
     * so we have to copy it into the kernel from userland,
     * then copy it back out to the specified address.
     */
    buf = kmem_alloc(len, KM_SLEEP);
    if (copyin(snap->mnts_text + off, buf, len))
        error = EFAULT;
    else {
        error = uiomove(buf, len, UIO_READ, uio);
    }
    kmem_free(buf, len);

    return (error);
}


static int
mntgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
{
    mntnode_t *mnp = VTOM(vp);
    int error;
    vnode_t *rvp;
    extern timespec_t vfs_mnttab_ctime;
    mntdata_t *mntdata = MTOD(VTOM(vp));
    mntsnap_t *snap = mnp->mnt_read.mnts_count ?
        &mnp->mnt_read : &mnp->mnt_ioctl;

    /*
     * Return all the attributes.  Should be refined
     * so that it returns only those asked for.
     * Most of this is complete fakery anyway.
     */
    rvp = mnp->mnt_mountvp;
    /*
     * Attributes are same as underlying file with modifications
     */
    if (error = VOP_GETATTR(rvp, vap, flags, cr))
        return (error);

    /*
     * We always look like a regular file
     */
    vap->va_type = VREG;
    /*
     * mode should basically be read only
     */
    vap->va_mode &= 07444;
    vap->va_fsid = vp->v_vfsp->vfs_dev;
    vap->va_blksize = DEV_BSIZE;
    vap->va_rdev = 0;
    vap->va_seq = 0;
    /*
     * Set nlink to the number of open vnodes for mnttab info
     * plus one for existing.
     */
    vap->va_nlink = mntdata->mnt_nopen + 1;
    /*
     * If we haven't taken a snapshot yet, set the
     * size to the size of the latest snapshot.
     */
    vap->va_size = snap->mnts_textsize ? snap->mnts_textsize :
        mntdata->mnt_size;
    /*
     * Fetch mtime from the vfs mnttab timestamp
     */
    vap->va_ctime = vfs_mnttab_ctime;
    vfs_list_read_lock();
    vfs_mnttab_modtime(&vap->va_mtime);
    vap->va_atime = vap->va_mtime;
    vfs_list_unlock();
    /*
     * Nodeid is always ROOTINO;
     */
    vap->va_nodeid = (ino64_t)MNTROOTINO;
    vap->va_nblocks = btod(vap->va_size);
    return (0);
}


static int
mntaccess(vnode_t *vp, int mode, int flags, cred_t *cr)
{
    mntnode_t *mnp = VTOM(vp);

    if (mode & (VWRITE|VEXEC))
        return (EROFS);

    /*
     * Do access check on the underlying directory vnode.
     */
    return (VOP_ACCESS(mnp->mnt_mountvp, mode, flags, cr));
}


/*
 * New /mntfs vnode required; allocate it and fill in most of the fields.
 */
static mntnode_t *
mntgetnode(vnode_t *dp)
{
    mntnode_t *mnp;
    vnode_t *vp;

    mnp = kmem_zalloc(sizeof (mntnode_t), KM_SLEEP);
    mnp->mnt_vnode = vn_alloc(KM_SLEEP);
    mnp->mnt_mountvp = VTOM(dp)->mnt_mountvp;
    vp = MTOV(mnp);
    vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
    vn_setops(vp, mntvnodeops);
    vp->v_vfsp = dp->v_vfsp;
    vp->v_type = VREG;
    vp->v_data = (caddr_t)mnp;

    return (mnp);
}

/*
 * Free the storage obtained from mntgetnode().
 */
static void
mntfreenode(mntnode_t *mnp)
{
    vnode_t *vp = MTOV(mnp);

    vn_invalid(vp);
    vn_free(vp);
    kmem_free(mnp, sizeof (*mnp));
}


/* ARGSUSED */
static int
mntfsync(vnode_t *vp, int syncflag, cred_t *cr)
{
    return (0);
}

/* ARGSUSED */
static void
mntinactive(vnode_t *vp, cred_t *cr)
{
    mntnode_t *mnp = VTOM(vp);

    mntfreenode(mnp);
}

/* ARGSUSED */
static int
mntseek(vnode_t *vp, offset_t ooff, offset_t *noffp)
{
    if (*noffp == 0)
        VTOM(vp)->mnt_offset = 0;

    return (0);
}

/*
 * Return the answer requested to poll().
 * POLLRDBAND will return when the mtime of the mnttab
 * information is newer than the latest one read for this open.
 */
/* ARGSUSED */
static int
mntpoll(vnode_t *vp, short ev, int any, short *revp, pollhead_t **phpp)
{
    mntnode_t *mnp = VTOM(vp);
    mntsnap_t *snap = &mnp->mnt_read;

    if (mnp->mnt_ioctl.mnts_time.tv_sec > snap->mnts_time.tv_sec ||
        (mnp->mnt_ioctl.mnts_time.tv_sec == snap->mnts_time.tv_sec &&
        mnp->mnt_ioctl.mnts_time.tv_nsec > snap->mnts_time.tv_nsec))
        snap = &mnp->mnt_ioctl;

    *revp = 0;
    *phpp = (pollhead_t *)NULL;
    if (ev & POLLIN)
        *revp |= POLLIN;

    if (ev & POLLRDNORM)
        *revp |= POLLRDNORM;

    if (ev & POLLRDBAND) {
        vfs_mnttab_poll(&snap->mnts_time, phpp);
        if (*phpp == (pollhead_t *)NULL)
            *revp |= POLLRDBAND;
    }
    if (*revp || *phpp != NULL || any) {
        return (0);
    }
    /*
     * If someone is polling an unsupported poll events (e.g.
     * POLLOUT, POLLPRI, etc.), just return POLLERR revents.
     * That way we will ensure that we don't return a 0
     * revents with a NULL pollhead pointer.
     */
    *revp = POLLERR;
    return (0);
}
/* ARGSUSED */
static int
mntioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
    cred_t *cr, int *rvalp)
{
    uint_t *up = (uint_t *)arg;
    mntnode_t *mnp = VTOM(vp);
    mntsnap_t *snap = &mnp->mnt_ioctl;
    int error;

    error = 0;
    switch (cmd) {

    case MNTIOC_NMNTS: {        /* get no. of mounted resources */
        if (snap->mnts_count == 0) {
            if ((error =
                mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK)) != 0)
                return (error);
        }
        if (suword32(up, snap->mnts_count) != 0)
            error = EFAULT;
        break;
    }

    case MNTIOC_GETDEVLIST: {   /* get mounted device major/minor nos */
        uint_t *devlist;
        int i;
        size_t len;

        if (snap->mnts_count == 0) {
            if ((error =
                mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK)) != 0)
                return (error);
        }

        len = 2 * snap->mnts_count * sizeof (uint_t);
        devlist = kmem_alloc(len, KM_SLEEP);
        for (i = 0; i < snap->mnts_count; i++) {

#ifdef _SYSCALL32_IMPL
            if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
                struct extmnttab32 tab;

                if ((error = xcopyin(snap->mnts_text +
                    i * sizeof (struct extmnttab32), &tab,
                    sizeof (tab))) != 0)
                    break;

                devlist[i*2] = tab.mnt_major;
                devlist[i*2+1] = tab.mnt_minor;
            } else {
#endif
                struct extmnttab tab;

                if ((error = xcopyin(snap->mnts_text +
                    i * sizeof (struct extmnttab), &tab,
                    sizeof (tab))) != 0)
                    break;

                devlist[i*2] = tab.mnt_major;
                devlist[i*2+1] = tab.mnt_minor;
#ifdef _SYSCALL32_IMPL
            }
#endif
        }

        if (error == 0)
            error = xcopyout(devlist, up, len);
        kmem_free(devlist, len);
        break;
    }

    case MNTIOC_SETTAG:     /* set tag on mounted file system */
    case MNTIOC_CLRTAG:     /* clear tag on mounted file system */
    {
        struct mnttagdesc *dp = (struct mnttagdesc *)arg;
        STRUCT_DECL(mnttagdesc, tagdesc);
        char *cptr;
        uint32_t major, minor;
        char tagbuf[MAX_MNTOPT_TAG];
        char *pbuf;
        size_t len;
        uint_t start = 0;
        mntdata_t *mntdata = MTOD(mnp);
        zone_t *zone = mntdata->mnt_zone;

        STRUCT_INIT(tagdesc, flag & DATAMODEL_MASK);
        if (copyin(dp, STRUCT_BUF(tagdesc), STRUCT_SIZE(tagdesc))) {
            error = EFAULT;
            break;
        }
        pbuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
        if (zone != global_zone) {
            (void) strcpy(pbuf, zone->zone_rootpath);
            /* truncate "/" and nul */
            start = zone->zone_rootpathlen - 2;
            ASSERT(pbuf[start] == '/');
        }
        cptr = STRUCT_FGETP(tagdesc, mtd_mntpt);
        error = copyinstr(cptr, pbuf + start, MAXPATHLEN - start, &len);
        if (error) {
            kmem_free(pbuf, MAXPATHLEN);
            break;
        }
        if (start != 0 && pbuf[start] != '/') {
            kmem_free(pbuf, MAXPATHLEN);
            error = EINVAL;
            break;
        }
        cptr = STRUCT_FGETP(tagdesc, mtd_tag);
        if ((error = copyinstr(cptr, tagbuf, MAX_MNTOPT_TAG, &len))) {
            kmem_free(pbuf, MAXPATHLEN);
            break;
        }
        major = STRUCT_FGET(tagdesc, mtd_major);
        minor = STRUCT_FGET(tagdesc, mtd_minor);
        if (cmd == MNTIOC_SETTAG)
            error = vfs_settag(major, minor, pbuf, tagbuf, cr);
        else
            error = vfs_clrtag(major, minor, pbuf, tagbuf, cr);
        kmem_free(pbuf, MAXPATHLEN);
        break;
    }

    case MNTIOC_SHOWHIDDEN:
    {
        mutex_enter(&vp->v_lock);
        mnp->mnt_flags |= MNT_SHOWHIDDEN;
        mutex_exit(&vp->v_lock);
        break;
    }

    case MNTIOC_GETMNTENT:
    {
        size_t idx;
        uintptr_t addr;

        idx = mnp->mnt_offset;
        if (snap->mnts_count == 0 || idx == 0) {
            if ((error =
                mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK)) != 0)
                return (error);
        }
        /*
         * If the next index is beyond the end of the current mnttab,
         * return EOF
         */
        if (idx >= snap->mnts_count) {
            *rvalp = 1;
            return (0);
        }

#ifdef _SYSCALL32_IMPL
        if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
            addr = (uintptr_t)(snap->mnts_metadata + idx *
                sizeof (struct extmnttab32));
            error = suword32((void *)arg, addr);
        } else {
#endif
            addr = (uintptr_t)(snap->mnts_metadata + idx *
                sizeof (struct extmnttab));
            error = sulword((void *)arg, addr);
#ifdef _SYSCALL32_IMPL
        }
#endif

        if (error != 0)
            return (error);

        mnp->mnt_offset++;
        break;
    }

    default:
        error = EINVAL;
        break;
    }

    return (error);
}


/*
 * /mntfs vnode operations vector
 */
const fs_operation_def_t mnt_vnodeops_template[] = {
    VOPNAME_OPEN, mntopen,
    VOPNAME_CLOSE, mntclose,
    VOPNAME_READ, mntread,
    VOPNAME_IOCTL, mntioctl,
    VOPNAME_GETATTR, mntgetattr,
    VOPNAME_ACCESS, mntaccess,
    VOPNAME_FSYNC, mntfsync,
    VOPNAME_INACTIVE, (fs_generic_func_p) mntinactive,
    VOPNAME_SEEK, mntseek,
    VOPNAME_POLL, (fs_generic_func_p) mntpoll,
    VOPNAME_DISPOSE, fs_error,
    VOPNAME_SHRLOCK, fs_error,
    NULL, NULL
};