solaris/SharedFolders/vboxfs_vnode.c

	vboxfs_vnode.c revision 72cc0d1a8b7c8406808a92eb0b9eb58bf6f8c3aa
/** @file
 * VirtualBox File System for Solaris Guests, vnode implementation.
 * Portions contributed by: Ronald.
 */

/*
 * Copyright (C) 2009-2010 Oracle Corporation
 *
 * This file is part of VirtualBox Open Source Edition (OSE), as
 * available from http://www.virtualbox.org. This file is free software;
 * you can redistribute it and/or modify it under the terms of the GNU
 * General Public License (GPL) as published by the Free Software
 * Foundation, in version 2 as it comes in the "COPYING" file of the
 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
 *
 * The contents of this file may alternatively be used under the terms
 * of the Common Development and Distribution License Version 1.0
 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
 * VirtualBox OSE distribution, in which case the provisions of the
 * CDDL are applicable instead of those of the GPL.
 *
 * You may elect to license modified versions of this file under the
 * terms and conditions of either the GPL or the CDDL or both.
 */

/*
 * Shared Folder File System is used from Solaris when run as a guest operating
 * system on VirtualBox, though is meant to be usable with any hypervisor that
 * can provide similar functionality. The sffs code handles all the Solaris
 * specific semantics and relies on a provider module to actually access
 * directories, files, etc. The provider interfaces are described in
 * "vboxfs_prov.h" and the module implementing them is shipped as part of the
 * VirtualBox Guest Additions for Solaris.
 *
 * The shared folder file system is similar to a networked file system,
 * but with some caveats. The sffs code caches minimal information and proxies
 * out to the provider whenever possible. Here are some things that are
 * handled in this code and not by the proxy:
 *
 * - a way to open ".." from any already open directory
 * - st_ino numbers
 * - detecting directory changes that happened on the host.
 *
 * The implementation builds a cache of information for every file/directory
 * ever accessed in all mounted sffs filesystems using sf_node structures.
 *
 * This information for both open or closed files can become invalid if
 * asynchronous changes are made on the host. Solaris should not panic() in
 * this event, but some file system operations may return unexpected errors.
 * Information for such directories or files while they have active vnodes
 * is removed from the regular cache and stored in a "stale" bucket until
 * the vnode becomes completely inactive.
 *
 * We suppport only read-only mmap (VBOXVFS_WITH_MMAP) i.e. MAP_SHARED,
 * MAP_PRIVATE in PROT_READ, this data caching would not be coherent with
 * normal simultaneous read()/write() operations, nor will it be coherent
 * with data access on the host. Writable mmap(MAP_SHARED) access is not
 * implemented, as guaranteeing any kind of coherency with concurrent
 * activity on the host would be near impossible with the existing
 * interfaces.
 *
 * A note about locking. sffs is not a high performance file system.
 * No fine grained locking is done. The one sffs_lock protects just about
 * everything.
 */

#include <VBox/log.h>
#include <iprt/asm.h>

#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mntent.h>
#include <sys/param.h>
#include <sys/modctl.h>
#include <sys/mount.h>
#include <sys/policy.h>
#include <sys/atomic.h>
#include <sys/sysmacros.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/vfs.h>
#include <sys/vmsystm.h>
#include <vm/seg_kpm.h>
#include <vm/pvn.h>
#if !defined(VBOX_VFS_SOLARIS_10U6)
#include <sys/vfs_opreg.h>
#endif
#include <sys/pathname.h>
#include <sys/dirent.h>
#include <sys/fs_subr.h>
#include <sys/time.h>
#include "vboxfs_prov.h"
#include "vboxfs_vnode.h"
#include "vboxfs_vfs.h"

#define VBOXVFS_WITH_MMAP

static struct vnodeops *sffs_ops = NULL;

kmutex_t sffs_lock;
static avl_tree_t sfnodes;
static avl_tree_t stale_sfnodes;

/*
 * For now we'll use an I/O buffer that doesn't page fault for VirtualBox
 * to transfer data into.
 */
char *sffs_buffer;

/*
 * sfnode_compare() is needed for AVL tree functionality.
 * The nodes are sorted by mounted filesystem, then path. If the
 * nodes are stale, the node pointer itself is used to force uniqueness.
 */
static int
sfnode_compare(const void *a, const void *b)
{
    sfnode_t *x = (sfnode_t *)a;
    sfnode_t *y = (sfnode_t *)b;
    int diff;

    if (x->sf_is_stale) {
        ASSERT(y->sf_is_stale);
        diff = strcmp(x->sf_path, y->sf_path);
        if (diff == 0)
            diff = (uintptr_t)y - (uintptr_t)x;
    } else {
        ASSERT(!y->sf_is_stale);
        diff = (uintptr_t)y->sf_sffs - (uintptr_t)x->sf_sffs;
        if (diff == 0)
            diff = strcmp(x->sf_path, y->sf_path);
    }
    if (diff < 0)
        return (-1);
    if (diff > 0)
        return (1);
    return (0);
}

/*
 * Construct a new pathname given an sfnode plus an optional tail component.
 * This handles ".." and "."
 */
static char *
sfnode_construct_path(sfnode_t *node, char *tail)
{
    char *p;

    if (strcmp(tail, ".") == 0 || strcmp(tail, "..") == 0)
        panic("construct path for %s", tail);
    p = kmem_alloc(strlen(node->sf_path) + 1 + strlen(tail) + 1, KM_SLEEP);
    strcpy(p, node->sf_path);
    strcat(p, "/");
    strcat(p, tail);
    return (p);
}

/*
 * Clears the (cached) directory listing for the node.
 */
static void
sfnode_clear_dir_list(sfnode_t *node)
{
    ASSERT(MUTEX_HELD(&sffs_lock));

    while (node->sf_dir_list != NULL) {
        sffs_dirents_t *next = node->sf_dir_list->sf_next;
        kmem_free(node->sf_dir_list, SFFS_DIRENTS_SIZE);
        node->sf_dir_list = next;
    }
}

/*
 * Open the provider file associated with a vnode. Holding the file open is
 * the only way we have of trying to have a vnode continue to refer to the
 * same host file in the host in light of the possibility of host side renames.
 */
static void
sfnode_open(sfnode_t *node)
{
    int error;
    sfp_file_t *fp;

    if (node->sf_file != NULL)
        return;
    error = sfprov_open(node->sf_sffs->sf_handle, node->sf_path, &fp);
    if (error == 0)
        node->sf_file = fp;
}

/*
 * get a new vnode reference for an sfnode
 */
vnode_t *
sfnode_get_vnode(sfnode_t *node)
{
    vnode_t *vp;

    if (node->sf_vnode != NULL) {
        VN_HOLD(node->sf_vnode);
    } else {
        vp = vn_alloc(KM_SLEEP);
        LogFlowFunc(("  %s gets vnode 0x%p\n", node->sf_path, vp));
        vp->v_type = node->sf_type;
        vp->v_vfsp = node->sf_sffs->sf_vfsp;
        vn_setops(vp, sffs_ops);
        vp->v_flag = VNOSWAP;
#ifndef VBOXVFS_WITH_MMAP
        vp->v_flag |= VNOMAP;
#endif
        vn_exists(vp);
        vp->v_data = node;
        node->sf_vnode = vp;
    }
    sfnode_open(node);
    return (node->sf_vnode);
}

/*
 * Allocate and initialize a new sfnode and assign it a vnode
 */
sfnode_t *
sfnode_make(
    sffs_data_t *sffs,
    char        *path,
    vtype_t     type,
    sfp_file_t  *fp,
    sfnode_t    *parent,    /* can be NULL for root */
    sffs_stat_t *stat,
    uint64_t    stat_time)
{
    sfnode_t    *node;
    avl_index_t where;

    ASSERT(MUTEX_HELD(&sffs_lock));
    ASSERT(path != NULL);

    /*
     * build the sfnode
     */
    LogFlowFunc(("sffs_make(%s)\n", path));
    node = kmem_alloc(sizeof (*node), KM_SLEEP);
    node->sf_sffs = sffs;
    VFS_HOLD(node->sf_sffs->sf_vfsp);
    node->sf_path = path;
    node->sf_ino = sffs->sf_ino++;
    node->sf_type = type;
    node->sf_is_stale = 0;  /* never stale at creation */
    node->sf_file = fp;
    node->sf_vnode = NULL;  /* do this before any sfnode_get_vnode() */
    node->sf_children = 0;
    node->sf_parent = parent;
    if (parent)
        ++parent->sf_children;
    node->sf_dir_list = NULL;
    if (stat != NULL) {
        node->sf_stat = *stat;
        node->sf_stat_time = stat_time;
    } else {
        node->sf_stat_time = 0;
    }

    /*
     * add the new node to our cache
     */
    if (avl_find(&sfnodes, node, &where) != NULL)
        panic("sffs_create_sfnode(%s): duplicate sfnode_t", path);
    avl_insert(&sfnodes, node, where);
    return (node);
}

/*
 * destroy an sfnode
 */
static void
sfnode_destroy(sfnode_t *node)
{
    avl_index_t where;
    avl_tree_t *tree;
    sfnode_t *parent;
top:
    parent = node->sf_parent;
    ASSERT(MUTEX_HELD(&sffs_lock));
    ASSERT(node->sf_path != NULL);
    LogFlowFunc(("sffs_destroy(%s)%s\n", node->sf_path, node->sf_is_stale ? " stale": ""));
    if (node->sf_children != 0)
        panic("sfnode_destroy(%s) has %d children", node->sf_path, node->sf_children);
    if (node->sf_vnode != NULL)
        panic("sfnode_destroy(%s) has active vnode", node->sf_path);

    if (node->sf_is_stale)
        tree = &stale_sfnodes;
    else
        tree = &sfnodes;
    if (avl_find(tree, node, &where) == NULL)
        panic("sfnode_destroy(%s) not found", node->sf_path);
    avl_remove(tree, node);

    VFS_RELE(node->sf_sffs->sf_vfsp);
    sfnode_clear_dir_list(node);
    kmem_free(node->sf_path, strlen(node->sf_path) + 1);
    kmem_free(node, sizeof (*node));
    if (parent != NULL) {
        sfnode_clear_dir_list(parent);
        if (parent->sf_children == 0)
            panic("sfnode_destroy parent (%s) has no child", parent->sf_path);
        --parent->sf_children;
        if (parent->sf_children == 0 &&
            parent->sf_is_stale &&
            parent->sf_vnode == NULL) {
            node = parent;
            goto top;
        }
    }
}

/*
 * Some sort of host operation on an sfnode has failed or it has been
 * deleted. Mark this node and any children as stale, deleting knowledge
 * about any which do not have active vnodes or children
 * This also handle deleting an inactive node that was already stale.
 */
static void
sfnode_make_stale(sfnode_t *node)
{
    sfnode_t *n;
    int len;
    ASSERT(MUTEX_HELD(&sffs_lock));
    avl_index_t where;

    /*
     * First deal with any children of a directory node.
     * If a directory becomes stale, anything below it becomes stale too.
     */
    if (!node->sf_is_stale && node->sf_type == VDIR) {
        len = strlen(node->sf_path);

        n = node;
        while ((n = AVL_NEXT(&sfnodes, node)) != NULL) {
            ASSERT(!n->sf_is_stale);

            /*
             * quit when no longer seeing children of node
             */
            if (n->sf_sffs != node->sf_sffs ||
                strncmp(node->sf_path, n->sf_path, len) != 0 ||
                n->sf_path[len] != '/')
                break;

            /*
             * Either mark the child as stale or destroy it
             */
            if (n->sf_vnode == NULL && n->sf_children == 0) {
                sfnode_destroy(n);
            } else {
                LogFlowFunc(("sffs_make_stale(%s) sub\n", n->sf_path));
                sfnode_clear_dir_list(n);
                if (avl_find(&sfnodes, n, &where) == NULL)
                    panic("sfnode_make_stale(%s)"
                        " not in sfnodes", n->sf_path);
                avl_remove(&sfnodes, n);
                n->sf_is_stale = 1;
                if (avl_find(&stale_sfnodes, n, &where) != NULL)
                    panic("sffs_make_stale(%s) duplicates",
                        n->sf_path);
                avl_insert(&stale_sfnodes, n, where);
            }
        }
    }

    /*
     * Now deal with the given node.
     */
    if (node->sf_vnode == NULL && node->sf_children == 0) {
        sfnode_destroy(node);
    } else if (!node->sf_is_stale) {
        LogFlowFunc(("sffs_make_stale(%s)\n", node->sf_path));
        sfnode_clear_dir_list(node);
        if (node->sf_parent)
            sfnode_clear_dir_list(node->sf_parent);
        if (avl_find(&sfnodes, node, &where) == NULL)
            panic("sfnode_make_stale(%s) not in sfnodes",
                node->sf_path);
        avl_remove(&sfnodes, node);
        node->sf_is_stale = 1;
        if (avl_find(&stale_sfnodes, node, &where) != NULL)
            panic("sffs_make_stale(%s) duplicates", node->sf_path);
        avl_insert(&stale_sfnodes, node, where);
    }
}

static uint64_t
sfnode_cur_time_usec(void)
{
    clock_t now = drv_hztousec(ddi_get_lbolt());
    return now;
}

static int
sfnode_stat_cached(sfnode_t *node)
{
    return (sfnode_cur_time_usec() - node->sf_stat_time) <
        node->sf_sffs->sf_stat_ttl * 1000L;
}

static int
sfnode_get_stat(sfp_mount_t *mnt, char *path, sffs_stat_t *stat)
{
    return sfprov_get_attr(mnt, path, &stat->sf_mode, &stat->sf_size,
        &stat->sf_atime, &stat->sf_mtime, &stat->sf_ctime);
}

static void
sfnode_invalidate_stat_cache(sfnode_t *node)
{
    node->sf_stat_time = 0;
}

static int
sfnode_update_stat_cache(sfnode_t *node)
{
    int error;

    error = sfnode_get_stat(node->sf_sffs->sf_handle, node->sf_path,
        &node->sf_stat);
    if (error == ENOENT)
        sfnode_make_stale(node);
    if (error == 0)
        node->sf_stat_time = sfnode_cur_time_usec();

    return (error);
}

/*
 * Rename a file or a directory
 */
static void
sfnode_rename(sfnode_t *node, sfnode_t *newparent, char *path)
{
    sfnode_t *n;
    sfnode_t template;
    avl_index_t where;
    int len = strlen(path);
    int old_len;
    char *new_path;
    char *tail;
    ASSERT(MUTEX_HELD(&sffs_lock));

    ASSERT(!node->sf_is_stale);

    /*
     * Have to remove anything existing that had the new name.
     */
    template.sf_sffs = node->sf_sffs;
    template.sf_path = path;
    template.sf_is_stale = 0;
    n = avl_find(&sfnodes, &template, &where);
    if (n != NULL)
        sfnode_make_stale(n);

    /*
     * Do the renaming, deal with any children of this node first.
     */
    if (node->sf_type == VDIR) {
        old_len = strlen(node->sf_path);
        while ((n = AVL_NEXT(&sfnodes, node)) != NULL) {

            /*
             * quit when no longer seeing children of node
             */
            if (n->sf_sffs != node->sf_sffs ||
                strncmp(node->sf_path, n->sf_path, old_len) != 0 ||
                n->sf_path[old_len] != '/')
                break;

            /*
             * Rename the child:
             * - build the new path name
             * - unlink the AVL node
             * - assign the new name
             * - re-insert the AVL name
             */
            ASSERT(strlen(n->sf_path) > old_len);
            tail = n->sf_path + old_len; /* includes initial "/" */
            new_path = kmem_alloc(len + strlen(tail) + 1,
                KM_SLEEP);
            strcpy(new_path, path);
            strcat(new_path, tail);
            if (avl_find(&sfnodes, n, &where) == NULL)
                panic("sfnode_rename(%s) not in sfnodes",
                   n->sf_path);
            avl_remove(&sfnodes, n);
            LogFlowFunc(("sfnode_rname(%s to %s) sub\n", n->sf_path, new_path));
            kmem_free(n->sf_path, strlen(n->sf_path) + 1);
            n->sf_path = new_path;
            if (avl_find(&sfnodes, n, &where) != NULL)
                panic("sfnode_rename(%s) duplicates",
                    n->sf_path);
            avl_insert(&sfnodes, n, where);
        }
    }

    /*
     * Deal with the given node.
     */
    if (avl_find(&sfnodes, node, &where) == NULL)
        panic("sfnode_rename(%s) not in sfnodes", node->sf_path);
    avl_remove(&sfnodes, node);
    LogFlowFunc(("sfnode_rname(%s to %s)\n", node->sf_path, path));
    kmem_free(node->sf_path, strlen(node->sf_path) + 1);
    node->sf_path = path;
    if (avl_find(&sfnodes, node, &where) != NULL)
        panic("sfnode_rename(%s) duplicates", node->sf_path);
    avl_insert(&sfnodes, node, where);

    /*
     * change the parent
     */
    if (node->sf_parent == NULL)
        panic("sfnode_rename(%s) no parent", node->sf_path);
    if (node->sf_parent->sf_children == 0)
        panic("sfnode_rename(%s) parent has no child", node->sf_path);
    sfnode_clear_dir_list(node->sf_parent);
    sfnode_clear_dir_list(newparent);
    --node->sf_parent->sf_children;
    node->sf_parent = newparent;
    ++newparent->sf_children;
}

/*
 * Look for a cached node, if not found either handle ".." or try looking
 * via the provider. Create an entry in sfnodes if found but not cached yet.
 * If the create flag is set, a file or directory is created. If the file
 * already existed, an error is returned.
 * Nodes returned from this routine always have a vnode with its ref count
 * bumped by 1.
 */
static sfnode_t *
sfnode_lookup(
    sfnode_t *dir,
    char *name,
    vtype_t create,
    sffs_stat_t *stat,
    uint64_t stat_time,
    int *err)
{
    avl_index_t where;
    sfnode_t    template;
    sfnode_t    *node;
    int     error = 0;
    int     type;
    char        *fullpath;
    sfp_file_t  *fp;
    sffs_stat_t tmp_stat;

    ASSERT(MUTEX_HELD(&sffs_lock));

    if (err)
        *err = error;

    /*
     * handle referencing myself
     */
    if (strcmp(name, "") == 0 || strcmp(name, ".") == 0)
        return (dir);

    /*
     * deal with parent
     */
    if (strcmp(name, "..") == 0)
        return (dir->sf_parent);

    /*
     * Look for an existing node.
     */
    fullpath = sfnode_construct_path(dir, name);
    template.sf_sffs = dir->sf_sffs;
    template.sf_path = fullpath;
    template.sf_is_stale = 0;
    node = avl_find(&sfnodes, &template, &where);
    if (node != NULL) {
        kmem_free(fullpath, strlen(fullpath) + 1);
        if (create != VNON)
            return (NULL);
        return (node);
    }

    /*
     * No entry for this path currently.
     * Check if the file exists with the provider and get the type from
     * there.
     */
    if (create == VREG) {
        type = VREG;
        error = sfprov_create(dir->sf_sffs->sf_handle, fullpath, &fp);
    } else if (create == VDIR) {
        type = VDIR;
        error = sfprov_mkdir(dir->sf_sffs->sf_handle, fullpath, &fp);
    } else {
        mode_t m;
        fp = NULL;
        type = VNON;
        if (stat == NULL) {
            stat = &tmp_stat;
            error = sfnode_get_stat(dir->sf_sffs->sf_handle,
                fullpath, stat);
            stat_time = sfnode_cur_time_usec();
        } else {
            error = 0;
        }
        m = stat->sf_mode;
        if (error != 0)
            error = ENOENT;
        else if (S_ISDIR(m))
            type = VDIR;
        else if (S_ISREG(m))
            type = VREG;
    }

    if (err)
        *err = error;

    /*
     * If no errors, make a new node and return it.
     */
    if (error) {
        kmem_free(fullpath, strlen(fullpath) + 1);
        return (NULL);
    }
    node = sfnode_make(dir->sf_sffs, fullpath, type, fp, dir, stat,
        stat_time);
    return (node);
}


/*
 * uid and gid in sffs determine owner and group for all files.
 */
static int
sfnode_access(sfnode_t *node, mode_t mode, cred_t *cr)
{
    sffs_data_t *sffs = node->sf_sffs;
    mode_t m;
    int shift = 0;
    int error;
    vnode_t *vp;

    ASSERT(MUTEX_HELD(&sffs_lock));

    /*
     * get the mode from the cache or provider
     */
    if (sfnode_stat_cached(node))
        error = 0;
    else
        error = sfnode_update_stat_cache(node);
    m = (error == 0) ? node->sf_stat.sf_mode : 0;

    /*
     * mask off the permissions based on uid/gid
     */
    if (crgetuid(cr) != sffs->sf_uid) {
        shift += 3;
        if (groupmember(sffs->sf_gid, cr) == 0)
            shift += 3;
    }
    mode &= ~(m << shift);

    if (mode == 0) {
        error = 0;
    } else {
        vp = sfnode_get_vnode(node);
        error = secpolicy_vnode_access(cr, vp, sffs->sf_uid, mode);
        VN_RELE(vp);
    }
    return (error);
}


/*
 *
 * Everything below this point are the vnode operations used by Solaris VFS
 */
static int
sffs_readdir(
    vnode_t     *vp,
    uio_t       *uiop,
    cred_t      *cred,
    int     *eofp,
    caller_context_t *ct,
    int     flags)
{
    sfnode_t *dir = VN2SFN(vp);
    sfnode_t *node;
    struct sffs_dirent *dirent = NULL;
    sffs_dirents_t *cur_buf;
    offset_t offset = 0;
    offset_t orig_off = uiop->uio_loffset;
    int dummy_eof;
    int error = 0;

    if (uiop->uio_iovcnt != 1)
        return (EINVAL);

    if (vp->v_type != VDIR)
        return (ENOTDIR);

    if (eofp == NULL)
        eofp = &dummy_eof;
    *eofp = 0;

    if (uiop->uio_loffset >= MAXOFFSET_T) {
        *eofp = 1;
        return (0);
    }

    /*
     * Get the directory entry names from the host. This gets all
     * entries. These are stored in a linked list of sffs_dirents_t
     * buffers, each of which contains a list of dirent64_t's.
     */
    mutex_enter(&sffs_lock);

    if (dir->sf_dir_list == NULL) {
        error = sfprov_readdir(dir->sf_sffs->sf_handle, dir->sf_path,
            &dir->sf_dir_list);
        if (error != 0)
            goto done;
    }

    /*
     * Validate and skip to the desired offset.
     */
    cur_buf = dir->sf_dir_list;
    offset = 0;

    while (cur_buf != NULL &&
        offset + cur_buf->sf_len <= uiop->uio_loffset) {
        offset += cur_buf->sf_len;
        cur_buf = cur_buf->sf_next;
    }

    if (cur_buf == NULL && offset != uiop->uio_loffset) {
        error = EINVAL;
        goto done;
    }
    if (cur_buf != NULL && offset != uiop->uio_loffset) {
        offset_t off = offset;
        int step;
        dirent = &cur_buf->sf_entries[0];

        while (off < uiop->uio_loffset) {
            if (dirent->sf_entry.d_off == uiop->uio_loffset)
                break;
            step = sizeof(sffs_stat_t) + dirent->sf_entry.d_reclen;
            dirent = (struct sffs_dirent *) (((char *) dirent) + step);
            off += step;
        }

        if (off >= uiop->uio_loffset) {
            error = EINVAL;
            goto done;
        }
    }

    offset = uiop->uio_loffset - offset;

    /*
     * Lookup each of the names, so that we have ino's, and copy to
     * result buffer.
     */
    while (cur_buf != NULL) {
        if (offset >= cur_buf->sf_len) {
            cur_buf = cur_buf->sf_next;
            offset = 0;
            continue;
        }

        dirent = (struct sffs_dirent *)
            (((char *) &cur_buf->sf_entries[0]) + offset);
        if (dirent->sf_entry.d_reclen > uiop->uio_resid)
            break;

        if (strcmp(dirent->sf_entry.d_name, ".") == 0) {
            node = dir;
        } else if (strcmp(dirent->sf_entry.d_name, "..") == 0) {
            node = dir->sf_parent;
            if (node == NULL)
                node = dir;
        } else {
            node = sfnode_lookup(dir, dirent->sf_entry.d_name, VNON,
                &dirent->sf_stat, sfnode_cur_time_usec(), NULL);
            if (node == NULL)
                panic("sffs_readdir() lookup failed");
        }
        dirent->sf_entry.d_ino = node->sf_ino;

        error = uiomove(&dirent->sf_entry, dirent->sf_entry.d_reclen, UIO_READ, uiop);
        if (error != 0)
            break;

        uiop->uio_loffset= dirent->sf_entry.d_off;
        offset += sizeof(sffs_stat_t) + dirent->sf_entry.d_reclen;
    }
    if (error == 0 && cur_buf == NULL)
        *eofp = 1;
done:
    mutex_exit(&sffs_lock);
    if (error != 0)
        uiop->uio_loffset = orig_off;
    return (error);
}


#if defined(VBOX_VFS_SOLARIS_10U6)
/*
 * HERE JOE.. this may need more logic, need to look at other file systems
 */
static int
sffs_pathconf(
    vnode_t *vp,
    int cmd,
    ulong_t *valp,
    cred_t  *cr)
{
    return (fs_pathconf(vp, cmd, valp, cr));
}
#else
/*
 * HERE JOE.. this may need more logic, need to look at other file systems
 */
static int
sffs_pathconf(
    vnode_t *vp,
    int cmd,
    ulong_t *valp,
    cred_t  *cr,
    caller_context_t *ct)
{
    return (fs_pathconf(vp, cmd, valp, cr, ct));
}
#endif

static int
sffs_getattr(
    vnode_t     *vp,
    vattr_t     *vap,
    int     flags,
    cred_t      *cred,
    caller_context_t *ct)
{
    sfnode_t    *node = VN2SFN(vp);
    sffs_data_t *sffs = node->sf_sffs;
    mode_t      mode;
    int     error = 0;

    mutex_enter(&sffs_lock);
    vap->va_type = vp->v_type;
    vap->va_uid = sffs->sf_uid;
    vap->va_gid = sffs->sf_gid;
    vap->va_fsid = sffs->sf_vfsp->vfs_dev;
    vap->va_nodeid = node->sf_ino;
    vap->va_nlink = 1;
    vap->va_rdev =  sffs->sf_vfsp->vfs_dev;
    vap->va_seq = 0;

    if (!sfnode_stat_cached(node)) {
        error = sfnode_update_stat_cache(node);
        if (error != 0)
            goto done;
    }

    vap->va_atime = node->sf_stat.sf_atime;
    vap->va_mtime = node->sf_stat.sf_mtime;
    vap->va_ctime = node->sf_stat.sf_ctime;

    mode = node->sf_stat.sf_mode;
    vap->va_mode = mode & MODEMASK;
    if (S_ISDIR(mode))
    {
        vap->va_type = VDIR;
        vap->va_mode = sffs->sf_dmode != ~0 ? (sffs->sf_dmode & 0777) : vap->va_mode;
        vap->va_mode &= ~sffs->sf_dmask;
        vap->va_mode |= S_IFDIR;
    }
    else if (S_ISREG(mode))
    {
        vap->va_type = VREG;
        vap->va_mode = sffs->sf_fmode != ~0 ? (sffs->sf_fmode & 0777) : vap->va_mode;
        vap->va_mode &= ~sffs->sf_fmask;
        vap->va_mode |= S_IFREG;
    }
    else if (S_ISFIFO(mode))
        vap->va_type = VFIFO;
    else if (S_ISCHR(mode))
        vap->va_type = VCHR;
    else if (S_ISBLK(mode))
        vap->va_type = VBLK;
    else if (S_ISLNK(mode))
    {
        vap->va_type = VLNK;
        vap->va_mode = sffs->sf_fmode != ~0 ? (sffs->sf_fmode & 0777) : vap->va_mode;
        vap->va_mode &= ~sffs->sf_fmask;
        vap->va_mode |= S_IFLNK;
    }
    else if (S_ISSOCK(mode))
        vap->va_type = VSOCK;

    vap->va_size = node->sf_stat.sf_size;
    vap->va_blksize = 512;
    vap->va_nblocks = (vap->va_size + 511) / 512;

done:
    mutex_exit(&sffs_lock);
    return (error);
}

static int
sffs_setattr(
    vnode_t     *vp,
    vattr_t     *vap,
    int     flags,
    cred_t      *cred,
    caller_context_t *ct)
{
    sfnode_t    *node = VN2SFN(vp);
    int     error;
    mode_t      mode;

    mode = vap->va_mode;
    if (vp->v_type == VREG)
        mode |= S_IFREG;
    else if (vp->v_type == VDIR)
        mode |= S_IFDIR;
    else if (vp->v_type == VBLK)
        mode |= S_IFBLK;
    else if (vp->v_type == VCHR)
        mode |= S_IFCHR;
    else if (vp->v_type == VLNK)
        mode |= S_IFLNK;
    else if (vp->v_type == VFIFO)
        mode |= S_IFIFO;
    else if (vp->v_type == VSOCK)
        mode |= S_IFSOCK;

    mutex_enter(&sffs_lock);

    sfnode_invalidate_stat_cache(node);
    error = sfprov_set_attr(node->sf_sffs->sf_handle, node->sf_path,
        vap->va_mask, mode, vap->va_atime, vap->va_mtime, vap->va_ctime);
    if (error == ENOENT)
        sfnode_make_stale(node);

    mutex_exit(&sffs_lock);
    return (error);
}

static int
sffs_space(
    vnode_t     *vp,
    int     cmd,
    struct flock64  *bfp,
    int     flags,
    offset_t    off,
    cred_t      *cred,
    caller_context_t *ct)
{
    sfnode_t    *node = VN2SFN(vp);
    int     error;

    /* we only support changing the length of the file */
    if (bfp->l_whence != SEEK_SET || bfp->l_len != 0)
        return ENOSYS;

    mutex_enter(&sffs_lock);

    sfnode_invalidate_stat_cache(node);

    error = sfprov_set_size(node->sf_sffs->sf_handle, node->sf_path,
        bfp->l_start);
    if (error == ENOENT)
        sfnode_make_stale(node);

    mutex_exit(&sffs_lock);
    return (error);
}

/*ARGSUSED*/
static int
sffs_read(
    vnode_t     *vp,
    struct uio  *uio,
    int     ioflag,
    cred_t      *cred,
    caller_context_t *ct)
{
    sfnode_t    *node = VN2SFN(vp);
    int     error = 0;
    uint32_t    bytes;
    uint32_t    done;
    ulong_t     offset;
    ssize_t     total;

    if (vp->v_type == VDIR)
        return (EISDIR);
    if (vp->v_type != VREG)
        return (EINVAL);
    if (uio->uio_loffset >= MAXOFFSET_T)
    {
        proc_t *p = ttoproc(curthread);
        mutex_enter(&p->p_lock);
        (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
            p, RCA_UNSAFE_SIGINFO);
        mutex_exit(&p->p_lock);
        return (EFBIG);
    }
    if (uio->uio_loffset < 0)
        return (EINVAL);
    total = uio->uio_resid;
    if (total == 0)
        return (0);

    mutex_enter(&sffs_lock);
    sfnode_open(node);
    if (node->sf_file == NULL) {
        mutex_exit(&sffs_lock);
        return (EINVAL);
    }

    do {
        offset = uio->uio_offset;
        done = bytes = MIN(PAGESIZE, uio->uio_resid);
        error = sfprov_read(node->sf_file, sffs_buffer, offset, &done);
        if (error == 0 && done > 0)
            error = uiomove(sffs_buffer, done, UIO_READ, uio);
    } while (error == 0 && uio->uio_resid > 0 && done > 0);

    mutex_exit(&sffs_lock);

    /*
     * a partial read is never an error
     */
    if (total != uio->uio_resid)
        error = 0;
    return (error);
}

/*ARGSUSED*/
static int
sffs_write(
    vnode_t     *vp,
    struct uio  *uiop,
    int     ioflag,
    cred_t      *cred,
    caller_context_t *ct)
{
    sfnode_t    *node = VN2SFN(vp);
    int     error = 0;
    uint32_t    bytes;
    uint32_t    done;
    ulong_t     offset;
    ssize_t     total;
    rlim64_t    limit = uiop->uio_llimit;

    if (vp->v_type == VDIR)
        return (EISDIR);
    if (vp->v_type != VREG)
        return (EINVAL);

    /*
     * We have to hold this lock for a long time to keep
     * multiple FAPPEND writes from intermixing
     */
    mutex_enter(&sffs_lock);
    sfnode_open(node);
    if (node->sf_file == NULL) {
        mutex_exit(&sffs_lock);
        return (EINVAL);
    }

    sfnode_invalidate_stat_cache(node);

    if (ioflag & FAPPEND) {
        uint64_t endoffile;

        error = sfprov_get_size(node->sf_sffs->sf_handle,
            node->sf_path, &endoffile);
        if (error == ENOENT)
            sfnode_make_stale(node);
        if (error != 0) {
            mutex_exit(&sffs_lock);
            return (error);
        }
        uiop->uio_loffset = endoffile;
    }

    if (vp->v_type != VREG || uiop->uio_loffset < 0) {
        mutex_exit(&sffs_lock);
        return (EINVAL);
    }
    if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
        limit = MAXOFFSET_T;

    if (uiop->uio_loffset >= limit) {
        proc_t *p = ttoproc(curthread);
        mutex_enter(&p->p_lock);
        (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
            p, RCA_UNSAFE_SIGINFO);
        mutex_exit(&p->p_lock);
        mutex_exit(&sffs_lock);
        return (EFBIG);
    }

    if (uiop->uio_loffset >= MAXOFFSET_T) {
        mutex_exit(&sffs_lock);
        return (EFBIG);
    }

    total = uiop->uio_resid;
    if (total == 0) {
        mutex_exit(&sffs_lock);
        return (0);
    }

    do {
        offset = uiop->uio_offset;
        bytes = MIN(PAGESIZE, uiop->uio_resid);
        if (offset + bytes >= limit) {
            if (offset >= limit) {
                error = EFBIG;
                break;
            }
            bytes = limit - offset;
        }
        error = uiomove(sffs_buffer, bytes, UIO_WRITE, uiop);
        if (error != 0)
            break;
        done = bytes;
        if (error == 0)
            error = sfprov_write(node->sf_file, sffs_buffer,
                offset, &done);
        total -= done;
        if (done != bytes) {
            uiop->uio_resid += bytes - done;
            break;
        }
    } while (error == 0 && uiop->uio_resid > 0 && done > 0);

    mutex_exit(&sffs_lock);

    /*
     * A short write is never really an error.
     */
    if (total != uiop->uio_resid)
        error = 0;
    return (error);
}

/*ARGSUSED*/
static int
sffs_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
{
    sfnode_t *node = VN2SFN(vp);
    int error;

    mutex_enter(&sffs_lock);
    error = sfnode_access(node, mode, cr);
    mutex_exit(&sffs_lock);
    return (error);
}

/*
 * Lookup an entry in a directory and create a new vnode if found.
 */
/* ARGSUSED3 */
static int
sffs_lookup(
    vnode_t     *dvp,       /* the directory vnode */
    char        *name,      /* the name of the file or directory */
    vnode_t     **vpp,      /* the vnode we found or NULL */
    struct pathname *pnp,
    int     flags,
    vnode_t     *rdir,
    cred_t      *cred,
    caller_context_t *ct,
    int     *direntflags,
    struct pathname *realpnp)
{
    int     error;
    sfnode_t    *node;

    /*
     * dvp must be a directory
     */
    if (dvp->v_type != VDIR)
        return (ENOTDIR);

    /*
     * An empty component name or just "." means the directory itself.
     * Don't do any further lookup or checking.
     */
    if (strcmp(name, "") == 0 || strcmp(name, ".") == 0) {
        VN_HOLD(dvp);
        *vpp = dvp;
        return (0);
    }

    /*
     * Check permission to look at this directory. We always allow "..".
     */
    mutex_enter(&sffs_lock);
    if (strcmp(name, "..") != 0) {
        error = sfnode_access(VN2SFN(dvp), VEXEC, cred);
        if (error) {
            mutex_exit(&sffs_lock);
            return (error);
        }
    }

    /*
     * Lookup the node.
     */
    node = sfnode_lookup(VN2SFN(dvp), name, VNON, NULL, 0, NULL);
    if (node != NULL)
        *vpp = sfnode_get_vnode(node);
    mutex_exit(&sffs_lock);
    return ((node == NULL) ? ENOENT : 0);
}

/*ARGSUSED*/
static int
sffs_create(
        vnode_t     *dvp,
        char        *name,
        struct vattr    *vap,
        vcexcl_t    exclusive,
        int     mode,
        vnode_t     **vpp,
        cred_t      *cr,
        int     flag,
        caller_context_t *ct,
        vsecattr_t  *vsecp)
{
    vnode_t     *vp;
    sfnode_t    *node;
    int     error;

    ASSERT(name != NULL);

    /*
     * this is used for regular files, not mkdir
     */
    if (vap->va_type == VDIR)
        return (EISDIR);
    if (vap->va_type != VREG)
        return (EINVAL);

    /*
     * is this a pre-existing file?
     */
    error = sffs_lookup(dvp, name, &vp,
        NULL, 0, NULL, cr, ct, NULL, NULL);
    if (error == ENOENT)
        vp = NULL;
    else if (error != 0)
        return (error);

    /*
     * Operation on a pre-existing file.
     */
    if (vp != NULL) {
        if (exclusive == EXCL) {
            VN_RELE(vp);
            return (EEXIST);
        }
        if (vp->v_type == VDIR && (mode & VWRITE) == VWRITE) {
            VN_RELE(vp);
            return (EISDIR);
        }

        mutex_enter(&sffs_lock);
        node = VN2SFN(vp);
        error = sfnode_access(node, mode, cr);
        if (error != 0) {
            mutex_exit(&sffs_lock);
            VN_RELE(vp);
            return (error);
        }

        sfnode_invalidate_stat_cache(VN2SFN(dvp));

        /*
         * handle truncating an existing file
         */
        if (vp->v_type == VREG && (vap->va_mask & AT_SIZE) &&
            vap->va_size == 0) {
            sfnode_open(node);
            if (node->sf_path == NULL)
                error = ENOENT;
            else
                error = sfprov_trunc(node->sf_sffs->sf_handle,
                    node->sf_path);
            if (error) {
                mutex_exit(&sffs_lock);
                VN_RELE(vp);
                return (error);
            }
        }
        mutex_exit(&sffs_lock);
        *vpp = vp;
        return (0);
    }

    /*
     * Create a new node. First check for a race creating it.
     */
    mutex_enter(&sffs_lock);
    node = sfnode_lookup(VN2SFN(dvp), name, VNON, NULL, 0, NULL);
    if (node != NULL) {
        mutex_exit(&sffs_lock);
        return (EEXIST);
    }

    /*
     * Doesn't exist yet and we have the lock, so create it.
     */
    sfnode_invalidate_stat_cache(VN2SFN(dvp));
    int lookuperr;
    node = sfnode_lookup(VN2SFN(dvp), name, VREG, NULL, 0, &lookuperr);
    if (node && (vap->va_mask & AT_MODE)) {
        timestruc_t dummy;
        error = sfprov_set_attr(node->sf_sffs->sf_handle, node->sf_path,
            AT_MODE, vap->va_mode, dummy, dummy, dummy);
        if (error)
            cmn_err(CE_WARN, "sffs_create: set_mode(%s, %o) failed"
                " rc=%d", node->sf_path, vap->va_mode, error);
    }

    if (node && node->sf_parent)
        sfnode_clear_dir_list(node->sf_parent);

    mutex_exit(&sffs_lock);
    if (node == NULL)
        return (lookuperr);
    *vpp = sfnode_get_vnode(node);
    return (0);
}

/*ARGSUSED*/
static int
sffs_mkdir(
    vnode_t     *dvp,
    char        *nm,
    vattr_t     *va,
    vnode_t     **vpp,
    cred_t      *cred,
    caller_context_t *ct,
    int     flags,
    vsecattr_t  *vsecp)
{
    sfnode_t    *node;
    vnode_t     *vp;
    int     error;

    /*
     * These should never happen
     */
    ASSERT(nm != NULL);
    ASSERT(strcmp(nm, "") != 0);
    ASSERT(strcmp(nm, ".") != 0);
    ASSERT(strcmp(nm, "..") != 0);

    /*
     * Do an unlocked look up first
     */
    error = sffs_lookup(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL, NULL);
    if (error == 0) {
        VN_RELE(vp);
        return (EEXIST);
    }
    if (error != ENOENT)
        return (error);

    /*
     * Must be able to write in current directory
     */
    mutex_enter(&sffs_lock);
    error = sfnode_access(VN2SFN(dvp), VWRITE, cred);
    if (error) {
        mutex_exit(&sffs_lock);
        return (error);
    }

    sfnode_invalidate_stat_cache(VN2SFN(dvp));
    int lookuperr = EACCES;
    node = sfnode_lookup(VN2SFN(dvp), nm, VDIR, NULL, 0, &lookuperr);
    if (node && (va->va_mask & AT_MODE)) {
        timestruc_t dummy;
        error = sfprov_set_attr(node->sf_sffs->sf_handle, node->sf_path,
            AT_MODE, va->va_mode, dummy, dummy, dummy);
        if (error)
            cmn_err(CE_WARN, "sffs_mkdir: set_mode(%s, %o) failed"
                " rc=%d", node->sf_path, va->va_mode, error);
    }

    if (node && node->sf_parent)
        sfnode_clear_dir_list(node->sf_parent);

    mutex_exit(&sffs_lock);
    if (node == NULL)
        return (lookuperr);
    *vpp = sfnode_get_vnode(node);
    return (0);
}

/*ARGSUSED*/
static int
sffs_rmdir(
    struct vnode    *dvp,
    char        *nm,
    vnode_t     *cdir,
    cred_t      *cred,
    caller_context_t *ct,
    int     flags)
{
    sfnode_t     *node;
    vnode_t     *vp;
    int     error;

    /*
     * Return error when removing . and ..
     */
    if (strcmp(nm, ".") == 0 || strcmp(nm, "") == 0)
        return (EINVAL);
    if (strcmp(nm, "..") == 0)
        return (EEXIST);

    error = sffs_lookup(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL, NULL);
    if (error)
        return (error);
    if (vp->v_type != VDIR) {
        VN_RELE(vp);
        return (ENOTDIR);
    }

#ifdef VBOXVFS_WITH_MMAP
    if (vn_vfswlock(vp)) {
        VN_RELE(vp);
        return (EBUSY);
    }
#endif

    if (vn_mountedvfs(vp)) {
        VN_RELE(vp);
        return (EBUSY);
    }

    node = VN2SFN(vp);

    mutex_enter(&sffs_lock);
    error = sfnode_access(VN2SFN(dvp), VEXEC | VWRITE, cred);
    if (error)
        goto done;

    /*
     * If anything else is using this vnode, then fail the remove.
     * Why?  Windows hosts can't remove something that is open,
     * so we have to sfprov_close() it first.
     * There is no errno for this - since it's not a problem on UNIX,
     * but EINVAL is the closest.
     */
    if (node->sf_file != NULL) {
        if (vp->v_count > 1) {
            error = EINVAL;
            goto done;
        }
        (void)sfprov_close(node->sf_file);
        node->sf_file = NULL;
    }

    /*
     * Remove the directory on the host and mark the node as stale.
     */
    sfnode_invalidate_stat_cache(VN2SFN(dvp));
    error = sfprov_rmdir(node->sf_sffs->sf_handle, node->sf_path);
    if (error == ENOENT || error == 0)
        sfnode_make_stale(node);

    if (node->sf_parent)
        sfnode_clear_dir_list(node->sf_parent);
done:
    mutex_exit(&sffs_lock);
#ifdef VBOXVFS_WITH_MMAP
    vn_vfsunlock(vp);
#endif
    VN_RELE(vp);
    return (error);
}


#ifdef VBOXVFS_WITH_MMAP
static caddr_t
sffs_page_map(
    page_t *ppage,
    enum seg_rw segaccess)
{
    /* Use seg_kpm driver if possible (64-bit) */
    if (kpm_enable)
        return (hat_kpm_mapin(ppage, NULL));
    ASSERT(segaccess == S_READ || segaccess == S_WRITE);
    return (ppmapin(ppage, PROT_READ | ((segaccess == S_WRITE) ? PROT_WRITE : 0), (caddr_t)-1));
}


static void
sffs_page_unmap(
    page_t *ppage,
    caddr_t addr)
{
    if (kpm_enable)
        hat_kpm_mapout(ppage, NULL, addr);
    else
        ppmapout(addr);
}


/*
 * Called when there's no page in the cache. This will create new page(s) and read
 * the file data into it.
 */
static int
sffs_readpages(
    vnode_t     *dvp,
    offset_t    off,
    page_t      *pagelist[],
    size_t      pagelistsize,
    struct seg  *segp,
    caddr_t     addr,
    enum seg_rw segaccess)
{
    ASSERT(MUTEX_HELD(&sffs_lock));

    int error = 0;
    u_offset_t io_off, total;
    size_t io_len;
    page_t *ppages;
    page_t *pcur;

    sfnode_t *node = VN2SFN(dvp);
    ASSERT(node);
    ASSERT(node->sf_file);

    if (pagelistsize == PAGESIZE)
    {
        io_off = off;
        io_len = PAGESIZE;
        ppages = page_create_va(dvp, io_off, io_len, PG_WAIT | PG_EXCL, segp, addr);
    }
    else
        ppages = pvn_read_kluster(dvp, off, segp, addr, &io_off, &io_len, off, pagelistsize, 0);

    /* If page already exists return success */
    if (!ppages)
    {
        *pagelist = NULL;
        return (0);
    }

    /*
     * Map & read page-by-page.
     */
    total = io_off + io_len;
    pcur = ppages;
    while (io_off < total)
    {
        ASSERT3U(io_off, ==, pcur->p_offset);

        caddr_t virtaddr = sffs_page_map(pcur, segaccess);
        uint32_t bytes = PAGESIZE;
        error = sfprov_read(node->sf_file, virtaddr, io_off, &bytes);
        /*
         * If we reuse pages without zero'ing them, one process can mmap() and read-past the length
         * to read previously mmap'd contents (from possibly other processes).
         */
        if (error == 0 && bytes < PAGESIZE)
            memset(virtaddr + bytes, 0, PAGESIZE - bytes);
        sffs_page_unmap(pcur, virtaddr);
        if (error != 0)
        {
            cmn_err(CE_WARN, "sffs_readpages: sfprov_read() failed. error=%d bytes=%u\n", error, bytes);
            /* Get rid of all kluster pages read & bail.  */
            pvn_read_done(ppages, B_ERROR);
            return (error);
        }
        pcur = pcur->p_next;
        io_off += PAGESIZE;
    }

    /*
     * Fill in the pagelist from kluster at the requested offset.
     */
    pvn_plist_init(ppages, pagelist, pagelistsize, off, io_len, segaccess);
    ASSERT(pagelist == NULL || (*pagelist)->p_offset == off);
    return (0);
}


/*ARGSUSED*/
static int
sffs_getpage(
    vnode_t     *dvp,
    offset_t    off,
    size_t      len,
    uint_t      *protp,
    page_t      *pagelist[],
    size_t      pagelistsize,
    struct seg  *segp,
    caddr_t     addr,
    enum seg_rw segaccess,
    cred_t      *credp
#if !defined(VBOX_VFS_SOLARIS_10U6)
    , caller_context_t *ct
#endif
    )
{
    int error = 0;
    int is_recursive = 0;
    page_t **pageliststart = pagelist;
    sfnode_t *node = VN2SFN(dvp);
    ASSERT(node);
    ASSERT(node->sf_file);

    if (segaccess == S_WRITE)
        return (ENOSYS);    /* Will this ever happen? */

    /* Don't bother about faultahead for now. */
    if (pagelist == NULL)
        return (0);

    if (len > pagelistsize)
        len = pagelistsize;
    else
        len = P2ROUNDUP(len, PAGESIZE);
    ASSERT(pagelistsize >= len);

    if (protp)
        *protp = PROT_ALL;

    /*
     * The buffer passed to sffs_write may be mmap'd so we may get a
     * pagefault there, in which case we'll end up here with this thread
     * already owning the mutex. Mutexes aren't recursive.
     */
    if (mutex_owner(&sffs_lock) == curthread)
        is_recursive = 1;
    else
        mutex_enter(&sffs_lock);

    /* Don't map pages past end of the file. */
    if (off + len > node->sf_stat.sf_size + PAGEOFFSET)
    {
        mutex_exit(&sffs_lock);
        return (EFAULT);
    }

    while (len > 0)
    {
        /*
         * Look for pages in the requested offset range, or create them if we can't find any.
         */
        if ((*pagelist = page_lookup(dvp, off, SE_SHARED)) != NULL)
            *(pagelist + 1) = NULL;
        else if ((error = sffs_readpages(dvp, off, pagelist, pagelistsize, segp, addr, segaccess)) != 0)
        {
            while (pagelist > pageliststart)
                page_unlock(*--pagelist);

            *pagelist = NULL;
            if (!is_recursive)
                mutex_exit(&sffs_lock);
            return (error);
        }

        while (*pagelist)
        {
            ASSERT3U((*pagelist)->p_offset, ==, off);
            off += PAGESIZE;
            addr += PAGESIZE;
            if (len > 0)
            {
                ASSERT3U(len, >=, PAGESIZE);
                len -= PAGESIZE;
            }

            ASSERT3U(pagelistsize,  >=, PAGESIZE);
            pagelistsize -= PAGESIZE;
            pagelist++;
        }
    }

    /*
     * Fill the page list array with any pages left in the cache.
     */
    while (   pagelistsize > 0
           && (*pagelist++ = page_lookup_nowait(dvp, off, SE_SHARED)))
    {
        off += PAGESIZE;
        pagelistsize -= PAGESIZE;
    }

    *pagelist = NULL;
    if (!is_recursive)
        mutex_exit(&sffs_lock);
    return (error);
}


/*ARGSUSED*/
static int
sffs_putpage(
    vnode_t     *dvp,
    offset_t    off,
    size_t      len,
    int         flags,
    cred_t      *credp
#if !defined(VBOX_VFS_SOLARIS_10U6)
    , caller_context_t *ct
#endif
    )
{
    /*
     * We don't support PROT_WRITE mmaps. For normal writes we do not map and IO via
     * vop_putpage() either, therefore, afaik this shouldn't ever be called.
     */
    return (ENOSYS);
}


/*ARGSUSED*/
static int
sffs_discardpage(
    vnode_t     *dvp,
    page_t      *ppage,
    u_offset_t  *poff,
    size_t      *plen,
    int         flags,
    cred_t      *pcred)
{
    /*
     * This would not get invoked i.e. via pvn_vplist_dirty() since we don't support
     * PROT_WRITE mmaps and therefore will not have dirty pages.
     */
    pvn_write_done(ppage, B_INVAL | B_ERROR | B_FORCE);
    return (0);
}


/*ARGSUSED*/
static int
sffs_map(
    vnode_t     *dvp,
    offset_t    off,
    struct as   *asp,
    caddr_t     *addrp,
    size_t      len,
    uchar_t     prot,
    uchar_t     maxprot,
    uint_t      flags,
    cred_t      *credp
#if !defined(VBOX_VFS_SOLARIS_10U6)
    , caller_context_t *ct
#endif
    )
{
    /*
     * Invocation: mmap()->smmap_common()->VOP_MAP()->sffs_map(). Once the
     * segment driver creates the new segment via segvn_create(), it'll
     * invoke down the line VOP_ADDMAP()->sffs_addmap()
     */
    int error = 0;
    sfnode_t *node = VN2SFN(dvp);
    ASSERT(node);
    if ((prot & PROT_WRITE))
        return (ENOTSUP);

    if (off < 0 || len > MAXOFFSET_T - off)
        return (ENXIO);

    if (dvp->v_type != VREG)
        return (ENODEV);

    if (dvp->v_flag & VNOMAP)
        return (ENOSYS);

    if (vn_has_mandatory_locks(dvp, node->sf_stat.sf_mode))
        return (EAGAIN);

    mutex_enter(&sffs_lock);
    as_rangelock(asp);

#if defined(VBOX_VFS_SOLARIS_10U6)
    if ((flags & MAP_FIXED) == 0)
    {
        map_addr(addrp, len, off, 1, flags);
        if (*addrp == NULL)
            error = ENOMEM;
    }
    else
        as_unmap(asp, *addrp, len); /* User specified address, remove any previous mappings */
#else
    error = choose_addr(asp, addrp, len, off, ADDR_VACALIGN, flags);
#endif

    if (error)
    {
        as_rangeunlock(asp);
        mutex_exit(&sffs_lock);
        return (error);
    }

    segvn_crargs_t vnodeargs;
    memset(&vnodeargs, 0, sizeof(vnodeargs));
    vnodeargs.vp = dvp;
    vnodeargs.cred = credp;
    vnodeargs.offset = off;
    vnodeargs.type = flags & MAP_TYPE;
    vnodeargs.prot = prot;
    vnodeargs.maxprot = maxprot;
    vnodeargs.flags = flags & ~MAP_TYPE;
    vnodeargs.amp = NULL;       /* anon. mapping */
    vnodeargs.szc = 0;          /* preferred page size code */
    vnodeargs.lgrp_mem_policy_flags = 0;

    error = as_map(asp, *addrp, len, segvn_create, &vnodeargs);

    as_rangeunlock(asp);
    mutex_exit(&sffs_lock);
    return (error);
}


/*ARGSUSED*/
static int
sffs_addmap(
    vnode_t     *dvp,
    offset_t    off,
    struct as   *asp,
    caddr_t     addr,
    size_t      len,
    uchar_t     prot,
    uchar_t     maxprot,
    uint_t      flags,
    cred_t      *credp
#if !defined(VBOX_VFS_SOLARIS_10U6)
    , caller_context_t *ct
#endif
    )
{
    if (dvp->v_flag & VNOMAP)
        return (ENOSYS);
    return (0);
}


/*ARGSUSED*/
static int
sffs_delmap(
    vnode_t     *dvp,
    offset_t    off,
    struct as   *asp,
    caddr_t     addr,
    size_t      len,
    uint_t      prot,
    uint_t      maxprot,
    uint_t      flags,
    cred_t      *credp
#if !defined(VBOX_VFS_SOLARIS_10U6)
    , caller_context_t *ct
#endif
    )
{
    if (dvp->v_flag & VNOMAP)
        return (ENOSYS);
    return (0);
}
#endif /* VBOXVFS_WITH_MMAP */


/*ARGSUSED*/
static int
sffs_remove(
    vnode_t     *dvp,
    char        *name,
    cred_t      *cred,
    caller_context_t *ct,
    int     flags)
{
    vnode_t     *vp;
    sfnode_t    *node;
    int     error;

    /*
     * These should never happen
     */
    ASSERT(name != NULL);
    ASSERT(strcmp(name, "..") != 0);

    error = sffs_lookup(dvp, name, &vp,
        NULL, 0, NULL, cred, ct, NULL, NULL);
    if (error)
        return (error);
    node = VN2SFN(vp);

    mutex_enter(&sffs_lock);
    error = sfnode_access(VN2SFN(dvp), VEXEC | VWRITE, cred);
    if (error)
        goto done;

    /*
     * If anything else is using this vnode, then fail the remove.
     * Why?  Windows hosts can't sfprov_remove() a file that is open,
     * so we have to sfprov_close() it first.
     * There is no errno for this - since it's not a problem on UNIX,
     * but ETXTBSY is the closest.
     */
    if (node->sf_file != NULL) {
        if (vp->v_count > 1) {
            error = ETXTBSY;
            goto done;
        }
        (void)sfprov_close(node->sf_file);
        node->sf_file = NULL;
    }

    /*
     * Remove the file on the host and mark the node as stale.
     */
    sfnode_invalidate_stat_cache(VN2SFN(dvp));

    error = sfprov_remove(node->sf_sffs->sf_handle, node->sf_path);
    if (error == ENOENT || error == 0)
        sfnode_make_stale(node);

    if (node->sf_parent)
        sfnode_clear_dir_list(node->sf_parent);
done:
    mutex_exit(&sffs_lock);
    VN_RELE(vp);
    return (error);
}

/*ARGSUSED*/
static int
sffs_rename(
    vnode_t     *old_dir,
    char        *old_nm,
    vnode_t     *new_dir,
    char        *new_nm,
    cred_t      *cred,
    caller_context_t *ct,
    int     flags)
{
    char        *newpath;
    int     error;
    sfnode_t    *node;

    if (strcmp(new_nm, "") == 0 ||
        strcmp(new_nm, ".") == 0 ||
        strcmp(new_nm, "..") == 0 ||
        strcmp(old_nm, "") == 0 ||
        strcmp(old_nm, ".") == 0 ||
        strcmp(old_nm, "..") == 0)
        return (EINVAL);

    /*
     * make sure we have permission to do the rename
     */
    mutex_enter(&sffs_lock);
    error = sfnode_access(VN2SFN(old_dir), VEXEC | VWRITE, cred);
    if (error == 0 && new_dir != old_dir)
        error = sfnode_access(VN2SFN(new_dir), VEXEC | VWRITE, cred);
    if (error)
        goto done;

    node = sfnode_lookup(VN2SFN(old_dir), old_nm, VNON, NULL, 0, NULL);
    if (node == NULL) {
        error = ENOENT;
        goto done;
    }

    /*
     * Rename the file on the host and in our caches.
     */
    sfnode_invalidate_stat_cache(node);
    sfnode_invalidate_stat_cache(VN2SFN(old_dir));
    sfnode_invalidate_stat_cache(VN2SFN(new_dir));

    newpath = sfnode_construct_path(VN2SFN(new_dir), new_nm);
    error = sfprov_rename(node->sf_sffs->sf_handle, node->sf_path, newpath,
        node->sf_type == VDIR);
    if (error == 0)
        sfnode_rename(node, VN2SFN(new_dir), newpath);
    else {
        kmem_free(newpath, strlen(newpath) + 1);
        if (error == ENOENT)
            sfnode_make_stale(node);
    }
done:
    mutex_exit(&sffs_lock);
    return (error);
}


/*ARGSUSED*/
static int
sffs_fsync(vnode_t *vp, int flag, cred_t *cr, caller_context_t *ct)
{
    sfnode_t *node;
    int     error;

    /*
     * Ask the host to sync any data it may have cached for open files.
     */
    mutex_enter(&sffs_lock);
    node = VN2SFN(vp);
    if (node->sf_file == NULL)
        error = EBADF;
    else if (node->sf_sffs->sf_fsync)
        error = sfprov_fsync(node->sf_file);
    else
        error = 0;
    mutex_exit(&sffs_lock);
    return (error);
}

/*
 * This may be the last reference, possibly time to close the file and
 * destroy the vnode. If the sfnode is stale, we'll destroy that too.
 */
/*ARGSUSED*/
static void
#if defined(VBOX_VFS_SOLARIS_10U6)
sffs_inactive(vnode_t *vp, cred_t *cr)
#else
sffs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
#endif
{
    sfnode_t *node;

    /*
     * nothing to do if this isn't the last use
     */
    mutex_enter(&sffs_lock);
    node = VN2SFN(vp);
    mutex_enter(&vp->v_lock);
    if (vp->v_count > 1) {
        --vp->v_count;
        mutex_exit(&vp->v_lock);
        mutex_exit(&sffs_lock);
        return;
    }

    if (vn_has_cached_data(vp)) {
#ifdef VBOXVFS_WITH_MMAP
        /* We're fine with releasing the vnode lock here as we should be covered by the sffs_lock */
        mutex_exit(&vp->v_lock);
        /* We won't have any dirty pages, this will just invalidate (destroy) the pages and move it to the cachelist. */
        pvn_vplist_dirty(vp, 0 /* offset */, sffs_discardpage, B_INVAL, cr);
        mutex_enter(&vp->v_lock);
#else
        panic("sffs_inactive() found cached data");
#endif
    }

    /*
     * destroy the vnode
     */
    node->sf_vnode = NULL;
    mutex_exit(&vp->v_lock);
    vn_invalid(vp);
    vn_free(vp);
    LogFlowFunc(("  %s vnode cleared\n", node->sf_path));

    /*
     * Close the sf_file for the node.
     */
    if (node->sf_file != NULL) {
        (void)sfprov_close(node->sf_file);
        node->sf_file = NULL;
    }

    /*
     * Free the directory entries for the node. This should normally
     * have been taken care of in sffs_close(), but better safe than
     * sorry.
     */
    sfnode_clear_dir_list(node);

    /*
     * If the node is stale, we can also destroy it.
     */
    if (node->sf_is_stale && node->sf_children == 0)
        sfnode_destroy(node);

    mutex_exit(&sffs_lock);
    return;
}

/*
 * All the work for this is really done in lookup.
 */
/*ARGSUSED*/
static int
sffs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
{
    return (0);
}

/*
 * All the work for this is really done in inactive.
 */
/*ARGSUSED*/
static int
sffs_close(
    vnode_t *vp,
    int flag,
    int count,
    offset_t offset,
    cred_t *cr,
    caller_context_t *ct)
{
    sfnode_t *node;

    mutex_enter(&sffs_lock);
    node = VN2SFN(vp);

    /*
     * Free the directory entries for the node. We do this on this call
     * here because the directory node may not become inactive for a long
     * time after the readdir is over. Case in point, if somebody cd's into
     * the directory then it won't become inactive until they cd away again.
     * In such a case we would end up with the directory listing not getting
     * updated (i.e. the result of 'ls' always being the same) until they
     * change the working directory.
     */
    sfnode_clear_dir_list(node);

    sfnode_invalidate_stat_cache(node);

    mutex_exit(&sffs_lock);
    return (0);
}

/* ARGSUSED */
static int
sffs_seek(vnode_t *v, offset_t o, offset_t *no, caller_context_t *ct)
{
    if (*no < 0 || *no > MAXOFFSET_T)
        return (EINVAL);

    if (v->v_type == VDIR)
    {
        sffs_dirents_t *cur_buf = VN2SFN(v)->sf_dir_list;
        off_t offset = 0;

        if (cur_buf == NULL)
            return (0);

        while (cur_buf != NULL) {
            if (*no >= offset && *no <= offset + cur_buf->sf_len)
                return (0);
            offset += cur_buf->sf_len;
            cur_buf = cur_buf->sf_next;
        }
        return (EINVAL);
    }
    return (0);
}


/*
 * By returning an error for this, we prevent anything in sffs from
 * being re-exported by NFS
 */
/* ARGSUSED */
static int
sffs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
{
    return (ENOTSUP);
}

/*
 * vnode operations for regular files
 */
const fs_operation_def_t sffs_ops_template[] = {
#if defined(VBOX_VFS_SOLARIS_10U6)
    VOPNAME_ACCESS,     sffs_access,
    VOPNAME_CLOSE,      sffs_close,
    VOPNAME_CREATE,     sffs_create,
    VOPNAME_FID,        sffs_fid,
    VOPNAME_FSYNC,      sffs_fsync,
    VOPNAME_GETATTR,    sffs_getattr,
    VOPNAME_INACTIVE,   sffs_inactive,
    VOPNAME_LOOKUP,     sffs_lookup,
    VOPNAME_MKDIR,      sffs_mkdir,
    VOPNAME_OPEN,       sffs_open,
    VOPNAME_PATHCONF,   sffs_pathconf,
    VOPNAME_READ,       sffs_read,
    VOPNAME_READDIR,    sffs_readdir,
    VOPNAME_REMOVE,     sffs_remove,
    VOPNAME_RENAME,     sffs_rename,
    VOPNAME_RMDIR,      sffs_rmdir,
    VOPNAME_SEEK,       sffs_seek,
    VOPNAME_SETATTR,    sffs_setattr,
    VOPNAME_SPACE,      sffs_space,
    VOPNAME_WRITE,      sffs_write,

# ifdef VBOXVFS_WITH_MMAP
    VOPNAME_MAP,        sffs_map,
    VOPNAME_ADDMAP,     sffs_addmap,
    VOPNAME_DELMAP,     sffs_delmap,
    VOPNAME_GETPAGE,    sffs_getpage,
    VOPNAME_PUTPAGE,    sffs_putpage,
# endif

    NULL,           NULL
#else
    VOPNAME_ACCESS,     { .vop_access = sffs_access },
    VOPNAME_CLOSE,      { .vop_close = sffs_close },
    VOPNAME_CREATE,     { .vop_create = sffs_create },
    VOPNAME_FID,        { .vop_fid = sffs_fid },
    VOPNAME_FSYNC,      { .vop_fsync = sffs_fsync },
    VOPNAME_GETATTR,    { .vop_getattr = sffs_getattr },
    VOPNAME_INACTIVE,   { .vop_inactive = sffs_inactive },
    VOPNAME_LOOKUP,     { .vop_lookup = sffs_lookup },
    VOPNAME_MKDIR,      { .vop_mkdir = sffs_mkdir },
    VOPNAME_OPEN,       { .vop_open = sffs_open },
    VOPNAME_PATHCONF,   { .vop_pathconf = sffs_pathconf },
    VOPNAME_READ,       { .vop_read = sffs_read },
    VOPNAME_READDIR,    { .vop_readdir = sffs_readdir },
    VOPNAME_REMOVE,     { .vop_remove = sffs_remove },
    VOPNAME_RENAME,     { .vop_rename = sffs_rename },
    VOPNAME_RMDIR,      { .vop_rmdir = sffs_rmdir },
    VOPNAME_SEEK,       { .vop_seek = sffs_seek },
    VOPNAME_SETATTR,    { .vop_setattr = sffs_setattr },
    VOPNAME_SPACE,      { .vop_space = sffs_space },
    VOPNAME_WRITE,      { .vop_write = sffs_write },

# ifdef VBOXVFS_WITH_MMAP
    VOPNAME_MAP,        { .vop_map = sffs_map },
    VOPNAME_ADDMAP,     { .vop_addmap = sffs_addmap },
    VOPNAME_DELMAP,     { .vop_delmap = sffs_delmap },
    VOPNAME_GETPAGE,    { .vop_getpage = sffs_getpage },
    VOPNAME_PUTPAGE,    { .vop_putpage = sffs_putpage },
# endif

    NULL,           NULL
#endif
};

/*
 * Also, init and fini functions...
 */
int
sffs_vnode_init(void)
{
    int err;

    err = vn_make_ops("sffs", sffs_ops_template, &sffs_ops);
    if (err)
        return (err);

    avl_create(&sfnodes, sfnode_compare, sizeof (sfnode_t),
        offsetof(sfnode_t, sf_linkage));
    avl_create(&stale_sfnodes, sfnode_compare, sizeof (sfnode_t),
        offsetof(sfnode_t, sf_linkage));

    sffs_buffer = kmem_alloc(PAGESIZE, KM_SLEEP);

    return (0);
}

void
sffs_vnode_fini(void)
{
    if (sffs_ops)
        vn_freevnodeops(sffs_ops);
    ASSERT(avl_first(&sfnodes) == NULL);
    avl_destroy(&sfnodes);
    if (sffs_buffer != NULL) {
        kmem_free(sffs_buffer, PAGESIZE);
        sffs_buffer = NULL;
    }
}

/*
 * Utility at unmount to get all nodes in that mounted filesystem removed.
 */
int
sffs_purge(struct sffs_data *sffs)
{
    sfnode_t *node;
    sfnode_t *prev;

    /*
     * Check that no vnodes are active.
     */
    if (sffs->sf_rootnode->v_count > 1)
        return (-1);
    for (node = avl_first(&sfnodes); node;
        node = AVL_NEXT(&sfnodes, node)) {
        if (node->sf_sffs == sffs && node->sf_vnode &&
            node->sf_vnode != sffs->sf_rootnode)
            return (-1);
    }
    for (node = avl_first(&stale_sfnodes); node;
        node = AVL_NEXT(&stale_sfnodes, node)) {
        if (node->sf_sffs == sffs && node->sf_vnode &&
            node->sf_vnode != sffs->sf_rootnode)
            return (-1);
    }

    /*
     * All clear to destroy all node information. Since there are no
     * vnodes, the make stale will cause deletion.
     */
    VN_RELE(sffs->sf_rootnode);
    mutex_enter(&sffs_lock);
    for (prev = NULL;;) {
        if (prev == NULL)
            node = avl_first(&sfnodes);
        else
            node = AVL_NEXT(&sfnodes, prev);

        if (node == NULL)
            break;

        if (node->sf_sffs == sffs) {
            if (node->sf_vnode != NULL)
                panic("vboxfs: purge hit active vnode");
            sfnode_make_stale(node);
        } else {
            prev = node;
        }
    }
    mutex_exit(&sffs_lock);
    return (0);
}

#if 0
/* Debug helper functions */
static void
sfnode_print(sfnode_t *node)
{
    Log(("0x%p", node));
    Log((" type=%s (%d)",
        node->sf_type == VDIR ? "VDIR" :
        node->sf_type == VNON ? "VNON" :
        node->sf_type == VREG ? "VREG" : "other", node->sf_type));
    Log((" ino=%d", (uint_t)node->sf_ino));
    Log((" path=%s", node->sf_path));
    Log((" parent=0x%p", node->sf_parent));
    if (node->sf_children)
        Log((" children=%d", node->sf_children));
    if (node->sf_vnode)
        Log((" vnode=0x%p", node->sf_vnode));
    Log(("%s\n", node->sf_is_stale ? " STALE" : ""));
}

static void
sfnode_list(void)
{
    sfnode_t *n;
    for (n = avl_first(&sfnodes); n != NULL; n = AVL_NEXT(&sfnodes, n))
        sfnode_print(n);
    for (n = avl_first(&stale_sfnodes); n != NULL;
        n = AVL_NEXT(&stale_sfnodes, n))
        sfnode_print(n);
}
#endif