xmem_subr.c revision 4fceebdf03eeac0d7c58a4f70cc19b00a8c40a73
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/types.h>
#include <sys/errno.h>
#include <sys/param.h>
#include <sys/t_lock.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/debug.h>
#include <sys/time.h>
#include <sys/cmn_err.h>
#include <sys/vnode.h>
#include <sys/vfs.h>
#include <sys/cred.h>
#include <sys/kmem.h>
#include <sys/stat.h>
#include <sys/mode.h>
#include <vm/hat.h>
#include <vm/seg_map.h>
#include <vm/seg_kmem.h>
#include <vm/pvn.h>
#include <vm/page.h>
#include <sys/atomic.h>
#include <sys/policy.h>
#include <sys/fs/xmem.h>
extern void *xpgget(struct xmount *);
extern void xpgput(struct xmount *, void *);
#define MODESHIFT 3
size_t xmemfs_maxkmem = 32768;
size_t xmemfs_kmemcnt;
int
xmem_xaccess(void *vxp, int mode, struct cred *cred)
{
struct xmemnode *xp = vxp;
int shift = 0;
/*
* Check access based on owner, group and
* public permissions in xmemnode.
*/
if (crgetuid(cred) != xp->xn_uid) {
shift += MODESHIFT;
if (groupmember(xp->xn_gid, cred) == 0)
shift += MODESHIFT;
}
mode &= ~(xp->xn_mode << shift);
if (mode == 0)
return (0);
return (secpolicy_vnode_access(cred, XNTOV(xp), xp->xn_uid, mode));
}
/*
* Decide whether it is okay to remove within a sticky directory.
* Two conditions need to be met: write access to the directory
* is needed. In sticky directories, write access is not sufficient;
* you can remove entries from a directory only if you own the directory,
* if you are privileged, if you own the entry or if they entry is
* a plain file and you have write access to that file.
* Function returns 0 if remove access is granted.
*/
int
xmem_sticky_remove_access(struct xmemnode *dir, struct xmemnode *entry,
struct cred *cr)
{
uid_t uid;
if ((dir->xn_mode & S_ISVTX) &&
(uid = crgetuid(cr)) != dir->xn_uid &&
uid != entry->xn_uid &&
(entry->xn_type != VREG ||
xmem_xaccess(entry, VWRITE, cr) != 0))
return (secpolicy_vnode_remove(cr));
return (0);
}
/*
* Allocate zeroed memory if xmemfs_maxkmem has not been exceeded
* or the 'musthave' flag is set. 'musthave' allocations should
* always be subordinate to normal allocations so that xmemfs_maxkmem
* can't be exceeded by more than a few KB. Example: when creating
* a new directory, the xmemnode is a normal allocation; if that
* succeeds, the dirents for "." and ".." are 'musthave' allocations.
*/
void *
xmem_memalloc(size_t size, int musthave)
{
void *ptr = NULL;
if (musthave) {
atomic_add_long(&xmemfs_kmemcnt, size);
ptr = kmem_zalloc(size, KM_SLEEP);
} else if (xmemfs_kmemcnt + size < xmemfs_maxkmem) {
/*
* kmemcnt may have increased since above check so a little
* more than xmemfs_maxkmem may be allocated.
*/
ptr = kmem_zalloc(size, KM_NOSLEEP);
if (ptr)
atomic_add_long(&xmemfs_kmemcnt, size);
}
return (ptr);
}
void
xmem_memfree(void *cp, size_t size)
{
extern size_t xmemfs_kmemcnt;
kmem_free(cp, size);
atomic_add_long(&xmemfs_kmemcnt, -size);
}
/* add to the number of pages we have created */
int
xmem_mem_add(struct xmount *xm, size_t size)
{
mutex_enter(&xm->xm_contents);
/* allocate the last available block */
if ((xm->xm_mem + size) > xm->xm_max) {
mutex_exit(&xm->xm_contents);
return (1);
}
xm->xm_mem += size;
mutex_exit(&xm->xm_contents);
return (0);
}
/* sub to the number of pages we have created */
static void
xmem_mem_sub(struct xmount *xm, size_t size)
{
mutex_enter(&xm->xm_contents);
xm->xm_mem -= size;
mutex_exit(&xm->xm_contents);
}
/*
* xmem_acquire_pages: returns an array of size btop(xm_bsize) page pointers
* or xm_bsize bytes.
*
* If large page, the array will contain 1024 entries (4MB) or 512 entries.
*
* If not large page, there is no array as a page_t * is returned.
*/
static page_t **
xmem_acquire_pages(struct xmount *xm, struct vnode *vp, offset_t off)
{
page_t **ppa, *pp, *pplist;
uint_t pindex;
size_t bsize;
struct seg tmpseg;
bsize = xm->xm_bsize;
if (xmem_mem_add(xm, 1))
return (NULL);
if (xm->xm_flags & XARGS_RESERVEMEM) {
mutex_enter(&xm->xm_contents);
ppa = xpgget(xm);
mutex_exit(&xm->xm_contents);
if (xm->xm_ppb == 1) {
/* ppa is a direct page pointer */
if (!page_hashin((page_t *)ppa, vp, off, NULL)) {
panic("xmem_acquire_pages: hashin failed"
" %p %llx", (void *)vp, off);
}
pindex = xm->xm_ppb; /* bypass for loop */
} else {
pindex = 0;
}
for (; pindex < xm->xm_ppb; pindex++, off += PAGESIZE) {
pp = ppa[pindex];
if (!page_hashin(pp, vp, off, NULL)) {
panic("xmem_acquire_pages: hashin failed"
" %p %p %llx", (void *)pp, (void *)vp, off);
}
}
return (ppa);
}
bzero(&tmpseg, sizeof (struct seg));
tmpseg.s_as = &kas;
if ((freemem - xm->xm_ppb) < xmemfs_minfree ||
page_resv(xm->xm_ppb, KM_NOSLEEP) == 0) {
cmn_err(CE_WARN, "%s: File system full, no memory",
xm->xm_mntpath);
return (NULL);
}
(void) page_create_wait(xm->xm_ppb, PG_WAIT);
pplist = page_get_freelist(vp, off, &tmpseg,
(caddr_t)(uintptr_t)off, bsize, 0, NULL);
if (pplist == NULL && xm->xm_ppb == 1) {
pplist = page_get_cachelist(vp, off, &tmpseg,
(caddr_t)(uintptr_t)off, 0, NULL);
}
if (pplist == NULL) {
page_create_putback(xm->xm_ppb);
page_unresv(xm->xm_ppb);
return (NULL);
}
if (PP_ISAGED(pplist) == 0) {
ASSERT(xm->xm_ppb == 1);
page_hashout(pplist, NULL);
}
if (xm->xm_ppb > 1)
ppa = kmem_alloc(sizeof (*ppa) * xm->xm_ppb, KM_SLEEP);
for (pindex = 0; pindex < xm->xm_ppb; pindex++, off += PAGESIZE) {
pp = pplist;
page_sub(&pplist, pp);
ASSERT(PAGE_EXCL(pp));
ASSERT(pp->p_vnode == NULL);
ASSERT(!hat_page_is_mapped(pp));
PP_CLRFREE(pp);
PP_CLRAGED(pp);
if (xm->xm_ppb == 1)
ppa = (page_t **)pp;
else
ppa[pindex] = pp;
if (!page_hashin(pp, vp, off, NULL)) {
panic("xmem_acquire_pages: hashin failed"
" %p %p %llx", (void *)pp, (void *)vp, off);
}
page_downgrade(pp); /* XXX */
}
return (ppa);
}
static void
xmem_release_pages(struct xmount *xm, page_t **ppa)
{
uint_t pindex;
page_t *pp;
xmem_mem_sub(xm, 1);
if (xm->xm_flags & XARGS_RESERVEMEM) {
/*
* if ppb == 1 and to lessen the load on kmem memory in
* having to allocate a million 4 byte pointers for a
* 4 GB file system, ppa is actually a page_t *
*/
if (xm->xm_ppb == 1) {
page_hashout((page_t *)ppa, NULL);
pindex = xm->xm_ppb; /* bypass for loop */
} else
pindex = 0;
for (; pindex < xm->xm_ppb; pindex++) {
pp = ppa[pindex];
page_hashout(pp, NULL);
}
mutex_enter(&xm->xm_contents);
xpgput(xm, ppa);
mutex_exit(&xm->xm_contents);
} else {
int flag = B_INVAL;
if (xm->xm_ppb == 1) {
VN_DISPOSE((page_t *)ppa, flag, 0, kcred);
} else {
for (pindex = 0; pindex < xm->xm_ppb; pindex++)
VN_DISPOSE(ppa[pindex], flag, 0, kcred);
kmem_free(ppa, sizeof (*ppa) * xm->xm_ppb);
}
page_unresv(xm->xm_ppb);
}
}
/*
* Initialize a xmemnode and add it to file list under mount point.
*/
void
xmemnode_init(struct xmount *xm, struct xmemnode *xp,
vattr_t *vap, cred_t *cred)
{
struct vnode *vp;
timestruc_t now;
ASSERT(vap != NULL);
ASSERT(cred != NULL);
rw_init(&xp->xn_rwlock, NULL, RW_DEFAULT, NULL);
mutex_init(&xp->xn_tlock, NULL, MUTEX_DEFAULT, NULL);
xp->xn_mode = MAKEIMODE(vap->va_type, vap->va_mode);
if (S_ISREG(xp->xn_mode))
xp->xn_mode &= ~(S_IXUSR | S_IXGRP | S_IXOTH);
xp->xn_mask = 0;
xp->xn_type = vap->va_type;
xp->xn_nodeid = (ino64_t)(uint32_t)((uintptr_t)xp >> 3);
xp->xn_nlink = 1;
xp->xn_size = 0;
xp->xn_uid = crgetuid(cred);
xp->xn_gid = crgetgid(cred);
xp->xn_fsid = xm->xm_dev;
xp->xn_rdev = vap->va_rdev;
xp->xn_blksize = PAGESIZE;
xp->xn_nblocks = 0;
gethrestime(&now);
xp->xn_atime = now;
xp->xn_mtime = now;
xp->xn_ctime = now;
xp->xn_seq = 0;
xp->xn_dir = NULL;
vp = XNTOV(xp);
vn_reinit(vp);
vn_setops(vp, xmem_vnodeops);
vp->v_vfsp = xm->xm_vfsp;
vp->v_type = vap->va_type;
vp->v_rdev = vap->va_rdev;
vp->v_data = (caddr_t)xp;
mutex_enter(&xm->xm_contents);
/*
* Increment the pseudo generation number for this xmemnode.
* Since xmemnodes are allocated and freed, there really is no
* particular generation number for a new xmemnode. Just fake it
* by using a counter in each file system.
*/
xp->xn_gen = xm->xm_gen++;
/*
* Add new xmemnode to end of linked list of xmemnodes for this xmemfs
* Root directory is handled specially in xmem_mount.
*/
if (xm->xm_rootnode != (struct xmemnode *)NULL) {
xp->xn_forw = NULL;
xp->xn_back = xm->xm_rootnode->xn_back;
xp->xn_back->xn_forw = xm->xm_rootnode->xn_back = xp;
}
mutex_exit(&xm->xm_contents);
}
/*
*
*/
int
xmem_fillpages(struct xmemnode *xp, struct vnode *vp, offset_t off,
offset_t len, int zerofill)
{
uint_t blockno, endblock;
caddr_t base;
int error = 0;
struct xmount *xm = (struct xmount *)VTOXM(vp);
offset_t poff;
size_t bsize = xm->xm_bsize;
blockno = off >> xm->xm_bshift;
poff = (offset_t)blockno << xm->xm_bshift;
endblock = howmany(off + len, (offset_t)bsize);
if (endblock > xp->xn_ppasz)
return (EINVAL);
/* Create missing pages if any */
for (; blockno < endblock; ) {
if (!xp->xn_ppa[blockno]) {
xp->xn_ppa[blockno] = xmem_acquire_pages(xm, vp, poff);
if (!xp->xn_ppa[blockno])
return (ENOSPC);
if (zerofill) {
page_t **ppp;
if (xm->xm_ppb == 1)
ppp = (page_t **)&xp->xn_ppa[blockno];
else
ppp = xp->xn_ppa[blockno];
base = segxmem_getmap(xm->xm_map, vp, poff,
bsize, ppp, S_WRITE);
(void) kzero(base, bsize);
segxmem_release(xm->xm_map, base, bsize);
}
xp->xn_nblocks++;
}
blockno++;
poff += bsize;
}
return (error);
}
/*
* xmemnode_trunc - set length of xmemnode and deal with resources
*/
int
xmemnode_trunc(struct xmount *xm, struct xmemnode *xp, u_offset_t newsize)
{
u_offset_t oldsize = xp->xn_size;
timestruc_t now;
int error = 0;
size_t zlen;
ulong_t newblocks, oldblocks;
ASSERT(RW_WRITE_HELD(&xp->xn_rwlock));
ASSERT(RW_WRITE_HELD(&xp->xn_contents));
if (newsize == oldsize) {
/* Required by POSIX */
goto stamp_out;
}
switch (xp->xn_type) {
case VREG:
oldblocks = howmany(oldsize, xm->xm_bsize);
newblocks = howmany(newsize, xm->xm_bsize);
XMEMPRINTF(4, ("xmemnode_trunc: xp %p old %lx new %lx\n",
xp, oldblocks, newblocks));
/*
* xn_ppasz is the size of the ppa array which may not
* be fully populated if pages cannot be allocated.
*/
ASSERT(xp->xn_ppasz >= oldblocks);
/* Growing the file */
if (newblocks > oldblocks) {
if (xp->xn_ppasz < newblocks) {
page_t ***ppa;
ppa = kmem_zalloc(newblocks * sizeof (*ppa), KM_SLEEP);
if (xp->xn_ppasz) {
bcopy(xp->xn_ppa, ppa,
newblocks * sizeof (*ppa));
kmem_free(xp->xn_ppa,
xp->xn_ppasz * sizeof (*ppa));
}
xp->xn_ppa = ppa;
xp->xn_ppasz = newblocks;
}
}
/* Free pages if shrinking file over block boundary. */
if (newblocks < oldblocks) {
uint_t next;
page_t ***ppa = NULL;
next = newblocks;
if (next) {
ppa = kmem_zalloc(next * sizeof (*ppa),
KM_SLEEP);
bcopy(xp->xn_ppa, ppa, next * sizeof (*ppa));
}
for (; next < oldblocks; next++) {
if (!xp->xn_ppa[next])
continue;
xmem_release_pages(xm, xp->xn_ppa[next]);
xp->xn_nblocks--;
}
kmem_free(xp->xn_ppa, xp->xn_ppasz * sizeof (*ppa));
xp->xn_ppa = ppa;
xp->xn_ppasz = newblocks;
}
/*
* Update the file size now to reflect the pages we just
* blew away as we're about to drop the
* contents lock to zero the partial page (which could
* re-enter xmemfs via getpage and try to reacquire the lock)
* Once we drop the lock, faulters can fill in holes in
* the file and if we haven't updated the size they
* may fill in holes that are beyond EOF, which will then
* never get cleared.
*/
xp->xn_size = newsize;
if (newsize) {
/* Zero new size of file to page boundary. */
zlen = PAGESIZE - ((ulong_t)newsize & PAGEOFFSET);
rw_exit(&xp->xn_contents);
pvn_vpzero(XNTOV(xp), (u_offset_t)newsize, zlen);
rw_enter(&xp->xn_contents, RW_WRITER);
}
break;
case VLNK:
/*
* Don't do anything here
* xmem_inactive frees the memory
*/
if (newsize != 0)
error = EINVAL;
goto out;
case VDIR:
/*
* Remove all the directory entries under this directory.
*/
if (newsize != 0) {
error = EINVAL;
goto out;
}
xdirtrunc(xp);
ASSERT(xp->xn_nlink == 0);
break;
default:
goto out;
}
stamp_out:
gethrestime(&now);
xp->xn_mtime = now;
xp->xn_ctime = now;
out:
/*
* xmemnode_trunc() cannot fail when newsize == 0.
*/
ASSERT(error == 0 || newsize != 0);
return (error);
}