/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
/*
* Vnode operations for the High Sierra filesystem
*/
#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/time.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/resource.h>
#include <sys/signal.h>
#include <sys/cred.h>
#include <sys/user.h>
#include <sys/buf.h>
#include <sys/vfs.h>
#include <sys/vfs_opreg.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/mode.h>
#include <sys/proc.h>
#include <sys/disp.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/flock.h>
#include <sys/kmem.h>
#include <sys/uio.h>
#include <sys/conf.h>
#include <sys/errno.h>
#include <sys/mman.h>
#include <sys/pathname.h>
#include <sys/debug.h>
#include <sys/vmsystm.h>
#include <sys/cmn_err.h>
#include <sys/fbuf.h>
#include <sys/dirent.h>
#include <sys/errno.h>
#include <sys/dkio.h>
#include <sys/cmn_err.h>
#include <sys/atomic.h>
#include <vm/hat.h>
#include <vm/page.h>
#include <vm/pvn.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_map.h>
#include <vm/seg_kmem.h>
#include <vm/seg_vn.h>
#include <vm/rm.h>
#include <vm/page.h>
#include <sys/swap.h>
#include <sys/avl.h>
#include <sys/sunldi.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/sdt.h>
/*
* For struct modlinkage
*/
#include <sys/modctl.h>
#include <sys/fs/hsfs_spec.h>
#include <sys/fs/hsfs_node.h>
#include <sys/fs/hsfs_impl.h>
#include <sys/fs/hsfs_susp.h>
#include <sys/fs/hsfs_rrip.h>
#include <fs/fs_subr.h>
/* # of contiguous requests to detect sequential access pattern */
static int seq_contig_requests = 2;
/*
* This is the max number os taskq threads that will be created
* if required. Since we are using a Dynamic TaskQ by default only
* one thread is created initially.
*
* NOTE: In the usual hsfs use case this per fs instance number
* of taskq threads should not place any undue load on a system.
* Even on an unusual system with say 100 CDROM drives, 800 threads
* will not be created unless all the drives are loaded and all
* of them are saturated with I/O at the same time! If there is at
* all a complaint of system load due to such an unusual case it
* should be easy enough to change to one per-machine Dynamic TaskQ
* for all hsfs mounts with a nthreads of say 32.
*/
static int hsfs_taskq_nthreads = 8; /* # of taskq threads per fs */
/* Min count of adjacent bufs that will avoid buf coalescing */
static int hsched_coalesce_min = 2;
/*
* Kmem caches for heavily used small allocations. Using these kmem
* caches provides a factor of 3 reduction in system time and greatly
* aids overall throughput esp. on SPARC.
*/
struct kmem_cache *hio_cache;
struct kmem_cache *hio_info_cache;
/*
* This tunable allows us to ignore inode numbers from rrip-1.12.
* In this case, we fall back to our default inode algorithm.
*/
extern int use_rrip_inodes;
/*
* Free behind logic from UFS to tame our thirst for
* the page cache.
* See usr/src/uts/common/fs/ufs/ufs_vnops.c for more
* explanation.
*/
static int freebehind = 1;
static int smallfile = 0;
static int cache_read_ahead = 0;
static u_offset_t smallfile64 = 32 * 1024;
#define SMALLFILE1_D 1000
#define SMALLFILE2_D 10
static u_offset_t smallfile1 = 32 * 1024;
static u_offset_t smallfile2 = 32 * 1024;
static clock_t smallfile_update = 0; /* when to recompute */
static uint_t smallfile1_d = SMALLFILE1_D;
static uint_t smallfile2_d = SMALLFILE2_D;
static int hsched_deadline_compare(const void *x1, const void *x2);
static int hsched_offset_compare(const void *x1, const void *x2);
static void hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra);
int hsched_invoke_strategy(struct hsfs *fsp);
/* ARGSUSED */
static int
hsfs_fsync(vnode_t *cp,
int syncflag,
cred_t *cred,
caller_context_t *ct)
{
return (0);
}
/*ARGSUSED*/
static int
hsfs_read(struct vnode *vp,
struct uio *uiop,
int ioflag,
struct cred *cred,
struct caller_context *ct)
{
caddr_t base;
offset_t diff;
int error;
struct hsnode *hp;
uint_t filesize;
int dofree;
hp = VTOH(vp);
/*
* if vp is of type VDIR, make sure dirent
* is filled up with all info (because of ptbl)
*/
if (vp->v_type == VDIR) {
if (hp->hs_dirent.ext_size == 0)
hs_filldirent(vp, &hp->hs_dirent);
}
filesize = hp->hs_dirent.ext_size;
/* Sanity checks. */
if (uiop->uio_resid == 0 || /* No data wanted. */
uiop->uio_loffset > HS_MAXFILEOFF || /* Offset too big. */
uiop->uio_loffset >= filesize) /* Past EOF. */
return (0);
do {
/*
* We want to ask for only the "right" amount of data.
* In this case that means:-
*
* We can't get data from beyond our EOF. If asked,
* we will give a short read.
*
* segmap_getmapflt returns buffers of MAXBSIZE bytes.
* These buffers are always MAXBSIZE aligned.
* If our starting offset is not MAXBSIZE aligned,
* we can only ask for less than MAXBSIZE bytes.
*
* If our requested offset and length are such that
* they belong in different MAXBSIZE aligned slots
* then we'll be making more than one call on
* segmap_getmapflt.
*
* This diagram shows the variables we use and their
* relationships.
*
* |<-----MAXBSIZE----->|
* +--------------------------...+
* |.....mapon->|<--n-->|....*...|EOF
* +--------------------------...+
* uio_loffset->|
* uio_resid....|<---------->|
* diff.........|<-------------->|
*
* So, in this case our offset is not aligned
* and our request takes us outside of the
* MAXBSIZE window. We will break this up into
* two segmap_getmapflt calls.
*/
size_t nbytes;
offset_t mapon;
size_t n;
uint_t flags;
mapon = uiop->uio_loffset & MAXBOFFSET;
diff = filesize - uiop->uio_loffset;
nbytes = (size_t)MIN(MAXBSIZE - mapon, uiop->uio_resid);
n = MIN(diff, nbytes);
if (n <= 0) {
/* EOF or request satisfied. */
return (0);
}
/*
* Freebehind computation taken from:
* usr/src/uts/common/fs/ufs/ufs_vnops.c
*/
if (drv_hztousec(ddi_get_lbolt()) >= smallfile_update) {
uint64_t percpufreeb;
if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D;
if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D;
percpufreeb = ptob((uint64_t)freemem) / ncpus_online;
smallfile1 = percpufreeb / smallfile1_d;
smallfile2 = percpufreeb / smallfile2_d;
smallfile1 = MAX(smallfile1, smallfile);
smallfile1 = MAX(smallfile1, smallfile64);
smallfile2 = MAX(smallfile1, smallfile2);
smallfile_update = drv_hztousec(ddi_get_lbolt())
+ 1000000;
}
dofree = freebehind &&
hp->hs_prev_offset == uiop->uio_loffset &&
hp->hs_ra_bytes > 0;
base = segmap_getmapflt(segkmap, vp,
(u_offset_t)uiop->uio_loffset, n, 1, S_READ);
error = uiomove(base + mapon, n, UIO_READ, uiop);
if (error == 0) {
/*
* if read a whole block, or read to eof,
* won't need this buffer again soon.
*/
if (n + mapon == MAXBSIZE ||
uiop->uio_loffset == filesize)
flags = SM_DONTNEED;
else
flags = 0;
if (dofree) {
flags = SM_FREE | SM_ASYNC;
if ((cache_read_ahead == 0) &&
uiop->uio_loffset > smallfile2)
flags |= SM_DONTNEED;
}
error = segmap_release(segkmap, base, flags);
} else
(void) segmap_release(segkmap, base, 0);
} while (error == 0 && uiop->uio_resid > 0);
return (error);
}
/*ARGSUSED2*/
static int
hsfs_getattr(
struct vnode *vp,
struct vattr *vap,
int flags,
struct cred *cred,
caller_context_t *ct)
{
struct hsnode *hp;
struct vfs *vfsp;
struct hsfs *fsp;
hp = VTOH(vp);
fsp = VFS_TO_HSFS(vp->v_vfsp);
vfsp = vp->v_vfsp;
if ((hp->hs_dirent.ext_size == 0) && (vp->v_type == VDIR)) {
hs_filldirent(vp, &hp->hs_dirent);
}
vap->va_type = IFTOVT(hp->hs_dirent.mode);
vap->va_mode = hp->hs_dirent.mode;
vap->va_uid = hp->hs_dirent.uid;
vap->va_gid = hp->hs_dirent.gid;
vap->va_fsid = vfsp->vfs_dev;
vap->va_nodeid = (ino64_t)hp->hs_nodeid;
vap->va_nlink = hp->hs_dirent.nlink;
vap->va_size = (offset_t)hp->hs_dirent.ext_size;
vap->va_atime.tv_sec = hp->hs_dirent.adate.tv_sec;
vap->va_atime.tv_nsec = hp->hs_dirent.adate.tv_usec*1000;
vap->va_mtime.tv_sec = hp->hs_dirent.mdate.tv_sec;
vap->va_mtime.tv_nsec = hp->hs_dirent.mdate.tv_usec*1000;
vap->va_ctime.tv_sec = hp->hs_dirent.cdate.tv_sec;
vap->va_ctime.tv_nsec = hp->hs_dirent.cdate.tv_usec*1000;
if (vp->v_type == VCHR || vp->v_type == VBLK)
vap->va_rdev = hp->hs_dirent.r_dev;
else
vap->va_rdev = 0;
vap->va_blksize = vfsp->vfs_bsize;
/* no. of blocks = no. of data blocks + no. of xar blocks */
vap->va_nblocks = (fsblkcnt64_t)howmany(vap->va_size + (u_longlong_t)
(hp->hs_dirent.xar_len << fsp->hsfs_vol.lbn_shift), DEV_BSIZE);
vap->va_seq = hp->hs_seq;
return (0);
}
/*ARGSUSED*/
static int
hsfs_readlink(struct vnode *vp,
struct uio *uiop,
struct cred *cred,
caller_context_t *ct)
{
struct hsnode *hp;
if (vp->v_type != VLNK)
return (EINVAL);
hp = VTOH(vp);
if (hp->hs_dirent.sym_link == (char *)NULL)
return (ENOENT);
return (uiomove(hp->hs_dirent.sym_link,
(size_t)MIN(hp->hs_dirent.ext_size,
uiop->uio_resid), UIO_READ, uiop));
}
/*ARGSUSED*/
static void
hsfs_inactive(struct vnode *vp,
struct cred *cred,
caller_context_t *ct)
{
struct hsnode *hp;
struct hsfs *fsp;
int nopage;
hp = VTOH(vp);
fsp = VFS_TO_HSFS(vp->v_vfsp);
/*
* Note: acquiring and holding v_lock for quite a while
* here serializes on the vnode; this is unfortunate, but
* likely not to overly impact performance, as the underlying
* device (CDROM drive) is quite slow.
*/
rw_enter(&fsp->hsfs_hash_lock, RW_WRITER);
mutex_enter(&hp->hs_contents_lock);
mutex_enter(&vp->v_lock);
if (vp->v_count < 1) {
panic("hsfs_inactive: v_count < 1");
/*NOTREACHED*/
}
if (vp->v_count > 1 || (hp->hs_flags & HREF) == 0) {
vp->v_count--; /* release hold from vn_rele */
mutex_exit(&vp->v_lock);
mutex_exit(&hp->hs_contents_lock);
rw_exit(&fsp->hsfs_hash_lock);
return;
}
vp->v_count--; /* release hold from vn_rele */
if (vp->v_count == 0) {
/*
* Free the hsnode.
* If there are no pages associated with the
* hsnode, give it back to the kmem_cache,
* else put at the end of this file system's
* internal free list.
*/
nopage = !vn_has_cached_data(vp);
hp->hs_flags = 0;
/*
* exit these locks now, since hs_freenode may
* kmem_free the hsnode and embedded vnode
*/
mutex_exit(&vp->v_lock);
mutex_exit(&hp->hs_contents_lock);
hs_freenode(vp, fsp, nopage);
} else {
mutex_exit(&vp->v_lock);
mutex_exit(&hp->hs_contents_lock);
}
rw_exit(&fsp->hsfs_hash_lock);
}
/*ARGSUSED*/
static int
hsfs_lookup(
struct vnode *dvp,
char *nm,
struct vnode **vpp,
struct pathname *pnp,
int flags,
struct vnode *rdir,
struct cred *cred,
caller_context_t *ct,
int *direntflags,
pathname_t *realpnp)
{
int error;
int namelen = (int)strlen(nm);
if (*nm == '\0') {
VN_HOLD(dvp);
*vpp = dvp;
return (0);
}
/*
* If we're looking for ourself, life is simple.
*/
if (namelen == 1 && *nm == '.') {
if (error = hs_access(dvp, (mode_t)VEXEC, cred))
return (error);
VN_HOLD(dvp);
*vpp = dvp;
return (0);
}
return (hs_dirlook(dvp, nm, namelen, vpp, cred));
}
/*ARGSUSED*/
static int
hsfs_readdir(
struct vnode *vp,
struct uio *uiop,
struct cred *cred,
int *eofp,
caller_context_t *ct,
int flags)
{
struct hsnode *dhp;
struct hsfs *fsp;
struct hs_direntry hd;
struct dirent64 *nd;
int error;
uint_t offset; /* real offset in directory */
uint_t dirsiz; /* real size of directory */
uchar_t *blkp;
int hdlen; /* length of hs directory entry */
long ndlen; /* length of dirent entry */
int bytes_wanted;
size_t bufsize; /* size of dirent buffer */
char *outbuf; /* ptr to dirent buffer */
char *dname;
int dnamelen;
size_t dname_size;
struct fbuf *fbp;
uint_t last_offset; /* last index into current dir block */
ino64_t dirino; /* temporary storage before storing in dirent */
off_t diroff;
dhp = VTOH(vp);
fsp = VFS_TO_HSFS(vp->v_vfsp);
if (dhp->hs_dirent.ext_size == 0)
hs_filldirent(vp, &dhp->hs_dirent);
dirsiz = dhp->hs_dirent.ext_size;
if (uiop->uio_loffset >= dirsiz) { /* at or beyond EOF */
if (eofp)
*eofp = 1;
return (0);
}
ASSERT(uiop->uio_loffset <= HS_MAXFILEOFF);
offset = uiop->uio_loffset;
dname_size = fsp->hsfs_namemax + 1; /* 1 for the ending NUL */
dname = kmem_alloc(dname_size, KM_SLEEP);
bufsize = uiop->uio_resid + sizeof (struct dirent64);
outbuf = kmem_alloc(bufsize, KM_SLEEP);
nd = (struct dirent64 *)outbuf;
while (offset < dirsiz) {
bytes_wanted = MIN(MAXBSIZE, dirsiz - (offset & MAXBMASK));
error = fbread(vp, (offset_t)(offset & MAXBMASK),
(unsigned int)bytes_wanted, S_READ, &fbp);
if (error)
goto done;
blkp = (uchar_t *)fbp->fb_addr;
last_offset = (offset & MAXBMASK) + fbp->fb_count;
#define rel_offset(offset) ((offset) & MAXBOFFSET) /* index into blkp */
while (offset < last_offset) {
/*
* Very similar validation code is found in
* process_dirblock(), hsfs_node.c.
* For an explanation, see there.
* It may make sense for the future to
* "consolidate" the code in hs_parsedir(),
* process_dirblock() and hsfs_readdir() into
* a single utility function.
*/
hdlen = (int)((uchar_t)
HDE_DIR_LEN(&blkp[rel_offset(offset)]));
if (hdlen < HDE_ROOT_DIR_REC_SIZE ||
offset + hdlen > last_offset) {
/*
* advance to next sector boundary
*/
offset = roundup(offset + 1, HS_SECTOR_SIZE);
if (hdlen)
hs_log_bogus_disk_warning(fsp,
HSFS_ERR_TRAILING_JUNK, 0);
continue;
}
bzero(&hd, sizeof (hd));
/*
* Just ignore invalid directory entries.
* XXX - maybe hs_parsedir() will detect EXISTENCE bit
*/
if (!hs_parsedir(fsp, &blkp[rel_offset(offset)],
&hd, dname, &dnamelen, last_offset - offset)) {
/*
* Determine if there is enough room
*/
ndlen = (long)DIRENT64_RECLEN((dnamelen));
if ((ndlen + ((char *)nd - outbuf)) >
uiop->uio_resid) {
fbrelse(fbp, S_READ);
goto done; /* output buffer full */
}
diroff = offset + hdlen;
/*
* If the media carries rrip-v1.12 or newer,
* and we trust the inodes from the rrip data
* (use_rrip_inodes != 0), use that data. If the
* media has been created by a recent mkisofs
* version, we may trust all numbers in the
* starting extent number; otherwise, we cannot
* do this for zero sized files and symlinks,
* because if we did we'd end up mapping all of
* them to the same node. We use HS_DUMMY_INO
* in this case and make sure that we will not
* map all files to the same meta data.
*/
if (hd.inode != 0 && use_rrip_inodes) {
dirino = hd.inode;
} else if ((hd.ext_size == 0 ||
hd.sym_link != (char *)NULL) &&
(fsp->hsfs_flags & HSFSMNT_INODE) == 0) {
dirino = HS_DUMMY_INO;
} else {
dirino = hd.ext_lbn;
}
/* strncpy(9f) will zero uninitialized bytes */
ASSERT(strlen(dname) + 1 <=
DIRENT64_NAMELEN(ndlen));
(void) strncpy(nd->d_name, dname,
DIRENT64_NAMELEN(ndlen));
nd->d_reclen = (ushort_t)ndlen;
nd->d_off = (offset_t)diroff;
nd->d_ino = dirino;
nd = (struct dirent64 *)((char *)nd + ndlen);
/*
* free up space allocated for symlink
*/
if (hd.sym_link != (char *)NULL) {
kmem_free(hd.sym_link,
(size_t)(hd.ext_size+1));
hd.sym_link = (char *)NULL;
}
}
offset += hdlen;
}
fbrelse(fbp, S_READ);
}
/*
* Got here for one of the following reasons:
* 1) outbuf is full (error == 0)
* 2) end of directory reached (error == 0)
* 3) error reading directory sector (error != 0)
* 4) directory entry crosses sector boundary (error == 0)
*
* If any directory entries have been copied, don't report
* case 4. Instead, return the valid directory entries.
*
* If no entries have been copied, report the error.
* If case 4, this will be indistiguishable from EOF.
*/
done:
ndlen = ((char *)nd - outbuf);
if (ndlen != 0) {
error = uiomove(outbuf, (size_t)ndlen, UIO_READ, uiop);
uiop->uio_loffset = offset;
}
kmem_free(dname, dname_size);
kmem_free(outbuf, bufsize);
if (eofp && error == 0)
*eofp = (uiop->uio_loffset >= dirsiz);
return (error);
}
/*ARGSUSED2*/
static int
hsfs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
{
struct hsnode *hp;
struct hsfid *fid;
if (fidp->fid_len < (sizeof (*fid) - sizeof (fid->hf_len))) {
fidp->fid_len = sizeof (*fid) - sizeof (fid->hf_len);
return (ENOSPC);
}
fid = (struct hsfid *)fidp;
fid->hf_len = sizeof (*fid) - sizeof (fid->hf_len);
hp = VTOH(vp);
mutex_enter(&hp->hs_contents_lock);
fid->hf_dir_lbn = hp->hs_dir_lbn;
fid->hf_dir_off = (ushort_t)hp->hs_dir_off;
fid->hf_ino = hp->hs_nodeid;
mutex_exit(&hp->hs_contents_lock);
return (0);
}
/*ARGSUSED*/
static int
hsfs_open(struct vnode **vpp,
int flag,
struct cred *cred,
caller_context_t *ct)
{
return (0);
}
/*ARGSUSED*/
static int
hsfs_close(
struct vnode *vp,
int flag,
int count,
offset_t offset,
struct cred *cred,
caller_context_t *ct)
{
(void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
cleanshares(vp, ttoproc(curthread)->p_pid);
return (0);
}
/*ARGSUSED2*/
static int
hsfs_access(struct vnode *vp,
int mode,
int flags,
cred_t *cred,
caller_context_t *ct)
{
return (hs_access(vp, (mode_t)mode, cred));
}
/*
* the seek time of a CD-ROM is very slow, and data transfer
* rate is even worse (max. 150K per sec). The design
* decision is to reduce access to cd-rom as much as possible,
* and to transfer a sizable block (read-ahead) of data at a time.
* UFS style of read ahead one block at a time is not appropriate,
* and is not supported
*/
/*
* KLUSTSIZE should be a multiple of PAGESIZE and <= MAXPHYS.
*/
#define KLUSTSIZE (56 * 1024)
/* we don't support read ahead */
int hsfs_lostpage; /* no. of times we lost original page */
/*
* Used to prevent biodone() from releasing buf resources that
* we didn't allocate in quite the usual way.
*/
/*ARGSUSED*/
int
hsfs_iodone(struct buf *bp)
{
sema_v(&bp->b_io);
return (0);
}
/*
* The taskq thread that invokes the scheduling function to ensure
* that all readaheads are complete and cleans up the associated
* memory and releases the page lock.
*/
void
hsfs_ra_task(void *arg)
{
struct hio_info *info = arg;
uint_t count;
struct buf *wbuf;
ASSERT(info->pp != NULL);
for (count = 0; count < info->bufsused; count++) {
wbuf = &(info->bufs[count]);
DTRACE_PROBE1(hsfs_io_wait_ra, struct buf *, wbuf);
while (sema_tryp(&(info->sema[count])) == 0) {
if (hsched_invoke_strategy(info->fsp)) {
sema_p(&(info->sema[count]));
break;
}
}
sema_destroy(&(info->sema[count]));
DTRACE_PROBE1(hsfs_io_done_ra, struct buf *, wbuf);
biofini(&(info->bufs[count]));
}
for (count = 0; count < info->bufsused; count++) {
if (info->vas[count] != NULL) {
ppmapout(info->vas[count]);
}
}
kmem_free(info->vas, info->bufcnt * sizeof (caddr_t));
kmem_free(info->bufs, info->bufcnt * sizeof (struct buf));
kmem_free(info->sema, info->bufcnt * sizeof (ksema_t));
pvn_read_done(info->pp, 0);
kmem_cache_free(hio_info_cache, info);
}
/*
* Submit asynchronous readahead requests to the I/O scheduler
* depending on the number of pages to read ahead. These requests
* are asynchronous to the calling thread but I/O requests issued
* subsequently by other threads with higher LBNs must wait for
* these readaheads to complete since we have a single ordered
* I/O pipeline. Thus these readaheads are semi-asynchronous.
* A TaskQ handles waiting for the readaheads to complete.
*
* This function is mostly a copy of hsfs_getapage but somewhat
* simpler. A readahead request is aborted if page allocation
* fails.
*/
/*ARGSUSED*/
static int
hsfs_getpage_ra(
struct vnode *vp,
u_offset_t off,
struct seg *seg,
caddr_t addr,
struct hsnode *hp,
struct hsfs *fsp,
int xarsiz,
offset_t bof,
int chunk_lbn_count,
int chunk_data_bytes)
{
struct buf *bufs;
caddr_t *vas;
caddr_t va;
struct page *pp, *searchp, *lastp;
struct vnode *devvp;
ulong_t byte_offset;
size_t io_len_tmp;
uint_t io_off, io_len;
uint_t xlen;
uint_t filsiz;
uint_t secsize;
uint_t bufcnt;
uint_t bufsused;
uint_t count;
uint_t io_end;
uint_t which_chunk_lbn;
uint_t offset_lbn;
uint_t offset_extra;
offset_t offset_bytes;
uint_t remaining_bytes;
uint_t extension;
int remainder; /* must be signed */
diskaddr_t driver_block;
u_offset_t io_off_tmp;
ksema_t *fio_done;
struct hio_info *info;
size_t len;
ASSERT(fsp->hqueue != NULL);
if (addr >= seg->s_base + seg->s_size) {
return (-1);
}
devvp = fsp->hsfs_devvp;
secsize = fsp->hsfs_vol.lbn_size; /* bytes per logical block */
/* file data size */
filsiz = hp->hs_dirent.ext_size;
if (off >= filsiz)
return (0);
extension = 0;
pp = NULL;
extension += hp->hs_ra_bytes;
/*
* Some CD writers (e.g. Kodak Photo CD writers)
* create CDs in TAO mode and reserve tracks that
* are not completely written. Some sectors remain
* unreadable for this reason and give I/O errors.
* Also, there's no point in reading sectors
* we'll never look at. So, if we're asked to go
* beyond the end of a file, truncate to the length
* of that file.
*
* Additionally, this behaviour is required by section
* 6.4.5 of ISO 9660:1988(E).
*/
len = MIN(extension ? extension : PAGESIZE, filsiz - off);
/* A little paranoia */
if (len <= 0)
return (-1);
/*
* After all that, make sure we're asking for things in units
* that bdev_strategy() will understand (see bug 4202551).
*/
len = roundup(len, DEV_BSIZE);
pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp,
&io_len_tmp, off, len, 1);
if (pp == NULL) {
hp->hs_num_contig = 0;
hp->hs_ra_bytes = 0;
hp->hs_prev_offset = 0;
return (-1);
}
io_off = (uint_t)io_off_tmp;
io_len = (uint_t)io_len_tmp;
/* check for truncation */
/*
* xxx Clean up and return EIO instead?
* xxx Ought to go to u_offset_t for everything, but we
* xxx call lots of things that want uint_t arguments.
*/
ASSERT(io_off == io_off_tmp);
/*
* get enough buffers for worst-case scenario
* (i.e., no coalescing possible).
*/
bufcnt = (len + secsize - 1) / secsize;
bufs = kmem_alloc(bufcnt * sizeof (struct buf), KM_SLEEP);
vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP);
/*
* Allocate a array of semaphores since we are doing I/O
* scheduling.
*/
fio_done = kmem_alloc(bufcnt * sizeof (ksema_t), KM_SLEEP);
/*
* If our filesize is not an integer multiple of PAGESIZE,
* we zero that part of the last page that's between EOF and
* the PAGESIZE boundary.
*/
xlen = io_len & PAGEOFFSET;
if (xlen != 0)
pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
DTRACE_PROBE2(hsfs_readahead, struct vnode *, vp, uint_t, io_len);
va = NULL;
lastp = NULL;
searchp = pp;
io_end = io_off + io_len;
for (count = 0, byte_offset = io_off;
byte_offset < io_end;
count++) {
ASSERT(count < bufcnt);
bioinit(&bufs[count]);
bufs[count].b_edev = devvp->v_rdev;
bufs[count].b_dev = cmpdev(devvp->v_rdev);
bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ;
bufs[count].b_iodone = hsfs_iodone;
bufs[count].b_vp = vp;
bufs[count].b_file = vp;
/* Compute disk address for interleaving. */
/* considered without skips */
which_chunk_lbn = byte_offset / chunk_data_bytes;
/* factor in skips */
offset_lbn = which_chunk_lbn * chunk_lbn_count;
/* convert to physical byte offset for lbn */
offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp);
/* don't forget offset into lbn */
offset_extra = byte_offset % chunk_data_bytes;
/* get virtual block number for driver */
driver_block = lbtodb(bof + xarsiz
+ offset_bytes + offset_extra);
if (lastp != searchp) {
/* this branch taken first time through loop */
va = vas[count] = ppmapin(searchp, PROT_WRITE,
(caddr_t)-1);
/* ppmapin() guarantees not to return NULL */
} else {
vas[count] = NULL;
}
bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE;
bufs[count].b_offset =
(offset_t)(byte_offset - io_off + off);
/*
* We specifically use the b_lblkno member here
* as even in the 32 bit world driver_block can
* get very large in line with the ISO9660 spec.
*/
bufs[count].b_lblkno = driver_block;
remaining_bytes = ((which_chunk_lbn + 1) * chunk_data_bytes)
- byte_offset;
/*
* remaining_bytes can't be zero, as we derived
* which_chunk_lbn directly from byte_offset.
*/
if ((remaining_bytes + byte_offset) < (off + len)) {
/* coalesce-read the rest of the chunk */
bufs[count].b_bcount = remaining_bytes;
} else {
/* get the final bits */
bufs[count].b_bcount = off + len - byte_offset;
}
remainder = PAGESIZE - (byte_offset % PAGESIZE);
if (bufs[count].b_bcount > remainder) {
bufs[count].b_bcount = remainder;
}
bufs[count].b_bufsize = bufs[count].b_bcount;
if (((offset_t)byte_offset + bufs[count].b_bcount) >
HS_MAXFILEOFF) {
break;
}
byte_offset += bufs[count].b_bcount;
/*
* We are scheduling I/O so we need to enqueue
* requests rather than calling bdev_strategy
* here. A later invocation of the scheduling
* function will take care of doing the actual
* I/O as it selects requests from the queue as
* per the scheduling logic.
*/
struct hio *hsio = kmem_cache_alloc(hio_cache,
KM_SLEEP);
sema_init(&fio_done[count], 0, NULL,
SEMA_DEFAULT, NULL);
hsio->bp = &bufs[count];
hsio->sema = &fio_done[count];
hsio->io_lblkno = bufs[count].b_lblkno;
hsio->nblocks = howmany(hsio->bp->b_bcount,
DEV_BSIZE);
/* used for deadline */
hsio->io_timestamp = drv_hztousec(ddi_get_lbolt());
/* for I/O coalescing */
hsio->contig_chain = NULL;
hsched_enqueue_io(fsp, hsio, 1);
lwp_stat_update(LWP_STAT_INBLK, 1);
lastp = searchp;
if ((remainder - bufs[count].b_bcount) < 1) {
searchp = searchp->p_next;
}
}
bufsused = count;
info = kmem_cache_alloc(hio_info_cache, KM_SLEEP);
info->bufs = bufs;
info->vas = vas;
info->sema = fio_done;
info->bufsused = bufsused;
info->bufcnt = bufcnt;
info->fsp = fsp;
info->pp = pp;
(void) taskq_dispatch(fsp->hqueue->ra_task,
hsfs_ra_task, info, KM_SLEEP);
/*
* The I/O locked pages are unlocked in our taskq thread.
*/
return (0);
}
/*
* Each file may have a different interleaving on disk. This makes
* things somewhat interesting. The gist is that there are some
* number of contiguous data sectors, followed by some other number
* of contiguous skip sectors. The sum of those two sets of sectors
* defines the interleave size. Unfortunately, it means that we generally
* can't simply read N sectors starting at a given offset to satisfy
* any given request.
*
* What we do is get the relevant memory pages via pvn_read_kluster(),
* then stride through the interleaves, setting up a buf for each
* sector that needs to be brought in. Instead of kmem_alloc'ing
* space for the sectors, though, we just point at the appropriate
* spot in the relevant page for each of them. This saves us a bunch
* of copying.
*
* NOTICE: The code below in hsfs_getapage is mostly same as the code
* in hsfs_getpage_ra above (with some omissions). If you are
* making any change to this function, please also look at
* hsfs_getpage_ra.
*/
/*ARGSUSED*/
static int
hsfs_getapage(
struct vnode *vp,
u_offset_t off,
size_t len,
uint_t *protp,
struct page *pl[],
size_t plsz,
struct seg *seg,
caddr_t addr,
enum seg_rw rw,
struct cred *cred)
{
struct hsnode *hp;
struct hsfs *fsp;
int err;
struct buf *bufs;
caddr_t *vas;
caddr_t va;
struct page *pp, *searchp, *lastp;
page_t *pagefound;
offset_t bof;
struct vnode *devvp;
ulong_t byte_offset;
size_t io_len_tmp;
uint_t io_off, io_len;
uint_t xlen;
uint_t filsiz;
uint_t secsize;
uint_t bufcnt;
uint_t bufsused;
uint_t count;
uint_t io_end;
uint_t which_chunk_lbn;
uint_t offset_lbn;
uint_t offset_extra;
offset_t offset_bytes;
uint_t remaining_bytes;
uint_t extension;
int remainder; /* must be signed */
int chunk_lbn_count;
int chunk_data_bytes;
int xarsiz;
diskaddr_t driver_block;
u_offset_t io_off_tmp;
ksema_t *fio_done;
int calcdone;
/*
* We don't support asynchronous operation at the moment, so
* just pretend we did it. If the pages are ever actually
* needed, they'll get brought in then.
*/
if (pl == NULL)
return (0);
hp = VTOH(vp);
fsp = VFS_TO_HSFS(vp->v_vfsp);
devvp = fsp->hsfs_devvp;
secsize = fsp->hsfs_vol.lbn_size; /* bytes per logical block */
/* file data size */
filsiz = hp->hs_dirent.ext_size;
/* disk addr for start of file */
bof = LBN_TO_BYTE((offset_t)hp->hs_dirent.ext_lbn, vp->v_vfsp);
/* xarsiz byte must be skipped for data */
xarsiz = hp->hs_dirent.xar_len << fsp->hsfs_vol.lbn_shift;
/* how many logical blocks in an interleave (data+skip) */
chunk_lbn_count = hp->hs_dirent.intlf_sz + hp->hs_dirent.intlf_sk;
if (chunk_lbn_count == 0) {
chunk_lbn_count = 1;
}
/*
* Convert interleaving size into bytes. The zero case
* (no interleaving) optimization is handled as a side-
* effect of the read-ahead logic.
*/
if (hp->hs_dirent.intlf_sz == 0) {
chunk_data_bytes = LBN_TO_BYTE(1, vp->v_vfsp);
/*
* Optimization: If our pagesize is a multiple of LBN
* bytes, we can avoid breaking up a page into individual
* lbn-sized requests.
*/
if (PAGESIZE % chunk_data_bytes == 0) {
chunk_lbn_count = BYTE_TO_LBN(PAGESIZE, vp->v_vfsp);
chunk_data_bytes = PAGESIZE;
}
} else {
chunk_data_bytes =
LBN_TO_BYTE(hp->hs_dirent.intlf_sz, vp->v_vfsp);
}
reread:
err = 0;
pagefound = 0;
calcdone = 0;
/*
* Do some read-ahead. This mostly saves us a bit of
* system cpu time more than anything else when doing
* sequential reads. At some point, could do the
* read-ahead asynchronously which might gain us something
* on wall time, but it seems unlikely....
*
* We do the easy case here, which is to read through
* the end of the chunk, minus whatever's at the end that
* won't exactly fill a page.
*/
if (hp->hs_ra_bytes > 0 && chunk_data_bytes != PAGESIZE) {
which_chunk_lbn = (off + len) / chunk_data_bytes;
extension = ((which_chunk_lbn + 1) * chunk_data_bytes) - off;
extension -= (extension % PAGESIZE);
} else {
extension = roundup(len, PAGESIZE);
}
atomic_inc_64(&fsp->total_pages_requested);
pp = NULL;
again:
/* search for page in buffer */
if ((pagefound = page_exists(vp, off)) == 0) {
/*
* Need to really do disk IO to get the page.
*/
if (!calcdone) {
extension += hp->hs_ra_bytes;
/*
* Some cd writers don't write sectors that aren't
* used. Also, there's no point in reading sectors
* we'll never look at. So, if we're asked to go
* beyond the end of a file, truncate to the length
* of that file.
*
* Additionally, this behaviour is required by section
* 6.4.5 of ISO 9660:1988(E).
*/
len = MIN(extension ? extension : PAGESIZE,
filsiz - off);
/* A little paranoia. */
ASSERT(len > 0);
/*
* After all that, make sure we're asking for things
* in units that bdev_strategy() will understand
* (see bug 4202551).
*/
len = roundup(len, DEV_BSIZE);
calcdone = 1;
}
pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp,
&io_len_tmp, off, len, 0);
if (pp == NULL) {
/*
* Pressure on memory, roll back readahead
*/
hp->hs_num_contig = 0;
hp->hs_ra_bytes = 0;
hp->hs_prev_offset = 0;
goto again;
}
io_off = (uint_t)io_off_tmp;
io_len = (uint_t)io_len_tmp;
/* check for truncation */
/*
* xxx Clean up and return EIO instead?
* xxx Ought to go to u_offset_t for everything, but we
* xxx call lots of things that want uint_t arguments.
*/
ASSERT(io_off == io_off_tmp);
/*
* get enough buffers for worst-case scenario
* (i.e., no coalescing possible).
*/
bufcnt = (len + secsize - 1) / secsize;
bufs = kmem_zalloc(bufcnt * sizeof (struct buf), KM_SLEEP);
vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP);
/*
* Allocate a array of semaphores if we are doing I/O
* scheduling.
*/
if (fsp->hqueue != NULL)
fio_done = kmem_alloc(bufcnt * sizeof (ksema_t),
KM_SLEEP);
for (count = 0; count < bufcnt; count++) {
bioinit(&bufs[count]);
bufs[count].b_edev = devvp->v_rdev;
bufs[count].b_dev = cmpdev(devvp->v_rdev);
bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ;
bufs[count].b_iodone = hsfs_iodone;
bufs[count].b_vp = vp;
bufs[count].b_file = vp;
}
/*
* If our filesize is not an integer multiple of PAGESIZE,
* we zero that part of the last page that's between EOF and
* the PAGESIZE boundary.
*/
xlen = io_len & PAGEOFFSET;
if (xlen != 0)
pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
va = NULL;
lastp = NULL;
searchp = pp;
io_end = io_off + io_len;
for (count = 0, byte_offset = io_off;
byte_offset < io_end; count++) {
ASSERT(count < bufcnt);
/* Compute disk address for interleaving. */
/* considered without skips */
which_chunk_lbn = byte_offset / chunk_data_bytes;
/* factor in skips */
offset_lbn = which_chunk_lbn * chunk_lbn_count;
/* convert to physical byte offset for lbn */
offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp);
/* don't forget offset into lbn */
offset_extra = byte_offset % chunk_data_bytes;
/* get virtual block number for driver */
driver_block =
lbtodb(bof + xarsiz + offset_bytes + offset_extra);
if (lastp != searchp) {
/* this branch taken first time through loop */
va = vas[count] =
ppmapin(searchp, PROT_WRITE, (caddr_t)-1);
/* ppmapin() guarantees not to return NULL */
} else {
vas[count] = NULL;
}
bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE;
bufs[count].b_offset =
(offset_t)(byte_offset - io_off + off);
/*
* We specifically use the b_lblkno member here
* as even in the 32 bit world driver_block can
* get very large in line with the ISO9660 spec.
*/
bufs[count].b_lblkno = driver_block;
remaining_bytes =
((which_chunk_lbn + 1) * chunk_data_bytes)
- byte_offset;
/*
* remaining_bytes can't be zero, as we derived
* which_chunk_lbn directly from byte_offset.
*/
if ((remaining_bytes + byte_offset) < (off + len)) {
/* coalesce-read the rest of the chunk */
bufs[count].b_bcount = remaining_bytes;
} else {
/* get the final bits */
bufs[count].b_bcount = off + len - byte_offset;
}
/*
* It would be nice to do multiple pages'
* worth at once here when the opportunity
* arises, as that has been shown to improve
* our wall time. However, to do that
* requires that we use the pageio subsystem,
* which doesn't mix well with what we're
* already using here. We can't use pageio
* all the time, because that subsystem
* assumes that a page is stored in N
* contiguous blocks on the device.
* Interleaving violates that assumption.
*
* Update: This is now not so big a problem
* because of the I/O scheduler sitting below
* that can re-order and coalesce I/O requests.
*/
remainder = PAGESIZE - (byte_offset % PAGESIZE);
if (bufs[count].b_bcount > remainder) {
bufs[count].b_bcount = remainder;
}
bufs[count].b_bufsize = bufs[count].b_bcount;
if (((offset_t)byte_offset + bufs[count].b_bcount) >
HS_MAXFILEOFF) {
break;
}
byte_offset += bufs[count].b_bcount;
if (fsp->hqueue == NULL) {
(void) bdev_strategy(&bufs[count]);
} else {
/*
* We are scheduling I/O so we need to enqueue
* requests rather than calling bdev_strategy
* here. A later invocation of the scheduling
* function will take care of doing the actual
* I/O as it selects requests from the queue as
* per the scheduling logic.
*/
struct hio *hsio = kmem_cache_alloc(hio_cache,
KM_SLEEP);
sema_init(&fio_done[count], 0, NULL,
SEMA_DEFAULT, NULL);
hsio->bp = &bufs[count];
hsio->sema = &fio_done[count];
hsio->io_lblkno = bufs[count].b_lblkno;
hsio->nblocks = howmany(hsio->bp->b_bcount,
DEV_BSIZE);
/* used for deadline */
hsio->io_timestamp =
drv_hztousec(ddi_get_lbolt());
/* for I/O coalescing */
hsio->contig_chain = NULL;
hsched_enqueue_io(fsp, hsio, 0);
}
lwp_stat_update(LWP_STAT_INBLK, 1);
lastp = searchp;
if ((remainder - bufs[count].b_bcount) < 1) {
searchp = searchp->p_next;
}
}
bufsused = count;
/* Now wait for everything to come in */
if (fsp->hqueue == NULL) {
for (count = 0; count < bufsused; count++) {
if (err == 0) {
err = biowait(&bufs[count]);
} else
(void) biowait(&bufs[count]);
}
} else {
for (count = 0; count < bufsused; count++) {
struct buf *wbuf;
/*
* Invoke scheduling function till our buf
* is processed. In doing this it might
* process bufs enqueued by other threads
* which is good.
*/
wbuf = &bufs[count];
DTRACE_PROBE1(hsfs_io_wait, struct buf *, wbuf);
while (sema_tryp(&fio_done[count]) == 0) {
/*
* hsched_invoke_strategy will return 1
* if the I/O queue is empty. This means
* that there is another thread who has
* issued our buf and is waiting. So we
* just block instead of spinning.
*/
if (hsched_invoke_strategy(fsp)) {
sema_p(&fio_done[count]);
break;
}
}
sema_destroy(&fio_done[count]);
DTRACE_PROBE1(hsfs_io_done, struct buf *, wbuf);
if (err == 0) {
err = geterror(wbuf);
}
}
kmem_free(fio_done, bufcnt * sizeof (ksema_t));
}
/* Don't leak resources */
for (count = 0; count < bufcnt; count++) {
biofini(&bufs[count]);
if (count < bufsused && vas[count] != NULL) {
ppmapout(vas[count]);
}
}
kmem_free(vas, bufcnt * sizeof (caddr_t));
kmem_free(bufs, bufcnt * sizeof (struct buf));
}
if (err) {
pvn_read_done(pp, B_ERROR);
return (err);
}
/*
* Lock the requested page, and the one after it if possible.
* Don't bother if our caller hasn't given us a place to stash
* the page pointers, since otherwise we'd lock pages that would
* never get unlocked.
*/
if (pagefound) {
int index;
ulong_t soff;
/*
* Make sure it's in memory before we say it's here.
*/
if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
hsfs_lostpage++;
goto reread;
}
pl[0] = pp;
index = 1;
atomic_inc_64(&fsp->cache_read_pages);
/*
* Try to lock the next page, if it exists, without
* blocking.
*/
plsz -= PAGESIZE;
/* LINTED (plsz is unsigned) */
for (soff = off + PAGESIZE; plsz > 0;
soff += PAGESIZE, plsz -= PAGESIZE) {
pp = page_lookup_nowait(vp, (u_offset_t)soff,
SE_SHARED);
if (pp == NULL)
break;
pl[index++] = pp;
}
pl[index] = NULL;
/*
* Schedule a semi-asynchronous readahead if we are
* accessing the last cached page for the current
* file.
*
* Doing this here means that readaheads will be
* issued only if cache-hits occur. This is an advantage
* since cache-hits would mean that readahead is giving
* the desired benefit. If cache-hits do not occur there
* is no point in reading ahead of time - the system
* is loaded anyway.
*/
if (fsp->hqueue != NULL &&
hp->hs_prev_offset - off == PAGESIZE &&
hp->hs_prev_offset < filsiz &&
hp->hs_ra_bytes > 0 &&
!page_exists(vp, hp->hs_prev_offset)) {
(void) hsfs_getpage_ra(vp, hp->hs_prev_offset, seg,
addr + PAGESIZE, hp, fsp, xarsiz, bof,
chunk_lbn_count, chunk_data_bytes);
}
return (0);
}
if (pp != NULL) {
pvn_plist_init(pp, pl, plsz, off, io_len, rw);
}
return (err);
}
/*ARGSUSED*/
static int
hsfs_getpage(
struct vnode *vp,
offset_t off,
size_t len,
uint_t *protp,
struct page *pl[],
size_t plsz,
struct seg *seg,
caddr_t addr,
enum seg_rw rw,
struct cred *cred,
caller_context_t *ct)
{
uint_t filsiz;
struct hsfs *fsp;
struct hsnode *hp;
fsp = VFS_TO_HSFS(vp->v_vfsp);
hp = VTOH(vp);
/* does not support write */
if (rw == S_WRITE) {
return (EROFS);
}
if (vp->v_flag & VNOMAP) {
return (ENOSYS);
}
ASSERT(off <= HS_MAXFILEOFF);
/*
* Determine file data size for EOF check.
*/
filsiz = hp->hs_dirent.ext_size;
if ((off + len) > (offset_t)(filsiz + PAGEOFFSET) && seg != segkmap)
return (EFAULT); /* beyond EOF */
/*
* Async Read-ahead computation.
* This attempts to detect sequential access pattern and
* enables reading extra pages ahead of time.
*/
if (fsp->hqueue != NULL) {
/*
* This check for sequential access also takes into
* account segmap weirdness when reading in chunks
* less than the segmap size of 8K.
*/
if (hp->hs_prev_offset == off || (off <
hp->hs_prev_offset && off + MAX(len, PAGESIZE)
>= hp->hs_prev_offset)) {
if (hp->hs_num_contig <
(seq_contig_requests - 1)) {
hp->hs_num_contig++;
} else {
/*
* We increase readahead quantum till
* a predefined max. max_readahead_bytes
* is a multiple of PAGESIZE.
*/
if (hp->hs_ra_bytes <
fsp->hqueue->max_ra_bytes) {
hp->hs_ra_bytes += PAGESIZE;
}
}
} else {
/*
* Not contiguous so reduce read ahead counters.
*/
if (hp->hs_ra_bytes > 0)
hp->hs_ra_bytes -= PAGESIZE;
if (hp->hs_ra_bytes <= 0) {
hp->hs_ra_bytes = 0;
if (hp->hs_num_contig > 0)
hp->hs_num_contig--;
}
}
/*
* Length must be rounded up to page boundary.
* since we read in units of pages.
*/
hp->hs_prev_offset = off + roundup(len, PAGESIZE);
DTRACE_PROBE1(hsfs_compute_ra, struct hsnode *, hp);
}
if (protp != NULL)
*protp = PROT_ALL;
return (pvn_getpages(hsfs_getapage, vp, off, len, protp, pl, plsz,
seg, addr, rw, cred));
}
/*
* This function should never be called. We need to have it to pass
* it as an argument to other functions.
*/
/*ARGSUSED*/
int
hsfs_putapage(
vnode_t *vp,
page_t *pp,
u_offset_t *offp,
size_t *lenp,
int flags,
cred_t *cr)
{
/* should never happen - just destroy it */
cmn_err(CE_NOTE, "hsfs_putapage: dirty HSFS page");
pvn_write_done(pp, B_ERROR | B_WRITE | B_INVAL | B_FORCE | flags);
return (0);
}
/*
* The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
* B_INVAL is set by:
*
* 1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
* 2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
* which translates to an MC_SYNC with the MS_INVALIDATE flag.
*
* The B_FREE (as well as the B_DONTNEED) flag is set when the
* MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked
* from SEGVN to release pages behind a pagefault.
*/
/*ARGSUSED*/
static int
hsfs_putpage(
struct vnode *vp,
offset_t off,
size_t len,
int flags,
struct cred *cr,
caller_context_t *ct)
{
int error = 0;
if (vp->v_count == 0) {
panic("hsfs_putpage: bad v_count");
/*NOTREACHED*/
}
if (vp->v_flag & VNOMAP)
return (ENOSYS);
ASSERT(off <= HS_MAXFILEOFF);
if (!vn_has_cached_data(vp)) /* no pages mapped */
return (0);
if (len == 0) { /* from 'off' to EOF */
error = pvn_vplist_dirty(vp, off, hsfs_putapage, flags, cr);
} else {
offset_t end_off = off + len;
offset_t file_size = VTOH(vp)->hs_dirent.ext_size;
offset_t io_off;
file_size = (file_size + PAGESIZE - 1) & PAGEMASK;
if (end_off > file_size)
end_off = file_size;
for (io_off = off; io_off < end_off; io_off += PAGESIZE) {
page_t *pp;
/*
* We insist on getting the page only if we are
* about to invalidate, free or write it and
* the B_ASYNC flag is not set.
*/
if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
pp = page_lookup(vp, io_off,
(flags & (B_INVAL | B_FREE)) ?
SE_EXCL : SE_SHARED);
} else {
pp = page_lookup_nowait(vp, io_off,
(flags & B_FREE) ? SE_EXCL : SE_SHARED);
}
if (pp == NULL)
continue;
/*
* Normally pvn_getdirty() should return 0, which
* impies that it has done the job for us.
* The shouldn't-happen scenario is when it returns 1.
* This means that the page has been modified and
* needs to be put back.
* Since we can't write on a CD, we fake a failed
* I/O and force pvn_write_done() to destroy the page.
*/
if (pvn_getdirty(pp, flags) == 1) {
cmn_err(CE_NOTE,
"hsfs_putpage: dirty HSFS page");
pvn_write_done(pp, flags |
B_ERROR | B_WRITE | B_INVAL | B_FORCE);
}
}
}
return (error);
}
/*ARGSUSED*/
static int
hsfs_map(
struct vnode *vp,
offset_t off,
struct as *as,
caddr_t *addrp,
size_t len,
uchar_t prot,
uchar_t maxprot,
uint_t flags,
struct cred *cred,
caller_context_t *ct)
{
struct segvn_crargs vn_a;
int error;
/* VFS_RECORD(vp->v_vfsp, VS_MAP, VS_CALL); */
if (vp->v_flag & VNOMAP)
return (ENOSYS);
if ((prot & PROT_WRITE) && (flags & MAP_SHARED))
return (ENOSYS);
if (off > HS_MAXFILEOFF || off < 0 ||
(off + len) < 0 || (off + len) > HS_MAXFILEOFF)
return (ENXIO);
if (vp->v_type != VREG) {
return (ENODEV);
}
/*
* If file is being locked, disallow mapping.
*/
if (vn_has_mandatory_locks(vp, VTOH(vp)->hs_dirent.mode))
return (EAGAIN);
as_rangelock(as);
error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
if (error != 0) {
as_rangeunlock(as);
return (error);
}
vn_a.vp = vp;
vn_a.offset = off;
vn_a.type = flags & MAP_TYPE;
vn_a.prot = prot;
vn_a.maxprot = maxprot;
vn_a.flags = flags & ~MAP_TYPE;
vn_a.cred = cred;
vn_a.amp = NULL;
vn_a.szc = 0;
vn_a.lgrp_mem_policy_flags = 0;
error = as_map(as, *addrp, len, segvn_create, &vn_a);
as_rangeunlock(as);
return (error);
}
/* ARGSUSED */
static int
hsfs_addmap(
struct vnode *vp,
offset_t off,
struct as *as,
caddr_t addr,
size_t len,
uchar_t prot,
uchar_t maxprot,
uint_t flags,
struct cred *cr,
caller_context_t *ct)
{
struct hsnode *hp;
if (vp->v_flag & VNOMAP)
return (ENOSYS);
hp = VTOH(vp);
mutex_enter(&hp->hs_contents_lock);
hp->hs_mapcnt += btopr(len);
mutex_exit(&hp->hs_contents_lock);
return (0);
}
/*ARGSUSED*/
static int
hsfs_delmap(
struct vnode *vp,
offset_t off,
struct as *as,
caddr_t addr,
size_t len,
uint_t prot,
uint_t maxprot,
uint_t flags,
struct cred *cr,
caller_context_t *ct)
{
struct hsnode *hp;
if (vp->v_flag & VNOMAP)
return (ENOSYS);
hp = VTOH(vp);
mutex_enter(&hp->hs_contents_lock);
hp->hs_mapcnt -= btopr(len); /* Count released mappings */
ASSERT(hp->hs_mapcnt >= 0);
mutex_exit(&hp->hs_contents_lock);
return (0);
}
/* ARGSUSED */
static int
hsfs_seek(
struct vnode *vp,
offset_t ooff,
offset_t *noffp,
caller_context_t *ct)
{
return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
}
/* ARGSUSED */
static int
hsfs_frlock(
struct vnode *vp,
int cmd,
struct flock64 *bfp,
int flag,
offset_t offset,
struct flk_callback *flk_cbp,
cred_t *cr,
caller_context_t *ct)
{
struct hsnode *hp = VTOH(vp);
/*
* If the file is being mapped, disallow fs_frlock.
* We are not holding the hs_contents_lock while checking
* hs_mapcnt because the current locking strategy drops all
* locks before calling fs_frlock.
* So, hs_mapcnt could change before we enter fs_frlock making
* it meaningless to have held hs_contents_lock in the first place.
*/
if (hp->hs_mapcnt > 0 && MANDLOCK(vp, hp->hs_dirent.mode))
return (EAGAIN);
return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
}
static int
hsched_deadline_compare(const void *x1, const void *x2)
{
const struct hio *h1 = x1;
const struct hio *h2 = x2;
if (h1->io_timestamp < h2->io_timestamp)
return (-1);
if (h1->io_timestamp > h2->io_timestamp)
return (1);
if (h1->io_lblkno < h2->io_lblkno)
return (-1);
if (h1->io_lblkno > h2->io_lblkno)
return (1);
if (h1 < h2)
return (-1);
if (h1 > h2)
return (1);
return (0);
}
static int
hsched_offset_compare(const void *x1, const void *x2)
{
const struct hio *h1 = x1;
const struct hio *h2 = x2;
if (h1->io_lblkno < h2->io_lblkno)
return (-1);
if (h1->io_lblkno > h2->io_lblkno)
return (1);
if (h1 < h2)
return (-1);
if (h1 > h2)
return (1);
return (0);
}
void
hsched_init_caches(void)
{
hio_cache = kmem_cache_create("hsfs_hio_cache",
sizeof (struct hio), 0, NULL,
NULL, NULL, NULL, NULL, 0);
hio_info_cache = kmem_cache_create("hsfs_hio_info_cache",
sizeof (struct hio_info), 0, NULL,
NULL, NULL, NULL, NULL, 0);
}
void
hsched_fini_caches(void)
{
kmem_cache_destroy(hio_cache);
kmem_cache_destroy(hio_info_cache);
}
/*
* Initialize I/O scheduling structures. This is called via hsfs_mount
*/
void
hsched_init(struct hsfs *fsp, int fsid, struct modlinkage *modlinkage)
{
struct hsfs_queue *hqueue = fsp->hqueue;
struct vnode *vp = fsp->hsfs_devvp;
/* TaskQ name of the form: hsched_task_ + stringof(int) */
char namebuf[23];
int error, err;
struct dk_cinfo info;
ldi_handle_t lh;
ldi_ident_t li;
/*
* Default maxtransfer = 16k chunk
*/
hqueue->dev_maxtransfer = 16384;
/*
* Try to fetch the maximum device transfer size. This is used to
* ensure that a coalesced block does not exceed the maxtransfer.
*/
err = ldi_ident_from_mod(modlinkage, &li);
if (err) {
cmn_err(CE_NOTE, "hsched_init: Querying device failed");
cmn_err(CE_NOTE, "hsched_init: ldi_ident_from_mod err=%d\n",
err);
goto set_ra;
}
err = ldi_open_by_dev(&(vp->v_rdev), OTYP_CHR, FREAD, CRED(), &lh, li);
ldi_ident_release(li);
if (err) {
cmn_err(CE_NOTE, "hsched_init: Querying device failed");
cmn_err(CE_NOTE, "hsched_init: ldi_open err=%d\n", err);
goto set_ra;
}
error = ldi_ioctl(lh, DKIOCINFO, (intptr_t)&info, FKIOCTL,
CRED(), &err);
err = ldi_close(lh, FREAD, CRED());
if (err) {
cmn_err(CE_NOTE, "hsched_init: Querying device failed");
cmn_err(CE_NOTE, "hsched_init: ldi_close err=%d\n", err);
}
if (error == 0) {
hqueue->dev_maxtransfer = ldbtob(info.dki_maxtransfer);
}
set_ra:
/*
* Max size of data to read ahead for sequential access pattern.
* Conservative to avoid letting the underlying CD drive to spin
* down, in case the application is reading slowly.
* We read ahead upto a max of 4 pages.
*/
hqueue->max_ra_bytes = PAGESIZE * 8;
mutex_init(&(hqueue->hsfs_queue_lock), NULL, MUTEX_DEFAULT, NULL);
mutex_init(&(hqueue->strategy_lock), NULL, MUTEX_DEFAULT, NULL);
avl_create(&(hqueue->read_tree), hsched_offset_compare,
sizeof (struct hio), offsetof(struct hio, io_offset_node));
avl_create(&(hqueue->deadline_tree), hsched_deadline_compare,
sizeof (struct hio), offsetof(struct hio, io_deadline_node));
(void) snprintf(namebuf, sizeof (namebuf), "hsched_task_%d", fsid);
hqueue->ra_task = taskq_create(namebuf, hsfs_taskq_nthreads,
minclsyspri + 2, 1, 104857600 / PAGESIZE, TASKQ_DYNAMIC);
hqueue->next = NULL;
hqueue->nbuf = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
}
void
hsched_fini(struct hsfs_queue *hqueue)
{
if (hqueue != NULL) {
/*
* Remove the sentinel if there was one.
*/
if (hqueue->next != NULL) {
avl_remove(&hqueue->read_tree, hqueue->next);
kmem_cache_free(hio_cache, hqueue->next);
}
avl_destroy(&(hqueue->read_tree));
avl_destroy(&(hqueue->deadline_tree));
mutex_destroy(&(hqueue->hsfs_queue_lock));
mutex_destroy(&(hqueue->strategy_lock));
/*
* If there are any existing readahead threads running
* taskq_destroy will wait for them to finish.
*/
taskq_destroy(hqueue->ra_task);
kmem_free(hqueue->nbuf, sizeof (struct buf));
}
}
/*
* Determine if two I/O requests are adjacent to each other so
* that they can coalesced.
*/
#define IS_ADJACENT(io, nio) \
(((io)->io_lblkno + (io)->nblocks == (nio)->io_lblkno) && \
(io)->bp->b_edev == (nio)->bp->b_edev)
/*
* This performs the actual I/O scheduling logic. We use the Circular
* Look algorithm here. Sort the I/O requests in ascending order of
* logical block number and process them starting with the lowest
* numbered block and progressing towards higher block numbers in the
* queue. Once there are no more higher numbered blocks, start again
* with the lowest one. This is good for CD/DVD as you keep moving
* the head in one direction along the outward spiral track and avoid
* too many seeks as much as possible. The re-ordering also allows
* us to coalesce adjacent requests into one larger request.
* This is thus essentially a 1-way Elevator with front merging.
*
* In addition each read request here has a deadline and will be
* processed out of turn if the deadline (500ms) expires.
*
* This function is necessarily serialized via hqueue->strategy_lock.
* This function sits just below hsfs_getapage and processes all read
* requests orginating from that function.
*/
int
hsched_invoke_strategy(struct hsfs *fsp)
{
struct hsfs_queue *hqueue;
struct buf *nbuf;
struct hio *fio, *nio, *tio, *prev, *last;
size_t bsize, soffset, offset, data;
int bioret, bufcount;
struct vnode *fvp;
ksema_t *io_done;
caddr_t iodata;
hqueue = fsp->hqueue;
mutex_enter(&hqueue->strategy_lock);
mutex_enter(&hqueue->hsfs_queue_lock);
/*
* Check for Deadline expiration first
*/
fio = avl_first(&hqueue->deadline_tree);
/*
* Paranoid check for empty I/O queue. Both deadline
* and read trees contain same data sorted in different
* ways. So empty deadline tree = empty read tree.
*/
if (fio == NULL) {
/*
* Remove the sentinel if there was one.
*/
if (hqueue->next != NULL) {
avl_remove(&hqueue->read_tree, hqueue->next);
kmem_cache_free(hio_cache, hqueue->next);
hqueue->next = NULL;
}
mutex_exit(&hqueue->hsfs_queue_lock);
mutex_exit(&hqueue->strategy_lock);
return (1);
}
if (drv_hztousec(ddi_get_lbolt()) - fio->io_timestamp
< HSFS_READ_DEADLINE) {
/*
* Apply standard scheduling logic. This uses the
* C-LOOK approach. Process I/O requests in ascending
* order of logical block address till no subsequent
* higher numbered block request remains. Then start
* again from the lowest numbered block in the queue.
*
* We do this cheaply here by means of a sentinel.
* The last processed I/O structure from the previous
* invocation of this func, is left dangling in the
* read_tree so that we can easily scan to the next
* higher numbered request and remove the sentinel.
*/
fio = NULL;
if (hqueue->next != NULL) {
fio = AVL_NEXT(&hqueue->read_tree, hqueue->next);
avl_remove(&hqueue->read_tree, hqueue->next);
kmem_cache_free(hio_cache, hqueue->next);
hqueue->next = NULL;
}
if (fio == NULL) {
fio = avl_first(&hqueue->read_tree);
}
} else if (hqueue->next != NULL) {
DTRACE_PROBE1(hsfs_deadline_expiry, struct hio *, fio);
avl_remove(&hqueue->read_tree, hqueue->next);
kmem_cache_free(hio_cache, hqueue->next);
hqueue->next = NULL;
}
/*
* In addition we try to coalesce contiguous
* requests into one bigger request.
*/
bufcount = 1;
bsize = ldbtob(fio->nblocks);
fvp = fio->bp->b_file;
nio = AVL_NEXT(&hqueue->read_tree, fio);
tio = fio;
while (nio != NULL && IS_ADJACENT(tio, nio) &&
bsize < hqueue->dev_maxtransfer) {
avl_remove(&hqueue->deadline_tree, tio);
avl_remove(&hqueue->read_tree, tio);
tio->contig_chain = nio;
bsize += ldbtob(nio->nblocks);
prev = tio;
tio = nio;
/*
* This check is required to detect the case where
* we are merging adjacent buffers belonging to
* different files. fvp is used to set the b_file
* parameter in the coalesced buf. b_file is used
* by DTrace so we do not want DTrace to accrue
* requests to two different files to any one file.
*/
if (fvp && tio->bp->b_file != fvp) {
fvp = NULL;
}
nio = AVL_NEXT(&hqueue->read_tree, nio);
bufcount++;
}
/*
* tio is not removed from the read_tree as it serves as a sentinel
* to cheaply allow us to scan to the next higher numbered I/O
* request.
*/
hqueue->next = tio;
avl_remove(&hqueue->deadline_tree, tio);
mutex_exit(&hqueue->hsfs_queue_lock);
DTRACE_PROBE3(hsfs_io_dequeued, struct hio *, fio, int, bufcount,
size_t, bsize);
/*
* The benefit of coalescing occurs if the the savings in I/O outweighs
* the cost of doing the additional work below.
* It was observed that coalescing 2 buffers results in diminishing
* returns, so we do coalescing if we have >2 adjacent bufs.
*/
if (bufcount > hsched_coalesce_min) {
/*
* We have coalesced blocks. First allocate mem and buf for
* the entire coalesced chunk.
* Since we are guaranteed single-threaded here we pre-allocate
* one buf at mount time and that is re-used every time. This
* is a synthesized buf structure that uses kmem_alloced chunk.
* Not quite a normal buf attached to pages.
*/
fsp->coalesced_bytes += bsize;
nbuf = hqueue->nbuf;
bioinit(nbuf);
nbuf->b_edev = fio->bp->b_edev;
nbuf->b_dev = fio->bp->b_dev;
nbuf->b_flags = fio->bp->b_flags;
nbuf->b_iodone = fio->bp->b_iodone;
iodata = kmem_alloc(bsize, KM_SLEEP);
nbuf->b_un.b_addr = iodata;
nbuf->b_lblkno = fio->bp->b_lblkno;
nbuf->b_vp = fvp;
nbuf->b_file = fvp;
nbuf->b_bcount = bsize;
nbuf->b_bufsize = bsize;
DTRACE_PROBE3(hsfs_coalesced_io_start, struct hio *, fio, int,
bufcount, size_t, bsize);
/*
* Perform I/O for the coalesced block.
*/
(void) bdev_strategy(nbuf);
/*
* Duplicate the last IO node to leave the sentinel alone.
* The sentinel is freed in the next invocation of this
* function.
*/
prev->contig_chain = kmem_cache_alloc(hio_cache, KM_SLEEP);
prev->contig_chain->bp = tio->bp;
prev->contig_chain->sema = tio->sema;
tio = prev->contig_chain;
tio->contig_chain = NULL;
soffset = ldbtob(fio->bp->b_lblkno);
nio = fio;
bioret = biowait(nbuf);
data = bsize - nbuf->b_resid;
biofini(nbuf);
mutex_exit(&hqueue->strategy_lock);
/*
* We use the b_resid parameter to detect how much
* data was succesfully transferred. We will signal
* a success to all the fully retrieved actual bufs
* before coalescing, rest is signaled as error,
* if any.
*/
tio = nio;
DTRACE_PROBE3(hsfs_coalesced_io_done, struct hio *, nio,
int, bioret, size_t, data);
/*
* Copy data and signal success to all the bufs
* which can be fully satisfied from b_resid.
*/
while (nio != NULL && data >= nio->bp->b_bcount) {
offset = ldbtob(nio->bp->b_lblkno) - soffset;
bcopy(iodata + offset, nio->bp->b_un.b_addr,
nio->bp->b_bcount);
data -= nio->bp->b_bcount;
bioerror(nio->bp, 0);
biodone(nio->bp);
sema_v(nio->sema);
tio = nio;
nio = nio->contig_chain;
kmem_cache_free(hio_cache, tio);
}
/*
* Signal error to all the leftover bufs (if any)
* after b_resid data is exhausted.
*/
while (nio != NULL) {
nio->bp->b_resid = nio->bp->b_bcount - data;
bzero(nio->bp->b_un.b_addr + data, nio->bp->b_resid);
bioerror(nio->bp, bioret);
biodone(nio->bp);
sema_v(nio->sema);
tio = nio;
nio = nio->contig_chain;
kmem_cache_free(hio_cache, tio);
data = 0;
}
kmem_free(iodata, bsize);
} else {
nbuf = tio->bp;
io_done = tio->sema;
nio = fio;
last = tio;
while (nio != NULL) {
(void) bdev_strategy(nio->bp);
nio = nio->contig_chain;
}
nio = fio;
mutex_exit(&hqueue->strategy_lock);
while (nio != NULL) {
if (nio == last) {
(void) biowait(nbuf);
sema_v(io_done);
break;
/* sentinel last not freed. See above. */
} else {
(void) biowait(nio->bp);
sema_v(nio->sema);
}
tio = nio;
nio = nio->contig_chain;
kmem_cache_free(hio_cache, tio);
}
}
return (0);
}
/*
* Insert an I/O request in the I/O scheduler's pipeline
* Using AVL tree makes it easy to reorder the I/O request
* based on logical block number.
*/
static void
hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra)
{
struct hsfs_queue *hqueue = fsp->hqueue;
mutex_enter(&hqueue->hsfs_queue_lock);
fsp->physical_read_bytes += hsio->bp->b_bcount;
if (ra)
fsp->readahead_bytes += hsio->bp->b_bcount;
avl_add(&hqueue->deadline_tree, hsio);
avl_add(&hqueue->read_tree, hsio);
DTRACE_PROBE3(hsfs_io_enqueued, struct hio *, hsio,
struct hsfs_queue *, hqueue, int, ra);
mutex_exit(&hqueue->hsfs_queue_lock);
}
/* ARGSUSED */
static int
hsfs_pathconf(struct vnode *vp,
int cmd,
ulong_t *valp,
struct cred *cr,
caller_context_t *ct)
{
struct hsfs *fsp;
int error = 0;
switch (cmd) {
case _PC_NAME_MAX:
fsp = VFS_TO_HSFS(vp->v_vfsp);
*valp = fsp->hsfs_namemax;
break;
case _PC_FILESIZEBITS:
*valp = 33; /* Without multi extent support: 4 GB - 2k */
break;
case _PC_TIMESTAMP_RESOLUTION:
/*
* HSFS keeps, at best, 1/100 second timestamp resolution.
*/
*valp = 10000000L;
break;
default:
error = fs_pathconf(vp, cmd, valp, cr, ct);
break;
}
return (error);
}
const fs_operation_def_t hsfs_vnodeops_template[] = {
VOPNAME_OPEN, { .vop_open = hsfs_open },
VOPNAME_CLOSE, { .vop_close = hsfs_close },
VOPNAME_READ, { .vop_read = hsfs_read },
VOPNAME_GETATTR, { .vop_getattr = hsfs_getattr },
VOPNAME_ACCESS, { .vop_access = hsfs_access },
VOPNAME_LOOKUP, { .vop_lookup = hsfs_lookup },
VOPNAME_READDIR, { .vop_readdir = hsfs_readdir },
VOPNAME_READLINK, { .vop_readlink = hsfs_readlink },
VOPNAME_FSYNC, { .vop_fsync = hsfs_fsync },
VOPNAME_INACTIVE, { .vop_inactive = hsfs_inactive },
VOPNAME_FID, { .vop_fid = hsfs_fid },
VOPNAME_SEEK, { .vop_seek = hsfs_seek },
VOPNAME_FRLOCK, { .vop_frlock = hsfs_frlock },
VOPNAME_GETPAGE, { .vop_getpage = hsfs_getpage },
VOPNAME_PUTPAGE, { .vop_putpage = hsfs_putpage },
VOPNAME_MAP, { .vop_map = hsfs_map },
VOPNAME_ADDMAP, { .vop_addmap = hsfs_addmap },
VOPNAME_DELMAP, { .vop_delmap = hsfs_delmap },
VOPNAME_PATHCONF, { .vop_pathconf = hsfs_pathconf },
NULL, NULL
};
struct vnodeops *hsfs_vnodeops;