hsfs_vnops.c revision 84b82766376a981b4beff87bdba0efa9e2aa7a39
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* Vnode operations for the High Sierra filesystem
*/
#include <sys/sysmacros.h>
#include <sys/resource.h>
#include <sys/vfs_opreg.h>
#include <sys/pathname.h>
#include <vm/seg_kmem.h>
/*
* For struct modlinkage
*/
/* # of contiguous requests to detect sequential access pattern */
static int seq_contig_requests = 2;
/*
* This is the max number os taskq threads that will be created
* if required. Since we are using a Dynamic TaskQ by default only
* one thread is created initially.
*
* NOTE: In the usual hsfs use case this per fs instance number
* of taskq threads should not place any undue load on a system.
* Even on an unusual system with say 100 CDROM drives, 800 threads
* will not be created unless all the drives are loaded and all
* of them are saturated with I/O at the same time! If there is at
* all a complaint of system load due to such an unusual case it
* should be easy enough to change to one per-machine Dynamic TaskQ
* for all hsfs mounts with a nthreads of say 32.
*/
/* Min count of adjacent bufs that will avoid buf coalescing */
static int hsched_coalesce_min = 2;
/*
* Kmem caches for heavily used small allocations. Using these kmem
* caches provides a factor of 3 reduction in system time and greatly
* aids overall throughput esp. on SPARC.
*/
struct kmem_cache *hio_cache;
struct kmem_cache *hio_info_cache;
/*
* This tunable allows us to ignore inode numbers from rrip-1.12.
* In this case, we fall back to our default inode algorithm.
*/
extern int use_rrip_inodes;
/*
* Free behind logic from UFS to tame our thirst for
* the page cache.
* explanation.
*/
static int freebehind = 1;
static int smallfile = 0;
static int cache_read_ahead = 0;
#define SMALLFILE1_D 1000
#define SMALLFILE2_D 10
/* ARGSUSED */
static int
{
return (0);
}
/*ARGSUSED*/
static int
struct caller_context *ct)
{
int error;
int dofree;
/*
* if vp is of type VDIR, make sure dirent
* is filled up with all info (because of ptbl)
*/
}
/* Sanity checks. */
return (0);
do {
/*
* We want to ask for only the "right" amount of data.
* In this case that means:-
*
* We can't get data from beyond our EOF. If asked,
* we will give a short read.
*
* segmap_getmapflt returns buffers of MAXBSIZE bytes.
* These buffers are always MAXBSIZE aligned.
* If our starting offset is not MAXBSIZE aligned,
* we can only ask for less than MAXBSIZE bytes.
*
* If our requested offset and length are such that
* they belong in different MAXBSIZE aligned slots
* then we'll be making more than one call on
* segmap_getmapflt.
*
* This diagram shows the variables we use and their
* relationships.
*
* |<-----MAXBSIZE----->|
* +--------------------------...+
* |.....mapon->|<--n-->|....*...|EOF
* +--------------------------...+
* uio_loffset->|
* uio_resid....|<---------->|
* diff.........|<-------------->|
*
* So, in this case our offset is not aligned
* and our request takes us outside of the
* MAXBSIZE window. We will break this up into
* two segmap_getmapflt calls.
*/
size_t n;
if (n <= 0) {
/* EOF or request satisfied. */
return (0);
}
/*
* Freebehind computation taken from:
*/
+ 1000000;
}
dofree = freebehind &&
hp->hs_ra_bytes > 0;
if (error == 0) {
/*
* if read a whole block, or read to eof,
* won't need this buffer again soon.
*/
flags = SM_DONTNEED;
else
flags = 0;
if (dofree) {
if ((cache_read_ahead == 0) &&
flags |= SM_DONTNEED;
}
} else
return (error);
}
/*ARGSUSED2*/
static int
int flags,
{
}
else
/* no. of blocks = no. of data blocks + no. of xar blocks */
return (0);
}
/*ARGSUSED*/
static int
{
return (EINVAL);
return (ENOENT);
}
/*ARGSUSED*/
static void
{
int nopage;
/*
* Note: acquiring and holding v_lock for quite a while
* here serializes on the vnode; this is unfortunate, but
* likely not to overly impact performance, as the underlying
* device (CDROM drive) is quite slow.
*/
panic("hsfs_inactive: v_count < 1");
/*NOTREACHED*/
}
return;
}
/*
* Free the hsnode.
* If there are no pages associated with the
* hsnode, give it back to the kmem_cache,
* else put at the end of this file system's
* internal free list.
*/
/*
* exit these locks now, since hs_freenode may
* kmem_free the hsnode and embedded vnode
*/
} else {
}
}
/*ARGSUSED*/
static int
char *nm,
int flags,
{
int error;
if (*nm == '\0') {
return (0);
}
/*
* If we're looking for ourself, life is simple.
*/
return (error);
return (0);
}
}
/*ARGSUSED*/
static int
int *eofp)
{
struct hs_direntry hd;
int error;
int hdlen; /* length of hs directory entry */
long ndlen; /* length of dirent entry */
int bytes_wanted;
char *outbuf; /* ptr to dirent buffer */
char *dname;
int dnamelen;
if (eofp)
*eofp = 1;
return (0);
}
if (error)
goto done;
while (offset < last_offset) {
/*
* Very similar validation code is found in
* process_dirblock(), hsfs_node.c.
* For an explanation, see there.
* It may make sense for the future to
* "consolidate" the code in hs_parsedir(),
* process_dirblock() and hsfs_readdir() into
* a single utility function.
*/
if (hdlen < HDE_ROOT_DIR_REC_SIZE ||
/*
* advance to next sector boundary
*/
if (hdlen)
continue;
}
/*
* Just ignore invalid directory entries.
* XXX - maybe hs_parsedir() will detect EXISTENCE bit
*/
/*
* Determine if there is enough room
*/
goto done; /* output buffer full */
}
/*
* If the media carries rrip-v1.12 or newer,
* and we trust the inodes from the rrip data
* (use_rrip_inodes != 0), use that data. If the
* media has been created by a recent mkisofs
* version, we may trust all numbers in the
* starting extent number; otherwise, we cannot
* do this for zero sized files and symlinks,
* because if we did we'd end up mapping all of
* them to the same node. We use HS_DUMMY_INO
* in this case and make sure that we will not
* map all files to the same meta data.
*/
} else {
}
/* strncpy(9f) will zero uninitialized bytes */
/*
* free up space allocated for symlink
*/
}
}
}
}
/*
* Got here for one of the following reasons:
* 1) outbuf is full (error == 0)
* 2) end of directory reached (error == 0)
* 3) error reading directory sector (error != 0)
* 4) directory entry crosses sector boundary (error == 0)
*
* If any directory entries have been copied, don't report
* case 4. Instead, return the valid directory entries.
*
* If no entries have been copied, report the error.
* If case 4, this will be indistiguishable from EOF.
*/
done:
if (ndlen != 0) {
}
return (error);
}
static int
{
return (ENOSPC);
}
return (0);
}
/*ARGSUSED*/
static int
{
return (0);
}
/*ARGSUSED*/
static int
int flag,
int count,
{
return (0);
}
/*ARGSUSED2*/
static int
{
}
/*
* the seek time of a CD-ROM is very slow, and data transfer
* rate is even worse (max. 150K per sec). The design
* decision is to reduce access to cd-rom as much as possible,
* and to transfer a sizable block (read-ahead) of data at a time.
* UFS style of read ahead one block at a time is not appropriate,
* and is not supported
*/
/*
* KLUSTSIZE should be a multiple of PAGESIZE and <= MAXPHYS.
*/
/* we don't support read ahead */
int hsfs_lostpage; /* no. of times we lost original page */
/*
* Used to prevent biodone() from releasing buf resources that
* we didn't allocate in quite the usual way.
*/
/*ARGSUSED*/
int
{
return (0);
}
/*
* The taskq thread that invokes the scheduling function to ensure
* that all readaheads are complete and cleans up the associated
* memory and releases the page lock.
*/
void
hsfs_ra_task(void *arg)
{
break;
}
}
}
}
}
}
/*
* Submit asynchronous readahead requests to the I/O scheduler
* depending on the number of pages to read ahead. These requests
* are asynchronous to the calling thread but I/O requests issued
* subsequently by other threads with higher LBNs must wait for
* these readaheads to complete since we have a single ordered
* I/O pipeline. Thus these readaheads are semi-asynchronous.
* A TaskQ handles waiting for the readaheads to complete.
*
* This function is mostly a copy of hsfs_getapage but somewhat
* simpler. A readahead request is aborted if page allocation
* fails.
*/
/*ARGSUSED*/
static int
int xarsiz,
int chunk_lbn_count,
int chunk_data_bytes)
{
int remainder; /* must be signed */
return (-1);
}
/* file data size */
return (0);
extension = 0;
/*
* Some cd writers don't write sectors that aren't used. Also,
* there's no point in reading sectors we'll never look at. So,
* if we're asked to go beyond the end of a file, truncate to the
* length of that file.
*
* Additionally, this behaviour is required by section 6.4.5 of
* ISO 9660:1988(E).
*/
/* A little paranoia */
if (len <= 0)
return (-1);
/*
* After all that, make sure we're asking for things in units
* that bdev_strategy() will understand (see bug 4202551).
*/
hp->hs_num_contig = 0;
hp->hs_ra_bytes = 0;
hp->hs_prev_offset = 0;
return (-1);
}
/* check for truncation */
/*
* xxx Clean up and return EIO instead?
* xxx Ought to go to u_offset_t for everything, but we
* xxx call lots of things that want uint_t arguments.
*/
/*
* get enough buffers for worst-case scenario
* (i.e., no coalescing possible).
*/
/*
* Allocate a array of semaphores since we are doing I/O
* scheduling.
*/
/*
* If our filesize is not an integer multiple of PAGESIZE,
* we zero that part of the last page that's between EOF and
* the PAGESIZE boundary.
*/
if (xlen != 0)
count++) {
/* Compute disk address for interleaving. */
/* considered without skips */
/* factor in skips */
/* convert to physical byte offset for lbn */
/* don't forget offset into lbn */
/* get virtual block number for driver */
+ offset_bytes + offset_extra);
/* this branch taken first time through loop */
(caddr_t)-1);
/* ppmapin() guarantees not to return NULL */
} else {
}
/*
* We specifically use the b_lblkno member here
* as even in the 32 bit world driver_block can
* get very large in line with the ISO9660 spec.
*/
- byte_offset;
/*
* remaining_bytes can't be zero, as we derived
* which_chunk_lbn directly from byte_offset.
*/
/* coalesce-read the rest of the chunk */
} else {
/* get the final bits */
}
}
break;
}
/*
* We are scheduling I/O so we need to enqueue
* requests rather than calling bdev_strategy
* here. A later invocation of the scheduling
* function will take care of doing the actual
* I/O as it selects requests from the queue as
* per the scheduling logic.
*/
KM_SLEEP);
SEMA_DEFAULT, NULL);
/* used for deadline */
/* for I/O coalescing */
}
}
/*
* The I/O locked pages are unlocked in our taskq thread.
*/
return (0);
}
/*
* Each file may have a different interleaving on disk. This makes
* things somewhat interesting. The gist is that there are some
* number of contiguous data sectors, followed by some other number
* of contiguous skip sectors. The sum of those two sets of sectors
* defines the interleave size. Unfortunately, it means that we generally
* can't simply read N sectors starting at a given offset to satisfy
* any given request.
*
* What we do is get the relevant memory pages via pvn_read_kluster(),
* then stride through the interleaves, setting up a buf for each
* sector that needs to be brought in. Instead of kmem_alloc'ing
* space for the sectors, though, we just point at the appropriate
* spot in the relevant page for each of them. This saves us a bunch
* of copying.
*
* NOTICE: The code below in hsfs_getapage is mostly same as the code
* in hsfs_getpage_ra above (with some omissions). If you are
* making any change to this function, please also look at
* hsfs_getpage_ra.
*/
/*ARGSUSED*/
static int
{
int err;
int remainder; /* must be signed */
int chunk_lbn_count;
int chunk_data_bytes;
int xarsiz;
int calcdone;
/*
* We don't support asynchronous operation at the moment, so
* just pretend we did it. If the pages are ever actually
* needed, they'll get brought in then.
*/
return (0);
/* file data size */
/* disk addr for start of file */
/* xarsiz byte must be skipped for data */
/* how many logical blocks in an interleave (data+skip) */
if (chunk_lbn_count == 0) {
chunk_lbn_count = 1;
}
/*
* Convert interleaving size into bytes. The zero case
* (no interleaving) optimization is handled as a side-
* effect of the read-ahead logic.
*/
/*
* Optimization: If our pagesize is a multiple of LBN
* bytes, we can avoid breaking up a page into individual
* lbn-sized requests.
*/
if (PAGESIZE % chunk_data_bytes == 0) {
}
} else {
}
err = 0;
pagefound = 0;
calcdone = 0;
/*
* Do some read-ahead. This mostly saves us a bit of
* system cpu time more than anything else when doing
* sequential reads. At some point, could do the
* read-ahead asynchronously which might gain us something
* on wall time, but it seems unlikely....
*
* We do the easy case here, which is to read through
* the end of the chunk, minus whatever's at the end that
* won't exactly fill a page.
*/
} else {
}
/* search for page in buffer */
/*
* Need to really do disk IO to get the page.
*/
if (!calcdone) {
/*
* Some cd writers don't write sectors that aren't
* used. Also, there's no point in reading sectors
* we'll never look at. So, if we're asked to go
* beyond the end of a file, truncate to the length
* of that file.
*
* Additionally, this behaviour is required by section
* 6.4.5 of ISO 9660:1988(E).
*/
/* A little paranoia. */
/*
* After all that, make sure we're asking for things
* in units that bdev_strategy() will understand
* (see bug 4202551).
*/
calcdone = 1;
}
/*
* Pressure on memory, roll back readahead
*/
hp->hs_num_contig = 0;
hp->hs_ra_bytes = 0;
hp->hs_prev_offset = 0;
goto again;
}
/* check for truncation */
/*
* xxx Clean up and return EIO instead?
* xxx Ought to go to u_offset_t for everything, but we
* xxx call lots of things that want uint_t arguments.
*/
/*
* get enough buffers for worst-case scenario
* (i.e., no coalescing possible).
*/
/*
* Allocate a array of semaphores if we are doing I/O
* scheduling.
*/
KM_SLEEP);
}
/*
* If our filesize is not an integer multiple of PAGESIZE,
* we zero that part of the last page that's between EOF and
* the PAGESIZE boundary.
*/
if (xlen != 0)
/* Compute disk address for interleaving. */
/* considered without skips */
/* factor in skips */
/* convert to physical byte offset for lbn */
/* don't forget offset into lbn */
/* get virtual block number for driver */
/* this branch taken first time through loop */
/* ppmapin() guarantees not to return NULL */
} else {
}
/*
* We specifically use the b_lblkno member here
* as even in the 32 bit world driver_block can
* get very large in line with the ISO9660 spec.
*/
- byte_offset;
/*
* remaining_bytes can't be zero, as we derived
* which_chunk_lbn directly from byte_offset.
*/
/* coalesce-read the rest of the chunk */
} else {
/* get the final bits */
}
/*
* It would be nice to do multiple pages'
* worth at once here when the opportunity
* arises, as that has been shown to improve
* our wall time. However, to do that
* requires that we use the pageio subsystem,
* which doesn't mix well with what we're
* already using here. We can't use pageio
* all the time, because that subsystem
* assumes that a page is stored in N
* contiguous blocks on the device.
* Interleaving violates that assumption.
*
* Update: This is now not so big a problem
* because of the I/O scheduler sitting below
* that can re-order and coalesce I/O requests.
*/
}
break;
}
} else {
/*
* We are scheduling I/O so we need to enqueue
* requests rather than calling bdev_strategy
* here. A later invocation of the scheduling
* function will take care of doing the actual
* I/O as it selects requests from the queue as
* per the scheduling logic.
*/
KM_SLEEP);
SEMA_DEFAULT, NULL);
/* used for deadline */
hsio->io_timestamp =
/* for I/O coalescing */
}
}
}
/* Now wait for everything to come in */
if (err == 0) {
} else
}
} else {
/*
* Invoke scheduling function till our buf
* is processed. In doing this it might
* process bufs enqueued by other threads
* which is good.
*/
/*
* hsched_invoke_strategy will return 1
* if the I/O queue is empty. This means
* that there is another thread who has
* issued our buf and is waiting. So we
* just block instead of spinning.
*/
if (hsched_invoke_strategy(fsp)) {
break;
}
}
if (err == 0) {
}
}
}
/* Don't leak resources */
}
}
}
if (err) {
return (err);
}
/*
* Lock the requested page, and the one after it if possible.
* Don't bother if our caller hasn't given us a place to stash
* the page pointers, since otherwise we'd lock pages that would
* never get unlocked.
*/
if (pagefound) {
int index;
/*
* Make sure it's in memory before we say it's here.
*/
goto reread;
}
index = 1;
/*
* Try to lock the next page, if it exists, without
* blocking.
*/
/* LINTED (plsz is unsigned) */
break;
}
/*
* Schedule a semi-asynchronous readahead if we are
* accessing the last cached page for the current
* file.
*
* Doing this here means that readaheads will be
* issued only if cache-hits occur. This is an advantage
* since cache-hits would mean that readahead is giving
* the desired benefit. If cache-hits do not occur there
* is no point in reading ahead of time - the system
* is loaded anyway.
*/
hp->hs_ra_bytes > 0 &&
}
return (0);
}
}
return (err);
}
static int
{
int err;
/* does not support write */
panic("write attempt on READ ONLY HSFS");
/*NOTREACHED*/
}
return (ENOSYS);
}
/*
* Determine file data size for EOF check.
*/
return (EFAULT); /* beyond EOF */
/*
* Async Read-ahead computation.
* This attempts to detect sequential access pattern and
* enables reading extra pages ahead of time.
*/
/*
* This check for sequential access also takes into
* account segmap weirdness when reading in chunks
* less than the segmap size of 8K.
*/
>= hp->hs_prev_offset)) {
if (hp->hs_num_contig <
(seq_contig_requests - 1)) {
hp->hs_num_contig++;
} else {
/*
* We increase readahead quantum till
* a predefined max. max_readahead_bytes
* is a multiple of PAGESIZE.
*/
if (hp->hs_ra_bytes <
}
}
} else {
/*
* Not contiguous so reduce read ahead counters.
*/
if (hp->hs_ra_bytes > 0)
if (hp->hs_ra_bytes <= 0) {
hp->hs_ra_bytes = 0;
if (hp->hs_num_contig > 0)
hp->hs_num_contig--;
}
}
/*
* Length must be rounded up to page boundary.
* since we read in units of pages.
*/
}
else
return (err);
}
/*
* This function should never be called. We need to have it to pass
* it as an argument to other functions.
*/
/*ARGSUSED*/
int
int flags,
{
/* should never happen - just destroy it */
return (0);
}
/*
* The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
* B_INVAL is set by:
*
* 1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
* 2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
* which translates to an MC_SYNC with the MS_INVALIDATE flag.
*
* The B_FREE (as well as the B_DONTNEED) flag is set when the
* MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked
* from SEGVN to release pages behind a pagefault.
*/
/*ARGSUSED*/
static int
int flags,
{
int error = 0;
panic("hsfs_putpage: bad v_count");
/*NOTREACHED*/
}
return (ENOSYS);
return (0);
if (len == 0) { /* from 'off' to EOF */
} else {
/*
* We insist on getting the page only if we are
* about to invalidate, free or write it and
* the B_ASYNC flag is not set.
*/
} else {
}
continue;
/*
* Normally pvn_getdirty() should return 0, which
* impies that it has done the job for us.
* The shouldn't-happen scenario is when it returns 1.
* This means that the page has been modified and
* needs to be put back.
* Since we can't write on a CD, we fake a failed
* I/O and force pvn_write_done() to destroy the page.
*/
"hsfs_putpage: dirty HSFS page");
}
}
}
return (error);
}
/*ARGSUSED*/
static int
{
struct segvn_crargs vn_a;
int error;
/* VFS_RECORD(vp->v_vfsp, VS_MAP, VS_CALL); */
return (ENOSYS);
return (ENXIO);
return (ENODEV);
}
/*
* If file is being locked, disallow mapping.
*/
return (EAGAIN);
return (ENOMEM);
}
} else {
/*
* User specified address - blow away any previous mappings
*/
}
return (error);
}
/* ARGSUSED */
static int
{
return (ENOSYS);
return (0);
}
/*ARGSUSED*/
static int
{
return (ENOSYS);
return (0);
}
/* ARGSUSED */
static int
{
}
/* ARGSUSED */
static int
int cmd,
int flag,
struct flk_callback *flk_cbp,
{
/*
* If the file is being mapped, disallow fs_frlock.
* We are not holding the hs_contents_lock while checking
* hs_mapcnt because the current locking strategy drops all
* locks before calling fs_frlock.
* So, hs_mapcnt could change before we enter fs_frlock making
* it meaningless to have held hs_contents_lock in the first place.
*/
return (EAGAIN);
}
static int
{
return (-1);
return (1);
return (-1);
return (1);
return (-1);
return (1);
return (0);
}
static int
{
return (-1);
return (1);
return (-1);
return (1);
return (0);
}
void
hsched_init_caches(void)
{
}
void
hsched_fini_caches(void)
{
}
/*
* Initialize I/O scheduling structures. This is called via hsfs_mount
*/
void
{
/* TaskQ name of the form: hsched_task_ + stringof(int) */
char namebuf[23];
/*
* Default maxtransfer = 16k chunk
*/
/*
* Try to fetch the maximum device transfer size. This is used to
* ensure that a coalesced block does not exceed the maxtransfer.
*/
if (err) {
err);
goto set_ra;
}
if (err) {
goto set_ra;
}
if (err) {
}
if (error == 0) {
}
/*
* Max size of data to read ahead for sequential access pattern.
* Conservative to avoid letting the underlying CD drive to spin
* down, in case the application is reading slowly.
* We read ahead upto a max of 4 pages.
*/
}
void
{
/*
* If there are any existing readahead threads running
* taskq_destroy will wait for them to finish.
*/
}
}
}
/*
* Determine if two I/O requests are adjacent to each other so
* that they can coalesced.
*/
/*
* This performs the actual I/O scheduling logic. We use the Circular
* Look algorithm here. Sort the I/O requests in ascending order of
* logical block number and process them starting with the lowest
* numbered block and progressing towards higher block numbers in the
* queue. Once there are no more higher numbered blocks, start again
* the head in one direction along the outward spiral track and avoid
* too many seeks as much as possible. The re-ordering also allows
* us to coalesce adjacent requests into one larger request.
* This is thus essentially a 1-way Elevator with front merging.
*
* In addition each read request here has a deadline and will be
* processed out of turn if the deadline (500ms) expires.
*
* This function is necessarily serialized via hqueue->strategy_lock.
* This function sits just below hsfs_getapage and processes all read
* requests orginating from that function.
*/
int
{
struct hsfs_queue *hqueue;
/*
* Check for Deadline expiration first
*/
/*
* Paranoid check for empty I/O queue. Both deadline
* and read trees contain same data sorted in different
* ways. So empty deadline tree = empty read tree.
*/
/*
* Remove the sentinel if there was one.
*/
}
return (1);
}
< HSFS_READ_DEADLINE) {
/*
* Apply standard scheduling logic. This uses the
* C-LOOK approach. Process I/O requests in ascending
* order of logical block address till no subsequent
* higher numbered block request remains. Then start
* again from the lowest numbered block in the queue.
*
* We do this cheaply here by means of a sentinel.
* The last processed I/O structure from the previous
* invocation of this func, is left dangling in the
* read_tree so that we can easily scan to the next
* higher numbered request and remove the sentinel.
*/
}
}
}
/*
* In addition we try to coalesce contiguous
* requests into one bigger request.
*/
bufcount = 1;
/*
* This check is required to detect the case where
* we are merging adjacent buffers belonging to
* different files. fvp is used to set the b_file
* parameter in the coalesced buf. b_file is used
* by DTrace so we do not want DTrace to accrue
* requests to two different files to any one file.
*/
}
bufcount++;
}
/*
* tio is not removed from the read_tree as it serves as a sentinel
* to cheaply allow us to scan to the next higher numbered I/O
* request.
*/
/*
* The benefit of coalescing occurs if the the savings in I/O outweighs
* the cost of doing the additional work below.
* It was observed that coalescing 2 buffers results in diminishing
* returns, so we do coalescing if we have >2 adjacent bufs.
*/
if (bufcount > hsched_coalesce_min) {
/*
* We have coalesced blocks. First allocate mem and buf for
* the entire coalesced chunk.
* Since we are guaranteed single-threaded here we pre-allocate
* one buf at mount time and that is re-used every time. This
* is a synthesized buf structure that uses kmem_alloced chunk.
* Not quite a normal buf attached to pages.
*/
/*
* Perform I/O for the coalesced block.
*/
(void) bdev_strategy(nbuf);
/*
* Duplicate the last IO node to leave the sentinel alone.
* The sentinel is freed in the next invocation of this
* function.
*/
/*
* We use the b_resid parameter to detect how much
* data was succesfully transferred. We will signal
* a success to all the fully retrieved actual bufs
* before coalescing, rest is signaled as error,
* if any.
*/
/*
* Copy data and signal success to all the bufs
* which can be fully satisfied from b_resid.
*/
}
/*
* Signal error to all the leftover bufs (if any)
* after b_resid data is exhausted.
*/
data = 0;
}
} else {
}
break;
/* sentinel last not freed. See above. */
} else {
}
}
}
return (0);
}
/*
* Insert an I/O request in the I/O scheduler's pipeline
* Using AVL tree makes it easy to reorder the I/O request
* based on logical block number.
*/
static void
{
if (ra)
}
/* ARGSUSED */
static int
{
int error = 0;
switch (cmd) {
case _PC_NAME_MAX:
break;
case _PC_FILESIZEBITS:
break;
default:
}
return (error);
}
const fs_operation_def_t hsfs_vnodeops_template[] = {
};
struct vnodeops *hsfs_vnodeops;