bio.c revision 9468939ef8704ee9aba7596c1e9ff9b059109cac
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright 2011 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
* The Regents of the University of California
* All Rights Reserved
*
* University Acknowledgment- Portions of this document are derived from
* software developed by the University of California, Berkeley, and its
* contributors.
*/
#include <sys/sysmacros.h>
#include <vm/seg_kmem.h>
#include <sys/tnf_probe.h>
/* Locks */
static int nbuf; /* number of buffer headers allocated */
static int lastindex; /* Reference point on where to start */
/* when looking for free buffers */
static kcondvar_t bio_flushinval_cv;
static int bio_doingflush; /* flush in progress */
static int bio_doinginval; /* inval in progress */
static int bio_flinv_cv_wanted; /* someone waiting for cv */
/*
* Statistics on the buffer cache
*/
{ "buffer_cache_lookups", KSTAT_DATA_UINT32 },
{ "buffer_cache_hits", KSTAT_DATA_UINT32 },
{ "new_buffer_requests", KSTAT_DATA_UINT32 },
{ "waits_for_buffer_allocs", KSTAT_DATA_UINT32 },
{ "buffers_locked_by_someone", KSTAT_DATA_UINT32 },
{ "duplicate_buffers_found", KSTAT_DATA_UINT32 }
};
/*
* kstat data
*/
sizeof (kstat_named_t));
/*
* Statistics on ufs buffer cache
* Not protected by locks
*/
struct ufsbiostats ub = {
{ "breads", KSTAT_DATA_UINT32 },
{ "bwrites", KSTAT_DATA_UINT32 },
{ "fbiwrites", KSTAT_DATA_UINT32 },
{ "getpages", KSTAT_DATA_UINT32 },
{ "getras", KSTAT_DATA_UINT32 },
{ "putsyncs", KSTAT_DATA_UINT32 },
{ "putasyncs", KSTAT_DATA_UINT32 },
{ "putpageios", KSTAT_DATA_UINT32 },
};
/*
* more UFS Logging eccentricities...
*
* required since "#pragma weak ..." doesn't work in reverse order.
* i.e.: genunix (bio.c) is loaded before the ufs modules and pointers
* to ufs routines don't get plugged into bio.c calls so
* we initialize it when setting up the "lufsops" table
* in "lufs.c:_init()"
*/
void (*bio_lufs_strategy)(void *, buf_t *);
void (*bio_snapshot_strategy)(void *, buf_t *);
/* Private routines */
static struct buf *bio_getfreeblk(long);
static void bio_mem_get(long);
static void bio_bhdr_free(struct buf *);
static struct buf *bio_bhdr_alloc(void);
static void bio_recycle(int, long);
static void bio_pageio_done(struct buf *);
/*
* Buffer cache constants
*/
/* Flags for bio_recycle() */
#define BIO_HEADER 0x01
#define BIO_MEM 0x02
extern int bufhwm; /* User tunable - high water mark for mem */
extern int bufhwm_pct; /* ditto - given in % of physmem */
/*
* The following routines allocate and free
* buffers with various side effects. In general the
* arguments to an allocate routine are a device and
* a block number, and the value is a pointer to
* to the buffer header; the buffer returned is locked with a
* binary semaphore so that no one else can touch it. If the block was
* already in core, no I/O need be done; if it is
* already locked, the process waits until it becomes free.
* The following routines allocate a buffer:
* getblk
* breada
* Eventually the buffer must be released, possibly with the
* side effect of writing it out, by using one of
* bawrite
* brelse
*
* Instead, a binary semaphore, b_sem is used to gain exclusive access to
* a buffer and a binary semaphore, b_io is used for I/O synchronization.
* B_DONE is still used to denote a buffer with I/O complete on it.
*
* The bfreelist.b_bcount field is computed everytime fsflush runs. It is
* should not be used where a very accurate count of the free buffers is
* needed.
*/
/*
* Read in (if necessary) the block and return a buffer pointer.
*
* This interface is provided for binary compatibility. Using
* BREAD() directly avoids the extra function call overhead invoked
* by calling this routine.
*/
struct buf *
{
}
/*
* Common code for reading a buffer with various options
*
* Read in (if necessary) the block and return a buffer pointer.
*/
struct buf *
{
return (bp);
(void) bdev_strategy(bp);
/* ufs && logging */
/* ufs && snapshots */
} else {
(void) bdev_strategy(bp);
}
return (bp);
}
/*
* Read in the block, like bread, but also start I/O on the
* read-ahead block (which is not allocated to the caller).
*/
struct buf *
{
(void) bdev_strategy(bp);
}
}
else {
(void) bdev_strategy(rabp);
}
}
return (bp);
}
/*
* Common code for writing a buffer with various options.
*
* force_wait - wait for write completion regardless of B_ASYNC flag
* do_relse - release the buffer when we are done
* clear_flags - flags to clear from the buffer
*/
void
int do_relse, int clear_flags)
{
register int do_wait;
int flag;
if (do_wait == 0)
(void) bdev_strategy(bp);
/* ufs && logging */
/* ufs && snapshots */
} else {
(void) bdev_strategy(bp);
}
if (do_wait) {
if (do_relse) {
}
}
}
/*
* Write the buffer, waiting for completion (unless B_ASYNC is set).
* Then release the buffer.
* This interface is provided for binary compatibility. Using
* BWRITE() directly avoids the extra function call overhead invoked
* by calling this routine.
*/
void
{
}
/*
* Write the buffer, waiting for completion.
* But don't release the buffer afterwards.
* This interface is provided for binary compatibility. Using
* BWRITE2() directly avoids the extra function call overhead.
*/
void
{
}
/*
* Release the buffer, marking it so that if it is grabbed
* for another purpose it will be written out before being
* given up (e.g. when writing a partial block where it is
* assumed that another write for the same block will soon follow).
* Also save the time that the block is first marked as delayed
* so that it will be written in a reasonable time.
*/
void
{
/*
* B_DONE allows others to use the buffer, B_DELWRI causes the
* buffer to be written before being reused, and setting b_resid
* to zero says the buffer is complete.
*/
}
/*
* Release the buffer, start I/O on it, but don't wait for completion.
*/
void
{
/* Use bfreelist.b_bcount as a weird-ass heuristic */
}
/*
* Release the buffer, with no I/O implied.
*/
void
{
/*
* Clear the retry write flag if the buffer was written without
* error. The presence of B_DELWRI means the buffer has not yet
* been written and the presence of B_ERROR means that an error
* is still occurring.
*/
}
/* Check for anomalous conditions */
/* Don't add to the freelist. Destroy it now */
return;
}
/*
* If a write failed and we are supposed to retry write,
* don't toss the buffer. Keep it around and mark it
* delayed write in the hopes that it will eventually
* get flushed (and still keep the system running.)
*/
/* keep fsflush from trying continuously to flush */
} else
}
/*
* If delayed write is set then put in on the delayed
* write list instead of the free buffer list.
*/
/*
* Make sure that the number of entries on this list are
* Zero <= count <= total # buffers
*/
/*
* This buffer goes on the delayed write buffer list
*/
}
} else {
}
/*
* Should come here very very rarely.
*/
}
}
/*
* Don't let anyone get the buffer off the freelist before we
* release our hold on it.
*/
}
/*
* Return a count of the number of B_BUSY buffers in the system
* Can only be used as a good estimate. If 'cleanit' is set,
* try to flush all bufs.
*/
int
{
int busy = 0;
int i;
for (i = 0; i < v.v_hbuf; i++) {
busy++;
}
}
}
return (busy);
}
/*
* this interface is provided for binary compatibility.
*
* Assign a buffer for the given block. If the appropriate
* block is already associated, return it; otherwise search
* for the oldest non-busy buffer and reassign it.
*/
struct buf *
{
}
/*
* Assign a buffer for the given block. If the appropriate
* block is already associated, return it; otherwise search
* for the oldest non-busy buffer and reassign it.
*/
struct buf *
{
loop:
continue;
/*
* Avoid holding the hash lock in the event that
* the buffer is locked by someone. Since the hash chain
* may change when we drop the hash lock
* we have to start at the beginning of the chain if the
*/
/*
* OK, we are dealing with a busy buffer.
* In the case that we are panicking and we
* got called from bread(), we have some chance
* for error recovery. So better bail out from
* here since sema_p() won't block. If we got
* called directly from ufs routines, there is
* no way to report an error yet.
*/
goto errout;
/*
* For the following line of code to work
* correctly never kmem_free the buffer "header".
*/
goto loop; /* start over */
}
}
/* Found */
/*
*/
/*
* Make the common path short.
*/
return (bp);
}
/*
* The buffer must have entered during the lock upgrade
* so free the new buffer we allocated and return the
* found buffer.
*/
/*
* Account for the memory
*/
/*
* Destroy buf identity, and place on avail list
*/
return (bp);
}
/*
* bio_getfreeblk may block so check the hash chain again.
*/
goto loop;
}
/*
* New buffer. Assign nbp and stick it on the hash.
*/
/*
* If we are given a ufsvfsp and the vfs_root field is NULL
* then this must be I/O for a superblock. A superblock's
* buffer is set up in mountfs() and there is no root vnode
* at that point.
*/
} else {
}
return (nbp);
/*
* Come here in case of an internal error. At this point we couldn't
* get a buffer, but he have to return one. Hence we allocate some
* kind of error reply buffer on the fly. This buffer is marked as
* B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
* - B_ERROR will indicate error to the caller.
* - B_DONE will prevent us from reading the buffer from
* the device.
* - B_NOCACHE will cause that this buffer gets free'd in
* brelse().
*/
return (errbp);
}
/*
* Get an empty block, not assigned to any particular device.
* Returns a locked buffer that is not on any hash or free list.
*/
struct buf *
{
return (bp);
}
/*
* Interface of geteblk() is kept intact to maintain driver compatibility.
* Use ngeteblk() to allocate block size other than 1 KB.
*/
struct buf *
geteblk(void)
{
return (ngeteblk((long)1024));
}
/*
* Return a buffer w/o sleeping
*/
struct buf *
{
if (!mutex_tryenter(hmp))
return (NULL);
continue;
/*
* Get access to a valid buffer without sleeping
*/
return (bp);
} else {
break;
}
}
break;
}
return (NULL);
}
/*
* Wait for I/O completion on the buffer; return errors
* to the user.
*/
int
{
}
/*
* Mark I/O complete on a buffer, release it if I/O is asynchronous,
* and wake up anyone waiting for it.
*/
void
{
}
/*
* Zero the core associated with a buffer.
*/
void
{
}
/*
* Make sure all write-behind blocks on dev (or NODEV for all)
* are flushed out.
*/
void
{
int i, index;
/*
* Wait for any invalidates or flushes ahead of us to finish.
* We really could split blist_lock up per device for better
* parallelism here.
*/
while (bio_doinginval || bio_doingflush) {
bio_flinv_cv_wanted = 1;
}
/*
* Gather all B_DELWRI buffer for device.
* Lock ordering is b_sem > hash lock (brelse).
* Since we are finding the buffer via the delayed write list,
* it may be busy and we would block trying to get the
* b_sem lock while holding hash lock. So transfer all the
* candidates on the delwri_list and then drop the hash locks.
*/
for (i = 0; i < v.v_hbuf; i++) {
delwri_list = bp;
}
}
}
}
/*
* Now that the hash locks have been dropped grab the semaphores
* and write back all the buffers that have B_DELWRI set.
*/
while (delwri_list != EMPTY_LIST) {
bp = delwri_list;
continue; /* No longer a candidate */
}
} else { /* ufs */
}
} else {
}
}
if (bio_flinv_cv_wanted) {
bio_flinv_cv_wanted = 0;
}
}
/*
* Ensure that a specified block is up-to-date on disk.
*/
void
{
/*
* Identify the buffer in the cache belonging to
* this device and blkno (if any).
*/
continue;
break;
}
return;
/*
* Now check the buffer we have identified and
* make sure it still belongs to the device and is B_DELWRI
*/
/*
* XXX - There is nothing to guarantee a synchronous
* write here if the B_ASYNC flag is set. This needs
* some investigation.
*/
} else { /* ufs */
}
} else {
}
}
/*
* Same as binval, except can force-invalidate delayed-write buffers
* (which are not be already flushed because of device errors). Also
* makes sure that the retry write flag is cleared.
*/
int
{
int i, error = 0;
/*
* Wait for any flushes ahead of us to finish, it's ok to
* do invalidates in parallel.
*/
while (bio_doingflush) {
bio_flinv_cv_wanted = 1;
}
/* Gather bp's */
for (i = 0; i < v.v_hbuf; i++) {
binval_list = bp;
}
}
}
}
/* Invalidate all bp's found */
while (binval_list != EMPTY_LIST) {
bp = binval_list;
/* clear B_DELWRI, move to non-dw freelist */
/* remove from delayed write freelist */
/* add to B_AGE side of non-dw freelist */
/*
* make sure write retries and busy are cleared
*/
}
else
}
}
if (bio_flinv_cv_wanted) {
bio_flinv_cv_wanted = 0;
}
return (error);
}
/*
* If possible, invalidate blocks for a dev on demand
*/
void
{
}
/*
* Initialize the buffer I/O system by freeing
* all buffers and setting all device hash buffer lists to empty.
*/
void
binit(void)
{
unsigned int i, pct;
/*
* - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
* - 1/4 of kernel virtual memory
* - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
* Additionally, in order to allow simple tuning by percentage of
* physical memory, bufhwm_pct is used to calculate the default if
* the value of this tunable is between 0 and BIO_MAX_PERCENT.
*
* Since the unit for v.v_bufhwm is kilobytes, this allows for
* a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
*/
if (bufhwm_pct != 0 &&
/*
* Invalid user specified value, emit a warning.
*/
range(1..%d). Using %d as default.",
}
v.v_bufhwm = bio_default_hwm;
v.v_bufhwm = (int)bio_max_hwm;
/*
* Invalid user specified value, emit a warning.
*/
"binit: bufhwm(%d) out \
of range(%d..%lu). Using %lu as default",
}
/*
* Determine the number of hash buckets. Default is to
* create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
* Round up number to the next power of 2.
*/
v.v_buf = BIO_BHDR_POOL;
for (i = 0; i < v.v_hbuf; i++) {
/*
* Initialize the delayed write buffer list.
*/
}
}
/*
* Wait for I/O completion on the buffer; return error code.
* If bp was for synchronous I/O, bp is invalid and associated
* resources are freed on return.
*/
int
{
int error = 0;
/*
* In case of panic, busy wait for completion
*/
if (panicstr) {
drv_usecwait(10);
} else
}
return (error);
}
static void
{
/* Kernel probe */
}
/*
* Mark I/O complete on a buffer, release it if I/O is asynchronous,
* and wake up anyone waiting for it.
*/
void
{
}
/*
* Call the TNF probe here instead of the inline code
* to force our compiler to use the tail call optimization.
*/
return;
}
else
} else {
}
}
/*
* Pick up the device's error number and pass it to the user;
* if there is an error but the number is 0 set a generalized code.
*/
int
{
int error = 0;
if (!error)
}
return (error);
}
/*
* Support for pageio buffers.
*
* This stuff should be generalized to provide a generalized bp
* header facility that can be used for things other than pageio.
*/
/*
* Allocate and initialize a buf struct for use with pageio.
*/
struct buf *
{
/* Kernel probe */
}
/*
* Update statistics for pages being paged in
*/
} else {
} else {
}
}
}
"page_ws_in:pp %p", pp);
/* Kernel probe */
}
/* Initialize bp->b_sem in "locked" state */
/*
* Caller sets dev & blkno and can adjust
* b_addr for page offset and can use bp_mapin
* to make pages kernel addressable.
*/
return (bp);
}
void
{
/* A sema_v(bp->b_sem) is implied if we are destroying it */
}
/*
* Check to see whether the buffers, except the one pointed by sbp,
* associated with the device are busy.
* NOTE: This expensive operation shall be improved together with ufs_icheck().
*/
int
{
int i;
/*
* check for busy bufs for this filesystem
*/
for (i = 0; i < v.v_hbuf; i++) {
/*
* if buf is busy or dirty, then filesystem is busy
*/
return (1);
}
}
}
return (0);
}
/*
* Hash two 32 bit entities.
*/
int
hash2ints(int x, int y)
{
int hash = 0;
hash = x - 1;
return (hash);
}
/*
* Return a new buffer struct.
* Create a new buffer if we haven't gone over our high water
* mark for memory, otherwise try to get one off the freelist.
*
* Returns a locked buf that has no id and is not on any hash or free
* list.
*/
static struct buf *
bio_getfreeblk(long bsize)
{
/*
* mutex_enter(&bfree_lock);
* bfreelist.b_bufsize represents the amount of memory
* mutex_exit(&bfree_lock); protect ref to bfreelist
* we are allowed to allocate in the cache before we hit our hwm.
*/
/*
* Make the common path short
*/
return (bp);
} else {
/*
* Memory isn't available from the system now. Scan
* the hash buckets till enough space is found.
*/
do {
continue;
}
/*
* Since we are going down the freelist
* associated with this hash bucket the
* B_DELWRI flag should not be set.
*/
/*
* Didn't kmem_alloc any more, so don't
* count it twice.
*/
/*
* Update the lastindex value.
*/
/*
* Put our saved bp back on the list
*/
return (bp);
}
}
}
return (bp);
}
/*
* Allocate a buffer header. If none currently available, allocate
* a new pool.
*/
static struct buf *
bio_bhdr_alloc(void)
{
int i;
for (;;) {
return (bp);
}
/*
* Need to allocate a new pool. If the system is currently
* out of memory, then try freeing things on the freelist.
*/
/*
* System can't give us a pool of headers, try
* recycling from the free lists.
*/
bio_recycle(BIO_HEADER, 0);
} else {
/*
* The next two lines are needed since NODEV
* is -1 and not NULL
*/
NULL);
NULL);
}
return (bp);
}
}
}
static void
{
}
/*
* If we haven't gone over the high water mark, it's o.k. to
* allocate more buffer space, otherwise recycle buffers
* from the freelist until enough memory is free for a bsize request.
*
* We account for this memory, even though
* we don't allocate it here.
*/
static void
bio_mem_get(long bsize)
{
return;
}
}
/*
* flush a list of delayed write buffers.
* (currently used only by bio_recycle below.)
*/
static void
{
while (delwri_list != EMPTY_LIST) {
bp = delwri_list;
} else { /* ufs */
}
}
}
/*
* Start recycling buffers on the freelist for one of 2 reasons:
* - we need a buffer header
* - we need to free up memory
* Once started we continue to recycle buffers until the B_AGE
* buffers are gone.
*/
static void
{
int found = 0;
/*
* Recycle buffers.
*/
top:
do {
continue;
}
/*
* Do we really want to nuke all of the B_AGE stuff??
*/
return; /* All done */
}
/*
* Remove bhdr from cache, free up memory,
* and add the hdr to the freelist.
*/
}
if (want == BIO_HEADER) {
found = 1;
} else {
/* Account for the memory we want */
found = 1;
}
}
}
/*
* Since we dropped hmp start from the
* begining.
*/
}
/*
* Look at the delayed write list.
* First gather into a private list, then write them.
*/
continue;
/*
* Do we really want to nuke all of the B_AGE stuff??
*/
if (bio_flinv_cv_wanted) {
bio_flinv_cv_wanted = 0;
}
return; /* All done */
}
/*
* If the buffer is already on a flush or
* invalidate list then just skip it.
*/
continue;
}
/*
* We are still on the same bucket.
*/
delwri_list = bp;
}
if (bio_flinv_cv_wanted) {
bio_flinv_cv_wanted = 0;
}
if (found)
return;
/*
* Free lists exhausted and we haven't satisfied the request.
* Wait here for more entries to be added to freelist.
* Because this might have just happened, make it timed.
*/
goto top;
}
/*
* See if the block is associated with some buffer
* (mainly to avoid getting hung up on a wait in breada).
*/
static int
{
return (1);
}
}
return (0);
}
static void
{
else
} else {
}
}
/*
* bioerror(9F) - indicate error in buffer header
* If 'error' is zero, remove the error indication.
*/
void
{
if (error != 0) {
} else {
}
}
/*
* bioreset(9F) - reuse a private buffer header after I/O is complete
*/
void
{
}
/*
* biosize(9F) - return size of a buffer header
*/
biosize(void)
{
return (sizeof (struct buf));
}
/*
* biomodified(9F) - check if buffer is modified
*/
int
{
int npf;
int ppattr;
return (-1);
}
while (npf > 0) {
return (1);
npf--;
}
return (0);
}
/*
* bioinit(9F) - initialize a buffer structure
*/
void
{
}
/*
* biofini(9F) - uninitialize a buffer structure
*/
void
{
}
/*
* bioclone(9F) - clone a buffer
*/
struct buf *
{
return (NULL);
}
} else {
}
/*
* The cloned buffer does not inherit the B_REMAPPED flag.
*/
} else {
off_t o;
int i;
for (i = btop(o); i > 0; i--) {
}
} else {
}
}
return (bufp);
}