ufs_directio.c revision 80d3443290aca22ad7fb6c18568d19d37517ebbf
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* Portions of this source code were derived from Berkeley 4.3 BSD
* under license from the Regents of the University of California.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/sysmacros.h>
#include <sys/resource.h>
#include <sys/pathname.h>
#include <sys/fssnap_if.h>
#include <vm/seg_kmem.h>
static void *ufs_directio_zero_buf;
static int ufs_directio_zero_len = 8192;
/*
* for kstats reader
*/
struct ufs_directio_kstats {
} ufs_directio_kstats = {
{ "logical_reads", KSTAT_DATA_UINT64 },
{ "phys_reads", KSTAT_DATA_UINT64 },
{ "hole_reads", KSTAT_DATA_UINT64 },
{ "nread", KSTAT_DATA_UINT64 },
{ "logical_writes", KSTAT_DATA_UINT64 },
{ "phys_writes", KSTAT_DATA_UINT64 },
{ "nwritten", KSTAT_DATA_UINT64 },
{ "nflushes", KSTAT_DATA_UINT64 },
};
/*
* use kmem_cache_create for direct-physio buffers. This has shown
* a better cache distribution compared to buffers on the
* stack. It also avoids semaphore construction/deconstruction
* per request
*/
struct directio_buf {
struct directio_buf *next;
char *addr;
};
static struct kmem_cache *directio_buf_cache;
/* ARGSUSED */
static int
{
return (0);
}
/* ARGSUSED */
static void
{
}
void
directio_bufs_init(void)
{
sizeof (struct directio_buf), 0,
}
void
ufs_directio_init(void)
{
/*
* kstats
*/
sizeof (ufs_directio_kstats) / sizeof (kstat_named_t),
if (ufs_directio_kstatsp) {
}
/*
* kzero is broken so we have to use a private buf of zeroes
*/
}
/*
* Wait for the first direct IO operation to finish
*/
static int
{
int error;
/*
* Wait for IO to finish
*/
/*
* bytes_io will be used to figure out a resid
* for the caller. The resid is approximated by reporting
* the bytes following the first failed IO as the residual.
*
* I am cautious about using b_resid because I
* am not sure how well the disk drivers maintain it.
*/
if (error)
else
*bytes_iop = 0;
else
/*
* Release direct IO resources
*/
return (error);
}
/*
* Wait for all of the direct IO operations to finish
*/
static int
{
struct directio_buf *dbp;
/*
* The linked list of directio buf structures is maintained
* in reverse order (tail->last request->penultimate request->...)
*/
/*
* This is the k_pri_req hack. Large numbers of threads
* sleeping with kernel priority will cause scheduler thrashing
* on an MP machine. This can be seen running Oracle using
* directio to ufs files. Sleep at normal priority here to
* more closely mimic physio to a device partition. This
* workaround is disabled by default as a niced thread could
* be starved from running while holding i_rwlock and i_contents.
*/
if (ufs_directio_drop_kpri) {
curthread->t_kpri_req = 0;
}
if (error == 0)
}
return (error);
}
/*
* Initiate direct IO request
*/
static void
{
struct directio_buf *dbp;
/*
* Allocate a directio buf header
* Note - list is maintained in reverse order.
* directio_wait_one() depends on this fact when
* adjusting the ``bytes_io'' param. bytes_io
* is used to compute a residual in the case of error.
*/
/*
* Initialize buf header
*/
/*
* Note that S_WRITE implies B_READ and vice versa: a read(2)
* will B_READ data from the filesystem and S_WRITE it into
* the user's buffer; a write(2) will S_READ data from the
* user's buffer and B_WRITE it to the filesystem.
*/
} else {
}
/*
* Issue I/O request.
*/
if (ufsvfsp->vfs_snapshot)
else
(void) bdev_strategy(bp);
else
}
/*
* Force POSIX syncronous data integrity on all writes for testing.
*/
/*
* Direct Write
*/
int
{
long resid, bytes_written;
struct directio_buf *tail;
char *copy_base;
long copy_resid;
/*
* assume that directio isn't possible (normal case)
*/
/*
* Don't go direct
*/
if (ufs_directio_enabled == 0)
return (0);
/*
* mapped file; nevermind
*/
return (0);
/*
* CAN WE DO DIRECT IO?
*/
/*
* beyond limit
*/
return (0);
/*
* must be sector aligned
*/
return (0);
/*
* SHOULD WE DO DIRECT IO?
*/
has_holes = -1;
/*
* only on regular files; no metadata
*/
return (0);
/*
* Synchronous, allocating writes run very slow in Direct-Mode
* XXX - can be fixed with bmap_write changes for large writes!!!
* XXX - can be fixed for updates to "almost-full" files
* XXX - WARNING - system hangs if bmap_write() has to
* allocate lots of pages since pageout
* suspends on locked inode
*/
return (0);
if (has_holes)
return (0);
}
/*
* Each iovec must be short aligned and sector aligned. If
* one is not, then kmem_alloc a new buffer and copy all of
* the smaller buffers into the new buffer. This new
* buffer will be short aligned and sector aligned.
*/
while (nbytes--) {
return (0);
if (error) {
return (0);
}
break;
}
iov++;
}
/*
* From here on down, all error exits must go to errout and
* not simply return a 0.
*/
/*
* DIRECTIO
*/
/*
* POSIX check. If attempting a concurrent re-write, make sure
* that this will be a single request to the driver to meet
* POSIX synchronous data integrity requirements.
*/
bmap_peek = 0;
int upgrade = 0;
/* check easy conditions first */
upgrade = 1;
} else {
/* now look for contiguous allocation */
goto errout;
/* save a call to bmap_read later */
bmap_peek = 1;
upgrade = 1;
}
if (upgrade) {
}
}
/*
* allocate space
*/
/*
* If attempting a re-write, there is no allocation to do.
* bmap_write would trip an ASSERT if i_contents is held shared.
*/
if (rewrite)
goto skip_alloc;
do {
/* Caller is responsible for updating i_seq if needed */
if (error)
break;
} else if (n == MAXBSIZE) {
/* Caller is responsible for updating i_seq if needed */
} else {
if (has_holes < 0)
if (has_holes) {
/*
* Caller is responsible for updating
* i_seq if needed
*/
} else
error = 0;
}
if (error)
break;
uoff += n;
resid -= n;
/*
* if file has grown larger than 2GB, set flag
* in superblock if not already set
*/
}
} while (resid);
if (error) {
/*
* restore original state
*/
if (resid) {
goto errout;
}
/*
* try non-directio path
*/
goto errout;
}
/*
* get rid of cached pages
*/
if (vn_has_cached_data(vp)) {
if (!exclusive) {
/*
* Still holding i_rwlock, so no allocations
* can happen after dropping contents.
*/
}
if (vn_has_cached_data(vp))
goto errout;
if (!exclusive)
}
/*
* Direct Writes
*/
if (!exclusive) {
if (ncur > ufs_maxcur_writes)
}
/*
* proc and as are for VM operations in directio_start()
*/
} else {
}
error = 0;
newerror = 0;
bytes_written = 0;
/*
* Adjust number of bytes
*/
if (pglck_len == 0) {
uio->uio_iovcnt--;
continue;
}
/*
* Try to Lock down the largest chunck of pages possible.
*/
if (error)
break;
while (pglck_len) {
if (!bmap_peek) {
/*
* Re-adjust number of bytes to contiguous
* range. May have already called bmap_read
* in the case of a concurrent rewrite.
*/
if (error)
break;
break;
}
bmap_peek = 0;
/*
* Get the pagelist pointer for this offset to be
* passed to directio_start.
*/
else
/*
* Kick off the direct write requests
*/
/*
* Adjust pointers and counters
*/
}
/*
* Wait for outstanding requests
*/
/*
* Release VM resources
*/
}
if (!exclusive) {
/*
* If this write was done shared, readers may
* have pulled in unmodified pages. Get rid of
* these potentially stale pages.
*/
if (vn_has_cached_data(vp)) {
}
}
/*
* If error, adjust resid to begin at the first
* un-writable byte.
*/
if (error == 0)
if (error)
if (!rewrite) {
/* Caller will update i_seq */
}
/*
* If there is a residual; adjust the EOF if necessary
*/
if (resid) {
}
}
return (error);
return (0);
}
/*
* Direct read of a hole
*/
static int
{
&phys_uio);
}
return (error);
}
/*
* Direct Read
*/
int
{
struct directio_buf *tail;
/*
* assume that directio isn't possible (normal case)
*/
/*
* Don't go direct
*/
if (ufs_directio_enabled == 0)
return (0);
/*
* mapped file; nevermind
*/
return (0);
/*
* CAN WE DO DIRECT IO?
*/
/*
* must be sector aligned
*/
return (0);
/*
* must be short aligned and sector aligned
*/
while (nbytes--) {
return (0);
return (0);
}
/*
* DIRECTIO
*/
/*
* don't read past EOF
*/
/*
* The file offset is past EOF so bail out here; we don't want
* to update uio_resid and make it look like we read something.
* We say that direct I/O was a success to avoid having rdip()
* go through the same "read past EOF logic".
*/
return (0);
}
/*
* The read would extend past EOF so make it smaller.
*/
/*
* recheck sector alignment
*/
return (0);
}
/*
* At this point, we know there is some real work to do.
*/
/*
* get rid of cached pages
*/
if (vn_has_cached_data(vp)) {
if (vn_has_cached_data(vp))
return (0);
}
/*
* Direct Reads
*/
/*
* proc and as are for VM operations in directio_start()
*/
} else {
}
error = 0;
newerror = 0;
bytes_read = 0;
/*
* Adjust number of bytes
*/
if (pglck_len == 0) {
uio->uio_iovcnt--;
continue;
}
/*
* Try to Lock down the largest chunck of pages possible.
*/
if (error)
break;
while (pglck_len) {
/*
* Re-adjust number of bytes to contiguous range
*/
if (error)
break;
/*
* Hole reads are not added to the list
* processed by directio_wait() below so
* account for bytes read here.
*/
if (!error)
bytes_read += nbytes;
} else {
/*
* Get the pagelist pointer for this offset
* to be passed to directio_start.
*/
else
/*
* Kick off the direct read requests
*/
}
if (error)
break;
/*
* Adjust pointers and counters
*/
}
/*
* Wait for outstanding requests
*/
/*
* Release VM resources
*/
}
/*
* If error, adjust resid to begin at the first
* un-read byte.
*/
if (error == 0)
return (error);
}