ufs_trans.c revision 80d3443290aca22ad7fb6c18568d19d37517ebbf
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* Portions of this source code were derived from Berkeley 4.3 BSD
* under license from the Regents of the University of California.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/sysmacros.h>
#include <sys/param.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/t_lock.h>
#include <sys/uio.h>
#include <sys/kmem.h>
#include <sys/thread.h>
#include <sys/vfs.h>
#include <sys/errno.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/fs/ufs_trans.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_fs.h>
#include <sys/fs/ufs_fsdir.h>
#include <sys/fs/ufs_quota.h>
#include <sys/fs/ufs_panic.h>
#include <sys/fs/ufs_bio.h>
#include <sys/fs/ufs_log.h>
#include <sys/cmn_err.h>
#include <sys/file.h>
#include <sys/debug.h>
extern kmutex_t ufsvfs_mutex;
extern struct ufsvfs *ufs_instances;
/*
* hlock any file systems w/errored logs
*/
int
ufs_trans_hlock()
{
struct ufsvfs *ufsvfsp;
struct lockfs lockfs;
int error;
int retry = 0;
/*
* find fs's that paniced or have errored logging devices
*/
mutex_enter(&ufsvfs_mutex);
for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next) {
/*
* not mounted; continue
*/
if ((ufsvfsp->vfs_vfs == NULL) ||
(ufsvfsp->vfs_validfs == UT_UNMOUNTED))
continue;
/*
* disallow unmounts (hlock occurs below)
*/
if (TRANS_ISERROR(ufsvfsp))
ufsvfsp->vfs_validfs = UT_HLOCKING;
}
mutex_exit(&ufsvfs_mutex);
/*
* hlock the fs's that paniced or have errored logging devices
*/
again:
mutex_enter(&ufsvfs_mutex);
for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next)
if (ufsvfsp->vfs_validfs == UT_HLOCKING)
break;
mutex_exit(&ufsvfs_mutex);
if (ufsvfsp == NULL)
return (retry);
/*
* hlock the file system
*/
(void) ufs_fiolfss(ufsvfsp->vfs_root, &lockfs);
if (!LOCKFS_IS_ELOCK(&lockfs)) {
lockfs.lf_lock = LOCKFS_HLOCK;
lockfs.lf_flags = 0;
lockfs.lf_comlen = 0;
lockfs.lf_comment = NULL;
error = ufs_fiolfs(ufsvfsp->vfs_root, &lockfs, 0);
/*
* retry after awhile; another app currently doing lockfs
*/
if (error == EBUSY || error == EINVAL)
retry = 1;
} else {
if (ufsfx_get_failure_qlen() > 0) {
if (mutex_tryenter(&ufs_fix.uq_mutex)) {
ufs_fix.uq_lowat = ufs_fix.uq_ne;
cv_broadcast(&ufs_fix.uq_cv);
mutex_exit(&ufs_fix.uq_mutex);
}
}
retry = 1;
}
/*
* allow unmounts
*/
ufsvfsp->vfs_validfs = UT_MOUNTED;
goto again;
}
/*ARGSUSED*/
void
ufs_trans_onerror()
{
mutex_enter(&ufs_hlock.uq_mutex);
ufs_hlock.uq_ne = ufs_hlock.uq_lowat;
cv_broadcast(&ufs_hlock.uq_cv);
mutex_exit(&ufs_hlock.uq_mutex);
}
void
ufs_trans_sbupdate(struct ufsvfs *ufsvfsp, struct vfs *vfsp, top_t topid)
{
if (curthread->t_flag & T_DONTBLOCK) {
sbupdate(vfsp);
return;
} else {
if (panicstr && TRANS_ISTRANS(ufsvfsp))
return;
curthread->t_flag |= T_DONTBLOCK;
TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE);
sbupdate(vfsp);
TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE);
curthread->t_flag &= ~T_DONTBLOCK;
}
}
void
ufs_trans_iupdat(struct inode *ip, int waitfor)
{
struct ufsvfs *ufsvfsp;
if (curthread->t_flag & T_DONTBLOCK) {
rw_enter(&ip->i_contents, RW_READER);
ufs_iupdat(ip, waitfor);
rw_exit(&ip->i_contents);
return;
} else {
ufsvfsp = ip->i_ufsvfs;
if (panicstr && TRANS_ISTRANS(ufsvfsp))
return;
curthread->t_flag |= T_DONTBLOCK;
TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip));
rw_enter(&ip->i_contents, RW_READER);
ufs_iupdat(ip, waitfor);
rw_exit(&ip->i_contents);
TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip));
curthread->t_flag &= ~T_DONTBLOCK;
}
}
void
ufs_trans_sbwrite(struct ufsvfs *ufsvfsp, top_t topid)
{
if (curthread->t_flag & T_DONTBLOCK) {
mutex_enter(&ufsvfsp->vfs_lock);
ufs_sbwrite(ufsvfsp);
mutex_exit(&ufsvfsp->vfs_lock);
return;
} else {
if (panicstr && TRANS_ISTRANS(ufsvfsp))
return;
curthread->t_flag |= T_DONTBLOCK;
TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE);
mutex_enter(&ufsvfsp->vfs_lock);
ufs_sbwrite(ufsvfsp);
mutex_exit(&ufsvfsp->vfs_lock);
TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE);
curthread->t_flag &= ~T_DONTBLOCK;
}
}
/*ARGSUSED*/
int
ufs_trans_push_si(ufsvfs_t *ufsvfsp, delta_t dtyp, int ignore)
{
struct fs *fs;
fs = ufsvfsp->vfs_fs;
mutex_enter(&ufsvfsp->vfs_lock);
TRANS_LOG(ufsvfsp, (char *)fs->fs_u.fs_csp,
ldbtob(fsbtodb(fs, fs->fs_csaddr)), fs->fs_cssize,
(caddr_t)fs->fs_u.fs_csp, fs->fs_cssize);
mutex_exit(&ufsvfsp->vfs_lock);
return (0);
}
/*ARGSUSED*/
int
ufs_trans_push_buf(ufsvfs_t *ufsvfsp, delta_t dtyp, daddr_t bno)
{
struct buf *bp;
bp = (struct buf *)UFS_GETBLK(ufsvfsp, ufsvfsp->vfs_dev, bno, 1);
if (bp == NULL)
return (ENOENT);
if (bp->b_flags & B_DELWRI) {
/*
* Do not use brwrite() here since the buffer is already
* marked for retry or not by the code that called
* TRANS_BUF().
*/
UFS_BWRITE(ufsvfsp, bp);
return (0);
}
/*
* If we did not find the real buf for this block above then
* clear the dev so the buf won't be found by mistake
* for this block later. We had to allocate at least a 1 byte
* buffer to keep brelse happy.
*/
if (bp->b_bufsize == 1) {
bp->b_dev = (o_dev_t)NODEV;
bp->b_edev = NODEV;
bp->b_flags = 0;
}
brelse(bp);
return (ENOENT);
}
/*ARGSUSED*/
int
ufs_trans_push_inode(ufsvfs_t *ufsvfsp, delta_t dtyp, ino_t ino)
{
int error;
struct inode *ip;
/*
* Grab the quota lock (if the file system has not been forcibly
* unmounted).
*/
if (ufsvfsp)
rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
error = ufs_iget(ufsvfsp->vfs_vfs, ino, &ip, kcred);
if (ufsvfsp)
rw_exit(&ufsvfsp->vfs_dqrwlock);
if (error)
return (ENOENT);
if (ip->i_flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) {
rw_enter(&ip->i_contents, RW_READER);
ufs_iupdat(ip, 1);
rw_exit(&ip->i_contents);
VN_RELE(ITOV(ip));
return (0);
}
VN_RELE(ITOV(ip));
return (ENOENT);
}
#ifdef DEBUG
/*
* These routines maintain the metadata map (matamap)
*/
/*
* update the metadata map at mount
*/
static int
ufs_trans_mata_mount_scan(struct inode *ip, void *arg)
{
/*
* wrong file system; keep looking
*/
if (ip->i_ufsvfs != (struct ufsvfs *)arg)
return (0);
/*
* load the metadata map
*/
rw_enter(&ip->i_contents, RW_WRITER);
ufs_trans_mata_iget(ip);
rw_exit(&ip->i_contents);
return (0);
}
void
ufs_trans_mata_mount(struct ufsvfs *ufsvfsp)
{
struct fs *fs = ufsvfsp->vfs_fs;
ino_t ino;
int i;
/*
* put static metadata into matamap
* superblock
* cylinder groups
* inode groups
* existing inodes
*/
TRANS_MATAADD(ufsvfsp, ldbtob(SBLOCK), fs->fs_sbsize);
for (ino = i = 0; i < fs->fs_ncg; ++i, ino += fs->fs_ipg) {
TRANS_MATAADD(ufsvfsp,
ldbtob(fsbtodb(fs, cgtod(fs, i))), fs->fs_cgsize);
TRANS_MATAADD(ufsvfsp,
ldbtob(fsbtodb(fs, itod(fs, ino))),
fs->fs_ipg * sizeof (struct dinode));
}
(void) ufs_scan_inodes(0, ufs_trans_mata_mount_scan, ufsvfsp, ufsvfsp);
}
/*
* clear the metadata map at umount
*/
void
ufs_trans_mata_umount(struct ufsvfs *ufsvfsp)
{
top_mataclr(ufsvfsp);
}
/*
* summary info (may be extended during growfs test)
*/
void
ufs_trans_mata_si(struct ufsvfs *ufsvfsp, struct fs *fs)
{
TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, fs->fs_csaddr)),
fs->fs_cssize);
}
/*
* scan an allocation block (either inode or true block)
*/
static void
ufs_trans_mata_direct(
struct inode *ip,
daddr_t *fragsp,
daddr32_t *blkp,
unsigned int nblk)
{
int i;
daddr_t frag;
ulong_t nb;
struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
struct fs *fs = ufsvfsp->vfs_fs;
for (i = 0; i < nblk && *fragsp; ++i, ++blkp)
if ((frag = *blkp) != 0) {
if (*fragsp > fs->fs_frag) {
nb = fs->fs_bsize;
*fragsp -= fs->fs_frag;
} else {
nb = *fragsp * fs->fs_fsize;
*fragsp = 0;
}
TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb);
}
}
/*
* scan an indirect allocation block (either inode or true block)
*/
static void
ufs_trans_mata_indir(
struct inode *ip,
daddr_t *fragsp,
daddr_t frag,
int level)
{
struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
struct fs *fs = ufsvfsp->vfs_fs;
int ne = fs->fs_bsize / (int)sizeof (daddr32_t);
int i;
struct buf *bp;
daddr32_t *blkp;
o_mode_t ifmt = ip->i_mode & IFMT;
bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, frag), fs->fs_bsize);
if (bp->b_flags & B_ERROR) {
brelse(bp);
return;
}
blkp = bp->b_un.b_daddr;
if (level || (ifmt == IFDIR) || (ifmt == IFSHAD) ||
(ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod))
ufs_trans_mata_direct(ip, fragsp, blkp, ne);
if (level)
for (i = 0; i < ne && *fragsp; ++i, ++blkp)
ufs_trans_mata_indir(ip, fragsp, *blkp, level-1);
brelse(bp);
}
/*
* put appropriate metadata into matamap for this inode
*/
void
ufs_trans_mata_iget(struct inode *ip)
{
int i;
daddr_t frags = dbtofsb(ip->i_fs, ip->i_blocks);
o_mode_t ifmt = ip->i_mode & IFMT;
if (frags && ((ifmt == IFDIR) || (ifmt == IFSHAD) ||
(ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)))
ufs_trans_mata_direct(ip, &frags, &ip->i_db[0], NDADDR);
if (frags)
ufs_trans_mata_direct(ip, &frags, &ip->i_ib[0], NIADDR);
for (i = 0; i < NIADDR && frags; ++i)
if (ip->i_ib[i])
ufs_trans_mata_indir(ip, &frags, ip->i_ib[i], i);
}
/*
* freeing possible metadata (block of user data)
*/
void
ufs_trans_mata_free(struct ufsvfs *ufsvfsp, offset_t mof, off_t nb)
{
top_matadel(ufsvfsp, mof, nb);
}
/*
* allocating metadata
*/
void
ufs_trans_mata_alloc(
struct ufsvfs *ufsvfsp,
struct inode *ip,
daddr_t frag,
ulong_t nb,
int indir)
{
struct fs *fs = ufsvfsp->vfs_fs;
o_mode_t ifmt = ip->i_mode & IFMT;
if (indir || ((ifmt == IFDIR) || (ifmt == IFSHAD) ||
(ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)))
TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb);
}
#endif /* DEBUG */
/*
* ufs_trans_dir is used to declare a directory delta
*/
int
ufs_trans_dir(struct inode *ip, off_t offset)
{
daddr_t bn;
int contig = 0, error;
ASSERT(ip);
ASSERT(RW_WRITE_HELD(&ip->i_contents));
error = bmap_read(ip, (u_offset_t)offset, &bn, &contig);
if (error || (bn == UFS_HOLE)) {
cmn_err(CE_WARN, "ufs_trans_dir - could not get block"
" number error = %d bn = %d\n", error, (int)bn);
if (error == 0) /* treat UFS_HOLE as an I/O error */
error = EIO;
return (error);
}
TRANS_DELTA(ip->i_ufsvfs, ldbtob(bn), DIRBLKSIZ, DT_DIR, 0, 0);
return (error);
}
/*ARGSUSED*/
int
ufs_trans_push_quota(ufsvfs_t *ufsvfsp, delta_t dtyp, struct dquot *dqp)
{
/*
* Lock the quota subsystem (ufsvfsp can be NULL
* if the DQ_ERROR is set).
*/
if (ufsvfsp)
rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
mutex_enter(&dqp->dq_lock);
/*
* If this transaction has been cancelled by closedq_scan_inode(),
* then bail out now. We don't call dqput() in this case because
* it has already been done.
*/
if ((dqp->dq_flags & DQ_TRANS) == 0) {
mutex_exit(&dqp->dq_lock);
if (ufsvfsp)
rw_exit(&ufsvfsp->vfs_dqrwlock);
return (0);
}
if (dqp->dq_flags & DQ_ERROR) {
/*
* Paranoia to make sure that there is at least one
* reference to the dquot struct. We are done with
* the dquot (due to an error) so clear logging
* specific markers.
*/
ASSERT(dqp->dq_cnt >= 1);
dqp->dq_flags &= ~DQ_TRANS;
dqput(dqp);
mutex_exit(&dqp->dq_lock);
if (ufsvfsp)
rw_exit(&ufsvfsp->vfs_dqrwlock);
return (1);
}
if (dqp->dq_flags & (DQ_MOD | DQ_BLKS | DQ_FILES)) {
ASSERT((dqp->dq_mof != UFS_HOLE) && (dqp->dq_mof != 0));
TRANS_LOG(ufsvfsp, (caddr_t)&dqp->dq_dqb,
dqp->dq_mof, (int)sizeof (struct dqblk), NULL, 0);
/*
* Paranoia to make sure that there is at least one
* reference to the dquot struct. Clear the
* modification flag because the operation is now in
* the log. Also clear the logging specific markers
* that were set in ufs_trans_quota().
*/
ASSERT(dqp->dq_cnt >= 1);
dqp->dq_flags &= ~(DQ_MOD | DQ_TRANS);
dqput(dqp);
}
/*
* At this point, the logging specific flag should be clear,
* but add paranoia just in case something has gone wrong.
*/
ASSERT((dqp->dq_flags & DQ_TRANS) == 0);
mutex_exit(&dqp->dq_lock);
if (ufsvfsp)
rw_exit(&ufsvfsp->vfs_dqrwlock);
return (0);
}
/*
* ufs_trans_quota take in a uid, allocates the disk space, placing the
* quota record into the metamap, then declares the delta.
*/
/*ARGSUSED*/
void
ufs_trans_quota(struct dquot *dqp)
{
struct inode *qip = dqp->dq_ufsvfsp->vfs_qinod;
ASSERT(qip);
ASSERT(MUTEX_HELD(&dqp->dq_lock));
ASSERT(dqp->dq_flags & DQ_MOD);
ASSERT(dqp->dq_mof != 0);
ASSERT(dqp->dq_mof != UFS_HOLE);
/*
* Mark this dquot to indicate that we are starting a logging
* file system operation for this dquot. Also increment the
* reference count so that the dquot does not get reused while
* it is on the mapentry_t list. DQ_TRANS is cleared and the
* reference count is decremented by ufs_trans_push_quota.
*
* If the file system is force-unmounted while there is a
* pending quota transaction, then closedq_scan_inode() will
* clear the DQ_TRANS flag and decrement the reference count.
*
* Since deltamap_add() drops multiple transactions to the
* same dq_mof and ufs_trans_push_quota() won't get called,
* we use DQ_TRANS to prevent repeat transactions from
* incrementing the reference count (or calling TRANS_DELTA()).
*/
if ((dqp->dq_flags & DQ_TRANS) == 0) {
dqp->dq_flags |= DQ_TRANS;
dqp->dq_cnt++;
TRANS_DELTA(qip->i_ufsvfs, dqp->dq_mof, sizeof (struct dqblk),
DT_QR, ufs_trans_push_quota, (ulong_t)dqp);
}
}
void
ufs_trans_dqrele(struct dquot *dqp)
{
struct ufsvfs *ufsvfsp = dqp->dq_ufsvfsp;
curthread->t_flag |= T_DONTBLOCK;
TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE);
rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
dqrele(dqp);
rw_exit(&ufsvfsp->vfs_dqrwlock);
TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE);
curthread->t_flag &= ~T_DONTBLOCK;
}
int ufs_trans_max_resv = TOP_MAX_RESV; /* will be adjusted for testing */
long ufs_trans_avgbfree = 0; /* will be adjusted for testing */
#define TRANS_MAX_WRITE (1024 * 1024)
size_t ufs_trans_max_resid = TRANS_MAX_WRITE;
/*
* Calculate the log reservation for the given write or truncate
*/
static ulong_t
ufs_log_amt(struct inode *ip, offset_t offset, ssize_t resid, int trunc)
{
long ncg, last2blk;
long niblk = 0;
u_offset_t writeend, offblk;
int resv;
daddr_t nblk, maxfblk;
long avgbfree;
struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
struct fs *fs = ufsvfsp->vfs_fs;
long fni = NINDIR(fs);
int bsize = fs->fs_bsize;
/*
* Assume that the request will fit in 1 or 2 cg's,
* resv is the amount of log space to reserve (in bytes).
*/
resv = SIZECG(ip) * 2 + INODESIZE + 1024;
/*
* get max position of write in fs blocks
*/
writeend = offset + resid;
maxfblk = lblkno(fs, writeend);
offblk = lblkno(fs, offset);
/*
* request size in fs blocks
*/
nblk = lblkno(fs, blkroundup(fs, resid));
/*
* Adjust for sparse files
*/
if (trunc)
nblk = MIN(nblk, ip->i_blocks);
/*
* Adjust avgbfree (for testing)
*/
avgbfree = (ufs_trans_avgbfree) ? 1 : ufsvfsp->vfs_avgbfree + 1;
/*
* Calculate maximum number of blocks of triple indirect
* pointers to write.
*/
last2blk = NDADDR + fni + fni * fni;
if (maxfblk > last2blk) {
long nl2ptr;
long n3blk;
if (offblk > last2blk)
n3blk = maxfblk - offblk;
else
n3blk = maxfblk - last2blk;
niblk += roundup(n3blk * sizeof (daddr_t), bsize) / bsize + 1;
nl2ptr = roundup(niblk, fni) / fni + 1;
niblk += roundup(nl2ptr * sizeof (daddr_t), bsize) / bsize + 2;
maxfblk -= n3blk;
}
/*
* calculate maximum number of blocks of double indirect
* pointers to write.
*/
if (maxfblk > NDADDR + fni) {
long n2blk;
if (offblk > NDADDR + fni)
n2blk = maxfblk - offblk;
else
n2blk = maxfblk - NDADDR + fni;
niblk += roundup(n2blk * sizeof (daddr_t), bsize) / bsize + 2;
maxfblk -= n2blk;
}
/*
* Add in indirect pointer block write
*/
if (maxfblk > NDADDR) {
niblk += 1;
}
/*
* Calculate deltas for indirect pointer writes
*/
resv += niblk * (fs->fs_bsize + sizeof (struct delta));
/*
* maximum number of cg's needed for request
*/
ncg = nblk / avgbfree;
if (ncg > fs->fs_ncg)
ncg = fs->fs_ncg;
/*
* maximum amount of log space needed for request
*/
if (ncg > 2)
resv += (ncg - 2) * SIZECG(ip);
return (resv);
}
/*
* Calculate the amount of log space that needs to be reserved for this
* trunc request. If the amount of log space is too large, then
* calculate the the size that the requests needs to be split into.
*/
void
ufs_trans_trunc_resv(
struct inode *ip,
u_offset_t length,
int *resvp,
u_offset_t *residp)
{
ulong_t resv;
u_offset_t size, offset, resid;
int nchunks;
/*
* *resvp is the amount of log space to reserve (in bytes).
* when nonzero, *residp is the number of bytes to truncate.
*/
*residp = 0;
if (length < ip->i_size) {
size = ip->i_size - length;
} else {
resv = SIZECG(ip) * 2 + INODESIZE + 1024;
/*
* truncate up, doesn't really use much space,
* the default above should be sufficient.
*/
goto done;
}
offset = length;
resid = size;
nchunks = 1;
for (; (resv = ufs_log_amt(ip, offset, resid, 1)) > ufs_trans_max_resv;
offset = length + (nchunks - 1) * resid) {
nchunks++;
resid = size / nchunks;
}
/*
* If this request takes too much log space, it will be split
*/
if (nchunks > 1) {
*residp = resid;
}
done:
*resvp = resv;
}
int
ufs_trans_itrunc(struct inode *ip, u_offset_t length, int flags, cred_t *cr)
{
int err, issync, resv;
u_offset_t resid;
int do_block = 0;
struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
struct fs *fs = ufsvfsp->vfs_fs;
/*
* Not logging; just do the trunc
*/
if (!TRANS_ISTRANS(ufsvfsp)) {
rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
rw_enter(&ip->i_contents, RW_WRITER);
err = ufs_itrunc(ip, length, flags, cr);
rw_exit(&ip->i_contents);
rw_exit(&ufsvfsp->vfs_dqrwlock);
return (err);
}
/*
* within the lockfs protocol but *not* part of a transaction
*/
do_block = curthread->t_flag & T_DONTBLOCK;
curthread->t_flag |= T_DONTBLOCK;
/*
* Trunc the file (in pieces, if necessary)
*/
again:
ufs_trans_trunc_resv(ip, length, &resv, &resid);
TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ITRUNC, resv);
rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
rw_enter(&ip->i_contents, RW_WRITER);
if (resid) {
/*
* resid is only set if we have to truncate in chunks
*/
ASSERT(length + resid < ip->i_size);
/*
* Partially trunc file down to desired size (length).
* Only retain I_FREE on the last partial trunc.
* Round up size to a block boundary, to ensure the truncate
* doesn't have to allocate blocks. This is done both for
* performance and to fix a bug where if the block can't be
* allocated then the inode delete fails, but the inode
* is still freed with attached blocks and non-zero size
* (bug 4348738).
*/
err = ufs_itrunc(ip, blkroundup(fs, (ip->i_size - resid)),
flags & ~I_FREE, cr);
ASSERT(ip->i_size != length);
} else
err = ufs_itrunc(ip, length, flags, cr);
if (!do_block)
curthread->t_flag &= ~T_DONTBLOCK;
rw_exit(&ip->i_contents);
rw_exit(&ufsvfsp->vfs_dqrwlock);
TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ITRUNC, resv);
if ((err == 0) && resid) {
ufsvfsp->vfs_avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
goto again;
}
return (err);
}
/*
* Fault in the pages of the first n bytes specified by the uio structure.
* 1 byte in each page is touched and the uio struct is unmodified.
* Any error will terminate the process as this is only a best
* attempt to get the pages resident.
*/
static void
ufs_trans_touch(ssize_t n, struct uio *uio)
{
struct iovec *iov;
ulong_t cnt, incr;
caddr_t p;
uint8_t tmp;
iov = uio->uio_iov;
while (n) {
cnt = MIN(iov->iov_len, n);
if (cnt == 0) {
/* empty iov entry */
iov++;
continue;
}
n -= cnt;
/*
* touch each page in this segment.
*/
p = iov->iov_base;
while (cnt) {
switch (uio->uio_segflg) {
case UIO_USERSPACE:
case UIO_USERISPACE:
if (fuword8(p, &tmp))
return;
break;
case UIO_SYSSPACE:
if (kcopy(p, &tmp, 1))
return;
break;
}
incr = MIN(cnt, PAGESIZE);
p += incr;
cnt -= incr;
}
/*
* touch the last byte in case it straddles a page.
*/
p--;
switch (uio->uio_segflg) {
case UIO_USERSPACE:
case UIO_USERISPACE:
if (fuword8(p, &tmp))
return;
break;
case UIO_SYSSPACE:
if (kcopy(p, &tmp, 1))
return;
break;
}
iov++;
}
}
/*
* Calculate the amount of log space that needs to be reserved for this
* write request. If the amount of log space is too large, then
* calculate the size that the requests needs to be split into.
* First try fixed chunks of size ufs_trans_max_resid. If that
* is too big, iterate down to the largest size that will fit.
* Pagein the pages in the first chunk here, so that the pagein is
* avoided later when the transaction is open.
*/
void
ufs_trans_write_resv(
struct inode *ip,
struct uio *uio,
int *resvp,
int *residp)
{
ulong_t resv;
offset_t offset;
ssize_t resid;
int nchunks;
*residp = 0;
offset = uio->uio_offset;
resid = MIN(uio->uio_resid, ufs_trans_max_resid);
resv = ufs_log_amt(ip, offset, resid, 0);
if (resv <= ufs_trans_max_resv) {
ufs_trans_touch(resid, uio);
if (resid != uio->uio_resid)
*residp = resid;
*resvp = resv;
return;
}
resid = uio->uio_resid;
nchunks = 1;
for (; (resv = ufs_log_amt(ip, offset, resid, 0)) > ufs_trans_max_resv;
offset = uio->uio_offset + (nchunks - 1) * resid) {
nchunks++;
resid = uio->uio_resid / nchunks;
}
ufs_trans_touch(resid, uio);
/*
* If this request takes too much log space, it will be split
*/
if (nchunks > 1)
*residp = resid;
*resvp = resv;
}
/*
* Issue write request.
*
* Split a large request into smaller chunks.
*/
int
ufs_trans_write(
struct inode *ip,
struct uio *uio,
int ioflag,
cred_t *cr,
int resv,
long resid)
{
long realresid;
int err;
struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
/*
* since the write is too big and would "HOG THE LOG" it needs to
* be broken up and done in pieces. NOTE, the caller will
* issue the EOT after the request has been completed
*/
realresid = uio->uio_resid;
again:
/*
* Perform partial request (uiomove will update uio for us)
* Request is split up into "resid" size chunks until
* "realresid" bytes have been transferred.
*/
uio->uio_resid = MIN(resid, realresid);
realresid -= uio->uio_resid;
err = wrip(ip, uio, ioflag, cr);
/*
* Error or request is done; caller issues final EOT
*/
if (err || uio->uio_resid || (realresid == 0)) {
uio->uio_resid += realresid;
return (err);
}
/*
* Generate EOT for this part of the request
*/
rw_exit(&ip->i_contents);
rw_exit(&ufsvfsp->vfs_dqrwlock);
if (ioflag & (FSYNC|FDSYNC)) {
TRANS_END_SYNC(ufsvfsp, err, TOP_WRITE_SYNC, resv);
} else {
TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv);
}
/*
* Make sure the input buffer is resident before starting
* the next transaction.
*/
ufs_trans_touch(MIN(resid, realresid), uio);
/*
* Generate BOT for next part of the request
*/
if (ioflag & (FSYNC|FDSYNC)) {
int error;
TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv, error);
ASSERT(!error);
} else {
TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv);
}
rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
rw_enter(&ip->i_contents, RW_WRITER);
/*
* Error during EOT (probably device error while writing commit rec)
*/
if (err)
return (err);
goto again;
}