/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
* The Regents of the University of California
* All Rights Reserved
*
* University Acknowledgment- Portions of this document are derived from
* software developed by the University of California, Berkeley, and its
* contributors.
*/
#include <sys/condvar_impl.h>
#include <sys/sysmacros.h>
static daddr_t fragextend();
static daddr_t alloccgblk();
static int findlogstartcg();
void delay();
/*
* Allocate a block in the file system.
*
* The size of the requested block is given, which must be some
* multiple of fs_fsize and <= fs_bsize.
* A preference may be optionally specified. If a preference is given
* the following hierarchy is used to allocate a block:
* 1) allocate the requested block.
* 2) allocate a rotationally optimal block in the same cylinder.
* 3) allocate a block in the same cylinder group.
* 4) quadratically rehash into other cylinder groups, until an
* available block is located.
* If no block preference is given the following hierarchy is used
* to allocate a block:
* 1) allocate a block in the cylinder group that contains the
* inode for the file.
* 2) quadratically rehash into other cylinder groups, until an
* available block is located.
*/
int
{
int cg;
int err;
" bsize = %d, size = %d, fs = %s\n",
return (err);
}
goto nospace;
goto nospace;
/* Note that may not have err, but may have errmsg */
}
if (err)
return (err);
bpref = 0;
if (bpref == 0)
else
if (bno > 0) {
return (0);
}
/*
* hashalloc() failed because some other thread grabbed
* the last block so unwind the quota operation. We can
* ignore the return because subtractions don't fail and
* size is guaranteed to be >= zero by our caller.
*/
now = ddi_get_lbolt();
}
return (ENOSPC);
}
/*
* Reallocate a fragment to a bigger size
*
* The number and size of the old block is given, and a preference
* and new size is also specified. The allocator attempts to extend
* the original block. Failing that, the regular block allocator is
* invoked to get an appropriate block.
*/
int
{
int err;
"realloccg: bad size, dev=0x%lx, bsize=%d, "
"osize=%d, nsize=%d, fs=%s\n",
return (err);
}
goto nospace;
if (bprev == 0) {
"realloccg: bad bprev, dev = 0x%lx, bsize = %d,"
return (err);
}
/* Note that may not have err, but may have errmsg */
}
if (err)
return (err);
if (bno != 0) {
return (0);
}
bpref = 0;
/*
* When optimizing for time we allocate a full block and
* then only use the upper portion for this request. When
* this file grows again it will grow into the unused portion
* of the block (See fragextend() above). This saves time
* because an extra disk write would be needed if the frags
* following the current allocation were not free. The extra
* disk write is needed to move the data from its current
* location into the newly allocated position.
*
* When optimizing for space we allocate a run of frags
* that is just the right size for this request.
*/
if (bno > 0) {
return (0);
}
/*
* hashalloc() failed because some other thread grabbed
* the last block so unwind the quota operation. We can
* ignore the return because subtractions don't fail, and
* our caller guarantees nsize >= osize.
*/
now = ddi_get_lbolt();
}
return (ENOSPC);
}
/*
* Allocate an inode in the file system.
*
* A preference may be optionally specified. If a preference is given
* the following hierarchy is used to allocate an inode:
* 1) allocate the requested inode.
* 2) allocate an inode in the same cylinder group.
* 3) quadratically rehash into other cylinder groups, until an
* available inode is located.
* If no inode preference is given the following hierarchy is used
* to allocate an inode:
* 1) allocate an inode in cylinder group 0.
* 2) quadratically rehash into other cylinder groups, until an
* available inode is located.
*/
int
{
int cg;
int err;
int nifree;
loop:
if (nifree == 0)
goto noinodes;
/*
* Shadow inodes don't count against a user's inode allocation.
* They are an implementation method and not a resource.
*/
/*
* As we haven't acquired any locks yet, dump the message
* now.
*/
}
if (err)
return (err);
}
ipref = 0;
if (ino == 0) {
/*
* We can safely ignore the return from chkiq()
* because deallocations can only fail if we
* can't get the user's quota info record off
* the disk due to an I/O error. In that case,
* the quota subsystem is already messed up.
*/
}
goto noinodes;
}
if (err) {
/*
* See above comment about why it is safe to ignore an
* error return here.
*/
}
return (err);
}
/*
* Check if we really got a free inode, if not then complain
* and mark the inode ISTALE so that it will be freed by the
* ufs idle thread eventually and will not be sent to ufs_delete().
*/
"%s: unexpected allocated inode %d, run fsck(1M)%s",
goto loop;
}
/*
* Check the inode has no size or data blocks.
* This could have happened if the truncation failed when
* deleting the inode. It used to be possible for this to occur
* if a block allocation failed when iteratively truncating a
* large file using logging and with a full file system.
* This was fixed with bug fix 4348738. However, truncation may
* still fail on an IO error. So in all cases for safety and
* security we clear out the size; the blocks allocated; and
* pointers to the blocks. This will ultimately cause a fsck
* error of un-accounted for blocks, but its a fairly benign error,
* and possibly the correct thing to do anyway as accesssing those
* blocks agains may lead to more IO errors.
*/
int i;
"%s: free inode %d had size 0x%llx, run fsck(1M)%s",
}
/*
* Clear any garbage left behind.
*/
for (i = 0; i < NDADDR; i++)
for (i = 0; i < NIADDR; i++)
}
/*
* Initialize the link count
*/
/*
* Clear the old flags
*/
/*
* Access times are not really defined if the fs is mounted
* with 'noatime'. But it can cause nfs clients to fail
* open() if the atime is not a legal value. Set a legal value
* here when the inode is allocated.
*/
if (ufsvfsp->vfs_noatime) {
}
return (0);
return (ENOSPC);
}
/*
* Find a cylinder group to place a directory.
* Returns an inumber within the selected cylinder group.
* Note, the vfs_lock is not needed as we don't require exact cg summary info.
*
* If the switch ufs_close_dirs is set, then the policy is to use
* the current cg if it has more than 25% free inodes and more
* than 25% free blocks. Otherwise the cgs are searched from
* the beginning and the first cg with the same criteria is
* used. If that is also null then we revert to the old algorithm.
* This tends to cluster files at the beginning of the disk
* until the disk gets full.
*
* Otherwise if ufs_close_dirs is not set then the original policy is
* used which is to select from among those cylinder groups with
* above the average number of free inodes, the one with the smallest
* number of directories.
*/
{
if (ufs_close_dirs &&
}
mincg = 0;
if (ufs_close_dirs &&
}
}
}
}
/*
* Select the desired position for the next block in a file. The file is
* logically divided into sections. The first section is composed of the
* direct blocks. Each additional section contains fs_maxbpg blocks.
*
* If no blocks have been allocated in the first section, the policy is to
* request a block in the same cylinder group as the inode that describes
* the file. If no blocks have been allocated in any other section, the
* policy is to place the section in a cylinder group with a greater than
* average number of free blocks. An appropriate cylinder group is found
* by using a rotor that sweeps the cylinder groups. When a new group of
* blocks is needed, the sweep begins in the cylinder group following the
* cylinder group from which the previous allocation was made. The sweep
* continues until a cylinder group with greater than the average number
* of free blocks is found. If the allocation is for the first block in an
* indirect block, the information on the previous allocation is unavailable;
* here a best guess is made based upon the logical block number being
* allocated.
*
* If a section is already partially allocated, the policy is to
* contiguously allocate fs_maxcontig blocks. The end of one of these
* contiguous blocks and the beginning of the next is physically separated
* so that the disk head will be in transit between them for at least
* fs_rotdelay milliseconds. This is to allow time for the processor to
* schedule another I/O transfer.
*/
{
int cg;
}
/*
* Find a cylinder with greater than average
* number of unused data blocks.
*/
else
/*
*/
}
}
return (NULL);
}
/*
* One or more previous blocks have been laid out. If less
* than fs_maxcontig previous blocks are contiguous, the
* next block is requested contiguously, otherwise it is
* requested rotationally delayed by fs_rotdelay milliseconds.
*/
/*
* Provision for fallocate to return positive
* blk preference based on last allocation
*/
} else {
}
return (nextblk);
}
if (fs->fs_rotdelay != 0)
/*
* Here we convert ms of delay to frags as:
* then round up to the next block.
*/
return (nextblk);
}
/*
* Free a block or fragment.
*
* The specified block or fragment is placed back in the
* free map. If a fragment is deallocated, a possible
* block reassembly is checked.
*/
void
{
int i;
int *blktot;
short *blks;
/*
* fallocate'd files will have negative block address.
* So negate it again to get original block address.
*/
}
"free: bad size, dev = 0x%lx, bsize = %d, size = %d, "
return;
}
return;
}
if (!(flags & I_NOCANCEL))
}
"dev:0x%lx, block:%ld, ino:%lu, fs:%s",
return;
}
}
} else {
/*
* Decrement the counts associated with the old frags
*/
/*
* Deallocate the fragment
*/
"free: freeing free frag, "
"dev:0x%lx, blk:%ld, cg:%d, "
"ino:%lu, fs:%s",
bno + i,
return;
}
}
}
/*
* Add back in counts associated with the new frags
*/
/*
* If a complete block has been reassembled, account for it
*/
}
}
}
/*
* Free an inode.
*
* The specified inode is placed back in the free map.
*/
void
{
unsigned int inot;
int cg;
char *iused;
"ufs_ifree: illegal mode: (imode) %o, (omode) %o, ino %d, "
"fs = %s\n",
return;
}
"ifree: range, dev = 0x%x, ino = %d, fs = %s\n",
return;
}
return;
}
"mode: (imode) %o, (omode) %o, ino:%d, "
"fs:%s",
return;
}
}
}
/*
* Implement the cylinder overflow algorithm.
*
* The policy implemented by this algorithm is:
* 1) allocate the block in its requested cylinder group.
* 2) quadratically rehash on the cylinder group number.
* 3) brute force search for a free block.
* The size parameter means size for data blocks, mode for inodes.
*/
static ino_t
{
int i;
long result;
/*
* 1: preferred cylinder group
*/
if (result)
return (result);
/*
* 2: quadratic rehash
*/
cg += i;
if (result)
return (result);
}
/*
* 3: brute force search
* Note that we start at i == 2, since 0 was checked initially,
* and 1 is always checked in the quadratic rehash.
*/
if (result)
return (result);
cg++;
cg = 0;
}
return (NULL);
}
/*
* Determine whether a fragment can be extended.
*
* Check to see if the necessary fragments are available, and
* if they are, allocate them.
*/
static daddr_t
{
long bno;
int i, j;
return (NULL);
/* cannot extend across a block boundary */
return (NULL);
}
return (NULL);
}
return (NULL);
}
return (NULL);
}
}
/*
* The current fragment can be extended,
* deduct the count on fragment being extended into
* increase the count on the remaining fragment (if any)
* allocate the extended piece.
*/
break;
if (i != frags)
}
}
/*
* Determine whether a block can be allocated.
*
* Check to see if a block of the apprpriate size
* is available, and if it is, allocate it.
*/
static daddr_t
{
int allocsiz;
int i;
/*
* Searching for space could be time expensive so do some
* up front checking to verify that there is actually space
* available (free blocks or free frags).
*/
return (0);
/*
* If there are not enough free frags then return.
*/
return (0);
}
return (0);
}
goto errout;
return (bno);
}
/*
* Check fragment bitmap to see if any fragments are already available.
* mapsearch() may fail because the fragment that fits this request
* might still be on the cancel list and not available for re-use yet.
* Look for a bigger sized fragment to allocate first before we have
* to give up and fragment a whole new block eventually.
*/
break;
allocsiz++;
goto next_size;
}
}
/*
* No fragments were available, so a block
* will be allocated and hacked up.
*/
goto errout;
goto errout;
return (bno);
}
for (i = 0; i < frags; i++)
}
return (0);
}
/*
* Allocate a block in a cylinder group.
*
* This algorithm implements the following policy:
* 1) allocate the requested block.
* 2) allocate a rotationally optimal block in the same cylinder.
* 3) allocate the next available block on the block rotor for the
* specified cylinder group.
* Note that this routine only allocates fs_bsize blocks; these
* blocks may be fragmented by the routine that allocates them.
*/
static daddr_t
{
short *cylbp;
int i;
short *blks;
if (bpref == 0) {
goto norot;
}
/*
* If the requested block is available, use it.
*/
goto gotit;
}
/*
* Check for a block available on the same cylinder.
*/
goto norot;
/*
* Block layout info is not available, so just
* have to take any block in this cylinder.
*/
goto norot;
}
/*
* Check the summary information to see if a block is
* available in the requested cylinder starting at the
* requested rotational position and proceeding around.
*/
if (cylbp[i] > 0)
break;
for (i = 0; i < pos; i++)
if (cylbp[i] > 0)
break;
if (cylbp[i] > 0) {
/*
* Found a rotational position, now find the actual
* block. A "panic" if none is actually there.
*/
/*
* Up to this point, "pos" has referred to the rotational
* position of the desired block. From now on, it holds
* the offset of the current cylinder within a cylinder
* cycle. (A cylinder cycle refers to a set of cylinders
* which are described by a single rotational table; the
* size of the cycle is fs_cpc.)
*
* bno is set to the block number of the first block within
* the current cylinder cycle.
*/
/*
* The blocks within a cylinder are grouped into equivalence
* classes according to their "rotational position." There
* are two tables used to determine these classes.
*
* The positional offset table (fs_postbl) has an entry for
* each rotational position of each cylinder in a cylinder
* cycle. This entry contains the relative block number
* (counting from the start of the cylinder cycle) of the
* first block in the equivalence class for that position
* and that cylinder. Positions for which no blocks exist
* are indicated by a -1.
*
* The rotational delta table (fs_rotbl) has an entry for
* each block in a cylinder cycle. This entry contains
* the offset from that block to the next block in the
* same equivalence class. The last block in the class
* is indicated by a zero in the table.
*
* The following code, then, walks through all of the blocks
* in the cylinder (cylno) which we're allocating within
* which are in the equivalence class for the rotational
* position (i) which we're allocating within.
*/
"alloccgblk: cyl groups corrupted, pos = %d, "
return (0);
}
/*
* There is one entry in the rotational table for each block
* in the cylinder cycle. These are whole blocks, not frags.
*/
/*
* As we start, "i" is the rotational position within which
* we're searching. After the next line, it will be a block
* number (relative to the start of the cylinder cycle)
* within the equivalence class of that rotational position.
*/
for (;;) {
goto gotit;
}
if (delta <= 0 || /* End of chain, or */
break; /* If so, panic. */
i += delta;
}
"alloccgblk: can't find blk in cyl, pos:%d, i:%d, "
return (0);
}
/*
* No blocks in the requested cylinder, so take
* next available one in this cylinder group.
*/
if (bno < 0)
return (0);
goto norot;
/*
*/
return (frag);
}
/*
* Determine whether an inode can be allocated.
*
* Check to see if an inode is available, and if it is,
* allocate it using the following policy:
* 1) allocate the requested inode.
* 2) allocate the next available inode after the requested
* inode in the specified cylinder group.
*/
static ino_t
{
char *iused;
return (0);
return (0);
}
/*
* While we are waiting for the mutex, someone may have taken
* the last available inode. Need to recheck.
*/
return (0);
}
if (ipref) {
goto gotit;
}
if (loc == 0) {
start = 0;
if (loc == 0) {
"ialloccg: map corrupted, cg = %d, irotor = %d, "
return (0);
}
}
if ((map & i) == 0) {
goto gotit;
}
}
return (0);
}
}
/*
* Find a block of the specified size in the specified cylinder group.
*
* It is a panic if a request is made to find a block if none are
* available.
*/
static daddr_t
int allocsiz)
{
int gotit;
/*
* ufsvfs->vfs_lock is held when calling this.
*/
/*
* Find the fragment by searching through the
* free block map for an appropriate bit pattern.
*/
if (bpref)
else
/*
* the following loop performs two scans -- the first scan
* searches the bottom half of the array for a match and the
* second scan searches the top half of the array. The loops
* have been merged just to make things difficult.
*/
secondtime = 0;
/*
* search the array for a match
*/
/*
* match found
*/
if (loc) {
/*
* Found the byte in the map, sift
* through the bits to find the selected frag
*/
gotit = 0;
blk <<= 1;
for (pos = 0;
pos++) {
gotit++;
break;
}
field <<= 1;
subfield <<= 1;
}
if (gotit)
break;
}
/*
* success if block is *not* being converted from
* metadata into userdata (harpy). If so, ignore.
*/
if (!TRANS_ISCANCEL(ufsvfsp,
return (bno);
/*
* keep looking -- this block is being converted
*/
loc = 0;
continue;
}
/*
* no usable matches in bottom half -- now search the top half
*/
if (secondtime)
/*
* no usable matches in top half -- all done
*/
break;
secondtime = 1;
first = 0;
}
/*
* no usable matches
*/
return ((daddr_t)-1);
}
/*
* Acquire a write lock, and keep trying till we get it
*/
static int
{
int err = 0;
do {
if (err)
return (err);
} while (!LOCKFS_IS_ULOCK(lf));
goto lockagain;
return (err);
}
/*
* Release the write lock
*/
static int
{
int err = 0;
return (err);
}
struct allocsp_undo {
};
/*
* ufs_allocsp() can be used to pre-allocate blocks for a file on a given
* file system. For direct blocks, the blocks are allocated from the offset
* requested to the block boundary, then any full blocks are allocated,
* and finally any remainder.
* For indirect blocks the blocks are not initialized and are
* only marked as allocated. These addresses are then stored as negative
* block numbers in the inode to imply special handling. UFS has been modified
* where necessary to understand this new notion.
* Successfully fallocated files will have IFALLOCATE cflag set in the inode.
*/
int
{
int cnt = 0;
goto out_allocsp;
}
return (EINVAL);
/* Quickly check to make sure we have space before we proceed */
if (TRANS_ISTRANS(ufsvfsp)) {
return (ENOSPC);
} else
return (ENOSPC);
}
/*
* We will keep i_rwlock locked as WRITER through out the function
* since we don't want anyone else reading or writing to the inode
* while we are in the middle of fallocating the file.
*/
/* Back up the direct block list, used for undo later if necessary */
for (i = 0; i < NDADDR; i++)
/* Write lock the file system */
goto exit;
/*
* Allocate any direct blocks now.
* Blocks are allocated from the offset requested to the block
* boundary, then any full blocks are allocated, and finally any
* remainder.
*/
done_len = 0;
/* Yikes error, quit */
if (berr) {
TOP_ALLOCSP, resv);
goto exit;
}
if (allocblk) {
totblks++;
}
}
/* start offset for indirect allocation */
}
/* Break the transactions into vfs_iotransz units */
/* Now go about fallocating necessary indirect blocks */
if (berr) {
TOP_ALLOCSP, resv);
goto exit;
}
/* Update the blk counter only if new block was added */
if (allocblk) {
/* Save undo information */
KM_SLEEP);
totblks++;
}
cnt++;
/* Being a good UFS citizen, let others get a share */
/*
* If there are waiters or the fs is hard locked,
* error locked, or read-only error locked,
* quit with EIO
*/
TOP_ALLOCSP, resv);
return (EIO);
}
/* End the current transaction */
TOP_ALLOCSP, resv);
/* Release the write lock */
goto exit;
/* Wake up others waiting to do operations */
/* Grab the write lock again */
goto exit;
} /* end of CV_HAS_WAITERS(&ulp->ul_cv) */
/* Reserve more space in log for this file */
cnt = 0; /* reset cnt b/c of new transaction */
}
}
/* If the file has grown then correct the file size */
/* Release locks, end log transaction and unlock fs */
/*
* @ exit label, we should no longer be holding the fs write lock, and
* all logging transactions should have been ended. We still hold
* ip->i_rwlock.
*/
exit:
/*
* File has grown larger than 2GB. Set flag
* in superblock to indicate this, if it
* is not already set.
*/
}
/*
* Since we couldn't allocate completely, we will undo the allocations.
*/
if (berr) {
/* Direct blocks */
for (i = 0; i < NDADDR; i++) {
/*
* Only free the block if they are not same, and
* the old one isn't zero (the fragment was
* re-allocated).
*/
}
}
/* Undo the indirect blocks */
if (err)
"undo allocation of block %ld",
}
return (berr);
}
/*
* Don't forget to free the undo chain :)
*/
}
return (err);
}
/*
* Free storage space associated with the specified inode. The portion
* to be freed is specified by lp->l_start and lp->l_len (already
* normalized to a "whence" of 0).
*
* This is an experimental facility whose continued existence is not
* guaranteed. Currently, we only support the special case
* of l_len == 0, meaning free to end of file.
*
* Blocks are freed in reverse order. This FILO algorithm will tend to
* maintain a contiguous free list much longer than FIFO.
* See also ufs_itrunc() in ufs_inode.c.
*
* Bug: unused bytes in the last retained block are not cleared.
* This may result in a "hole" in the file that does not read as zeroes.
*/
/* ARGSUSED */
int
{
int i;
int error;
return (EINVAL);
return (0);
}
/*
* Check if there is any active mandatory lock on the
*/
/*
* "Truncate up" case: need to make sure there
* is no lock beyond current end-of-file. To
* do so, we need to set l_start to the size
* of the file temporarily.
*/
}
return (i ? i : EAGAIN);
}
}
/*
* Make sure a write isn't in progress (allocating blocks)
* by acquiring i_rwlock (we promised ufs_bmap we wouldn't
* truncate while it was allocating blocks).
* Grab the locks in the right order.
*/
return (error);
}
/*
* Find a cg with as close to nb contiguous bytes as possible
* THIS MAY TAKE MANY DISK READS!
*
* Implemented in an attempt to allocate contiguous blocks for
* writing the ufs log file to, minimizing future disk head seeking
*/
{
savenblk = 0;
savecg = 0;
savebno = 0;
cg = 0; /* Nothing suitable found */
else
/*
* find the largest contiguous range in this cg
*/
continue;
}
cgbno = 0;
/* find a free block */
if (startcg != -1) {
goto done;
} else
break;
}
}
/* count the number of free blocks */
break;
break;
}
}
}
break;
}
done:
/* convert block offset in cg to frag offset in cg */
/* convert frag offset in cg to frag offset in fs */
return (savebno);
}
/*
* The object of this routine is to find a start point for the UFS log.
* Ideally the space should be allocated from the smallest possible number
* of contiguous cylinder groups. This is found by using a sliding window
* technique. The smallest window of contiguous cylinder groups, which is
* still able to accommodate the target, is found by moving the window
* through the cylinder groups in a single pass. The end of the window is
* advanced until the space is accommodated, then the start is advanced until
* it no longer fits, the end is then advanced again and so on until the
* final cylinder group is reached. The first suitable instance is recorded
* and its starting cg number is returned.
*
* If we are not able to find a minimum amount of space, represented by
* minblk, or to do so uses more than the available extents, then return -1.
*/
int
{
int s; /* index of the first element in the current window */
int e; /* index of the first element + the width */
/* (i.e. 1 + index of last element) */
btotal = -1;
s = e = 0;
while (e < ncgs) {
/* Advance the end of the window until it accommodates the target. */
e++;
}
/*
* Advance the start of the window until it no longer
* accommodates the target.
*/
/* See if this is the smallest window so far. */
cwidth = e - s;
goto more;
bs = s;
}
more:
s++;
}
}
/*
* If we cannot allocate the minimum required or we use too many
* extents to do so, return -1.
*/
bs = -1;
return (bs);
}