ufs_inode.h revision 7f63b8c301509bdd1176e2db41c3c20d7666a2de
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
* The Regents of the University of California
* All Rights Reserved
*
* University Acknowledgment- Portions of this document are derived from
* software developed by the University of California, Berkeley, and its
* contributors.
*/
#ifndef _SYS_FS_UFS_INODE_H
#define _SYS_FS_UFS_INODE_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/isa_defs.h>
#include <sys/fdbuffer.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* The I node is the focus of all local file activity in UNIX.
* There is a unique inode allocated for each active file,
* each current directory, each mounted-on file, each mapping,
* Data in icommon is read in from permanent inode on volume.
*
* Each inode has 5 locks associated with it:
* i_rwlock: Serializes ufs_write and ufs_setattr request
* and allows ufs_read requests to proceed in parallel.
* vfs_dqrwlock: Manages quota sub-system quiescence. See below.
* i_contents: Protects almost all of the fields in the inode
* except for those listed below. When held
* in writer mode also protects those fields
* listed under i_tlock.
* i_tlock: When i_tlock is held with the i_contents reader
* lock the i_atime, i_mtime, i_ctime,
* i_delayoff, i_delaylen, i_nextrio, i_writes, i_flag
* i_seq, i_writer & i_mapcnt fields are protected.
* For more i_flag locking info see below.
* ih_lock: Protects inode hash chain buckets
* ifree_lock: Protects inode freelist
*
* Lock ordering:
* i_rwlock > i_contents > i_tlock
* i_rwlock > vfs_dqrwlock > i_contents(writer) > i_tlock
* i_contents > i_tlock
* vfs_dqrwlock > i_contents(writer) > i_tlock
* ih_lock > i_contents > i_tlock
*
* Making major changes to quota sub-system state, while the file
* system is mounted required the addition of another lock. The
* primary lock in the quota sub-system is vfs_dqrwlock in the ufsvfs
* structure. This lock is used to manage quota sub-system quiescence
* for a particular file system. Major changes to quota sub-system
* state (disabling quotas, enabling quotas, and setting new quota
* limits) all require the file system to be quiescent and grabbing
* vfs_dqrwlock as writer accomplishes this. On the other hand,
* grabbing vfs_dqrwlock as reader makes the quota sub-system
* non-quiescent and lets the quota sub-system know that now is not a
* good time to change major quota sub-system state. Typically
* vfs_dqrwlock is grabbed for reading before i_contents is grabbed for
* writing. However, there are cases where vfs_dqrwlock is grabbed for
* reading without a corresponding i_contents write grab because there
* is no relevant inode. There are also cases where i_contents is
* grabbed for writing when a vfs_dqrwlock read grab is not needed
* because the inode changes do not affect quotas.
*
* Unfortunately, performance considerations have required that we be more
* intelligent about using i_tlock when updating i_flag. Ideally, we would
* have simply separated out several of the bits in i_flag into their own
* ints to avoid problems. But, instead, we have implemented the following
* rules:
*
* o You can update any i_flag field while holding the writer-contents,
* or by holding the reader-contents AND holding i_tlock.
* You can only call ITIMES_NOLOCK while holding the writer-contents,
* or by holding the reader-contents AND holding i_tlock.
*
* o For a directory, holding the reader-rw_lock is sufficient for setting
* IACC.
*
* o Races with IREF are avoided by holding the reader contents lock
* and by holding i_tlock in ufs_rmidle, ufs_putapage, and ufs_getpage.
* And by holding the writer-contents in ufs_iinactive.
*
* o The callers are no longer required to handle the calls to ITIMES
* and ITIMES_NOLOCK. The functions that set the i_flag bits are
* responsible for managing those calls. The exceptions are the
* bmap routines.
*
* SVR4 Extended Fundamental Type (EFT) support:
* The inode structure has been enhanced to support
* 32-bit user-id, 32-bit group-id, and 32-bit device number.
* Standard SVR4 ufs also supports 32-bit mode field. For the reason
* of backward compatibility with the previous ufs disk format,
* 32-bit mode field is not supported.
*
* The current inode structure is 100% backward compatible with
* the previous inode structure if no user-id or group-id exceeds
* USHRT_MAX, and no major or minor number of a device number
* stored in an inode exceeds 255.
*
* Rules for managing i_seq:
* o i_seq is locked under the same rules as i_flag
* o The i_ctime or i_mtime MUST never change without increasing
* the value of i_seq.
* o You may increase the value of i_seq without the timestamps
* changing, this may decrease the callers performance but will
* be functionally correct.
* o The common case is when IUPD or ICHG is set, increase i_seq
* and immediately call ITIMES* or ufs_iupdat to create a new timestamp.
* o A less common case is the setting of IUPD or ICHG and while still
* holding the correct lock defer the timestamp and i_seq update
* until later, but it must still be done before the lock is released.
* bmap_write is an example of this, where the caller does the update.
* o If multiple changes are being made with the timestamps being
* updated only at the end, a single increase of i_seq is allowed.
* o If changes are made with IUPD or ICHG being set, but
* the controlling lock is being dropped before the timestamp is
* updated, there is a risk that another thread will also change
* the file, update i_flag, and push just one timestamp update.
* There is also the risk that another thread calls ITIMES or
* ufs_iupdat without setting IUPD|ICHG and thus not changing i_seq,
* this will cause ufs_imark to change the timestamps without changing
* i_seq. If the controlling lock is dropped, ISEQ must be set to
* force i_seq to be increased on next ufs_imark, but i_seq MUST still
* be increased by the original setting thread before its deferred
* call to ITIMES to insure it is increased the correct number of times.
*/
/* flag value to indicate uid is 32-bit long */
/* flag value to indicate gid is 32-bit long */
/* max fast symbolic name length is 56 */
struct icommon {
short ic_nlink; /* 2: number of links to file */
#ifdef _KERNEL
#else
#endif
};
/*
* Large directories can be cached. Directory caching can take the following
* states:
*/
typedef enum {
CD_DISABLED_NOMEM = -2,
} cachedir_t;
/*
* Large Files: Note we use the inline functions load_double, store_double
* to load and store the long long values of i_size. Therefore the
* address of i_size must be eight byte aligned. Kmem_alloc of incore
* inode structure makes sure that the structure is 8-byte aligned.
* XX64 - reorder this structure?
*/
typedef struct inode {
/* just a hint - no locking needed */
/* next byte read offset (read-ahead) */
/* No lock required */
/* */
/* - no locking needed */
long i_mapcnt; /* mappings to file pages */
int *i_map; /* block list for the corresponding file */
long i_writes; /* number of outstanding bytes in write q */
} inode_t;
struct dinode {
union {
char di_size[128];
} di_un;
};
#ifdef _LITTLE_ENDIAN
/*
* Originally done on x86, but carried on to all other little
* architectures, which provides for file system compatibility.
*/
#else
#endif
/* EFT transition aids - obsolete */
#define oEFT_MAGIC 0x90909090
#ifdef _LITTLE_ENDIAN
#else
#endif
/* flags */
/* filesystem won't become active */
/* write operation asynchronously */
/* cflags */
/* modes */
#define IWRITE 0200
#define IEXEC 0100
/* specify how the inode info is written in ufs_syncip() */
/* only if IATTCHG is set */
#define I_ASYNC 0 /* don't wait for the inode written */
/* flags passed to ufs_itrunc(), indirtrunc(), and free() */
/*
* If ufs_dircheckforname() fails to find an entry with the given name,
* this "slot" structure holds state for ufs_direnter_*() as to where
* there is space to put an entry with that name.
* If ufs_dircheckforname() finds an entry with the given name, this structure
* holds state for ufs_dirrename() and ufs_dirremove() as to where the
* entry is. "status" indicates what ufs_dircheckforname() found:
* NONE name not found, large enough free slot not found,
* FOUND name not found, large enough free slot found
* EXIST name found
* If ufs_dircheckforname() fails due to an error, this structure is not
* filled in.
*
* After ufs_dircheckforname() succeeds the values are:
* status offset size fbp, ep
* ------ ------ ---- -------
* NONE end of dir needed not valid
* FOUND start of entry of ent both valid if fbp != NULL
* EXIST start of entry of prev ent valid
*
* "endoff" is set to 0 if the an entry with the given name is found, or if no
* free slot could be found or made; this means that the directory should not
* be truncated. If the entry was found, the search terminates so
* ufs_dircheckforname() didn't find out where the last valid entry in the
* directory was, so it doesn't know where to cut the directory off; if no free
* slot could be found or made, the directory has to be extended to make room
* for the new entry, so there's nothing to cut off.
* Otherwise, "endoff" is set to the larger of the offset of the last
* non-empty entry in the directory, or the offset at which the new entry will
* be placed, whichever is larger. This is used by ufs_diraddentry(); if a new
* entry is to be added to the directory, any complete directory blocks at the
* end of the directory that contain no non-empty entries are lopped off the
* end, thus shrinking the directory dynamically.
*/
struct slot {
int size; /* size of area at slotoffset */
int cached; /* cached directory */
};
/*
* Statistics on inodes
* Not protected by locks
*/
struct instats {
};
#ifdef _KERNEL
/*
* Extended attributes
*/
#define XATTR_DIR_NAME "/@/"
extern int ufs_ninode; /* high-water mark for inode cache */
extern const struct fs_operation_def ufs_vnodeops_template[];
/*
* Convert between inode pointers and vnode pointers
*/
/*
* convert to fs
*/
/*
* Convert between vnode types and inode formats
*/
extern enum vtype iftovt_tab[];
#ifdef notneeded
extern int vttoif_tab[];
#endif
/*
* Mark an inode with the current (unique) timestamp.
* (Note that UFS's concept of time only keeps 32 bits of seconds
* in the on-disk format).
*/
extern kmutex_t ufs_iuniqtime_lock;
ITIMES_NOLOCK(ip); \
}
/*
* The following interfaces are used to do atomic loads and stores
* of an inode's i_size, which is a long long data type.
*
* For LP64, we just to a load or a store - atomicity and alignment
* are 8-byte guaranteed. For x86 there are no such instructions,
* so we grab i_contents as reader to get the size; we already hold
* it as writer when we're setting the size.
*/
#ifdef _LP64
#else /* _LP64 */
{ \
}
{ \
}
#endif /* _LP64 */
/*
* Allocate the specified block in the inode
* and make sure any in-core pages are initialized.
*/
/*
* enums
*/
/* direnter ops */
/* dirremove ops */
/*
* This overlays the fid structure (see vfs.h)
*
* LP64 note: we use int32_t instead of ino_t since UFS does not use
* inode numbers larger than 32-bits and ufid's are passed to NFS
* which expects them to not grow in size beyond 10 bytes (12 including
* the length).
*/
struct ufid {
};
/*
* each ufs thread (see ufs_thread.c) is managed by this struct
*/
struct ufs_q {
union uq_head {
void *_uq_generic; /* first entry on q */
} _uq_head;
int uq_lowat; /* thread runs when ne == lowat */
int uq_hiwat; /* synchronous idle if ne >= hiwat */
};
/*
* uq_flags
*/
/*
* When logging is enabled, statvfs must account for blocks and files that
* may be on the delete queue. Protected by ufsvfsp->vfs_delete.uq_mutex
*/
struct ufs_delq_info {
};
/*
* global idle queues
* The queues are sized dynamically in proportion to ufs_ninode
* which, unless overridden, scales with the amount of memory.
* The idle queue is halved whenever it hits the low water mark
* (1/4 of ufs_ninode), but can burst to sizes much larger. The number
* of hash queues is currently maintained to give on average IQHASHQLEN
* entries when the idle queue is at the low water mark.
* Note, we do not need to search along the hash queues, but use them
* in order to batch together geographically local inodes to allow
* their updates (via the log or buffer cache) to require less disk seeks.
* This gives an incredible performance boost for logging and a boost for
* non logging file systems.
*/
typedef struct {
} iqhead_t;
extern int ufs_njunk_iq; /* number of entries in junk iq */
extern int ufs_nuseful_iq; /* number of entries in useful iq */
extern int ufs_niqhash; /* number of iq hash qs - power of 2 */
extern int ufs_iqhashmask; /* iq hash mask = ufs_niqhash - 1 */
/*
* vfs_lfflags flags
*/
/*
* vfs_dfritime flags
*/
/*
* UFS VFS private data.
*
* UFS file system instances may be linked on several lists.
*
* - The vfs_next field chains together every extant ufs instance; this
* list is rooted at ufs_instances and should be used in preference to
* the overall vfs list (which is properly the province of the generic
* file system code, not of file system implementations). This same list
* link is used during forcible unmounts to chain together instances that
* can't yet be completely dismantled,
*
* - The vfs_wnext field is used within ufs_update to form a work list of
* UFS instances to be synced out.
*/
typedef struct ufsvfs {
/*
* some fs local threads
*/
/*
* This is copied from the super block at mount time.
*/
int vfs_nrpos; /* # rotational positions */
/*
* This lock protects cg's and super block pointed at by
* vfs_bufp->b_fs. Locks contents of fs and cg's and contents
* of vfs_dio.
*/
/*
* trans (logging ufs) stuff
*/
/*
* Some useful constants
*/
int vfs_nindirshift; /* calc. from fs_nindir */
int vfs_nindiroffset; /* calc. from fs_ninidr */
int vfs_ioclustsz; /* bytes in read/write cluster */
int vfs_iotransz; /* max device i/o transfer size */
/*
* More useful constants
*/
int vfs_minfrags; /* calc. from fs_minfree */
/*
* Force DirectIO on all files
*/
/*
* Deferred inode time related fields
*/
/*
* Some more useful info
*/
/*
* snapshot stuff
*/
void *vfs_snapshot; /* snapshot handle */
/*
* Controls logging "file system full" messages to messages file
*/
int vfs_nolog_si; /* not logging summary info */
int vfs_validfs; /* indicates mounted fs */
/*
* Additional information about vfs_delete above
*/
} ufsvfs_t;
/*
* values for vfs_validfs
*/
#define UT_UNMOUNTED 0
#define UT_MOUNTED 1
#define UT_HLOCKING 2
/* inohsz is guaranteed to be a power of 2 */
union ihead {
};
extern int *ih_ne;
extern int inohsz;
extern clock_t ufs_iowait;
#endif /* _KERNEL */
/*
* ufs function prototypes
*/
extern void ufs_iinit(void);
cred_t *);
extern void ufs_reset_vnode(vnode_t *);
extern void ufs_iinactive(struct inode *);
extern void ufs_iupdat(struct inode *, int);
extern int ufs_rmidle(struct inode *);
extern int ufs_iaccess(void *, int, cred_t *);
extern void ufs_itimes_nolock(struct inode *);
cred_t *, int);
struct cred *);
cred_t *);
extern int bmap_has_holes(struct inode *);
extern void ufs_vfs_add(struct ufsvfs *);
extern void ufs_vfs_remove(struct ufsvfs *);
extern void ufs_sbwrite(struct ufsvfs *);
extern void ufs_update(int);
extern int ufs_sync_indir(struct inode *);
extern void ufs_notclean(struct ufsvfs *);
extern void ufs_checkclean(struct vfs *);
extern void ufs_free_inode(inode_t *);
/*
* special stuff
*/
extern void ufs_setreclaim(struct inode *);
extern int ufs_scan_inodes(int, int (*)(struct inode *, void *), void *,
struct ufsvfs *);
extern int ufs_sync_inode(struct inode *, void *);
struct cred *);
/*
* quota
*/
/*
* ufs thread stuff
*/
extern void ufs_thread_delete(struct vfs *);
extern void ufs_delete_drain(struct vfs *, int, int);
extern void ufs_inode_cache_reclaim(void *);
extern void ufs_idle_drain(struct vfs *);
extern void ufs_idle_some(int);
extern void ufs_thread_idle(void);
extern void ufs_thread_reclaim(struct vfs *);
extern void ufs_thread_init(struct ufs_q *, int);
extern void ufs_thread_exit(struct ufs_q *);
extern void ufs_thread_suspend(struct ufs_q *);
extern void ufs_thread_continue(struct ufs_q *);
extern void ufs_thread_hlock(void *);
extern void ufs_delete_init(struct ufsvfs *, int);
extern void ufs_delete_drain_wait(struct ufsvfs *, int);
/*
* ufs lockfs stuff
*/
struct seg;
extern int ufs_quiesce(struct ulockfs *);
extern void ufs_lockfs_end(struct ulockfs *);
/*
* ufs acl stuff
*/
extern void si_cache_init(void);
extern void ufs_si_del(struct inode *);
extern void ufs_si_cache_flush(dev_t);
/*
* ufs directio stuff
*/
extern void ufs_directio_init();
int *);
#define DIRECTIO_FAILURE (0)
#define DIRECTIO_SUCCESS (1)
/*
* ufs extensions for PXFS
*/
/*
* prototypes to support the forced unmount
*/
/*
* extended attributes
*/
#endif /* defined(_KERNEL) && !defined(_BOOT) */
#ifdef __cplusplus
}
#endif
#endif /* _SYS_FS_UFS_INODE_H */