1N/A/*-
1N/A * See the file LICENSE for redistribution information.
1N/A *
1N/A * Copyright (c) 1996, 1997, 1998
1N/A * Sleepycat Software. All rights reserved.
1N/A *
1N/A * @(#)mp.h 10.37 (Sleepycat) 1/1/99
1N/A */
1N/A
1N/Astruct __bh; typedef struct __bh BH;
1N/Astruct __db_mpreg; typedef struct __db_mpreg DB_MPREG;
1N/Astruct __mpool; typedef struct __mpool MPOOL;
1N/Astruct __mpoolfile; typedef struct __mpoolfile MPOOLFILE;
1N/A
1N/A /* Default mpool name. */
1N/A#define DB_DEFAULT_MPOOL_FILE "__db_mpool.share"
1N/A
1N/A/*
1N/A * We default to 256K (32 8K pages) if the user doesn't specify, and
1N/A * require a minimum of 20K.
1N/A */
1N/A#ifndef DB_CACHESIZE_DEF
1N/A#define DB_CACHESIZE_DEF (256 * 1024)
1N/A#endif
1N/A#define DB_CACHESIZE_MIN ( 20 * 1024)
1N/A
1N/A#define INVALID 0 /* Invalid shared memory offset. */
1N/A
1N/A/*
1N/A * There are three ways we do locking in the mpool code:
1N/A *
1N/A * Locking a handle mutex to provide concurrency for DB_THREAD operations.
1N/A * Locking the region mutex to provide mutual exclusion while reading and
1N/A * writing structures in the shared region.
1N/A * Locking buffer header mutexes during I/O.
1N/A *
1N/A * The first will not be further described here. We use the shared mpool
1N/A * region lock to provide mutual exclusion while reading/modifying all of
1N/A * the data structures, including the buffer headers. We use a per-buffer
1N/A * header lock to wait on buffer I/O. The order of locking is as follows:
1N/A *
1N/A * Searching for a buffer:
1N/A * Acquire the region lock.
1N/A * Find the buffer header.
1N/A * Increment the reference count (guarantee the buffer stays).
1N/A * While the BH_LOCKED flag is set (I/O is going on) {
1N/A * Release the region lock.
1N/A * Explicitly yield the processor if it's not the first pass
1N/A * through this loop, otherwise, we can simply spin because
1N/A * we'll be simply switching between the two locks.
1N/A * Request the buffer lock.
1N/A * The I/O will complete...
1N/A * Acquire the buffer lock.
1N/A * Release the buffer lock.
1N/A * Acquire the region lock.
1N/A * }
1N/A * Return the buffer.
1N/A *
1N/A * Reading/writing a buffer:
1N/A * Acquire the region lock.
1N/A * Find/create the buffer header.
1N/A * If reading, increment the reference count (guarantee the buffer stays).
1N/A * Set the BH_LOCKED flag.
1N/A * Acquire the buffer lock (guaranteed not to block).
1N/A * Release the region lock.
1N/A * Do the I/O and/or initialize the buffer contents.
1N/A * Release the buffer lock.
1N/A * At this point, the buffer lock is available, but the logical
1N/A * operation (flagged by BH_LOCKED) is not yet completed. For
1N/A * this reason, among others, threads checking the BH_LOCKED flag
1N/A * must loop around their test.
1N/A * Acquire the region lock.
1N/A * Clear the BH_LOCKED flag.
1N/A * Release the region lock.
1N/A * Return/discard the buffer.
1N/A *
1N/A * Pointers to DB_MPOOL, MPOOL, DB_MPOOLFILE and MPOOLFILE structures are not
1N/A * reacquired when a region lock is reacquired because they couldn't have been
1N/A * closed/discarded and because they never move in memory.
1N/A */
1N/A#define LOCKINIT(dbmp, mutexp) \
1N/A if (F_ISSET(dbmp, MP_LOCKHANDLE | MP_LOCKREGION)) \
1N/A (void)__db_mutex_init(mutexp, \
1N/A MUTEX_LOCK_OFFSET((dbmp)->reginfo.addr, mutexp))
1N/A
1N/A#define LOCKHANDLE(dbmp, mutexp) \
1N/A if (F_ISSET(dbmp, MP_LOCKHANDLE)) \
1N/A (void)__db_mutex_lock(mutexp, (dbmp)->reginfo.fd)
1N/A#define UNLOCKHANDLE(dbmp, mutexp) \
1N/A if (F_ISSET(dbmp, MP_LOCKHANDLE)) \
1N/A (void)__db_mutex_unlock(mutexp, (dbmp)->reginfo.fd)
1N/A
1N/A#define LOCKREGION(dbmp) \
1N/A if (F_ISSET(dbmp, MP_LOCKREGION)) \
1N/A (void)__db_mutex_lock(&((RLAYOUT *)(dbmp)->mp)->lock, \
1N/A (dbmp)->reginfo.fd)
1N/A#define UNLOCKREGION(dbmp) \
1N/A if (F_ISSET(dbmp, MP_LOCKREGION)) \
1N/A (void)__db_mutex_unlock(&((RLAYOUT *)(dbmp)->mp)->lock, \
1N/A (dbmp)->reginfo.fd)
1N/A
1N/A#define LOCKBUFFER(dbmp, bhp) \
1N/A if (F_ISSET(dbmp, MP_LOCKREGION)) \
1N/A (void)__db_mutex_lock(&(bhp)->mutex, (dbmp)->reginfo.fd)
1N/A#define UNLOCKBUFFER(dbmp, bhp) \
1N/A if (F_ISSET(dbmp, MP_LOCKREGION)) \
1N/A (void)__db_mutex_unlock(&(bhp)->mutex, (dbmp)->reginfo.fd)
1N/A
1N/A/* Check for region catastrophic shutdown. */
1N/A#define MP_PANIC_CHECK(dbmp) { \
1N/A if ((dbmp)->mp->rlayout.panic) \
1N/A return (DB_RUNRECOVERY); \
1N/A}
1N/A
1N/A/*
1N/A * DB_MPOOL --
1N/A * Per-process memory pool structure.
1N/A */
1N/Astruct __db_mpool {
1N/A/* These fields need to be protected for multi-threaded support. */
1N/A db_mutex_t *mutexp; /* Structure lock. */
1N/A
1N/A /* List of pgin/pgout routines. */
1N/A LIST_HEAD(__db_mpregh, __db_mpreg) dbregq;
1N/A
1N/A /* List of DB_MPOOLFILE's. */
1N/A TAILQ_HEAD(__db_mpoolfileh, __db_mpoolfile) dbmfq;
1N/A
1N/A/* These fields are not protected. */
1N/A DB_ENV *dbenv; /* Reference to error information. */
1N/A REGINFO reginfo; /* Region information. */
1N/A
1N/A MPOOL *mp; /* Address of the shared MPOOL. */
1N/A
1N/A void *addr; /* Address of shalloc() region. */
1N/A
1N/A DB_HASHTAB *htab; /* Hash table of bucket headers. */
1N/A
1N/A#define MP_LOCKHANDLE 0x01 /* Threaded, lock handles and region. */
1N/A#define MP_LOCKREGION 0x02 /* Concurrent access, lock region. */
1N/A u_int32_t flags;
1N/A};
1N/A
1N/A/*
1N/A * DB_MPREG --
1N/A * DB_MPOOL registry of pgin/pgout functions.
1N/A */
1N/Astruct __db_mpreg {
1N/A LIST_ENTRY(__db_mpreg) q; /* Linked list. */
1N/A
1N/A int ftype; /* File type. */
1N/A /* Pgin, pgout routines. */
1N/A int (DB_CALLBACK *pgin) __P((db_pgno_t, void *, DBT *));
1N/A int (DB_CALLBACK *pgout) __P((db_pgno_t, void *, DBT *));
1N/A};
1N/A
1N/A/*
1N/A * DB_MPOOLFILE --
1N/A * Per-process DB_MPOOLFILE information.
1N/A */
1N/Astruct __db_mpoolfile {
1N/A/* These fields need to be protected for multi-threaded support. */
1N/A db_mutex_t *mutexp; /* Structure lock. */
1N/A
1N/A int fd; /* Underlying file descriptor. */
1N/A
1N/A u_int32_t ref; /* Reference count. */
1N/A
1N/A /*
1N/A * !!!
1N/A * This field is a special case -- it's protected by the region lock
1N/A * NOT the thread lock. The reason for this is that we always have
1N/A * the region lock immediately before or after we modify the field,
1N/A * and we don't want to use the structure lock to protect it because
1N/A * then I/O (which is done with the structure lock held because of
1N/A * the race between the seek and write of the file descriptor) will
1N/A * block any other put/get calls using this DB_MPOOLFILE structure.
1N/A */
1N/A u_int32_t pinref; /* Pinned block reference count. */
1N/A
1N/A/* These fields are not protected. */
1N/A TAILQ_ENTRY(__db_mpoolfile) q; /* Linked list of DB_MPOOLFILE's. */
1N/A
1N/A DB_MPOOL *dbmp; /* Overlying DB_MPOOL. */
1N/A MPOOLFILE *mfp; /* Underlying MPOOLFILE. */
1N/A
1N/A void *addr; /* Address of mmap'd region. */
1N/A size_t len; /* Length of mmap'd region. */
1N/A
1N/A/* These fields need to be protected for multi-threaded support. */
1N/A#define MP_READONLY 0x01 /* File is readonly. */
1N/A#define MP_UPGRADE 0x02 /* File descriptor is readwrite. */
1N/A#define MP_UPGRADE_FAIL 0x04 /* Upgrade wasn't possible. */
1N/A u_int32_t flags;
1N/A};
1N/A
1N/A/*
1N/A * MPOOL --
1N/A * Shared memory pool region. One of these is allocated in shared
1N/A * memory, and describes the pool.
1N/A */
1N/Astruct __mpool {
1N/A RLAYOUT rlayout; /* General region information. */
1N/A
1N/A SH_TAILQ_HEAD(__bhq) bhq; /* LRU list of buckets. */
1N/A SH_TAILQ_HEAD(__bhfq) bhfq; /* Free buckets. */
1N/A SH_TAILQ_HEAD(__mpfq) mpfq; /* List of MPOOLFILEs. */
1N/A
1N/A /*
1N/A * We make the assumption that the early pages of the file are far
1N/A * more likely to be retrieved than the later pages, which means
1N/A * that the top bits are more interesting for hashing since they're
1N/A * less likely to collide. On the other hand, since 512 4K pages
1N/A * represents a 2MB file, only the bottom 9 bits of the page number
1N/A * are likely to be set. We XOR in the offset in the MPOOL of the
1N/A * MPOOLFILE that backs this particular page, since that should also
1N/A * be unique for the page.
1N/A */
1N/A#define BUCKET(mp, mf_offset, pgno) \
1N/A (((pgno) ^ ((mf_offset) << 9)) % (mp)->htab_buckets)
1N/A
1N/A size_t htab; /* Hash table offset. */
1N/A size_t htab_buckets; /* Number of hash table entries. */
1N/A
1N/A DB_LSN lsn; /* Maximum checkpoint LSN. */
1N/A u_int32_t lsn_cnt; /* Checkpoint buffers left to write. */
1N/A
1N/A DB_MPOOL_STAT stat; /* Global mpool statistics. */
1N/A
1N/A#define MP_LSN_RETRY 0x01 /* Retry all BH_WRITE buffers. */
1N/A u_int32_t flags;
1N/A};
1N/A
1N/A/*
1N/A * MPOOLFILE --
1N/A * Shared DB_MPOOLFILE information.
1N/A */
1N/Astruct __mpoolfile {
1N/A SH_TAILQ_ENTRY q; /* List of MPOOLFILEs */
1N/A
1N/A u_int32_t ref; /* Reference count. */
1N/A
1N/A int ftype; /* File type. */
1N/A
1N/A int32_t lsn_off; /* Page's LSN offset. */
1N/A u_int32_t clear_len; /* Bytes to clear on page create. */
1N/A
1N/A size_t path_off; /* File name location. */
1N/A size_t fileid_off; /* File identification location. */
1N/A
1N/A size_t pgcookie_len; /* Pgin/pgout cookie length. */
1N/A size_t pgcookie_off; /* Pgin/pgout cookie location. */
1N/A
1N/A u_int32_t lsn_cnt; /* Checkpoint buffers left to write. */
1N/A
1N/A db_pgno_t last_pgno; /* Last page in the file. */
1N/A db_pgno_t orig_last_pgno; /* Original last page in the file. */
1N/A
1N/A#define MP_CAN_MMAP 0x01 /* If the file can be mmap'd. */
1N/A#define MP_TEMP 0x02 /* Backing file is a temporary. */
1N/A u_int32_t flags;
1N/A
1N/A DB_MPOOL_FSTAT stat; /* Per-file mpool statistics. */
1N/A};
1N/A
1N/A/*
1N/A * BH --
1N/A * Buffer header.
1N/A */
1N/Astruct __bh {
1N/A db_mutex_t mutex; /* Structure lock. */
1N/A
1N/A u_int16_t ref; /* Reference count. */
1N/A
1N/A#define BH_CALLPGIN 0x001 /* Page needs to be reworked... */
1N/A#define BH_DIRTY 0x002 /* Page was modified. */
1N/A#define BH_DISCARD 0x004 /* Page is useless. */
1N/A#define BH_LOCKED 0x008 /* Page is locked (I/O in progress). */
1N/A#define BH_TRASH 0x010 /* Page is garbage. */
1N/A#define BH_WRITE 0x020 /* Page scheduled for writing. */
1N/A u_int16_t flags;
1N/A
1N/A SH_TAILQ_ENTRY q; /* LRU queue. */
1N/A SH_TAILQ_ENTRY hq; /* MPOOL hash bucket queue. */
1N/A
1N/A db_pgno_t pgno; /* Underlying MPOOLFILE page number. */
1N/A size_t mf_offset; /* Associated MPOOLFILE offset. */
1N/A
1N/A /*
1N/A * !!!
1N/A * This array must be size_t aligned -- the DB access methods put PAGE
1N/A * and other structures into it, and expect to be able to access them
1N/A * directly. (We guarantee size_t alignment in the db_mpool(3) manual
1N/A * page as well.)
1N/A */
1N/A u_int8_t buf[1]; /* Variable length data. */
1N/A};
1N/A
1N/A#include "mp_ext.h"