db_page.h revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1996, 1997, 1998
* Sleepycat Software. All rights reserved.
*
* @(#)db_page.h 10.18 (Sleepycat) 12/2/98
*/
#ifndef _DB_PAGE_H_
#define _DB_PAGE_H_
/*
* DB page formats.
*
* This implementation requires that values within the following structures
* NOT be padded -- note, ANSI C permits random padding within structures.
* If your compiler pads randomly you can just forget ever making DB run on
* your system. In addition, no data type can require larger alignment than
* its own size, e.g., a 4-byte data element may not require 8-byte alignment.
*
* item fits on a page, it's guaranteed to be small enough to fit into a
* db_indx_t, and storing it in one saves space.
*/
#define PGNO_METADATA 0 /* Metadata page number. */
#define PGNO_INVALID 0 /* Metadata page number, therefore illegal. */
/*
* When we create pages in mpool, we ask mpool to clear some number of bytes
* in the header. This number must be at least as big as the regular page
* headers and cover enough of the btree and hash meta-data pages to obliterate
* the magic and version numbers.
*/
#define DB_PAGE_CLEAR_LEN 32
/************************************************************************
BTREE METADATA PAGE LAYOUT
************************************************************************/
/*
* Btree metadata page layout:
*/
typedef struct _btmeta {
#define BTM_MASK 0x01f
/* 48-67: Unique file ID. */
} BTMETA;
/************************************************************************
HASH METADATA PAGE LAYOUT
************************************************************************/
/*
* Hash metadata page layout:
*/
/* Hash Table Information */
typedef struct hashhdr { /* Disk resident portion */
#define DB_HASH_DUP 0x01
/* 60-187: Spare pages for overflow */
/* 188-207: Unique file ID. */
/*
* Minimum page size is 256.
*/
} HASHHDR;
/************************************************************************
MAIN PAGE LAYOUT
************************************************************************/
/*
* +-----------------------------------+
* | lsn | pgno | prev pgno |
* +-----------------------------------+
* | next pgno | entries | hf offset |
* +-----------------------------------+
* | level | type | index |
* +-----------------------------------+
* | index | free --> |
* +-----------+-----------------------+
* | F R E E A R E A |
* +-----------------------------------+
* | <-- free | item |
* +-----------------------------------+
* | item | item | item |
* +-----------------------------------+
*
* sizeof(PAGE) == 26 bytes, and the following indices are guaranteed to be
* two-byte aligned.
*
* For hash and btree leaf pages, index items are paired, e.g., inp[0] is the
* key for inp[1]'s data. All other types of pages only contain single items.
*/
typedef struct _db_page {
/*
* The btree levels are numbered from the leaf to the root, starting
* with 1, so the leaf is level 1, its parent is level 2, and so on.
* We maintain this level on all btree pages, but the only place that
* we actually need it is on the root page. It would not be difficult
* to hide the byte on the root page once it becomes an internal page,
* so we could get this byte back if we needed it for something else.
*/
#define LEAFLEVEL 1
#define MAXBTREELEVEL 255
#define P_INVALID 0 /* Invalid page type. */
} PAGE;
/* Element macros. */
/*
* !!!
* The next_pgno and prev_pgno fields are not maintained for btree and recno
* internal pages. It's a minor performance improvement, and more, it's
* hard to do when deleting internal pages, and it decreases the chance of
* deadlock during deletes and splits.
*
* !!!
* page to specify how many records are stored in the tree. (The alternative
* is to store the number of records in the meta-data page, which will create
* a second hot spot in trees being actively modified, or recalculate it from
* the BINTERNAL fields on each access.) Overload the prev_pgno field.
*/
#define RE_NREC(p) \
#define RE_NREC_ADJ(p, adj) \
#define RE_NREC_SET(p, num) \
/*
* Initialize a page.
*
* !!!
* Don't modify the page's LSN, code depends on it being unchanged after a
* P_INIT call.
*/
} while (0)
/* Page header length (offset to first index). */
/* First free byte. */
/* Free space on the page. */
/* Get a pointer to the bytes at a specific index. */
/************************************************************************
OVERFLOW PAGE LAYOUT
************************************************************************/
/*
* Overflow items are referenced by HOFFPAGE and BOVERFLOW structures, which
* store a page number (the first page of the overflow item) and a length
* (the total length of the overflow item). The overflow item consists of
* some number of overflow pages, linked by the next_pgno field of the page.
* A next_pgno field of PGNO_INVALID flags the end of the overflow item.
*
* Overflow page overloads:
* The amount of overflow data stored on each page is stored in the
* hf_offset field.
*
* The implementation reference counts overflow items as it's possible
* for them to be promoted onto btree internal pages. The reference
* count is stored in the entries field.
*/
/* Maximum number of bytes that you can put on an overflow page. */
/************************************************************************
HASH PAGE LAYOUT
************************************************************************/
/* Each index references a group of bytes on the page. */
/*
* !!!
* Items on hash pages are (potentially) unaligned, so we can never cast the
* (page + offset) pointer to an HKEYDATA, HOFFPAGE or HOFFDUP structure, as
* we do with B+tree on-page structures. Because we frequently want the type
* field, it requires no alignment, and it's in the same location in all three
* structures, there's a pair of macros.
*/
#define HPAGE_PTYPE(p) (*(u_int8_t *)p)
/*
* The first and second types are H_KEYDATA and H_DUPLICATE, represented
* by the HKEYDATA structure:
*
* +-----------------------------------+
* +-----------------------------------+
*
* For duplicates, the data field encodes duplicate elements in the data
* field:
*
* +---------------------------------------------------------------+
* | type | len1 | element1 | len1 | len2 | element2 | len2 |
* +---------------------------------------------------------------+
*
* Thus, by keeping track of the offset in the element, we can do both
* backward and forward traversal.
*/
typedef struct _hkeydata {
} HKEYDATA;
/*
* The length of any HKEYDATA item. Note that indx is an element index,
* not a PAIR index.
*/
/*
* Page space required to add a new HKEYDATA item to the page, with and
* without the index value.
*/
#define HKEYDATA_SIZE(len) \
#define HKEYDATA_PSIZE(len) \
/* Put a HKEYDATA item at the location referenced by a page entry. */
}
/*
* Macros the describe the page layout in terms of key-data pairs.
* The use of "pindex" indicates that the argument is the index
* expressed in pairs instead of individual elements.
*/
/*
* The third type is the H_OFFPAGE, represented by the HOFFPAGE structure:
*/
typedef struct _hoffpage {
} HOFFPAGE;
/*
* Page space required to add a new HOFFPAGE item to the page, with and
* without the index value.
*/
#define HOFFPAGE_SIZE (sizeof(HOFFPAGE))
/*
* The fourth type is H_OFFDUP represented by the HOFFDUP structure:
*/
typedef struct _hoffdup {
} HOFFDUP;
/*
* Page space required to add a new HOFFDUP item to the page, with and
* without the index value.
*/
#define HOFFDUP_SIZE (sizeof(HOFFDUP))
/************************************************************************
BTREE PAGE LAYOUT
************************************************************************/
/* Each index references a group of bytes on the page. */
/*
* We have to store a deleted entry flag in the page. The reason is complex,
* but the simple version is that we can't delete on-page items referenced by
* a cursor -- the return order of subsequent insertions might be wrong. The
* delete flag is an overload of the top bit of the type byte.
*/
#define B_DELETE (0x80)
(t) = (type); \
if (deleted) \
B_DSET(t); \
}
/*
* The first type is B_KEYDATA, represented by the BKEYDATA structure:
*/
typedef struct _bkeydata {
} BKEYDATA;
/* Get a BKEYDATA item for a specific index. */
/*
* Page space required to add a new BKEYDATA item to the page, with and
* without the index value.
*/
#define BKEYDATA_SIZE(len) \
#define BKEYDATA_PSIZE(len) \
/*
* The second and third types are B_DUPLICATE and B_OVERFLOW, represented
* by the BOVERFLOW structure.
*/
typedef struct _boverflow {
} BOVERFLOW;
/* Get a BOVERFLOW item for a specific index. */
/*
* Page space required to add a new BOVERFLOW item to the page, with and
* without the index value.
*/
#define BOVERFLOW_SIZE \
#define BOVERFLOW_PSIZE \
(BOVERFLOW_SIZE + sizeof(db_indx_t))
/*
* Btree leaf and hash page layouts group indices in sets of two, one
* for the key and one for the data. Everything else does it in sets
* of one to save space. I use the following macros so that it's real
* obvious what's going on...
*/
#define O_INDX 1
#define P_INDX 2
/************************************************************************
BTREE INTERNAL PAGE LAYOUT
************************************************************************/
/*
* Btree internal entry.
*/
typedef struct _binternal {
} BINTERNAL;
/* Get a BINTERNAL item for a specific index. */
/*
* Page space required to add a new BINTERNAL item to the page, with and
* without the index value.
*/
#define BINTERNAL_SIZE(len) \
#define BINTERNAL_PSIZE(len) \
/************************************************************************
RECNO INTERNAL PAGE LAYOUT
************************************************************************/
/*
* The recno internal entry.
*
* XXX
* Why not fold this into the db_indx_t structure, it's fixed length?
*/
typedef struct _rinternal {
} RINTERNAL;
/* Get a RINTERNAL item for a specific index. */
/*
* Page space required to add a new RINTERNAL item to the page, with and
* without the index value.
*/
#define RINTERNAL_SIZE \
#define RINTERNAL_PSIZE \
(RINTERNAL_SIZE + sizeof(db_indx_t))
#endif /* _DB_PAGE_H_ */