/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1996, 1997, 1998
* Sleepycat Software. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
* Keith Bostic. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "config.h"
#ifndef lint
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
#include <errno.h>
#include <limits.h>
#include <string.h>
#endif
#include "db_int.h"
#include "db_page.h"
#include "btree.h"
/*
* __bam_split --
* Split a page.
*
* PUBLIC: int __bam_split __P((DBC *, void *));
*/
int
void *arg;
{
BTREE *t;
/*
* The locking protocol we use to avoid deadlock to acquire locks by
* walking down the tree, but we do it as lazily as possible, locking
* the root only as a last resort. We expect all stack pages to have
* been discarded before we're called; we discard all short-term locks.
*
* When __bam_split is first called, we know that a leaf page was too
* full for an insert. We don't know what leaf page it was, but we
* reacquire the leaf page, but this time get both the leaf page and
* its parent, locked. We then split the leaf page and see if the new
* internal key will fit into the parent page. If it will, we're done.
*
* If it won't, we discard our current locks and repeat the process,
* only this time acquiring the parent page and its parent, locked.
* This process repeats until we succeed in the split, splitting the
* root page as the final resort. The entire process then repeats,
* as necessary, until we split a leaf page.
*
* XXX
* A traditional method of speeding this up is to maintain a stack of
* the pages traversed in the original search. You can detect if the
* stack is correct by storing the page's LSN when it was searched and
* comparing that LSN with the current one when it's locked during the
* split. This would be an easy change for this code, but I have no
* numbers that indicate it's worthwhile.
*/
/*
* Acquire a page and its parent, locked.
*/
return (ret);
/*
* Split the page if it still needs it (it's possible another
* thread of control has already split the page). If we are
* guaranteed that two items will fit on the page, the split
* is no longer necessary.
*/
if (t->bt_ovflsize * 2 <=
return (0);
}
BT_STK_CLR(cp);
switch (ret) {
case 0:
/* Once we've split the leaf page, we're done. */
return (0);
/* Switch directions. */
break;
case DB_NEEDSPLIT:
/*
* It's possible to fail to split repeatedly, as other
* threads may be modifying the tree, or the page usage
* is sufficiently bad that we don't get enough space
* the first time.
*/
break;
default:
return (ret);
}
}
/* NOTREACHED */
}
/*
* __bam_root --
* Split the root page of a btree.
*/
static int
{
int ret;
/* Yeah, right. */
goto err;
}
/* Create new left and right pages for the split. */
goto err;
/* Split the page. */
goto err;
/* Log the change. */
if (DB_LOGGING(dbc)) {
&__a)) != 0)
goto err;
}
/* Clean up the new root page. */
goto err;
/* Adjust any cursors. Do it last so we don't have to undo it. */
/* Success -- write the real pages back to the store. */
return (0);
return (ret);
}
/*
* __bam_page --
* Split the non-root page of a btree.
*/
static int
{
int ret;
ret = -1;
/* Create new right page for the split. */
goto err;
/* Create new left page for the split. */
goto err;
/*
* Split right.
*
* aren't, so it's simpler to copy the data from the split page onto
* two new pages instead of copying half the data to the right page
* and compacting the left page in place. Since the left page can't
* change, we swap the original and the allocated left page after the
* split.
*/
goto err;
/*
* Fix up the previous pointer of any leaf page following the split
* page.
*
* !!!
* There are interesting deadlock situations here as we write-lock a
* page that's not in our direct ancestry. Consider a cursor walking
* through the leaf pages, that has the previous page read-locked and
* is waiting on a lock for the page we just split. It will deadlock
* here. If this is a problem, we can fail in the split; it's not a
* problem as the split will succeed after the cursor passes through
* the page we're splitting.
*/
goto err;
goto err;
}
/* Insert the new pages into the parent page. */
goto err;
/* Log the change. */
if (DB_LOGGING(dbc)) {
goto err;
}
/* Copy the allocated page into place. */
/* Finish the next-page link. */
/* Adjust any cursors. Do so last so we don't have to undo it. */
/* Success -- write the real pages back to the store. */
}
return (0);
if (ret == DB_NEEDSPLIT)
else
}
if (ret == DB_NEEDSPLIT)
else
if (ret == DB_NEEDSPLIT)
else
return (ret);
}
/*
* __bam_broot --
* Fix up the btree root page after it has been split.
*/
static int
{
int ret;
/*
* If the root page was a leaf page, change it into an internal page.
* We copy the key we split on (but not the key's data, in the case of
* a leaf page) to the new root page.
*/
/*
* The btree comparison code guarantees that the left-most key on any
* level of the tree is never used, so it doesn't need to be filled in.
*/
}
if ((ret =
return (ret);
case P_IBTREE:
/* Copy the first key of the child page onto the root page. */
}
return (ret);
/* Increment the overflow ref count. */
return (ret);
break;
case P_LBTREE:
/* Copy the first key of the child page onto the root page. */
case B_KEYDATA:
}
return (ret);
break;
case B_DUPLICATE:
case B_OVERFLOW:
}
return (ret);
/* Increment the overflow ref count. */
return (ret);
break;
default:
}
break;
default:
}
return (0);
}
/*
* __ram_root --
* Fix up the recno root page after it has been split.
*/
static int
{
int ret;
/* Initialize the page. */
/* Initialize the header. */
/* Insert the left and right keys, set the header information. */
return (ret);
return (ret);
return (0);
}
/*
* __bam_pinsert --
* Insert a new key into a parent page, completing the split.
*/
static int
{
BTREE *t;
int ret;
/* If handling record numbers, count records split to the right page. */
__bam_total(rchild) : 0;
/*
* Now we insert the new page's first key into the parent page, which
* completes the split. The parent points to a PAGE and a page index
* offset, where the new key goes ONE AFTER the index, because we split
* to the right.
*
* XXX
* Some btree algorithms replace the key for the old page as well as
* the new page. We don't, as there's no reason to believe that the
* first key on the old page is any better than the key we have, and,
* in the case of a key being placed at index 0 causing the split, the
* key is unavailable.
*/
/*
* Calculate the space needed on the parent page.
*
* Prefix trees: space hack used when inserting into BINTERNAL pages.
* Retain only what's needed to distinguish between the new entry and
* the LAST entry on the page to its left. If the keys compare equal,
* retain the entire key. We ignore overflow keys, and the entire key
* must be retained for the next-to-leftmost key on the leftmost page
* of each level, or the search will fail. Applicable ONLY to internal
* pages that have leaf pages as children. Further reduction of the
* key between pairs of internal pages loses too much information.
*/
case P_IBTREE:
return (DB_NEEDSPLIT);
/* Add a new record for the right page. */
return (ret);
/* Increment the overflow ref count. */
return (ret);
break;
case P_LBTREE:
case B_KEYDATA:
goto noprefix;
goto noprefix;
goto noprefix;
memset(&a, 0, sizeof(a));
memset(&b, 0, sizeof(b));
nbytes = n;
else
return (DB_NEEDSPLIT);
return (ret);
break;
case B_DUPLICATE:
case B_OVERFLOW:
return (DB_NEEDSPLIT);
return (ret);
/* Increment the overflow ref count. */
return (ret);
break;
default:
}
break;
case P_IRECNO:
case P_LRECNO:
return (DB_NEEDSPLIT);
/* Add a new record for the right page. */
return (ret);
break;
default:
}
/* Adjust the parent page's left page record count. */
/* Log the change. */
if (DB_LOGGING(dbc) &&
return (ret);
/* Update the left page count. */
else
}
return (0);
}
/*
* __bam_psplit --
* Do the real work of splitting the page.
*/
static int
{
/*
* If we're splitting the first (last) page on a level because we're
* inserting (appending) a key to it, it's likely that the data is
* sorted. Moving a single item to the new page is less work and can
* push the fill factor higher than normal. If we're wrong it's not
* a big deal, we'll just do the split the right way next time.
*/
off = 0;
if (off != 0)
goto sort;
/*
* Split the data to the left and right pages. Try not to split on
* an overflow key. (Overflow keys on internal pages will slow down
* searches.) Refuse to split in the middle of a set of duplicates.
*
* First, find the optimum place to split.
*
* It's possible to try and split past the last record on the page if
* there's a very large record at the end of the page. Make sure this
* doesn't happen by bounding the check at the next-to-last entry on
* the page.
*
* Note, we try and split half the data present on the page. This is
* because another process may have already split the page and left
* it half empty. We don't try and skip the split -- we don't know
* how much space we're going to need on the page, and we may need up
* to half the page for a big item, so there's no easy test to decide
* if we need to split or not. Besides, if two threads are inserting
* data into the same place in the database, we're probably going to
* need more space soon anyway.
*/
case P_IBTREE:
nbytes +=
else
break;
case P_LBTREE:
nbytes +=
else
nbytes += BOVERFLOW_SIZE;
++off;
nbytes +=
else
nbytes += BOVERFLOW_SIZE;
break;
case P_IRECNO:
nbytes += RINTERNAL_SIZE;
break;
case P_LRECNO:
break;
default:
}
/*
* Splitp is either at or just past the optimum split point. If
* it's a big key, try and find something close by that's not.
*/
else
isbigkey = 0;
if (isbigkey)
break;
}
continue;
break;
}
}
/*
* We can't split in the middle a set of duplicates. We know that
* no duplicate set can take up more than about 25% of the page,
* because that's the point where we push it off onto a duplicate
* page set. So, this loop can't be unbounded.
*/
break;
}
continue;
break;
}
}
/* We're going to split at splitp. */
return (ret);
return (ret);
return (0);
}
/*
* __bam_copy --
* Copy a set of records from one page to another.
*
* PUBLIC: int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t));
*/
int
{
/*
* Copy the rest of the data to the right page. Nxt is the next
* offset placed on the target page.
*/
case P_IBTREE:
nbytes =
else
break;
case P_LBTREE:
/*
* If we're on a key and it's a duplicate, just copy
* the offset.
*/
continue;
}
/* FALLTHROUGH */
case P_LRECNO:
nbytes =
else
break;
case P_IRECNO:
break;
default:
}
}
return (0);
}