1N/A/*-
1N/A * See the file LICENSE for redistribution information.
1N/A *
1N/A * Copyright (c) 1996, 1997, 1998
1N/A * Sleepycat Software. All rights reserved.
1N/A */
1N/A/*
1N/A * Copyright (c) 1990, 1993, 1994, 1995, 1996
1N/A * Keith Bostic. All rights reserved.
1N/A */
1N/A/*
1N/A * Copyright (c) 1990, 1993, 1994, 1995
1N/A * The Regents of the University of California. All rights reserved.
1N/A *
1N/A * This code is derived from software contributed to Berkeley by
1N/A * Mike Olson.
1N/A *
1N/A * Redistribution and use in source and binary forms, with or without
1N/A * modification, are permitted provided that the following conditions
1N/A * are met:
1N/A * 1. Redistributions of source code must retain the above copyright
1N/A * notice, this list of conditions and the following disclaimer.
1N/A * 2. Redistributions in binary form must reproduce the above copyright
1N/A * notice, this list of conditions and the following disclaimer in the
1N/A * documentation and/or other materials provided with the distribution.
1N/A * 3. All advertising materials mentioning features or use of this software
1N/A * must display the following acknowledgement:
1N/A * This product includes software developed by the University of
1N/A * California, Berkeley and its contributors.
1N/A * 4. Neither the name of the University nor the names of its contributors
1N/A * may be used to endorse or promote products derived from this software
1N/A * without specific prior written permission.
1N/A *
1N/A * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
1N/A * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1N/A * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1N/A * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
1N/A * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1N/A * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
1N/A * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
1N/A * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
1N/A * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
1N/A * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
1N/A * SUCH DAMAGE.
1N/A */
1N/A
1N/A#include "config.h"
1N/A
1N/A#ifndef lint
1N/Astatic const char sccsid[] = "@(#)bt_search.c 10.25 (Sleepycat) 12/16/98";
1N/A#endif /* not lint */
1N/A
1N/A#ifndef NO_SYSTEM_INCLUDES
1N/A#include <sys/types.h>
1N/A
1N/A#include <errno.h>
1N/A#include <string.h>
1N/A#endif
1N/A
1N/A#include "db_int.h"
1N/A#include "db_page.h"
1N/A#include "btree.h"
1N/A
1N/A/*
1N/A * __bam_search --
1N/A * Search a btree for a key.
1N/A *
1N/A * PUBLIC: int __bam_search __P((DBC *,
1N/A * PUBLIC: const DBT *, u_int32_t, int, db_recno_t *, int *));
1N/A */
1N/Aint
1N/A__bam_search(dbc, key, flags, stop, recnop, exactp)
1N/A DBC *dbc;
1N/A const DBT *key;
1N/A u_int32_t flags;
1N/A int stop, *exactp;
1N/A db_recno_t *recnop;
1N/A{
1N/A BTREE *t;
1N/A CURSOR *cp;
1N/A DB *dbp;
1N/A DB_LOCK lock;
1N/A PAGE *h;
1N/A db_indx_t base, i, indx, lim;
1N/A db_pgno_t pg;
1N/A db_recno_t recno;
1N/A int cmp, jump, ret, stack;
1N/A
1N/A dbp = dbc->dbp;
1N/A cp = dbc->internal;
1N/A t = dbp->internal;
1N/A recno = 0;
1N/A
1N/A BT_STK_CLR(cp);
1N/A
1N/A /*
1N/A * There are several ways we search a btree tree. The flags argument
1N/A * specifies if we're acquiring read or write locks, if we position
1N/A * to the first or last item in a set of duplicates, if we return
1N/A * deleted items, and if we are locking pairs of pages. In addition,
1N/A * if we're modifying record numbers, we have to lock the entire tree
1N/A * regardless. See btree.h for more details.
1N/A *
1N/A * If write-locking pages, we need to know whether or not to acquire a
1N/A * write lock on a page before getting it. This depends on how deep it
1N/A * is in tree, which we don't know until we acquire the root page. So,
1N/A * if we need to lock the root page we may have to upgrade it later,
1N/A * because we won't get the correct lock initially.
1N/A *
1N/A * Retrieve the root page.
1N/A */
1N/A pg = PGNO_ROOT;
1N/A stack = F_ISSET(dbp, DB_BT_RECNUM) && LF_ISSET(S_STACK);
1N/A if ((ret = __bam_lget(dbc,
1N/A 0, pg, stack ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
1N/A return (ret);
1N/A if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
1N/A (void)__BT_LPUT(dbc, lock);
1N/A return (ret);
1N/A }
1N/A
1N/A /*
1N/A * Decide if we need to save this page; if we do, write lock it.
1N/A * We deliberately don't lock-couple on this call. If the tree
1N/A * is tiny, i.e., one page, and two threads are busily updating
1N/A * the root page, we're almost guaranteed deadlocks galore, as
1N/A * each one gets a read lock and then blocks the other's attempt
1N/A * for a write lock.
1N/A */
1N/A if (!stack &&
1N/A ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
1N/A (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
1N/A (void)memp_fput(dbp->mpf, h, 0);
1N/A (void)__BT_LPUT(dbc, lock);
1N/A if ((ret = __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
1N/A return (ret);
1N/A if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
1N/A (void)__BT_LPUT(dbc, lock);
1N/A return (ret);
1N/A }
1N/A stack = 1;
1N/A }
1N/A
1N/A for (;;) {
1N/A /*
1N/A * Do a binary search on the current page. If we're searching
1N/A * a leaf page, we have to manipulate the indices in groups of
1N/A * two. If we're searching an internal page, they're an index
1N/A * per page item. If we find an exact match on a leaf page,
1N/A * we're done.
1N/A */
1N/A jump = TYPE(h) == P_LBTREE ? P_INDX : O_INDX;
1N/A for (base = 0,
1N/A lim = NUM_ENT(h) / (db_indx_t)jump; lim != 0; lim >>= 1) {
1N/A indx = base + ((lim >> 1) * jump);
1N/A if ((cmp =
1N/A __bam_cmp(dbp, key, h, indx, t->bt_compare)) == 0) {
1N/A if (TYPE(h) == P_LBTREE)
1N/A goto match;
1N/A goto next;
1N/A }
1N/A if (cmp > 0) {
1N/A base = indx + jump;
1N/A --lim;
1N/A }
1N/A }
1N/A
1N/A /*
1N/A * No match found. Base is the smallest index greater than
1N/A * key and may be zero or a last + O_INDX index.
1N/A *
1N/A * If it's a leaf page, return base as the "found" value.
1N/A * Delete only deletes exact matches.
1N/A */
1N/A if (TYPE(h) == P_LBTREE) {
1N/A *exactp = 0;
1N/A
1N/A if (LF_ISSET(S_EXACT))
1N/A goto notfound;
1N/A
1N/A /*
1N/A * !!!
1N/A * Possibly returning a deleted record -- DB_SET_RANGE,
1N/A * DB_KEYFIRST and DB_KEYLAST don't require an exact
1N/A * match, and we don't want to walk multiple pages here
1N/A * to find an undeleted record. This is handled in the
1N/A * __bam_c_search() routine.
1N/A */
1N/A BT_STK_ENTER(cp, h, base, lock, ret);
1N/A return (ret);
1N/A }
1N/A
1N/A /*
1N/A * If it's not a leaf page, record the internal page (which is
1N/A * a parent page for the key). Decrement the base by 1 if it's
1N/A * non-zero so that if a split later occurs, the inserted page
1N/A * will be to the right of the saved page.
1N/A */
1N/A indx = base > 0 ? base - O_INDX : base;
1N/A
1N/A /*
1N/A * If we're trying to calculate the record number, sum up
1N/A * all the record numbers on this page up to the indx point.
1N/A */
1N/A if (recnop != NULL)
1N/A for (i = 0; i < indx; ++i)
1N/A recno += GET_BINTERNAL(h, i)->nrecs;
1N/A
1N/Anext: pg = GET_BINTERNAL(h, indx)->pgno;
1N/A if (stack) {
1N/A /* Return if this is the lowest page wanted. */
1N/A if (LF_ISSET(S_PARENT) && stop == h->level) {
1N/A BT_STK_ENTER(cp, h, indx, lock, ret);
1N/A return (ret);
1N/A }
1N/A BT_STK_PUSH(cp, h, indx, lock, ret);
1N/A if (ret != 0)
1N/A goto err;
1N/A
1N/A if ((ret =
1N/A __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
1N/A goto err;
1N/A } else {
1N/A /*
1N/A * Decide if we want to return a reference to the next
1N/A * page in the return stack. If so, lock it and never
1N/A * unlock it.
1N/A */
1N/A if ((LF_ISSET(S_PARENT) &&
1N/A (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) ||
1N/A (h->level - 1) == LEAFLEVEL)
1N/A stack = 1;
1N/A
1N/A (void)memp_fput(dbp->mpf, h, 0);
1N/A
1N/A if ((ret =
1N/A __bam_lget(dbc, 1, pg, stack && LF_ISSET(S_WRITE) ?
1N/A DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
1N/A goto err;
1N/A }
1N/A if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0)
1N/A goto err;
1N/A }
1N/A /* NOTREACHED */
1N/A
1N/Amatch: *exactp = 1;
1N/A
1N/A /*
1N/A * If we're trying to calculate the record number, add in the
1N/A * offset on this page and correct for the fact that records
1N/A * in the tree are 0-based.
1N/A */
1N/A if (recnop != NULL)
1N/A *recnop = recno + (indx / P_INDX) + 1;
1N/A
1N/A /*
1N/A * If we got here, we know that we have a btree leaf page.
1N/A *
1N/A * If there are duplicates, go to the first/last one. This is
1N/A * safe because we know that we're not going to leave the page,
1N/A * all duplicate sets that are not on overflow pages exist on a
1N/A * single leaf page.
1N/A */
1N/A if (LF_ISSET(S_DUPLAST))
1N/A while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
1N/A h->inp[indx] == h->inp[indx + P_INDX])
1N/A indx += P_INDX;
1N/A else
1N/A while (indx > 0 &&
1N/A h->inp[indx] == h->inp[indx - P_INDX])
1N/A indx -= P_INDX;
1N/A
1N/A /*
1N/A * Now check if we are allowed to return deleted items; if not
1N/A * find the next (or previous) non-deleted item.
1N/A */
1N/A if (LF_ISSET(S_DELNO)) {
1N/A if (LF_ISSET(S_DUPLAST))
1N/A while (B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type) &&
1N/A indx > 0 &&
1N/A h->inp[indx] == h->inp[indx - P_INDX])
1N/A indx -= P_INDX;
1N/A else
1N/A while (B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type) &&
1N/A indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
1N/A h->inp[indx] == h->inp[indx + P_INDX])
1N/A indx += P_INDX;
1N/A
1N/A if (B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type))
1N/A goto notfound;
1N/A }
1N/A
1N/A BT_STK_ENTER(cp, h, indx, lock, ret);
1N/A return (ret);
1N/A
1N/Anotfound:
1N/A (void)memp_fput(dbp->mpf, h, 0);
1N/A (void)__BT_LPUT(dbc, lock);
1N/A ret = DB_NOTFOUND;
1N/A
1N/Aerr: if (cp->csp > cp->sp) {
1N/A BT_STK_POP(cp);
1N/A __bam_stkrel(dbc, 0);
1N/A }
1N/A return (ret);
1N/A}
1N/A
1N/A/*
1N/A * __bam_stkrel --
1N/A * Release all pages currently held in the stack.
1N/A *
1N/A * PUBLIC: int __bam_stkrel __P((DBC *, int));
1N/A */
1N/Aint
1N/A__bam_stkrel(dbc, nolocks)
1N/A DBC *dbc;
1N/A int nolocks;
1N/A{
1N/A CURSOR *cp;
1N/A DB *dbp;
1N/A EPG *epg;
1N/A
1N/A dbp = dbc->dbp;
1N/A cp = dbc->internal;
1N/A
1N/A /* Release inner pages first. */
1N/A for (epg = cp->sp; epg <= cp->csp; ++epg) {
1N/A if (epg->page != NULL)
1N/A (void)memp_fput(dbp->mpf, epg->page, 0);
1N/A if (epg->lock != LOCK_INVALID)
1N/A if (nolocks)
1N/A (void)__BT_LPUT(dbc, epg->lock);
1N/A else
1N/A (void)__BT_TLPUT(dbc, epg->lock);
1N/A }
1N/A
1N/A /* Clear the stack, all pages have been released. */
1N/A BT_STK_CLR(cp);
1N/A
1N/A return (0);
1N/A}
1N/A
1N/A/*
1N/A * __bam_stkgrow --
1N/A * Grow the stack.
1N/A *
1N/A * PUBLIC: int __bam_stkgrow __P((CURSOR *));
1N/A */
1N/Aint
1N/A__bam_stkgrow(cp)
1N/A CURSOR *cp;
1N/A{
1N/A EPG *p;
1N/A size_t entries;
1N/A int ret;
1N/A
1N/A entries = cp->esp - cp->sp;
1N/A
1N/A if ((ret = __os_calloc(entries * 2, sizeof(EPG), &p)) != 0)
1N/A return (ret);
1N/A memcpy(p, cp->sp, entries * sizeof(EPG));
1N/A if (cp->sp != cp->stack)
1N/A __os_free(cp->sp, entries * sizeof(EPG));
1N/A cp->sp = p;
1N/A cp->csp = p + entries;
1N/A cp->esp = p + entries * 2;
1N/A return (0);
1N/A}