db/db/db_region.c

1N/A/*-
1N/A * See the file LICENSE for redistribution information.
1N/A *
1N/A * Copyright (c) 1996, 1997, 1998
1N/A *  Sleepycat Software.  All rights reserved.
1N/A */
1N/A
1N/A#include "config.h"
1N/A
1N/A#ifndef lint
1N/Astatic const char sccsid[] = "@(#)db_region.c   10.53 (Sleepycat) 11/10/98";
1N/A#endif /* not lint */
1N/A
1N/A#ifndef NO_SYSTEM_INCLUDES
1N/A#include <sys/types.h>
1N/A
1N/A#include <errno.h>
1N/A#include <string.h>
1N/A#include <unistd.h>
1N/A#endif
1N/A
1N/A#include "db_int.h"
1N/A#include "common_ext.h"
1N/A
1N/Astatic int __db_growregion __P((REGINFO *, size_t));
1N/A
1N/A/*
1N/A * __db_rattach --
1N/A *  Optionally create and attach to a shared memory region.
1N/A *
1N/A * PUBLIC: int __db_rattach __P((REGINFO *));
1N/A */
1N/Aint
1N/A__db_rattach(infop)
1N/A    REGINFO *infop;
1N/A{
1N/A    RLAYOUT *rlp, rl;
1N/A    size_t grow_region, size;
1N/A    ssize_t nr, nw;
1N/A    u_int32_t flags, mbytes, bytes;
1N/A    u_int8_t *p;
1N/A    int malloc_possible, ret, retry_cnt;
1N/A
1N/A    grow_region = 0;
1N/A    malloc_possible = 1;
1N/A    ret = retry_cnt = 0;
1N/A
1N/A    /* Round off the requested size to the next page boundary. */
1N/A    DB_ROUNDOFF(infop->size, DB_VMPAGESIZE);
1N/A
1N/A    /* Some architectures have hard limits on the maximum region size. */
1N/A#ifdef DB_REGIONSIZE_MAX
1N/A    if (infop->size > DB_REGIONSIZE_MAX) {
1N/A        __db_err(infop->dbenv, "__db_rattach: cache size too large");
1N/A        return (EINVAL);
1N/A    }
1N/A#endif
1N/A
1N/A    /* Intialize the return information in the REGINFO structure. */
1N/Aloop:   infop->addr = NULL;
1N/A    infop->fd = -1;
1N/A    infop->segid = INVALID_SEGID;
1N/A    if (infop->name != NULL) {
1N/A        __os_freestr(infop->name);
1N/A        infop->name = NULL;
1N/A    }
1N/A    F_CLR(infop, REGION_CANGROW | REGION_CREATED);
1N/A
1N/A#ifndef HAVE_SPINLOCKS
1N/A    /*
1N/A     * XXX
1N/A     * Lacking spinlocks, we must have a file descriptor for fcntl(2)
1N/A     * locking, which implies using mmap(2) to map in a regular file.
1N/A     * (Theoretically, we could probably get a file descriptor to lock
1N/A     * other types of shared regions, but I don't see any reason to
1N/A     * bother.)
1N/A     *
1N/A     * Since we may be using shared memory regions, e.g., shmget(2),
1N/A     * and not mmap of regular files, the backing file may be only a
1N/A     * few tens of bytes in length.  So, this depends on the ability
1N/A     * to fcntl lock file offsets much larger than the physical file.
1N/A     */
1N/A    malloc_possible = 0;
1N/A#endif
1N/A
1N/A#ifdef __hppa
1N/A    /*
1N/A     * XXX
1N/A     * HP-UX won't permit mutexes to live in anything but shared memory.
1N/A     * Instantiate a shared region file on that architecture, regardless.
1N/A     */
1N/A    malloc_possible = 0;
1N/A#endif
1N/A    /*
1N/A     * If a region is truly private, malloc the memory.  That's faster
1N/A     * than either anonymous memory or a shared file.
1N/A     */
1N/A    if (malloc_possible && F_ISSET(infop, REGION_PRIVATE)) {
1N/A        if ((ret = __os_malloc(infop->size, NULL, &infop->addr)) != 0)
1N/A            return (ret);
1N/A
1N/A        /*
1N/A         * It's sometimes significantly faster to page-fault in all of
1N/A         * the region's pages before we run the application, as we see
1N/A         * nasty side-effects when we page-fault while holding various
1N/A         * locks, i.e., the lock takes a long time to acquire because
1N/A         * of the underlying page fault, and the other threads convoy
1N/A         * behind the lock holder.
1N/A         */
1N/A        if (DB_GLOBAL(db_region_init))
1N/A            for (p = infop->addr;
1N/A                p < (u_int8_t *)infop->addr + infop->size;
1N/A                p += DB_VMPAGESIZE)
1N/A                p[0] = '\0';
1N/A
1N/A        F_SET(infop, REGION_CREATED | REGION_MALLOC);
1N/A        goto region_init;
1N/A    }
1N/A
1N/A    /*
1N/A     * Get the name of the region (creating the file if a temporary file
1N/A     * is being used).  The dbenv contains the current DB environment,
1N/A     * including naming information.  The path argument may be a file or
1N/A     * a directory.  If path is a directory, it must exist and file is the
1N/A     * file name to be created inside the directory.  If path is a file,
1N/A     * then file must be NULL.
1N/A     */
1N/A    if ((ret = __db_appname(infop->dbenv, infop->appname, infop->path,
1N/A        infop->file, infop->dbflags, &infop->fd, &infop->name)) != 0)
1N/A        return (ret);
1N/A    if (infop->fd != -1)
1N/A        F_SET(infop, REGION_CREATED);
1N/A
1N/A    /*
1N/A     * Try to create the file, if we have authority.  We have to make sure
1N/A     * that multiple threads/processes attempting to simultaneously create
1N/A     * the region are properly ordered, so we open it using DB_CREATE and
1N/A     * DB_EXCL, so two attempts to create the region will return failure in
1N/A     * one.
1N/A     */
1N/A    if (infop->fd == -1 && infop->dbflags & DB_CREATE) {
1N/A        flags = infop->dbflags;
1N/A        LF_SET(DB_EXCL);
1N/A        if ((ret = __db_open(infop->name,
1N/A            flags, flags, infop->mode, &infop->fd)) == 0)
1N/A            F_SET(infop, REGION_CREATED);
1N/A        else
1N/A            if (ret != EEXIST)
1N/A                goto errmsg;
1N/A    }
1N/A
1N/A    /* If we couldn't create the file, try and open it. */
1N/A    if (infop->fd == -1) {
1N/A        flags = infop->dbflags;
1N/A        LF_CLR(DB_CREATE | DB_EXCL);
1N/A        if ((ret = __db_open(infop->name,
1N/A            flags, flags, infop->mode, &infop->fd)) != 0)
1N/A            goto errmsg;
1N/A    }
1N/A
1N/A    /*
1N/A     * There are three cases we support:
1N/A     *    1. Named anonymous memory (shmget(2)).
1N/A     *    2. Unnamed anonymous memory (mmap(2): MAP_ANON/MAP_ANONYMOUS).
1N/A     *    3. Memory backed by a regular file (mmap(2)).
1N/A     *
1N/A     * We instantiate a backing file in all cases, which contains at least
1N/A     * the RLAYOUT structure, and in case #3, contains the actual region.
1N/A     * This is necessary for a couple of reasons:
1N/A     *
1N/A     * First, the mpool region uses temporary files to name regions, and
1N/A     * since you may have multiple regions in the same directory, we need
1N/A     * a filesystem name to ensure that they don't collide.
1N/A     *
1N/A     * Second, applications are allowed to forcibly remove regions, even
1N/A     * if they don't know anything about them other than the name.  If a
1N/A     * region is backed by anonymous memory, there has to be some way for
1N/A     * the application to find out that information, and, in some cases,
1N/A     * determine ID information for the anonymous memory.
1N/A     */
1N/A    if (F_ISSET(infop, REGION_CREATED)) {
1N/A        /*
1N/A         * If we're using anonymous memory to back this region, set
1N/A         * the flag.
1N/A         */
1N/A        if (DB_GLOBAL(db_region_anon))
1N/A            F_SET(infop, REGION_ANONYMOUS);
1N/A
1N/A        /*
1N/A         * If we're using a regular file to back a region we created,
1N/A         * grow it to the specified size.
1N/A         */
1N/A        if (!DB_GLOBAL(db_region_anon) &&
1N/A            (ret = __db_growregion(infop, infop->size)) != 0)
1N/A            goto err;
1N/A    } else {
1N/A        /*
1N/A         * If we're joining a region, figure out what it looks like.
1N/A         *
1N/A         * XXX
1N/A         * We have to figure out if the file is a regular file backing
1N/A         * a region that we want to map into our address space, or a
1N/A         * file with the information we need to find a shared anonymous
1N/A         * region that we want to map into our address space.
1N/A         *
1N/A         * All this noise is because some systems don't have a coherent
1N/A         * VM and buffer cache, and worse, if you mix operations on the
1N/A         * VM and buffer cache, half the time you hang the system.
1N/A         *
1N/A         * There are two possibilities.  If the file is the size of an
1N/A         * RLAYOUT structure, then we know that the real region is in
1N/A         * shared memory, because otherwise it would be bigger.  (As
1N/A         * the RLAYOUT structure size is smaller than a disk sector,
1N/A         * the only way it can be this size is if deliberately written
1N/A         * that way.)  In which case, retrieve the information we need
1N/A         * from the RLAYOUT structure and use it to acquire the shared
1N/A         * memory.
1N/A         *
1N/A         * If the structure is larger than an RLAYOUT structure, then
1N/A         * the file is backing the shared memory region, and we use
1N/A         * the current size of the file without reading any information
1N/A         * from the file itself so that we don't confuse the VM.
1N/A         *
1N/A         * And yes, this makes me want to take somebody and kill them,
1N/A         * but I can't think of any other solution.
1N/A         */
1N/A        if ((ret = __os_ioinfo(infop->name,
1N/A            infop->fd, &mbytes, &bytes, NULL)) != 0)
1N/A            goto errmsg;
1N/A        size = mbytes * MEGABYTE + bytes;
1N/A
1N/A        if (size <= sizeof(RLAYOUT)) {
1N/A            /*
1N/A             * If the size is too small, the read fails or the
1N/A             * valid flag is incorrect, assume it's because the
1N/A             * RLAYOUT information hasn't been written out yet,
1N/A             * and retry.
1N/A             */
1N/A            if (size < sizeof(RLAYOUT))
1N/A                goto retry;
1N/A            if ((ret =
1N/A                __os_read(infop->fd, &rl, sizeof(rl), &nr)) != 0)
1N/A                goto retry;
1N/A            if (rl.valid != DB_REGIONMAGIC)
1N/A                goto retry;
1N/A
1N/A            /* Copy the size, memory id and characteristics. */
1N/A            size = rl.size;
1N/A            infop->segid = rl.segid;
1N/A            if (F_ISSET(&rl, REGION_ANONYMOUS))
1N/A                F_SET(infop, REGION_ANONYMOUS);
1N/A        }
1N/A
1N/A        /*
1N/A         * If the region is larger than we think, that's okay, use the
1N/A         * current size.  If it's smaller than we think, and we were
1N/A         * just using the default size, that's okay, use the current
1N/A         * size.  If it's smaller than we think and we really care,
1N/A         * save the size and we'll catch that further down -- we can't
1N/A         * correct it here because we have to have a lock to grow the
1N/A         * region.
1N/A         */
1N/A        if (infop->size > size && !F_ISSET(infop, REGION_SIZEDEF))
1N/A            grow_region = infop->size;
1N/A        infop->size = size;
1N/A    }
1N/A
1N/A    /*
1N/A     * Map the region into our address space.  If we're creating it, the
1N/A     * underlying routines will make it the right size.
1N/A     *
1N/A     * There are at least two cases where we can "reasonably" fail when
1N/A     * we attempt to map in the region.  On Windows/95, closing the last
1N/A     * reference to a region causes it to be zeroed out.  On UNIX, when
1N/A     * using the shmget(2) interfaces, the region will no longer exist
1N/A     * if the system was rebooted.  In these cases, the underlying map call
1N/A     * returns EAGAIN, and we *remove* our file and try again.  There are
1N/A     * obvious races in doing this, but it should eventually settle down
1N/A     * to a winner and then things should proceed normally.
1N/A     */
1N/A    if ((ret = __db_mapregion(infop->name, infop)) != 0)
1N/A        if (ret == EAGAIN) {
1N/A            /*
1N/A             * Pretend we created the region even if we didn't so
1N/A             * that our error processing unlinks it.
1N/A             */
1N/A            F_SET(infop, REGION_CREATED);
1N/A            ret = 0;
1N/A            goto retry;
1N/A        } else
1N/A            goto err;
1N/A
1N/Aregion_init:
1N/A    /*
1N/A     * Initialize the common region information.
1N/A     *
1N/A     * !!!
1N/A     * We have to order the region creates so that two processes don't try
1N/A     * to simultaneously create the region.  This is handled by using the
1N/A     * DB_CREATE and DB_EXCL flags when we create the "backing" region file.
1N/A     *
1N/A     * We also have to order region joins so that processes joining regions
1N/A     * never see inconsistent data.  We'd like to play permissions games
1N/A     * with the backing file, but we can't because WNT filesystems won't
1N/A     * open a file mode 0.
1N/A     */
1N/A    rlp = (RLAYOUT *)infop->addr;
1N/A    if (F_ISSET(infop, REGION_CREATED)) {
1N/A        /*
1N/A         * The process creating the region acquires a lock before it
1N/A         * sets the valid flag.  Any processes joining the region will
1N/A         * check the valid flag before acquiring the lock.
1N/A         *
1N/A         * Check the return of __db_mutex_init() and __db_mutex_lock(),
1N/A         * even though we don't usually check elsewhere.  This is the
1N/A         * first lock we initialize and acquire, and we have to know if
1N/A         * it fails.  (It CAN fail, e.g., SunOS, when using fcntl(2)
1N/A         * for locking, with an in-memory filesystem specified as the
1N/A         * database home.)
1N/A         */
1N/A        if ((ret = __db_mutex_init(&rlp->lock,
1N/A            MUTEX_LOCK_OFFSET(rlp, &rlp->lock))) != 0 ||
1N/A            (ret = __db_mutex_lock(&rlp->lock, infop->fd)) != 0)
1N/A            goto err;
1N/A
1N/A        /* Initialize the remaining region information. */
1N/A        rlp->refcnt = 1;
1N/A        rlp->size = infop->size;
1N/A        db_version(&rlp->majver, &rlp->minver, &rlp->patch);
1N/A        rlp->panic = 0;
1N/A        rlp->segid = infop->segid;
1N/A        rlp->flags = 0;
1N/A        if (F_ISSET(infop, REGION_ANONYMOUS))
1N/A            F_SET(rlp, REGION_ANONYMOUS);
1N/A
1N/A        /*
1N/A         * Fill in the valid field last -- use a magic number, memory
1N/A         * may not be zero-filled, and we want to minimize the chance
1N/A         * for collision.
1N/A         */
1N/A        rlp->valid = DB_REGIONMAGIC;
1N/A
1N/A        /*
1N/A         * If the region is anonymous, write the RLAYOUT information
1N/A         * into the backing file so that future region join and unlink
1N/A         * calls can find it.
1N/A         *
1N/A         * XXX
1N/A         * We MUST do the seek before we do the write.  On Win95, while
1N/A         * closing the last reference to an anonymous shared region
1N/A         * doesn't discard the region, it does zero it out.  So, the
1N/A         * REGION_CREATED may be set, but the file may have already
1N/A         * been written and the file descriptor may be at the end of
1N/A         * the file.
1N/A         */
1N/A        if (F_ISSET(infop, REGION_ANONYMOUS)) {
1N/A            if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, 0)) != 0)
1N/A                goto err;
1N/A            if ((ret =
1N/A                __os_write(infop->fd, rlp, sizeof(*rlp), &nw)) != 0)
1N/A                goto err;
1N/A        }
1N/A    } else {
1N/A        /* Check to see if the region has had catastrophic failure. */
1N/A        if (rlp->panic) {
1N/A            ret = DB_RUNRECOVERY;
1N/A            goto err;
1N/A        }
1N/A
1N/A        /*
1N/A         * Check the valid flag to ensure the region is initialized.
1N/A         * If the valid flag has not been set, the mutex may not have
1N/A         * been initialized, and an attempt to get it could lead to
1N/A         * random behavior.
1N/A         */
1N/A        if (rlp->valid != DB_REGIONMAGIC)
1N/A            goto retry;
1N/A
1N/A        /* Get the region lock. */
1N/A        (void)__db_mutex_lock(&rlp->lock, infop->fd);
1N/A
1N/A        /*
1N/A         * We now own the region.  There are a couple of things that
1N/A         * may have gone wrong, however.
1N/A         *
1N/A         * Problem #1: while we were waiting for the lock, the region
1N/A         * was deleted.  Detected by re-checking the valid flag, since
1N/A         * it's cleared by the delete region routines.
1N/A         */
1N/A        if (rlp->valid != DB_REGIONMAGIC) {
1N/A            (void)__db_mutex_unlock(&rlp->lock, infop->fd);
1N/A            goto retry;
1N/A        }
1N/A
1N/A        /*
1N/A         * Problem #3: when we checked the size of the file, it was
1N/A         * still growing as part of creation.  Detected by the fact
1N/A         * that infop->size isn't the same size as the region.
1N/A         */
1N/A        if (infop->size != rlp->size) {
1N/A            (void)__db_mutex_unlock(&rlp->lock, infop->fd);
1N/A            goto retry;
1N/A        }
1N/A
1N/A        /* Increment the reference count. */
1N/A        ++rlp->refcnt;
1N/A    }
1N/A
1N/A    /* Return the region in a locked condition. */
1N/A
1N/A    if (0) {
1N/Aerrmsg:     __db_err(infop->dbenv, "%s: %s", infop->name, strerror(ret));
1N/A
1N/Aerr:
1N/Aretry:      /* Discard the region. */
1N/A        if (infop->addr != NULL) {
1N/A            (void)__db_unmapregion(infop);
1N/A            infop->addr = NULL;
1N/A        }
1N/A
1N/A        /* Discard the backing file. */
1N/A        if (infop->fd != -1) {
1N/A            (void)__os_close(infop->fd);
1N/A            infop->fd = -1;
1N/A
1N/A            if (F_ISSET(infop, REGION_CREATED))
1N/A                (void)__os_unlink(infop->name);
1N/A        }
1N/A
1N/A        /* Discard the name. */
1N/A        if (infop->name != NULL) {
1N/A            __os_freestr(infop->name);
1N/A            infop->name = NULL;
1N/A        }
1N/A
1N/A        /*
1N/A         * If we had a temporary error, wait a few seconds and
1N/A         * try again.
1N/A         */
1N/A        if (ret == 0) {
1N/A            if (++retry_cnt <= 3) {
1N/A                __os_sleep(retry_cnt * 2, 0);
1N/A                goto loop;
1N/A            }
1N/A            ret = EAGAIN;
1N/A        }
1N/A    }
1N/A
1N/A    /*
1N/A     * XXX
1N/A     * HP-UX won't permit mutexes to live in anything but shared memory.
1N/A     * Instantiate a shared region file on that architecture, regardless.
1N/A     *
1N/A     * XXX
1N/A     * There's a problem in cleaning this up on application exit, or on
1N/A     * application failure.  If an application opens a database without
1N/A     * an environment, we create a temporary backing mpool region for it.
1N/A     * That region is marked REGION_PRIVATE, but as HP-UX won't permit
1N/A     * mutexes to live in anything but shared memory, we instantiate a
1N/A     * real file plus a memory region of some form.  If the application
1N/A     * crashes, the necessary information to delete the backing file and
1N/A     * any system region (e.g., the shmget(2) segment ID) is no longer
1N/A     * available.  We can't completely fix the problem, but we try.
1N/A     *
1N/A     * The underlying UNIX __db_mapregion() code preferentially uses the
1N/A     * mmap(2) interface with the MAP_ANON/MAP_ANONYMOUS flags for regions
1N/A     * that are marked REGION_PRIVATE.  This means that we normally aren't
1N/A     * holding any system resources when we get here, in which case we can
1N/A     * delete the backing file.  This results in a short race, from the
1N/A     * __db_open() call above to here.
1N/A     *
1N/A     * If, for some reason, we are holding system resources when we get
1N/A     * here, we don't have any choice -- we can't delete the backing file
1N/A     * because we may need it to detach from the resources.  Set the
1N/A     * REGION_LASTDETACH flag, so that we do all necessary cleanup when
1N/A     * the application closes the region.
1N/A     */
1N/A    if (F_ISSET(infop, REGION_PRIVATE) && !F_ISSET(infop, REGION_MALLOC))
1N/A        if (F_ISSET(infop, REGION_HOLDINGSYS))
1N/A            F_SET(infop, REGION_LASTDETACH);
1N/A        else {
1N/A            F_SET(infop, REGION_REMOVED);
1N/A            F_CLR(infop, REGION_CANGROW);
1N/A
1N/A            (void)__os_close(infop->fd);
1N/A            (void)__os_unlink(infop->name);
1N/A        }
1N/A
1N/A    return (ret);
1N/A}
1N/A
1N/A/*
1N/A * __db_rdetach --
1N/A *  De-attach from a shared memory region.
1N/A *
1N/A * PUBLIC: int __db_rdetach __P((REGINFO *));
1N/A */
1N/Aint
1N/A__db_rdetach(infop)
1N/A    REGINFO *infop;
1N/A{
1N/A    RLAYOUT *rlp;
1N/A    int detach, ret, t_ret;
1N/A
1N/A    ret = 0;
1N/A
1N/A    /*
1N/A     * If the region was removed when it was created, no further action
1N/A     * is required.
1N/A     */
1N/A    if (F_ISSET(infop, REGION_REMOVED))
1N/A        goto done;
1N/A    /*
1N/A     * If the region was created in memory returned by malloc, the only
1N/A     * action required is freeing the memory.
1N/A     */
1N/A    if (F_ISSET(infop, REGION_MALLOC)) {
1N/A        __os_free(infop->addr, 0);
1N/A        goto done;
1N/A    }
1N/A
1N/A    /* Otherwise, attach to the region and optionally delete it. */
1N/A    rlp = infop->addr;
1N/A
1N/A    /* Get the lock. */
1N/A    (void)__db_mutex_lock(&rlp->lock, infop->fd);
1N/A
1N/A    /* Decrement the reference count. */
1N/A    if (rlp->refcnt == 0)
1N/A        __db_err(infop->dbenv,
1N/A            "region rdetach: reference count went to zero!");
1N/A    else
1N/A        --rlp->refcnt;
1N/A
1N/A    /*
1N/A     * If we're going to remove the region, clear the valid flag so
1N/A     * that any region join that's blocked waiting for us will know
1N/A     * what happened.
1N/A     */
1N/A    detach = 0;
1N/A    if (F_ISSET(infop, REGION_LASTDETACH))
1N/A        if (rlp->refcnt == 0) {
1N/A            detach = 1;
1N/A            rlp->valid = 0;
1N/A        } else
1N/A            ret = EBUSY;
1N/A
1N/A    /* Release the lock. */
1N/A    (void)__db_mutex_unlock(&rlp->lock, infop->fd);
1N/A
1N/A    /* Close the backing file descriptor. */
1N/A    (void)__os_close(infop->fd);
1N/A    infop->fd = -1;
1N/A
1N/A    /* Discard our mapping of the region. */
1N/A    if ((t_ret = __db_unmapregion(infop)) != 0 && ret == 0)
1N/A        ret = t_ret;
1N/A
1N/A    /* Discard the region itself. */
1N/A    if (detach) {
1N/A        if ((t_ret =
1N/A            __db_unlinkregion(infop->name, infop) != 0) && ret == 0)
1N/A            ret = t_ret;
1N/A        if ((t_ret = __os_unlink(infop->name) != 0) && ret == 0)
1N/A            ret = t_ret;
1N/A    }
1N/A
1N/Adone:   /* Discard the name. */
1N/A    if (infop->name != NULL) {
1N/A        __os_freestr(infop->name);
1N/A        infop->name = NULL;
1N/A    }
1N/A
1N/A    return (ret);
1N/A}
1N/A
1N/A/*
1N/A * __db_runlink --
1N/A *  Remove a region.
1N/A *
1N/A * PUBLIC: int __db_runlink __P((REGINFO *, int));
1N/A */
1N/Aint
1N/A__db_runlink(infop, force)
1N/A    REGINFO *infop;
1N/A    int force;
1N/A{
1N/A    RLAYOUT rl, *rlp;
1N/A    size_t size;
1N/A    ssize_t nr;
1N/A    u_int32_t mbytes, bytes;
1N/A    int fd, ret, t_ret;
1N/A    char *name;
1N/A
1N/A    /*
1N/A     * XXX
1N/A     * We assume that we've created a new REGINFO structure for this
1N/A     * call, not used one that was already initialized.  Regardless,
1N/A     * if anyone is planning to use it after we're done, they're going
1N/A     * to be sorely disappointed.
1N/A     *
1N/A     * If force isn't set, we attach to the region, set a flag to delete
1N/A     * the region on last close, and let the region delete code do the
1N/A     * work.
1N/A     */
1N/A    if (!force) {
1N/A        if ((ret = __db_rattach(infop)) != 0)
1N/A            return (ret);
1N/A
1N/A        rlp = (RLAYOUT *)infop->addr;
1N/A        (void)__db_mutex_unlock(&rlp->lock, infop->fd);
1N/A
1N/A        F_SET(infop, REGION_LASTDETACH);
1N/A
1N/A        return (__db_rdetach(infop));
1N/A    }
1N/A
1N/A    /*
1N/A     * Otherwise, we don't want to attach to the region.  We may have been
1N/A     * called to clean up if a process died leaving a region locked and/or
1N/A     * corrupted, which could cause the attach to hang.
1N/A     */
1N/A    if ((ret = __db_appname(infop->dbenv, infop->appname,
1N/A        infop->path, infop->file, infop->dbflags, NULL, &name)) != 0)
1N/A        return (ret);
1N/A
1N/A    /*
1N/A     * An underlying file is created for all regions other than private
1N/A     * (REGION_PRIVATE) ones, regardless of whether or not it's used to
1N/A     * back the region.  If that file doesn't exist, we're done.
1N/A     */
1N/A    if (__os_exists(name, NULL) != 0) {
1N/A        __os_freestr(name);
1N/A        return (0);
1N/A    }
1N/A
1N/A    /*
1N/A     * See the comments in __db_rattach -- figure out if this is a regular
1N/A     * file backing a region or if it's a regular file with information
1N/A     * about a region.
1N/A     */
1N/A    if ((ret = __db_open(name, DB_RDONLY, DB_RDONLY, 0, &fd)) != 0)
1N/A        goto errmsg;
1N/A    if ((ret = __os_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0)
1N/A        goto errmsg;
1N/A    size = mbytes * MEGABYTE + bytes;
1N/A
1N/A    if (size <= sizeof(RLAYOUT)) {
1N/A        if ((ret = __os_read(fd, &rl, sizeof(rl), &nr)) != 0)
1N/A            goto errmsg;
1N/A        if (rl.valid != DB_REGIONMAGIC) {
1N/A            __db_err(infop->dbenv,
1N/A                "%s: illegal region magic number", name);
1N/A            ret = EINVAL;
1N/A            goto err;
1N/A        }
1N/A
1N/A        /* Set the size, memory id and characteristics. */
1N/A        infop->size = rl.size;
1N/A        infop->segid = rl.segid;
1N/A        if (F_ISSET(&rl, REGION_ANONYMOUS))
1N/A            F_SET(infop, REGION_ANONYMOUS);
1N/A    } else {
1N/A        infop->size = size;
1N/A        infop->segid = INVALID_SEGID;
1N/A    }
1N/A
1N/A    /* Remove the underlying region. */
1N/A    ret = __db_unlinkregion(name, infop);
1N/A
1N/A    /*
1N/A     * Unlink the backing file.  Close the open file descriptor first,
1N/A     * because some architectures (e.g., Win32) won't unlink a file if
1N/A     * open file descriptors remain.
1N/A     */
1N/A    (void)__os_close(fd);
1N/A    if ((t_ret = __os_unlink(name)) != 0 && ret == 0)
1N/A        ret = t_ret;
1N/A
1N/A    if (0) {
1N/Aerrmsg:     __db_err(infop->dbenv, "%s: %s", name, strerror(ret));
1N/Aerr:        (void)__os_close(fd);
1N/A    }
1N/A
1N/A    __os_freestr(name);
1N/A    return (ret);
1N/A}
1N/A
1N/A/*
1N/A * __db_rgrow --
1N/A *  Extend a region.
1N/A *
1N/A * PUBLIC: int __db_rgrow __P((REGINFO *, size_t));
1N/A */
1N/Aint
1N/A__db_rgrow(infop, new_size)
1N/A    REGINFO *infop;
1N/A    size_t new_size;
1N/A{
1N/A    RLAYOUT *rlp;
1N/A    size_t increment;
1N/A    int ret;
1N/A
1N/A    /*
1N/A     * !!!
1N/A     * This routine MUST be called with the region already locked.
1N/A     */
1N/A
1N/A    /* The underlying routines have flagged if this region can grow. */
1N/A    if (!F_ISSET(infop, REGION_CANGROW))
1N/A        return (EINVAL);
1N/A
1N/A    /*
1N/A     * Round off the requested size to the next page boundary, and
1N/A     * determine the additional space required.
1N/A     */
1N/A    rlp = (RLAYOUT *)infop->addr;
1N/A    DB_ROUNDOFF(new_size, DB_VMPAGESIZE);
1N/A    increment = new_size - rlp->size;
1N/A
1N/A    if ((ret = __db_growregion(infop, increment)) != 0)
1N/A        return (ret);
1N/A
1N/A    /* Update the on-disk region size. */
1N/A    rlp->size = new_size;
1N/A
1N/A    /* Detach from and reattach to the region. */
1N/A    return (__db_rreattach(infop, new_size));
1N/A}
1N/A
1N/A/*
1N/A * __db_growregion --
1N/A *  Grow a shared memory region.
1N/A */
1N/Astatic int
1N/A__db_growregion(infop, increment)
1N/A    REGINFO *infop;
1N/A    size_t increment;
1N/A{
1N/A    db_pgno_t pages;
1N/A    size_t i;
1N/A    ssize_t nr, nw;
1N/A    u_int32_t relative;
1N/A    int ret;
1N/A    char buf[DB_VMPAGESIZE];
1N/A
1N/A    /* Seek to the end of the region. */
1N/A    if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, SEEK_END)) != 0)
1N/A        goto err;
1N/A
1N/A    /* Write nuls to the new bytes. */
1N/A    memset(buf, 0, sizeof(buf));
1N/A
1N/A    /*
1N/A     * Some systems require that all of the bytes of the region be
1N/A     * written before it can be mapped and accessed randomly, and
1N/A     * other systems don't zero out the pages.
1N/A     */
1N/A    if (__db_mapinit())
1N/A        /* Extend the region by writing each new page. */
1N/A        for (i = 0; i < increment; i += DB_VMPAGESIZE) {
1N/A            if ((ret =
1N/A                __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
1N/A                goto err;
1N/A            if (nw != sizeof(buf))
1N/A                goto eio;
1N/A        }
1N/A    else {
1N/A        /*
1N/A         * Extend the region by writing the last page.  If the region
1N/A         * is >4Gb, increment may be larger than the maximum possible
1N/A         * seek "relative" argument, as it's an unsigned 32-bit value.
1N/A         * Break the offset into pages of 1MB each so that we don't
1N/A         * overflow (2^20 + 2^32 is bigger than any memory I expect
1N/A         * to see for awhile).
1N/A         */
1N/A        pages = (increment - DB_VMPAGESIZE) / MEGABYTE;
1N/A        relative = (increment - DB_VMPAGESIZE) % MEGABYTE;
1N/A        if ((ret = __os_seek(infop->fd,
1N/A            MEGABYTE, pages, relative, 0, SEEK_CUR)) != 0)
1N/A            goto err;
1N/A        if ((ret = __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
1N/A            goto err;
1N/A        if (nw != sizeof(buf))
1N/A            goto eio;
1N/A
1N/A        /*
1N/A         * It's sometimes significantly faster to page-fault in all of
1N/A         * the region's pages before we run the application, as we see
1N/A         * nasty side-effects when we page-fault while holding various
1N/A         * locks, i.e., the lock takes a long time to acquire because
1N/A         * of the underlying page fault, and the other threads convoy
1N/A         * behind the lock holder.
1N/A         *
1N/A         * We also use REGION_INIT to guarantee that there is enough
1N/A         * disk space for the region, so we also write a byte to each
1N/A         * page.  Reading the byte is insufficient as some systems
1N/A         * (e.g., Solaris) do not instantiate disk pages to satisfy
1N/A         * a read, and so we don't know if there is enough disk space
1N/A         * or not.
1N/A         */
1N/A        if (DB_GLOBAL(db_region_init)) {
1N/A            pages = increment / MEGABYTE;
1N/A            relative = increment % MEGABYTE;
1N/A            if ((ret = __os_seek(infop->fd,
1N/A                MEGABYTE, pages, relative, 1, SEEK_END)) != 0)
1N/A                goto err;
1N/A
1N/A            /* Write a byte to each page. */
1N/A            for (i = 0; i < increment; i += DB_VMPAGESIZE) {
1N/A                if ((ret =
1N/A                    __os_write(infop->fd, buf, 1, &nr)) != 0)
1N/A                    goto err;
1N/A                if (nr != 1)
1N/A                    goto eio;
1N/A                if ((ret = __os_seek(infop->fd,
1N/A                    0, 0, DB_VMPAGESIZE - 1, 0, SEEK_CUR)) != 0)
1N/A                    goto err;
1N/A            }
1N/A        }
1N/A    }
1N/A    return (0);
1N/A
1N/Aeio:    ret = EIO;
1N/Aerr:    __db_err(infop->dbenv, "region grow: %s", strerror(ret));
1N/A    return (ret);
1N/A}
1N/A
1N/A/*
1N/A * __db_rreattach --
1N/A *  Detach from and reattach to a region.
1N/A *
1N/A * PUBLIC: int __db_rreattach __P((REGINFO *, size_t));
1N/A */
1N/Aint
1N/A__db_rreattach(infop, new_size)
1N/A    REGINFO *infop;
1N/A    size_t new_size;
1N/A{
1N/A    int ret;
1N/A
1N/A#ifdef DIAGNOSTIC
1N/A    if (infop->name == NULL) {
1N/A        __db_err(infop->dbenv, "__db_rreattach: name was NULL");
1N/A        return (EINVAL);
1N/A    }
1N/A#endif
1N/A    /*
1N/A     * If we're growing an already mapped region, we have to unmap it
1N/A     * and get it back.  We have it locked, so nobody else can get in,
1N/A     * which makes it fairly straight-forward to do, as everybody else
1N/A     * is going to block while we do the unmap/remap.  NB: if we fail
1N/A     * to get it back, the pooch is genuinely screwed, because we can
1N/A     * never release the lock we're holding.
1N/A     *
1N/A     * Detach from the region.  We have to do this first so architectures
1N/A     * that don't permit a file to be mapped into different places in the
1N/A     * address space simultaneously, e.g., HP's PaRisc, will work.
1N/A     */
1N/A    if ((ret = __db_unmapregion(infop)) != 0)
1N/A        return (ret);
1N/A
1N/A    /* Update the caller's REGINFO size to the new map size. */
1N/A    infop->size = new_size;
1N/A
1N/A    /* Attach to the region. */
1N/A    ret = __db_mapregion(infop->name, infop);
1N/A
1N/A    return (ret);
1N/A}