lvm/softpart/sp.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
 */

/*
 * Soft partitioning metadevice driver (md_sp).
 *
 * This file contains the primary operations of the soft partitioning
 * metadevice driver.  This includes all routines for normal operation
 * (open/close/read/write).  Please see mdvar.h for a definition of
 * metadevice operations vector (md_ops_t).  This driver is loosely
 * based on the stripe driver (md_stripe).
 *
 * All metadevice administration is done through the use of ioctl's.
 * As such, all administrative routines appear in sp_ioctl.c.
 *
 * Soft partitions are represented both in-core and in the metadb with a
 * unit structure.  The soft partition-specific information in the unit
 * structure includes the following information:
 *  - Device information (md_dev64_t & md key) about the device on which
 *    the soft partition is built.
 *  - Soft partition status information.
 *  - The size of the soft partition and number of extents used to
 *    make up that size.
 *  - An array of exents which define virtual/physical offset
 *    mappings and lengths for each extent.
 *
 * Typical soft partition operation proceeds as follows:
 *  - The unit structure is fetched from the metadb and placed into
 *    an in-core array (as with other metadevices).  This operation
 *    is performed via sp_build_incore( ) and takes place during
 *    "snarfing" (when all metadevices are brought in-core at
 *    once) and when a new soft partition is created.
 *  - A soft partition is opened via sp_open( ).  At open time the
 *    the soft partition unit structure is verified with the soft
 *    partition on-disk structures.  Additionally, the soft partition
 *    status is checked (only soft partitions in the OK state may be
 *    opened).
 *  - Soft partition I/O is performed via sp_strategy( ) which relies on
 *    a support routine, sp_mapbuf( ), to do most of the work.
 *    sp_mapbuf( ) maps a buffer to a particular extent via a binary
 *    search of the extent array in the soft partition unit structure.
 *    Once a translation has been performed, the I/O is passed down
 *    to the next layer, which may be another metadevice or a physical
 *    disk.  Since a soft partition may contain multiple, non-contiguous
 *    extents, a single I/O may have to be fragmented.
 *  - Soft partitions are closed using sp_close.
 *
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/file.h>
#include <sys/user.h>
#include <sys/uio.h>
#include <sys/t_lock.h>
#include <sys/buf.h>
#include <sys/dkio.h>
#include <sys/vtoc.h>
#include <sys/kmem.h>
#include <vm/page.h>
#include <sys/cmn_err.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/mkdev.h>
#include <sys/stat.h>
#include <sys/open.h>
#include <sys/lvm/mdvar.h>
#include <sys/lvm/md_sp.h>
#include <sys/lvm/md_convert.h>
#include <sys/lvm/md_notify.h>
#include <sys/lvm/md_crc.h>
#include <sys/modctl.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/debug.h>

#include <sys/sysevent/eventdefs.h>
#include <sys/sysevent/svm.h>

md_ops_t        sp_md_ops;
#ifndef lint
md_ops_t        *md_interface_ops = &sp_md_ops;
#endif

extern unit_t       md_nunits;
extern set_t        md_nsets;
extern md_set_t     md_set[];

extern int      md_status;
extern major_t      md_major;
extern mdq_anchor_t md_done_daemon;
extern mdq_anchor_t md_sp_daemon;
extern kmutex_t     md_mx;
extern kcondvar_t   md_cv;
extern md_krwlock_t md_unit_array_rw;
extern clock_t      md_hz;

static kmem_cache_t *sp_parent_cache = NULL;
static kmem_cache_t *sp_child_cache = NULL;
static void     sp_send_stat_ok(mp_unit_t *);
static void     sp_send_stat_err(mp_unit_t *);

/*
 * FUNCTION:    sp_parent_constructor()
 * INPUT:   none.
 * OUTPUT:  ps  - parent save structure initialized.
 * RETURNS: void *  - ptr to initialized parent save structure.
 * PURPOSE: initialize parent save structure.
 */
/*ARGSUSED1*/
static int
sp_parent_constructor(void *p, void *d1, int d2)
{
    mutex_init(&((md_spps_t *)p)->ps_mx,
        NULL, MUTEX_DEFAULT, NULL);
    return (0);
}

static void
sp_parent_init(md_spps_t *ps)
{
    bzero(ps, offsetof(md_spps_t, ps_mx));
}

/*ARGSUSED1*/
static void
sp_parent_destructor(void *p, void *d)
{
    mutex_destroy(&((md_spps_t *)p)->ps_mx);
}

/*
 * FUNCTION:    sp_child_constructor()
 * INPUT:   none.
 * OUTPUT:  cs  - child save structure initialized.
 * RETURNS: void *  - ptr to initialized child save structure.
 * PURPOSE: initialize child save structure.
 */
/*ARGSUSED1*/
static int
sp_child_constructor(void *p, void *d1, int d2)
{
    bioinit(&((md_spcs_t *)p)->cs_buf);
    return (0);
}

static void
sp_child_init(md_spcs_t *cs)
{
    cs->cs_mdunit = 0;
    cs->cs_ps = NULL;
    md_bioreset(&cs->cs_buf);
}

/*ARGSUSED1*/
static void
sp_child_destructor(void *p, void *d)
{
    biofini(&((md_spcs_t *)p)->cs_buf);
}

/*
 * FUNCTION:    sp_run_queue()
 * INPUT:   none.
 * OUTPUT:  none.
 * RETURNS: void.
 * PURPOSE: run the md_daemon to clean up memory pool.
 */
/*ARGSUSED*/
static void
sp_run_queue(void *d)
{
    if (!(md_status & MD_GBL_DAEMONS_LIVE))
        md_daemon(1, &md_done_daemon);
}


/*
 * FUNCTION:    sp_build_incore()
 * INPUT:   p       - ptr to unit structure.
 *      snarfing    - flag to tell us we are snarfing.
 * OUTPUT:  non.
 * RETURNS: int - 0 (always).
 * PURPOSE: place unit structure into in-core unit array (keyed from
 *      minor number).
 */
int
sp_build_incore(void *p, int snarfing)
{
    mp_unit_t   *un = (mp_unit_t *)p;
    minor_t     mnum;
    set_t       setno;
    md_dev64_t  tmpdev;

    mnum = MD_SID(un);

    if (MD_UNIT(mnum) != NULL)
        return (0);

    MD_STATUS(un) = 0;

    if (snarfing) {
        /*
         * if we are snarfing, we get the device information
         * from the metadb record (using the metadb key for
         * that device).
         */
        setno = MD_MIN2SET(mnum);

        tmpdev = md_getdevnum(setno, mddb_getsidenum(setno),
            un->un_key, MD_NOTRUST_DEVT);
        un->un_dev = tmpdev;
    }

    /* place various information in the in-core data structures */
    md_nblocks_set(mnum, un->c.un_total_blocks);
    MD_UNIT(mnum) = un;

    return (0);
}

/*
 * FUNCTION:    reset_sp()
 * INPUT:   un      - unit structure to be reset/removed.
 *      mnum        - minor number to be reset/removed.
 *      removing    - flag to tell us if we are removing
 *                permanently or just reseting in-core
 *                structures.
 * OUTPUT:  none.
 * RETURNS: void.
 * PURPOSE: used to either simply reset in-core structures or to
 *      permanently remove metadevices from the metadb.
 */
void
reset_sp(mp_unit_t *un, minor_t mnum, int removing)
{
    sv_dev_t    *sv;
    mddb_recid_t    vtoc_id;

    /* clean up in-core structures */
    md_destroy_unit_incore(mnum, &sp_md_ops);

    md_nblocks_set(mnum, -1ULL);
    MD_UNIT(mnum) = NULL;

    /*
     * Attempt release of minor node
     */
    md_remove_minor_node(mnum);

    if (!removing)
        return;

    /* we are removing the soft partition from the metadb */

    /*
     * Save off device information so we can get to
     * it after we do the mddb_deleterec().
     */
    sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP);
    sv->setno = MD_MIN2SET(mnum);
    sv->key = un->un_key;
    vtoc_id = un->c.un_vtoc_id;

    /*
     * Remove self from the namespace
     */
    if (un->c.un_revision & MD_FN_META_DEV) {
        (void) md_rem_selfname(un->c.un_self_id);
    }

    /* Remove the unit structure */
    mddb_deleterec_wrapper(un->c.un_record_id);

    if (vtoc_id)
        mddb_deleterec_wrapper(vtoc_id);

    SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE,
        MD_MIN2SET(mnum), MD_MIN2UNIT(mnum));

    /*
     * remove the underlying device name from the metadb.  if other
     * soft partitions are built on this device, this will simply
     * decrease the reference count for this device.  otherwise the
     * name record for this device will be removed from the metadb.
     */
    md_rem_names(sv, 1);
    kmem_free(sv, sizeof (sv_dev_t));
}

/*
 * FUNCTION:    sp_send_stat_msg
 * INPUT:   un  - unit reference
 *      status  - status to be sent to master node
 *          MD_SP_OK - soft-partition is now OK
 *          MD_SP_ERR   "   "    errored
 * OUTPUT:  none.
 * RETURNS: void.
 * PURPOSE: send a soft-partition status change to the master node. If the
 *      message succeeds we simply return. If it fails we panic as the
 *      cluster-wide view of the metadevices is now inconsistent.
 * CALLING CONTEXT:
 *  Blockable. No locks can be held.
 */
static void
sp_send_stat_msg(mp_unit_t *un, sp_status_t status)
{
    md_mn_msg_sp_setstat_t  sp_msg;
    md_mn_kresult_t *kres;
    set_t       setno = MD_UN2SET(un);
    int     rval;
    const char  *str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK";
    int     nretries = 0;

    sp_msg.sp_setstat_mnum = MD_SID(un);
    sp_msg.sp_setstat_status = status;

    kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);

spss_msg:
    rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG,
        0, (char *)&sp_msg, sizeof (sp_msg), kres);

    if (!MDMN_KSEND_MSG_OK(rval, kres)) {
        mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2");
        /* If we're shutting down already, pause things here. */
        if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
            while (!md_mn_is_commd_present()) {
                delay(md_hz);
            }
            /*
             * commd is available again. Retry the message once.
             * If it fails we panic as the system is in an
             * unexpected state.
             */
            if (nretries++ == 0)
                goto spss_msg;
        }
        /*
         * Panic as we are now in an inconsistent state.
         */
        cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n",
            md_shortname(MD_SID(un)), str);
    }

    kmem_free(kres, sizeof (md_mn_kresult_t));
}

/*
 * FUNCTION:    sp_finish_error
 * INPUT:   ps  - parent save structure for error-ed I/O.
 *      lock_held   - set if the unit readerlock is held
 * OUTPUT:  none.
 * RETURNS: void.
 * PURPOSE: report a driver error
 */
static void
sp_finish_error(md_spps_t *ps, int lock_held)
{
    struct buf  *pb = ps->ps_bp;
    mdi_unit_t  *ui = ps->ps_ui;
    md_dev64_t  un_dev;         /* underlying device */
    md_dev64_t  md_dev = md_expldev(pb->b_edev); /* metadev in error */
    char        *str;

    un_dev = md_expldev(ps->ps_un->un_dev);
    /* set error type */
    if (pb->b_flags & B_READ) {
        str = "read";
    } else {
        str = "write";
    }


    SPPS_FREE(sp_parent_cache, ps);
    pb->b_flags |= B_ERROR;

    md_kstat_done(ui, pb, 0);

    if (lock_held) {
        md_unit_readerexit(ui);
    }
    md_biodone(pb);

    cmn_err(CE_WARN, "md: %s: %s error on %s",
        md_shortname(md_getminor(md_dev)), str,
        md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0));
}


/*
 * FUNCTION:    sp_xmit_ok
 * INPUT:   dq  - daemon queue referencing failing ps structure
 * OUTPUT:  none.
 * RETURNS: void.
 * PURPOSE: send a message to the master node in a multi-owner diskset to
 *      update all attached nodes view of the soft-part to be MD_SP_OK.
 * CALLING CONTEXT:
 *  Blockable. No unit lock held.
 */
static void
sp_xmit_ok(daemon_queue_t *dq)
{
    md_spps_t   *ps = (md_spps_t *)dq;

    /* Send a MD_MN_MSG_SP_SETSTAT to the master */
    sp_send_stat_msg(ps->ps_un, MD_SP_OK);

    /*
     * Successfully transmitted error state to all nodes, now release this
     * parent structure.
     */
    SPPS_FREE(sp_parent_cache, ps);
}

/*
 * FUNCTION:    sp_xmit_error
 * INPUT:   dq  - daemon queue referencing failing ps structure
 * OUTPUT:  none.
 * RETURNS: void.
 * PURPOSE: send a message to the master node in a multi-owner diskset to
 *      update all attached nodes view of the soft-part to be MD_SP_ERR.
 * CALLING CONTEXT:
 *  Blockable. No unit lock held.
 */
static void
sp_xmit_error(daemon_queue_t *dq)
{
    md_spps_t   *ps = (md_spps_t *)dq;

    /* Send a MD_MN_MSG_SP_SETSTAT to the master */
    sp_send_stat_msg(ps->ps_un, MD_SP_ERR);

    /*
     * Successfully transmitted error state to all nodes, now release this
     * parent structure.
     */
    SPPS_FREE(sp_parent_cache, ps);
}
static void
sp_send_stat_ok(mp_unit_t *un)
{
    minor_t     mnum = MD_SID(un);
    md_spps_t   *ps;

    ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
    sp_parent_init(ps);
    ps->ps_un = un;
    ps->ps_ui = MDI_UNIT(mnum);

    daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps,
        REQ_OLD);
}

static void
sp_send_stat_err(mp_unit_t *un)
{
    minor_t     mnum = MD_SID(un);
    md_spps_t   *ps;

    ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
    sp_parent_init(ps);
    ps->ps_un = un;
    ps->ps_ui = MDI_UNIT(mnum);

    daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps,
        REQ_OLD);
}


/*
 * FUNCTION:    sp_error()
 * INPUT:   ps  - parent save structure for error-ed I/O.
 * OUTPUT:  none.
 * RETURNS: void.
 * PURPOSE: report a driver error.
 * CALLING CONTEXT:
 *  Interrupt - non-blockable
 */
static void
sp_error(md_spps_t *ps)
{
    set_t       setno = MD_UN2SET(ps->ps_un);

    /*
     * Drop the mutex associated with this request before (potentially)
     * enqueuing the free onto a separate thread. We have to release the
     * mutex before destroying the parent structure.
     */
    if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
        if (MUTEX_HELD(&ps->ps_mx)) {
            mutex_exit(&ps->ps_mx);
        }
    } else {
        /*
         * this should only ever happen if we are panicking,
         * since DONTFREE is only set on the parent if panicstr
         * is non-NULL.
         */
        ASSERT(panicstr);
    }

    /*
     * For a multi-owner set we need to send a message to the master so that
     * all nodes get the errored status when we first encounter it. To avoid
     * deadlocking when multiple soft-partitions encounter an error on one
     * physical unit we drop the unit readerlock before enqueueing the
     * request. That way we can service any messages that require a
     * writerlock to be held. Additionally, to avoid deadlocking when at
     * the bottom of a metadevice stack and a higher level mirror has
     * multiple requests outstanding on this soft-part, we clone the ps
     * that failed and pass the error back up the stack to release the
     * reference that this i/o may have in the higher-level metadevice.
     * The other nodes in the cluster just have to modify the soft-part
     * status and we do not need to block the i/o completion for this.
     */
    if (MD_MNSET_SETNO(setno)) {
        md_spps_t   *err_ps;
        err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
        sp_parent_init(err_ps);

        err_ps->ps_un = ps->ps_un;
        err_ps->ps_ui = ps->ps_ui;

        md_unit_readerexit(ps->ps_ui);

        daemon_request(&md_sp_daemon, sp_xmit_error,
            (daemon_queue_t *)err_ps, REQ_OLD);

        sp_finish_error(ps, 0);

        return;
    } else {
        ps->ps_un->un_status = MD_SP_ERR;
    }

    /* Flag the error */
    sp_finish_error(ps, 1);

}

/*
 * FUNCTION:    sp_mapbuf()
 * INPUT:   un  - unit structure for soft partition we are doing
 *            I/O on.
 *      voff    - virtual offset in soft partition to map.
 *      bcount  - # of blocks in the I/O.
 * OUTPUT:  bp  - translated buffer to be passed down to next layer.
 * RETURNS: 1   - request must be fragmented, more work to do,
 *      0   - request satisified, no more work to do
 *      -1  - error
 * PURPOSE: Map the the virtual offset in the soft partition (passed
 *      in via voff) to the "physical" offset on whatever the soft
 *      partition is built on top of.  We do this by doing a binary
 *      search of the extent array in the soft partition unit
 *      structure.  Once the current extent is found, we do the
 *      translation, determine if the I/O will cross extent
 *      boundaries (if so, we have to fragment the I/O), then
 *      fill in the buf structure to be passed down to the next layer.
 */
static int
sp_mapbuf(
    mp_unit_t   *un,
    sp_ext_offset_t voff,
    sp_ext_length_t bcount,
    buf_t       *bp
)
{
    int     lo, mid, hi, found, more;
    size_t      new_bcount;
    sp_ext_offset_t new_blkno;
    sp_ext_offset_t new_offset;
    sp_ext_offset_t ext_endblk;
    md_dev64_t  new_edev;
    extern unsigned md_maxphys;

    found = 0;
    lo = 0;
    hi = un->un_numexts - 1;

    /*
     * do a binary search to find the extent that contains the
     * starting offset.  after this loop, mid contains the index
     * of the correct extent.
     */
    while (lo <= hi && !found) {
        mid = (lo + hi) / 2;
        /* is the starting offset contained within the mid-ext? */
        if (voff >= un->un_ext[mid].un_voff &&
            voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len)
            found = 1;
        else if (voff < un->un_ext[mid].un_voff)
            hi = mid - 1;
        else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */
            lo = mid + 1;
    }

    if (!found) {
        cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff);
        return (-1);
    }

    /* translate to underlying physical offset/device */
    new_offset = voff - un->un_ext[mid].un_voff;
    new_blkno = un->un_ext[mid].un_poff + new_offset;
    new_edev = un->un_dev;

    /* determine if we need to break the I/O into fragments */
    ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len;
    if (voff + btodb(bcount) > ext_endblk) {
        new_bcount = dbtob(ext_endblk - voff);
        more = 1;
    } else {
        new_bcount = bcount;
        more = 0;
    }

    /* only break up the I/O if we're not built on another metadevice */
    if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) {
        new_bcount = md_maxphys;
        more = 1;
    }
    if (bp != (buf_t *)NULL) {
        /* do bp updates */
        bp->b_bcount = new_bcount;
        bp->b_lblkno = new_blkno;
        bp->b_edev = md_dev64_to_dev(new_edev);
    }
    return (more);
}

/*
 * FUNCTION:    sp_validate()
 * INPUT:   un  - unit structure to be validated.
 * OUTPUT:  none.
 * RETURNS: 0   - soft partition ok.
 *      -1  - error.
 * PURPOSE: called on open to sanity check the soft partition.  In
 *      order to open a soft partition:
 *      - it must have at least one extent
 *      - the extent info in core and on disk must match
 *      - it may not be in an intermediate state (which would
 *        imply that a two-phase commit was interrupted)
 *
 *      If the extent checking fails (B_ERROR returned from the read
 *      strategy call) _and_ we're a multi-owner diskset, we send a
 *      message to the master so that all nodes inherit the same view
 *      of the soft partition.
 *      If we are checking a soft-part that is marked as in error, and
 *      we can actually read and validate the watermarks we send a
 *      message to clear the error to the master node.
 */
static int
sp_validate(mp_unit_t *un)
{
    uint_t      ext;
    struct buf  *buf;
    sp_ext_length_t len;
    mp_watermark_t  *wm;
    set_t       setno;
    int     reset_error = 0;

    setno = MD_UN2SET(un);

    /* sanity check unit structure components ?? */
    if (un->un_status != MD_SP_OK) {
        if (un->un_status != MD_SP_ERR) {
            cmn_err(CE_WARN, "md: %s: open failed, soft partition "
                "status is %u.",
                md_shortname(MD_SID(un)),
                un->un_status);
            return (-1);
        } else {
            cmn_err(CE_WARN, "md: %s: open of soft partition "
                "in Errored state.",
                md_shortname(MD_SID(un)));
            reset_error = 1;
        }
    }

    if (un->un_numexts == 0) {
        cmn_err(CE_WARN, "md: %s: open failed, soft partition does "
            "not have any extents.", md_shortname(MD_SID(un)));
        return (-1);
    }

    len = 0LL;
    for (ext = 0; ext < un->un_numexts; ext++) {

        /* tally extent lengths to check total size */
        len += un->un_ext[ext].un_len;

        /* allocate buffer for watermark */
        buf = getrbuf(KM_SLEEP);

        /* read watermark */
        buf->b_flags = B_READ;
        buf->b_edev = md_dev64_to_dev(un->un_dev);
        buf->b_iodone = NULL;
        buf->b_proc = NULL;
        buf->b_bcount = sizeof (mp_watermark_t);
        buf->b_lblkno = un->un_ext[ext].un_poff - 1;
        buf->b_bufsize = sizeof (mp_watermark_t);
        buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t),
            KM_SLEEP);

        /*
         * make the call non-blocking so that it is not affected
         * by a set take.
         */
        md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL);
        (void) biowait(buf);

        if (buf->b_flags & B_ERROR) {
            cmn_err(CE_WARN, "md: %s: open failed, could not "
                "read watermark at block %llu for extent %u, "
                "error %d.", md_shortname(MD_SID(un)),
                buf->b_lblkno, ext, buf->b_error);
            kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
            freerbuf(buf);

            /*
             * If we're a multi-owner diskset we send a message
             * indicating that this soft-part has an invalid
             * extent to the master node. This ensures a consistent
             * view of the soft-part across the cluster.
             */
            if (MD_MNSET_SETNO(setno)) {
                sp_send_stat_err(un);
            }
            return (-1);
        }

        wm = (mp_watermark_t *)buf->b_un.b_addr;

        /* make sure the checksum is correct first */
        if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
            (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) {
            cmn_err(CE_WARN, "md: %s: open failed, watermark "
                "at block %llu for extent %u does not have a "
                "valid checksum 0x%08x.", md_shortname(MD_SID(un)),
                buf->b_lblkno, ext, wm->wm_checksum);
            kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
            freerbuf(buf);
            return (-1);
        }

        if (wm->wm_magic != MD_SP_MAGIC) {
            cmn_err(CE_WARN, "md: %s: open failed, watermark "
                "at block %llu for extent %u does not have a "
                "valid watermark magic number, expected 0x%x, "
                "found 0x%x.", md_shortname(MD_SID(un)),
                buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic);
            kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
            freerbuf(buf);
            return (-1);
        }

        /* make sure sequence number matches the current extent */
        if (wm->wm_seq != ext) {
            cmn_err(CE_WARN, "md: %s: open failed, watermark "
                "at block %llu for extent %u has invalid "
                "sequence number %u.", md_shortname(MD_SID(un)),
                buf->b_lblkno, ext, wm->wm_seq);
            kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
            freerbuf(buf);
            return (-1);
        }

        /* make sure watermark length matches unit structure */
        if (wm->wm_length != un->un_ext[ext].un_len) {
            cmn_err(CE_WARN, "md: %s: open failed, watermark "
                "at block %llu for extent %u has inconsistent "
                "length, expected %llu, found %llu.",
                md_shortname(MD_SID(un)), buf->b_lblkno,
                ext, un->un_ext[ext].un_len,
                (u_longlong_t)wm->wm_length);
            kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
            freerbuf(buf);
            return (-1);
        }

        /*
         * make sure the type is a valid soft partition and not
         * a free extent or the end.
         */
        if (wm->wm_type != EXTTYP_ALLOC) {
            cmn_err(CE_WARN, "md: %s: open failed, watermark "
                "at block %llu for extent %u is not marked "
                "as in-use, type = %u.", md_shortname(MD_SID(un)),
                buf->b_lblkno, ext, wm->wm_type);
            kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
            freerbuf(buf);
            return (-1);
        }
        /* free up buffer */
        kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
        freerbuf(buf);
    }

    if (len != un->un_length) {
        cmn_err(CE_WARN, "md: %s: open failed, computed length "
            "%llu != expected length %llu.", md_shortname(MD_SID(un)),
            len, un->un_length);
        return (-1);
    }

    /*
     * If we're a multi-owner set _and_ reset_error is set, we should clear
     * the error condition on all nodes in the set. Use SP_SETSTAT2 with
     * MD_SP_OK.
     */
    if (MD_MNSET_SETNO(setno) && reset_error) {
        sp_send_stat_ok(un);
    }
    return (0);
}

/*
 * FUNCTION:    sp_done()
 * INPUT:   child_buf   - buffer attached to child save structure.
 *                this is the buffer on which I/O has just
 *                completed.
 * OUTPUT:  none.
 * RETURNS: 0   - success.
 *      1   - error.
 * PURPOSE: called on I/O completion.
 */
static int
sp_done(struct buf *child_buf)
{
    struct buf  *parent_buf;
    mdi_unit_t  *ui;
    md_spps_t   *ps;
    md_spcs_t   *cs;

    /* find the child save structure to which this buffer belongs */
    cs = (md_spcs_t *)((caddr_t)child_buf -
        (sizeof (md_spcs_t) - sizeof (buf_t)));
    /* now get the parent save structure */
    ps = cs->cs_ps;
    parent_buf = ps->ps_bp;

    mutex_enter(&ps->ps_mx);
    /* pass any errors back up to the parent */
    if (child_buf->b_flags & B_ERROR) {
        ps->ps_flags |= MD_SPPS_ERROR;
        parent_buf->b_error = child_buf->b_error;
    }
    /* mapout, if needed */
    if (child_buf->b_flags & B_REMAPPED)
        bp_mapout(child_buf);

    ps->ps_frags--;
    if (ps->ps_frags != 0) {
        /*
         * if this parent has more children, we just free the
         * child and return.
         */
        kmem_cache_free(sp_child_cache, cs);
        mutex_exit(&ps->ps_mx);
        return (1);
    }
    /* there are no more children */
    kmem_cache_free(sp_child_cache, cs);
    if (ps->ps_flags & MD_SPPS_ERROR) {
        sp_error(ps);
        return (1);
    }
    ui = ps->ps_ui;
    if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
        mutex_exit(&ps->ps_mx);
    } else {
        /*
         * this should only ever happen if we are panicking,
         * since DONTFREE is only set on the parent if panicstr
         * is non-NULL.
         */
        ASSERT(panicstr);
    }
    SPPS_FREE(sp_parent_cache, ps);
    md_kstat_done(ui, parent_buf, 0);
    md_unit_readerexit(ui);
    md_biodone(parent_buf);
    return (0);
}

/*
 * FUNCTION:    md_sp_strategy()
 * INPUT:   parent_buf  - parent buffer
 *      flag        - flags
 *      private     - private data
 * OUTPUT:  none.
 * RETURNS: void.
 * PURPOSE: Soft partitioning I/O strategy.  Performs the main work
 *      needed to do I/O to a soft partition.  The basic
 *      algorithm is as follows:
 *          - Allocate a child save structure to keep track
 *            of the I/O we are going to pass down.
 *          - Map the I/O to the correct extent in the soft
 *            partition (see sp_mapbuf()).
 *          - bioclone() the buffer and pass it down the
 *            stack using md_call_strategy.
 *          - If the I/O needs to split across extents,
 *            repeat the above steps until all fragments
 *            are finished.
 */
static void
md_sp_strategy(buf_t *parent_buf, int flag, void *private)
{
    md_spps_t   *ps;
    md_spcs_t   *cs;
    int     more;
    mp_unit_t   *un;
    mdi_unit_t  *ui;
    size_t      current_count;
    off_t       current_offset;
    sp_ext_offset_t current_blkno;
    buf_t       *child_buf;
    set_t       setno = MD_MIN2SET(getminor(parent_buf->b_edev));
    int     strat_flag = flag;

    /*
     * When doing IO to a multi owner meta device, check if set is halted.
     * We do this check without the needed lock held, for performance
     * reasons.
     * If an IO just slips through while the set is locked via an
     * MD_MN_SUSPEND_SET, we don't care about it.
     * Only check for suspension if we are a top-level i/o request
     * (MD_STR_NOTTOP is cleared in 'flag');
     */
    if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
        (MD_SET_HALTED | MD_SET_MNSET)) {
        if ((flag & MD_STR_NOTTOP) == 0) {
            mutex_enter(&md_mx);
            /* Here we loop until the set is no longer halted */
            while (md_set[setno].s_status & MD_SET_HALTED) {
                cv_wait(&md_cv, &md_mx);
            }
            mutex_exit(&md_mx);
        }
    }

    ui = MDI_UNIT(getminor(parent_buf->b_edev));

    md_kstat_waitq_enter(ui);

    un = (mp_unit_t *)md_unit_readerlock(ui);

    if ((flag & MD_NOBLOCK) == 0) {
        if (md_inc_iocount(setno) != 0) {
            parent_buf->b_flags |= B_ERROR;
            parent_buf->b_error = ENXIO;
            parent_buf->b_resid = parent_buf->b_bcount;
            md_kstat_waitq_exit(ui);
            md_unit_readerexit(ui);
            biodone(parent_buf);
            return;
        }
    } else {
        md_inc_iocount_noblock(setno);
    }

    if (!(flag & MD_STR_NOTTOP)) {
        if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) {
            md_kstat_waitq_exit(ui);
            return;
        }
    }

    ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
    sp_parent_init(ps);

    /*
     * Save essential information from the original buffhdr
     * in the parent.
     */
    ps->ps_un = un;
    ps->ps_ui = ui;
    ps->ps_bp = parent_buf;
    ps->ps_addr = parent_buf->b_un.b_addr;

    current_count = parent_buf->b_bcount;
    current_blkno = (sp_ext_offset_t)parent_buf->b_blkno;
    current_offset  = 0;

    /*
     * if we are at the top and we are panicking,
     * we don't free in order to save state.
     */
    if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL))
        ps->ps_flags |= MD_SPPS_DONTFREE;

    md_kstat_waitq_to_runq(ui);

    ps->ps_frags++;

    /*
     * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this
     * metadevice.
     */
    if (ui->ui_tstate & MD_ABR_CAP)
        strat_flag |= MD_STR_ABR;

    /*
     * this loop does the main work of an I/O.  we allocate a
     * a child save for each buf, do the logical to physical
     * mapping, decide if we need to frag the I/O, clone the
     * new I/O to pass down the stack.  repeat until we've
     * taken care of the entire buf that was passed to us.
     */
    do {
        cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
        sp_child_init(cs);
        child_buf = &cs->cs_buf;
        cs->cs_ps = ps;

        more = sp_mapbuf(un, current_blkno, current_count, child_buf);
        if (more == -1) {
            parent_buf->b_flags |= B_ERROR;
            parent_buf->b_error = EIO;
            md_kstat_done(ui, parent_buf, 0);
            md_unit_readerexit(ui);
            md_biodone(parent_buf);
            kmem_cache_free(sp_parent_cache, ps);
            return;
        }

        child_buf = md_bioclone(parent_buf, current_offset,
            child_buf->b_bcount, child_buf->b_edev,
            child_buf->b_blkno, sp_done, child_buf,
            KM_NOSLEEP);
        /* calculate new offset, counts, etc... */
        current_offset += child_buf->b_bcount;
        current_count -=  child_buf->b_bcount;
        current_blkno +=  (sp_ext_offset_t)(btodb(child_buf->b_bcount));

        if (more) {
            mutex_enter(&ps->ps_mx);
            ps->ps_frags++;
            mutex_exit(&ps->ps_mx);
        }

        md_call_strategy(child_buf, strat_flag, private);
    } while (more);

    if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) {
        while (!(ps->ps_flags & MD_SPPS_DONE)) {
            md_daemon(1, &md_done_daemon);
        }
        kmem_cache_free(sp_parent_cache, ps);
    }
}

/*
 * FUNCTION:    sp_directed_read()
 * INPUT:   mnum    - minor number
 *      vdr - vol_directed_rd_t from user
 *      mode    - access mode for copying data out.
 * OUTPUT:  none.
 * RETURNS: 0   - success
 *      Exxxxx  - failure error-code
 * PURPOSE: Construct the necessary sub-device i/o requests to perform the
 *      directed read as requested by the user. This is essentially the
 *      same as md_sp_strategy() with the exception being that the
 *      underlying 'md_call_strategy' is replaced with an ioctl call.
 */
int
sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode)
{
    md_spps_t   *ps;
    md_spcs_t   *cs;
    int     more;
    mp_unit_t   *un;
    mdi_unit_t  *ui;
    size_t      current_count;
    off_t       current_offset;
    sp_ext_offset_t current_blkno;
    buf_t       *child_buf, *parent_buf;
    void        *kbuffer;
    vol_directed_rd_t   cvdr;
    caddr_t     userbuf;
    offset_t    useroff;
    int     ret = 0;

    ui = MDI_UNIT(mnum);

    md_kstat_waitq_enter(ui);

    bzero(&cvdr, sizeof (cvdr));

    un = (mp_unit_t *)md_unit_readerlock(ui);

    /*
     * Construct a parent_buf header which reflects the user-supplied
     * request.
     */

    kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
    if (kbuffer == NULL) {
        vdr->vdr_flags |= DKV_DMR_ERROR;
        md_kstat_waitq_exit(ui);
        md_unit_readerexit(ui);
        return (ENOMEM);
    }

    parent_buf = getrbuf(KM_NOSLEEP);
    if (parent_buf == NULL) {
        vdr->vdr_flags |= DKV_DMR_ERROR;
        md_kstat_waitq_exit(ui);
        md_unit_readerexit(ui);
        kmem_free(kbuffer, vdr->vdr_nbytes);
        return (ENOMEM);
    }
    parent_buf->b_un.b_addr = kbuffer;
    parent_buf->b_flags = B_READ;
    parent_buf->b_bcount = vdr->vdr_nbytes;
    parent_buf->b_lblkno = lbtodb(vdr->vdr_offset);
    parent_buf->b_edev = un->un_dev;


    ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
    sp_parent_init(ps);

    /*
     * Save essential information from the original buffhdr
     * in the parent.
     */
    ps->ps_un = un;
    ps->ps_ui = ui;
    ps->ps_bp = parent_buf;
    ps->ps_addr = parent_buf->b_un.b_addr;

    current_count = parent_buf->b_bcount;
    current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno;
    current_offset  = 0;

    md_kstat_waitq_to_runq(ui);

    ps->ps_frags++;
    vdr->vdr_bytesread = 0;

    /*
     * this loop does the main work of an I/O.  we allocate a
     * a child save for each buf, do the logical to physical
     * mapping, decide if we need to frag the I/O, clone the
     * new I/O to pass down the stack.  repeat until we've
     * taken care of the entire buf that was passed to us.
     */
    do {
        cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
        sp_child_init(cs);
        child_buf = &cs->cs_buf;
        cs->cs_ps = ps;

        more = sp_mapbuf(un, current_blkno, current_count, child_buf);
        if (more == -1) {
            ret = EIO;
            vdr->vdr_flags |= DKV_DMR_SHORT;
            kmem_cache_free(sp_child_cache, cs);
            goto err_out;
        }

        cvdr.vdr_flags = vdr->vdr_flags;
        cvdr.vdr_side = vdr->vdr_side;
        cvdr.vdr_nbytes = child_buf->b_bcount;
        cvdr.vdr_offset = ldbtob(child_buf->b_lblkno);
        /* Work out where we are in the allocated buffer */
        useroff = (offset_t)(uintptr_t)kbuffer;
        useroff = useroff + (offset_t)current_offset;
        cvdr.vdr_data = (void *)(uintptr_t)useroff;
        child_buf = md_bioclone(parent_buf, current_offset,
            child_buf->b_bcount, child_buf->b_edev,
            child_buf->b_blkno, NULL,
            child_buf, KM_NOSLEEP);
        /* calculate new offset, counts, etc... */
        current_offset += child_buf->b_bcount;
        current_count -=  child_buf->b_bcount;
        current_blkno +=  (sp_ext_offset_t)(btodb(child_buf->b_bcount));

        if (more) {
            mutex_enter(&ps->ps_mx);
            ps->ps_frags++;
            mutex_exit(&ps->ps_mx);
        }

        ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr,
            (mode | FKIOCTL), NULL);

        /*
         * Free the child structure as we've finished with it.
         * Normally this would be done by sp_done() but we're just
         * using md_bioclone() to segment the transfer and we never
         * issue a strategy request so the iodone will not be called.
         */
        kmem_cache_free(sp_child_cache, cs);
        if (ret == 0) {
            /* copyout the returned data to vdr_data + offset */
            userbuf = (caddr_t)kbuffer;
            userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer;
            if (ddi_copyout(userbuf, vdr->vdr_data,
                cvdr.vdr_bytesread, mode)) {
                ret = EFAULT;
                goto err_out;
            }
            vdr->vdr_bytesread += cvdr.vdr_bytesread;
        } else {
            goto err_out;
        }
    } while (more);

    /*
     * Update the user-supplied vol_directed_rd_t structure with the
     * contents of the last issued child request.
     */
    vdr->vdr_flags = cvdr.vdr_flags;
    vdr->vdr_side = cvdr.vdr_side;
    bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME);

err_out:
    if (ret != 0) {
        vdr->vdr_flags |= DKV_DMR_ERROR;
    }
    if (vdr->vdr_bytesread != vdr->vdr_nbytes) {
        vdr->vdr_flags |= DKV_DMR_SHORT;
    }
    kmem_cache_free(sp_parent_cache, ps);
    kmem_free(kbuffer, vdr->vdr_nbytes);
    freerbuf(parent_buf);
    md_unit_readerexit(ui);
    return (ret);
}

/*
 * FUNCTION:    sp_snarf()
 * INPUT:   cmd - snarf cmd.
 *      setno   - set number.
 * OUTPUT:  none.
 * RETURNS: 1   - soft partitions were snarfed.
 *      0   - no soft partitions were snarfed.
 * PURPOSE: Snarf soft partition metadb records into their in-core
 *      structures.  This routine is called at "snarf time" when
 *      md loads and gets all metadevices records into memory.
 *      The basic algorithm is simply to walk the soft partition
 *      records in the metadb and call the soft partitioning
 *      build_incore routine to set up the in-core structures.
 */
static int
sp_snarf(md_snarfcmd_t cmd, set_t setno)
{
    mp_unit_t   *un;
    mddb_recid_t    recid;
    int     gotsomething;
    int     all_sp_gotten;
    mddb_type_t rec_type;
    mddb_de_ic_t    *dep;
    mddb_rb32_t *rbp;
    mp_unit_t   *big_un;
    mp_unit32_od_t  *small_un;
    size_t      newreqsize;


    if (cmd == MD_SNARF_CLEANUP)
        return (0);

    all_sp_gotten = 1;
    gotsomething = 0;

    /* get the record type */
    rec_type = (mddb_type_t)md_getshared_key(setno,
        sp_md_ops.md_driver.md_drivername);
    recid = mddb_makerecid(setno, 0);

    /*
     * walk soft partition records in the metadb and call
     * sp_build_incore to build in-core structures.
     */
    while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
        /* if we've already gotten this record, go to the next one */
        if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
            continue;


        dep = mddb_getrecdep(recid);
        dep->de_flags = MDDB_F_SOFTPART;
        rbp = dep->de_rb;

        switch (rbp->rb_revision) {
        case MDDB_REV_RB:
        case MDDB_REV_RBFN:
            if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
                /*
                 * This means, we have an old and small record.
                 * And this record hasn't already been converted
                 * :-o before we create an incore metadevice
                 * from this we have to convert it to a big
                 * record.
                 */
                small_un =
                    (mp_unit32_od_t *)mddb_getrecaddr(recid);
                newreqsize = sizeof (mp_unit_t) +
                    ((small_un->un_numexts - 1) *
                    sizeof (struct mp_ext));
                big_un = (mp_unit_t *)kmem_zalloc(newreqsize,
                    KM_SLEEP);
                softpart_convert((caddr_t)small_un,
                    (caddr_t)big_un, SMALL_2_BIG);
                kmem_free(small_un, dep->de_reqsize);
                dep->de_rb_userdata = big_un;
                dep->de_reqsize = newreqsize;
                rbp->rb_private |= MD_PRV_CONVD;
                un = big_un;
            } else {
                /* Record has already been converted */
                un = (mp_unit_t *)mddb_getrecaddr(recid);
            }
            un->c.un_revision &= ~MD_64BIT_META_DEV;
            break;
        case MDDB_REV_RB64:
        case MDDB_REV_RB64FN:
            /* Large device */
            un = (mp_unit_t *)mddb_getrecaddr(recid);
            un->c.un_revision |= MD_64BIT_META_DEV;
            un->c.un_flag |= MD_EFILABEL;
            break;
        }
        MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);

        /*
         * Create minor node for snarfed entry.
         */
        (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));

        if (MD_UNIT(MD_SID(un)) != NULL) {
            /* unit is already in-core */
            mddb_setrecprivate(recid, MD_PRV_PENDDEL);
            continue;
        }
        all_sp_gotten = 0;
        if (sp_build_incore((void *)un, 1) == 0) {
            mddb_setrecprivate(recid, MD_PRV_GOTIT);
            md_create_unit_incore(MD_SID(un), &sp_md_ops, 0);
            gotsomething = 1;
        }
    }

    if (!all_sp_gotten)
        return (gotsomething);
    /* double-check records */
    recid = mddb_makerecid(setno, 0);
    while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0)
        if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
            mddb_setrecprivate(recid, MD_PRV_PENDDEL);

    return (0);
}

/*
 * FUNCTION:    sp_halt()
 * INPUT:   cmd - halt cmd.
 *      setno   - set number.
 * RETURNS: 0   - success.
 *      1   - err.
 * PURPOSE: Perform driver halt operations.  As with stripe, we
 *      support MD_HALT_CHECK and MD_HALT_DOIT.  The first
 *      does a check to see if halting can be done safely
 *      (no open soft partitions), the second cleans up and
 *      shuts down the driver.
 */
static int
sp_halt(md_haltcmd_t cmd, set_t setno)
{
    int     i;
    mdi_unit_t  *ui;
    minor_t     mnum;

    if (cmd == MD_HALT_CLOSE)
        return (0);

    if (cmd == MD_HALT_OPEN)
        return (0);

    if (cmd == MD_HALT_UNLOAD)
        return (0);

    if (cmd == MD_HALT_CHECK) {
        for (i = 0; i < md_nunits; i++) {
            mnum = MD_MKMIN(setno, i);
            if ((ui = MDI_UNIT(mnum)) == NULL)
                continue;
            if (ui->ui_opsindex != sp_md_ops.md_selfindex)
                continue;
            if (md_unit_isopen(ui))
                return (1);
        }
        return (0);
    }

    if (cmd != MD_HALT_DOIT)
        return (1);

    for (i = 0; i < md_nunits; i++) {
        mnum = MD_MKMIN(setno, i);
        if ((ui = MDI_UNIT(mnum)) == NULL)
            continue;
        if (ui->ui_opsindex != sp_md_ops.md_selfindex)
            continue;
        reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0);
    }

    return (0);
}

/*
 * FUNCTION:    sp_open_dev()
 * INPUT:   un  - unit structure.
 *      oflags  - open flags.
 * OUTPUT:  none.
 * RETURNS: 0       - success.
 *      non-zero    - err.
 * PURPOSE: open underlying device via md_layered_open.
 */
static int
sp_open_dev(mp_unit_t *un, int oflags)
{
    minor_t     mnum = MD_SID(un);
    int     err;
    md_dev64_t  tmpdev;
    set_t       setno = MD_MIN2SET(MD_SID(un));
    side_t      side = mddb_getsidenum(setno);

    tmpdev = un->un_dev;
    /*
     * Do the open by device id if underlying is regular
     */
    if ((md_getmajor(tmpdev) != md_major) &&
        md_devid_found(setno, side, un->un_key) == 1) {
        tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key);
    }
    err = md_layered_open(mnum, &tmpdev, oflags);
    un->un_dev = tmpdev;

    if (err)
        return (ENXIO);

    return (0);
}

/*
 * FUNCTION:    sp_open()
 * INPUT:   dev     - device to open.
 *      flag        - pass-through flag.
 *      otyp        - pass-through open type.
 *      cred_p      - credentials.
 *      md_oflags   - open flags.
 * OUTPUT:  none.
 * RETURNS: 0       - success.
 *      non-zero    - err.
 * PURPOSE: open a soft partition.
 */
/* ARGSUSED */
static int
sp_open(
    dev_t       *dev,
    int     flag,
    int     otyp,
    cred_t      *cred_p,
    int     md_oflags
)
{
    minor_t     mnum = getminor(*dev);
    mdi_unit_t  *ui = MDI_UNIT(mnum);
    mp_unit_t   *un;
    int     err = 0;
    set_t       setno;

    /*
     * When doing an open of a multi owner metadevice, check to see if this
     * node is a starting node and if a reconfig cycle is underway.
     * If so, the system isn't sufficiently set up enough to handle the
     * open (which involves I/O during sp_validate), so fail with ENXIO.
     */
    setno = MD_MIN2SET(mnum);
    if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
        (MD_SET_MNSET | MD_SET_MN_START_RC)) {
            return (ENXIO);
    }

    /* grab necessary locks */
    un = (mp_unit_t *)md_unit_openclose_enter(ui);
    setno = MD_UN2SET(un);

    /* open underlying device, if necessary */
    if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) {
        if ((err = sp_open_dev(un, md_oflags)) != 0)
            goto out;

        if (MD_MNSET_SETNO(setno)) {
            /* For probe, don't incur the overhead of validate */
            if (!(md_oflags & MD_OFLG_PROBEDEV)) {
                /*
                 * Don't call sp_validate while
                 * unit_openclose lock is held.  So, actually
                 * open the device, drop openclose lock,
                 * call sp_validate, reacquire openclose lock,
                 * and close the device.  If sp_validate
                 * succeeds, then device will be re-opened.
                 */
                if ((err = md_unit_incopen(mnum, flag,
                    otyp)) != 0)
                    goto out;

                mutex_enter(&ui->ui_mx);
                ui->ui_lock |= MD_UL_OPENINPROGRESS;
                mutex_exit(&ui->ui_mx);
                md_unit_openclose_exit(ui);
                if (otyp != OTYP_LYR)
                    rw_exit(&md_unit_array_rw.lock);

                err = sp_validate(un);

                if (otyp != OTYP_LYR)
                    rw_enter(&md_unit_array_rw.lock,
                        RW_READER);
                (void) md_unit_openclose_enter(ui);
                (void) md_unit_decopen(mnum, otyp);
                mutex_enter(&ui->ui_mx);
                ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
                cv_broadcast(&ui->ui_cv);
                mutex_exit(&ui->ui_mx);
                /*
                 * Should be in the same state as before
                 * the sp_validate.
                 */
                if (err != 0) {
                    /* close the device opened above */
                    md_layered_close(un->un_dev, md_oflags);
                    err = EIO;
                    goto out;
                }
            }
            /*
             * As we're a multi-owner metadevice we need to ensure
             * that all nodes have the same idea of the status.
             * sp_validate() will mark the device as errored (if
             * it cannot read the watermark) or ok (if it was
             * previously errored but the watermark is now valid).
             * This code-path is only entered on the non-probe open
             * so we will maintain the errored state during a probe
             * call. This means the sys-admin must metarecover -m
             * to reset the soft-partition error.
             */
        } else {
            /* For probe, don't incur the overhead of validate */
            if (!(md_oflags & MD_OFLG_PROBEDEV) &&
                (err = sp_validate(un)) != 0) {
                /* close the device opened above */
                md_layered_close(un->un_dev, md_oflags);
                err = EIO;
                goto out;
            } else {
                /*
                 * we succeeded in validating the on disk
                 * format versus the in core, so reset the
                 * status if it's in error
                 */
                if (un->un_status == MD_SP_ERR) {
                    un->un_status = MD_SP_OK;
                }
            }
        }
    }

    /* count open */
    if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
        goto out;

out:
    md_unit_openclose_exit(ui);
    return (err);
}

/*
 * FUNCTION:    sp_close()
 * INPUT:   dev     - device to close.
 *      flag        - pass-through flag.
 *      otyp        - pass-through type.
 *      cred_p      - credentials.
 *      md_cflags   - close flags.
 * OUTPUT:  none.
 * RETURNS: 0       - success.
 *      non-zero    - err.
 * PURPOSE: close a soft paritition.
 */
/* ARGSUSED */
static int
sp_close(
    dev_t       dev,
    int     flag,
    int     otyp,
    cred_t      *cred_p,
    int     md_cflags
)
{
    minor_t     mnum = getminor(dev);
    mdi_unit_t  *ui = MDI_UNIT(mnum);
    mp_unit_t   *un;
    int     err = 0;

    /* grab necessary locks */
    un = (mp_unit_t *)md_unit_openclose_enter(ui);

    /* count closed */
    if ((err = md_unit_decopen(mnum, otyp)) != 0)
        goto out;

    /* close devices, if necessary */
    if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
        md_layered_close(un->un_dev, md_cflags);
    }

    /*
     * If a MN set and transient capabilities (eg ABR/DMR) are set,
     * clear these capabilities if this is the last close in
     * the cluster
     */
    if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
        (ui->ui_tstate & MD_ABR_CAP)) {
        md_unit_openclose_exit(ui);
        mdmn_clear_all_capabilities(mnum);
        return (0);
    }
    /* unlock, return success */
out:
    md_unit_openclose_exit(ui);
    return (err);
}


/* used in sp_dump routine */
static struct buf dumpbuf;

/*
 * FUNCTION:    sp_dump()
 * INPUT:   dev - device to dump to.
 *      addr    - address to dump.
 *      blkno   - blkno on device.
 *      nblk    - number of blocks to dump.
 * OUTPUT:  none.
 * RETURNS: result from bdev_dump.
 * PURPOSE:  This routine dumps memory to the disk.  It assumes that
 *           the memory has already been mapped into mainbus space.
 *           It is called at disk interrupt priority when the system
 *           is in trouble.
 *           NOTE: this function is defined using 32-bit arguments,
 *           but soft partitioning is internally 64-bit.  Arguments
 *           are casted where appropriate.
 */
static int
sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
{
    mp_unit_t   *un;
    buf_t       *bp;
    sp_ext_length_t nb;
    daddr_t     mapblk;
    int     result;
    int     more;
    int     saveresult = 0;

    /*
     * Don't need to grab the unit lock.
     * Cause nothing else is supposed to be happenning.
     * Also dump is not supposed to sleep.
     */
    un = (mp_unit_t *)MD_UNIT(getminor(dev));

    if ((diskaddr_t)blkno >= un->c.un_total_blocks)
        return (EINVAL);

    if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks)
        return (EINVAL);

    bp = &dumpbuf;
    nb = (sp_ext_length_t)dbtob(nblk);
    do {
        bzero((caddr_t)bp, sizeof (*bp));
        more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp);
        nblk = (int)(btodb(bp->b_bcount));
        mapblk = bp->b_blkno;
        result = bdev_dump(bp->b_edev, addr, mapblk, nblk);
        if (result)
            saveresult = result;

        nb -= bp->b_bcount;
        addr += bp->b_bcount;
        blkno += nblk;
    } while (more);

    return (saveresult);
}

static int
sp_imp_set(
    set_t   setno
)
{
    mddb_recid_t    recid;
    int     gotsomething;
    mddb_type_t rec_type;
    mddb_de_ic_t    *dep;
    mddb_rb32_t *rbp;
    mp_unit_t   *un64;
    mp_unit32_od_t  *un32;
    md_dev64_t  self_devt;
    minor_t     *self_id;   /* minor needs to be updated */
    md_parent_t *parent_id; /* parent needs to be updated */
    mddb_recid_t    *record_id; /* record id needs to be updated */

    gotsomething = 0;

    rec_type = (mddb_type_t)md_getshared_key(setno,
        sp_md_ops.md_driver.md_drivername);
    recid = mddb_makerecid(setno, 0);

    while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
        if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
            continue;

        dep = mddb_getrecdep(recid);
        rbp = dep->de_rb;

        switch (rbp->rb_revision) {
        case MDDB_REV_RB:
        case MDDB_REV_RBFN:
            /*
             * Small device
             */
            un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid);
            self_id = &(un32->c.un_self_id);
            parent_id = &(un32->c.un_parent);
            record_id = &(un32->c.un_record_id);

            if (!md_update_minor(setno, mddb_getsidenum
                (setno), un32->un_key))
                goto out;
            break;

        case MDDB_REV_RB64:
        case MDDB_REV_RB64FN:
            un64 = (mp_unit_t *)mddb_getrecaddr(recid);
            self_id = &(un64->c.un_self_id);
            parent_id = &(un64->c.un_parent);
            record_id = &(un64->c.un_record_id);

            if (!md_update_minor(setno, mddb_getsidenum
                (setno), un64->un_key))
                goto out;
            break;
        }

        /*
         * If this is a top level and a friendly name metadevice,
         * update its minor in the namespace.
         */
        if ((*parent_id == MD_NO_PARENT) &&
            ((rbp->rb_revision == MDDB_REV_RBFN) ||
            (rbp->rb_revision == MDDB_REV_RB64FN))) {

            self_devt = md_makedevice(md_major, *self_id);
            if (!md_update_top_device_minor(setno,
                mddb_getsidenum(setno), self_devt))
                goto out;
        }

        /*
         * Update unit with the imported setno
         *
         */
        mddb_setrecprivate(recid, MD_PRV_GOTIT);

        *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
        if (*parent_id != MD_NO_PARENT)
            *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
        *record_id = MAKERECID(setno, DBID(*record_id));

        gotsomething = 1;
    }

out:
    return (gotsomething);
}

static md_named_services_t sp_named_services[] = {
    {NULL,                  0}
};

md_ops_t sp_md_ops = {
    sp_open,        /* open */
    sp_close,       /* close */
    md_sp_strategy,     /* strategy */
    NULL,           /* print */
    sp_dump,        /* dump */
    NULL,           /* read */
    NULL,           /* write */
    md_sp_ioctl,        /* ioctl, */
    sp_snarf,       /* snarf */
    sp_halt,        /* halt */
    NULL,           /* aread */
    NULL,           /* awrite */
    sp_imp_set,     /* import set */
    sp_named_services
};

static void
init_init()
{
    sp_parent_cache = kmem_cache_create("md_softpart_parent",
        sizeof (md_spps_t), 0, sp_parent_constructor,
        sp_parent_destructor, sp_run_queue, NULL, NULL, 0);
    sp_child_cache = kmem_cache_create("md_softpart_child",
        sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0,
        sp_child_constructor, sp_child_destructor, sp_run_queue,
        NULL, NULL, 0);
}

static void
fini_uninit()
{
    kmem_cache_destroy(sp_parent_cache);
    kmem_cache_destroy(sp_child_cache);
    sp_parent_cache = sp_child_cache = NULL;
}

/* define the module linkage */
MD_PLUGIN_MISC_MODULE("soft partition module", init_init(), fini_uninit())