raid_ioctl.c revision f9722deaa8da9978617bd4b5c9130f219e127193
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* NAME: raid_ioctl.c
*
* DESCRIPTION: RAID driver source file containing IOCTL operations.
*
* ROUTINES PROVIDED FOR EXTERNAL USE:
* raid_commit() - commits MD database updates for a RAID metadevice
* md_raid_ioctl() - RAID metadevice IOCTL operations entry point.
*
* ROUTINES PROVIDED FOR INTERNAL USE:
* raid_getun() - Performs unit checking on a RAID metadevice
* init_col_nextio() - normal backend when zeroing column of RAID metadevice.
* init_col_int() - I/O interrupt while zeroing column of RAID metadevice.
* raid_init_columns() - Zero one or more columns of a RAID metadevice.
* raid_set() - used to create a RAID metadevice
* raid_get() - used to get the unit structure of a RAID metadevice
* raid_replace() - used to replace a component of a RAID metadevice
* raid_grow() - Concatenate to a RAID metadevice
* raid_change() - change dynamic values of a RAID metadevice
* raid_reset() - used to reset (clear / remove) a RAID metadevice
* raid_get_geom() - used to get the geometry of a RAID metadevice
* raid_get_vtoc() - used to get the VTOC on a RAID metadevice
* raid_set_vtoc() - used to set the VTOC on a RAID metadevice
* raid_get_extvtoc() - used to get the extended VTOC on a RAID metadevice
* raid_set_extvtoc() - used to set the extended VTOC on a RAID metadevice
* raid_getdevs() - return all devices within a RAID metadevice
* raid_admin_ioctl() - IOCTL operations unique to metadevices and RAID
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/file.h>
#include <sys/user.h>
#include <sys/uio.h>
#include <sys/t_lock.h>
#include <sys/buf.h>
#include <sys/dkio.h>
#include <sys/vtoc.h>
#include <sys/kmem.h>
#include <vm/page.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/mkdev.h>
#include <sys/stat.h>
#include <sys/open.h>
#include <sys/disp.h>
#include <sys/modctl.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/cred.h>
#include <sys/lvm/mdvar.h>
#include <sys/lvm/md_names.h>
#include <sys/lvm/md_mddb.h>
#include <sys/lvm/md_raid.h>
#include <sys/lvm/md_convert.h>
#include <sys/sysevent/eventdefs.h>
#include <sys/sysevent/svm.h>
extern int md_status;
extern unit_t md_nunits;
extern set_t md_nsets;
extern md_set_t md_set[];
extern md_ops_t raid_md_ops;
extern major_t md_major;
extern md_krwlock_t md_unit_array_rw;
extern mdq_anchor_t md_done_daemon;
extern mdq_anchor_t md_ff_daemonq;
extern int mdopen();
extern int mdclose();
extern void md_probe_one();
extern int md_init_probereq(md_probedev_impl_t *,
daemon_queue_t **);
extern md_resync_t md_cpr_resync;
extern void dump_mr_unit(mr_unit_t *);
typedef struct raid_ci {
DAEMON_QUEUE
struct raid_ci *ci_next;
mr_unit_t *ci_un;
int ci_col;
int ci_err;
int ci_flag;
size_t ci_zerosize;
diskaddr_t ci_blkno;
diskaddr_t ci_lastblk;
buf_t ci_buf;
} raid_ci_t;
/* values for the ci_flag */
#define COL_INITING (0x0001)
#define COL_INIT_DONE (0x0002)
#define COL_READY (0x0004)
/*
* NAME: raid_getun
* DESCRIPTION: performs a lot of unit checking on a RAID metadevice
* PARAMETERS: minor_t mnum - minor device number for RAID unit
* md_error_t *mde - pointer to error reporting structure
* int flags - pointer to error reporting structure
* STALE_OK - allow stale MD memory
* NO_OLD - unit must not exist
* NO_LOCK - no IOCTL lock needed
* WR_LOCK - write IOCTL lock needed
* RD_LOCK - read IOCTL lock needed
* IOLOCK *lock - pointer to IOCTL lock
*
* LOCKS: obtains unit reader or writer lock via IOLOCK
*
*/
static mr_unit_t *
raid_getun(minor_t mnum, md_error_t *mde, int flags, IOLOCK *lock)
{
mr_unit_t *un;
mdi_unit_t *ui;
set_t setno = MD_MIN2SET(mnum);
if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits)) {
(void) mdmderror(mde, MDE_INVAL_UNIT, mnum);
return (NULL);
}
if (!(flags & STALE_OK)) {
if (md_get_setstatus(setno) & MD_SET_STALE) {
(void) mdmddberror(mde, MDE_DB_STALE, mnum, setno);
return (NULL);
}
}
ui = MDI_UNIT(mnum);
if (flags & NO_OLD) {
if (ui != NULL) {
(void) mdmderror(mde, MDE_UNIT_ALREADY_SETUP, mnum);
return (NULL);
}
return ((mr_unit_t *)1);
}
if (ui == NULL) {
(void) mdmderror(mde, MDE_UNIT_NOT_SETUP, mnum);
return (NULL);
}
if (flags & ARRAY_WRITER)
md_array_writer(lock);
else if (flags & ARRAY_READER)
md_array_reader(lock);
if (!(flags & NO_LOCK)) {
if (flags & WR_LOCK) {
(void) md_ioctl_io_lock(lock, ui);
(void) md_ioctl_writerlock(lock, ui);
} else /* RD_LOCK */
(void) md_ioctl_readerlock(lock, ui);
}
un = (mr_unit_t *)MD_UNIT(mnum);
if (un->c.un_type != MD_METARAID) {
(void) mdmderror(mde, MDE_NOT_RAID, mnum);
return (NULL);
}
return (un);
}
/*
* NAME: raid_commit
* DESCRIPTION: commits MD database updates for a RAID metadevice
* PARAMETERS: mr_unit_t *un - RAID unit to update in the MD database
* mddb_recid_t *extras - array of other record IDs to update
*
* LOCKS: assumes caller holds unit writer lock
*
*/
void
raid_commit(mr_unit_t *un, mddb_recid_t *extras)
{
mddb_recid_t *recids;
int ri = 0;
int nrecids = 0;
if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
return;
/* Count the extra recids */
if (extras != NULL) {
while (extras[nrecids] != 0) {
nrecids++;
}
}
/*
* Allocate space for two recids in addition to the extras:
* one for the unit structure, one for the null terminator.
*/
nrecids += 2;
recids = (mddb_recid_t *)
kmem_zalloc(nrecids * sizeof (mddb_recid_t), KM_SLEEP);
if (un != NULL) {
ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1);
recids[ri++] = un->c.un_record_id;
}
if (extras != NULL) {
while (*extras != 0) {
recids[ri++] = *extras;
extras++;
}
}
if (ri > 0) {
mddb_commitrecs_wrapper(recids);
}
kmem_free(recids, nrecids * sizeof (mddb_recid_t));
}
static int
raid_check_pw(mr_unit_t *un)
{
buf_t bp;
char *buf;
mr_column_t *colptr;
minor_t mnum = MD_SID(un);
int i;
int err = 0;
minor_t unit;
buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP);
for (i = 0; i < un->un_totalcolumncnt; i++) {
md_dev64_t tmpdev;
colptr = &un->un_column[i];
tmpdev = colptr->un_dev;
/*
* Open by device id
* If this device is hotspared
* use the hotspare key
*/
tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ?
colptr->un_hs_key : colptr->un_orig_key);
if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) {
colptr->un_dev = tmpdev;
return (1);
}
colptr->un_dev = tmpdev;
bzero((caddr_t)&bp, sizeof (buf_t));
bp.b_back = &bp;
bp.b_forw = &bp;
bp.b_flags = B_READ | B_BUSY;
sema_init(&bp.b_io, 0, NULL,
SEMA_DEFAULT, NULL);
sema_init(&bp.b_sem, 0, NULL,
SEMA_DEFAULT, NULL);
bp.b_edev = md_dev64_to_dev(colptr->un_dev);
bp.b_lblkno = colptr->un_pwstart;
bp.b_bcount = DEV_BSIZE;
bp.b_bufsize = DEV_BSIZE;
bp.b_un.b_addr = (caddr_t)buf;
bp.b_offset = -1;
(void) md_call_strategy(&bp, 0, NULL);
if (biowait(&bp))
err = 1;
if (i == 0) {
if (un->c.un_revision & MD_64BIT_META_DEV) {
unit = ((raid_pwhdr_t *)buf)->rpw_unit;
} else {
unit = ((raid_pwhdr32_od_t *)buf)->rpw_unit;
}
}
/*
* depending upon being an 64bit or 32 bit raid, the
* pre write headers have different layout
*/
if (un->c.un_revision & MD_64BIT_META_DEV) {
if ((((raid_pwhdr_t *)buf)->rpw_column != i) ||
(((raid_pwhdr_t *)buf)->rpw_unit != unit))
err = 1;
} else {
if ((((raid_pwhdr32_od_t *)buf)->rpw_column != i) ||
(((raid_pwhdr32_od_t *)buf)->rpw_unit != unit))
err = 1;
}
md_layered_close(colptr->un_dev, MD_OFLG_NULL);
if (err)
break;
}
kmem_free(buf, DEV_BSIZE);
return (err);
}
/*
* NAME: init_col_nextio
* DESCRIPTION: normal backend process when zeroing column of a RAID metadevice.
* PARAMETERS: raid_ci_t *cur - struct for column being zeroed
*
* LOCKS: assumes caller holds unit reader lock,
* preiodically releases and reacquires unit reader lock,
* broadcasts on unit conditional variable (un_cv)
*
*/
#define INIT_RLS_CNT 10
static void
init_col_nextio(raid_ci_t *cur)
{
mr_unit_t *un;
un = cur->ci_un;
cur->ci_blkno += cur->ci_zerosize;
mutex_enter(&un->un_mx);
/* ===> update un_percent_done */
un->un_init_iocnt += btodb(cur->ci_buf.b_bcount);
mutex_exit(&un->un_mx);
/*
* When gorwing a device, normal I/O is still going on.
* The init thread still holds the unit reader lock which
* prevents I/O from doing state changes.
* So every INIT_RLS_CNT init I/Os, we will release the
* unit reader lock.
*
* CAVEAT:
* We know we are in the middle of a grow operation and the
* unit cannot be grown or removed (through reset or halt)
* so the mr_unit_t structure will not move or disappear.
* In addition, we know that only one of the init I/Os
* can be in col_init_nextio at a time because they are
* placed on the md_done_daemon queue and md only processes
* one element of this queue at a time. In addition, any
* code that needs to acquire the unit writer lock to change
* state is supposed to be on the md_mstr_daemon queue so
* it can be processing while we sit here waiting to get the
* unit reader lock back.
*/
if (cur->ci_blkno < cur->ci_lastblk) {
/* truncate last chunk to end_addr if needed */
if (cur->ci_blkno + cur->ci_zerosize > cur->ci_lastblk) {
cur->ci_zerosize = (size_t)
(cur->ci_lastblk - cur->ci_blkno);
}
/* set address and length for I/O bufs */
cur->ci_buf.b_bufsize = dbtob(cur->ci_zerosize);
cur->ci_buf.b_bcount = dbtob(cur->ci_zerosize);
cur->ci_buf.b_lblkno = cur->ci_blkno;
(void) md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL);
return;
}
/* finished initializing this column */
mutex_enter(&un->un_mx);
cur->ci_flag = COL_INIT_DONE;
uniqtime32(&un->un_column[cur->ci_col].un_devtimestamp);
mutex_exit(&un->un_mx);
cv_broadcast(&un->un_cv);
}
/*
* NAME: init_col_int
* DESCRIPTION: I/O interrupt while zeroing column of a RAID metadevice.
* PARAMETERS: buf_t *cb - I/O buffer for which interrupt occurred
*
* LOCKS: assumes caller holds unit reader or writer lock
*
*/
static int
init_col_int(buf_t *cb)
{
raid_ci_t *cur;
cur = (raid_ci_t *)cb->b_chain;
if (cb->b_flags & B_ERROR) {
mutex_enter(&cur->ci_un->un_mx);
cur->ci_err = EIO;
mutex_exit(&cur->ci_un->un_mx);
cv_broadcast(&cur->ci_un->un_cv);
return (1);
}
daemon_request(&md_done_daemon, init_col_nextio,
(daemon_queue_t *)cur, REQ_OLD);
return (1);
}
/*
* NAME: raid_init_columns
* DESCRIPTION: Zero one or more columns of a RAID metadevice.
* PARAMETERS: minor_t mnum - RAID unit minor identifier
*
* LOCKS: obtains and releases unit reader lock,
* obtains and releases unit writer lock,
* obtains and releases md_unit_array_rw write lock,
* obtains and releases unit mutex (un_mx) lock,
* waits on unit conditional variable (un_cv)
*
*/
static void
raid_init_columns(minor_t mnum)
{
mr_unit_t *un;
mdi_unit_t *ui;
raid_ci_t *ci_chain = NULL, *cur;
rus_state_t state;
caddr_t zero_addr;
diskaddr_t end_off;
size_t zerosize;
int err = 0;
int ix;
int colcnt = 0;
int col;
set_t setno = MD_MIN2SET(mnum);
/*
* Increment the raid resync count for cpr
*/
mutex_enter(&md_cpr_resync.md_resync_mutex);
md_cpr_resync.md_raid_resync++;
mutex_exit(&md_cpr_resync.md_resync_mutex);
/*
* initialization is a multiple step process. The first step
* is to go through the unit structure and start each device
* in the init state writing zeros over the component.
* Next initialize the prewrite areas, so the device can be
* used if a metainit -k is done. Now close the componenets.
*
* Once this complete set the state of each component being
* zeroed and set the correct state for the unit.
*
* last commit the records.
*/
ui = MDI_UNIT(mnum);
un = md_unit_readerlock(ui);
/* check for active init on this column */
/* exiting is cpr safe */
if ((un->un_init_colcnt > 0) && (un->un_resync_index != -1)) {
md_unit_readerexit(ui);
(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
/*
* Decrement the raid resync count for cpr
*/
mutex_enter(&md_cpr_resync.md_resync_mutex);
md_cpr_resync.md_raid_resync--;
mutex_exit(&md_cpr_resync.md_resync_mutex);
thread_exit();
}
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_START, SVM_TAG_METADEVICE, setno,
MD_SID(un));
un->un_init_colcnt = 0;
un->un_init_iocnt = 0;
end_off = un->un_pwsize + (un->un_segsize * un->un_segsincolumn);
zerosize = (size_t)MIN((diskaddr_t)un->un_maxio, end_off);
/* allocate zero-filled buffer */
zero_addr = kmem_zalloc(dbtob(zerosize), KM_SLEEP);
for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
if (un->un_column[ix].un_devstate != RCS_INIT)
continue;
/* allocate new column init structure */
cur = (raid_ci_t *)kmem_zalloc((sizeof (raid_ci_t)), KM_SLEEP);
ASSERT(cur != NULL);
un->un_init_colcnt++;
cur->ci_next = ci_chain;
ci_chain = cur;
cur->ci_un = un;
cur->ci_col = ix;
cur->ci_err = 0;
cur->ci_flag = COL_INITING;
cur->ci_zerosize = zerosize;
cur->ci_blkno = un->un_column[ix].un_pwstart;
cur->ci_lastblk = cur->ci_blkno + un->un_pwsize
+ (un->un_segsize * un->un_segsincolumn);
/* initialize static buf fields */
cur->ci_buf.b_un.b_addr = zero_addr;
cur->ci_buf.b_chain = (buf_t *)cur;
cur->ci_buf.b_back = &cur->ci_buf;
cur->ci_buf.b_forw = &cur->ci_buf;
cur->ci_buf.b_iodone = init_col_int;
cur->ci_buf.b_flags = B_BUSY | B_WRITE;
cur->ci_buf.b_edev = md_dev64_to_dev(un->un_column[ix].un_dev);
sema_init(&cur->ci_buf.b_io, 0, NULL, SEMA_DEFAULT, NULL);
sema_init(&cur->ci_buf.b_sem, 0, NULL, SEMA_DEFAULT, NULL);
/* set address and length for I/O bufs */
cur->ci_buf.b_bufsize = dbtob(zerosize);
cur->ci_buf.b_bcount = dbtob(zerosize);
cur->ci_buf.b_lblkno = un->un_column[ix].un_pwstart;
cur->ci_buf.b_offset = -1;
if (! (un->un_column[ix].un_devflags & MD_RAID_DEV_ISOPEN)) {
md_dev64_t tmpdev = un->un_column[ix].un_dev;
/*
* Open by device id
* If this column is hotspared then
* use the hotspare key
*/
tmpdev = md_resolve_bydevid(mnum, tmpdev,
HOTSPARED(un, ix) ?
un->un_column[ix].un_hs_key :
un->un_column[ix].un_orig_key);
if ((cur->ci_err = md_layered_open(mnum, &tmpdev,
MD_OFLG_NULL)) == 0)
un->un_column[ix].un_devflags |=
MD_RAID_DEV_ISOPEN;
un->un_column[ix].un_dev = tmpdev;
}
if (cur->ci_err == 0)
md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL);
}
md_unit_readerexit(ui);
state = un->un_state;
colcnt = un->un_init_colcnt;
mutex_enter(&un->un_mx);
while (colcnt) {
cv_wait(&un->un_cv, &un->un_mx);
colcnt = 0;
for (cur = ci_chain; cur != NULL; cur = cur->ci_next) {
col = cur->ci_col;
if ((cur->ci_flag != COL_INITING) || (cur->ci_err)) {
if (cur->ci_err)
err = cur->ci_err;
else if (cur->ci_flag == COL_INIT_DONE) {
(void) init_pw_area(un,
un->un_column[col].un_dev,
un->un_column[col].un_pwstart,
col);
cur->ci_flag = COL_READY;
}
} else {
colcnt++;
}
}
}
mutex_exit(&un->un_mx);
/* This prevents new opens */
rw_enter(&md_unit_array_rw.lock, RW_WRITER);
(void) md_io_writerlock(ui);
un = (mr_unit_t *)md_unit_writerlock(ui);
while (ci_chain) {
cur = ci_chain;
/* take this element out of the chain */
ci_chain = cur->ci_next;
/* free this element */
sema_destroy(&cur->ci_buf.b_io);
sema_destroy(&cur->ci_buf.b_sem);
if (cur->ci_err)
raid_set_state(cur->ci_un, cur->ci_col,
RCS_INIT_ERRED, 0);
else
raid_set_state(cur->ci_un, cur->ci_col,
RCS_OKAY, 0);
kmem_free(cur, sizeof (raid_ci_t));
}
/* free the zeroed buffer */
kmem_free(zero_addr, dbtob(zerosize));
/* determine new unit state */
if (err == 0) {
if (state == RUS_INIT)
un->un_state = RUS_OKAY;
else {
un->c.un_total_blocks = un->un_grow_tb;
md_nblocks_set(mnum, un->c.un_total_blocks);
un->un_grow_tb = 0;
if (raid_state_cnt(un, RCS_OKAY) ==
un->un_totalcolumncnt)
un->un_state = RUS_OKAY;
}
} else { /* error orcurred */
if (state & RUS_INIT)
un->un_state = RUS_DOI;
}
uniqtime32(&un->un_timestamp);
MD_STATUS(un) &= ~MD_UN_GROW_PENDING;
un->un_init_colcnt = 0;
un->un_init_iocnt = 0;
raid_commit(un, NULL);
md_unit_writerexit(ui);
(void) md_io_writerexit(ui);
rw_exit(&md_unit_array_rw.lock);
if (err) {
if (un->un_state & RUS_DOI) {
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL,
SVM_TAG_METADEVICE, setno, MD_SID(un));
} else {
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED,
SVM_TAG_METADEVICE, setno, MD_SID(un));
}
} else {
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_SUCCESS,
SVM_TAG_METADEVICE, setno, MD_SID(un));
}
(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
/*
* Decrement the raid resync count for cpr
*/
mutex_enter(&md_cpr_resync.md_resync_mutex);
md_cpr_resync.md_raid_resync--;
mutex_exit(&md_cpr_resync.md_resync_mutex);
thread_exit();
/*NOTREACHED*/
}
static int
raid_init_unit(minor_t mnum, md_error_t *ep)
{
mdi_unit_t *ui;
mr_unit_t *un;
int rval, i;
set_t setno = MD_MIN2SET(mnum);
ui = MDI_UNIT(mnum);
if (md_get_setstatus(setno) & MD_SET_STALE)
return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
/* Don't start an init if the device is not available */
if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
}
if (raid_internal_open(mnum, (FREAD | FWRITE),
OTYP_LYR, MD_OFLG_ISINIT)) {
rval = mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum);
goto out;
}
un = md_unit_readerlock(ui);
un->un_percent_done = 0;
md_unit_readerexit(ui);
/* start resync_unit thread */
(void) thread_create(NULL, 0, raid_init_columns,
(void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
return (0);
out:
un = md_unit_writerlock(ui);
MD_STATUS(un) &= ~MD_UN_GROW_PENDING;
/* recover state */
for (i = 0; i < un->un_totalcolumncnt; i++)
if (COLUMN_STATE(un, i) == RCS_INIT)
raid_set_state(un, i, RCS_ERRED, 0);
if (un->un_state & RUS_INIT)
un->un_state = RUS_DOI;
raid_commit(un, NULL);
md_unit_writerexit(ui);
if (un->un_state & RUS_DOI) {
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL,
SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
} else {
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED,
SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
}
return (rval);
}
/*
* NAME: raid_regen
*
* DESCRIPTION: regenerate all the parity on the raid device. This
* routine starts a thread that will regenerate the
* parity on a raid device. If an I/O error occurs during
* this process the entire device is placed in error.
*
* PARAMETERS: md_set_params_t *msp - ioctl packet
*/
static void
regen_unit(minor_t mnum)
{
mdi_unit_t *ui = MDI_UNIT(mnum);
mr_unit_t *un = MD_UNIT(mnum);
buf_t buf, *bp;
caddr_t buffer;
int err = 0;
diskaddr_t total_segments;
diskaddr_t line;
size_t iosize;
/*
* Increment raid resync count for cpr
*/
mutex_enter(&md_cpr_resync.md_resync_mutex);
md_cpr_resync.md_raid_resync++;
mutex_exit(&md_cpr_resync.md_resync_mutex);
iosize = dbtob(un->un_segsize);
buffer = kmem_alloc(iosize, KM_SLEEP);
bp = &buf;
total_segments = un->un_segsincolumn;
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_START, SVM_TAG_METADEVICE,
MD_UN2SET(un), MD_SID(un));
un->un_percent_done = 0;
init_buf(bp, B_READ | B_BUSY, iosize);
for (line = 0; line < total_segments; line++) {
bp->b_lblkno = line *
((un->un_origcolumncnt - 1) * un->un_segsize);
bp->b_un.b_addr = buffer;
bp->b_bcount = iosize;
bp->b_iodone = NULL;
/*
* The following assignment is only correct because
* md_raid_strategy is fine when it's only a minor number
* and not a real dev_t. Yuck.
*/
bp->b_edev = mnum;
md_raid_strategy(bp, MD_STR_NOTTOP, NULL);
if (biowait(bp)) {
err = 1;
break;
}
un->un_percent_done = (uint_t)((line * 1000) /
un->un_segsincolumn);
/* just to avoid rounding errors */
if (un->un_percent_done > 1000)
un->un_percent_done = 1000;
reset_buf(bp, B_READ | B_BUSY, iosize);
}
destroy_buf(bp);
kmem_free(buffer, iosize);
(void) md_io_writerlock(ui);
(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
(void) md_io_writerexit(ui);
un = md_unit_writerlock(ui);
if (!err &&
(raid_state_cnt(un, RCS_OKAY) == un->un_totalcolumncnt))
un->un_state = RUS_OKAY;
raid_commit(un, NULL);
md_unit_writerexit(ui);
if (err ||
raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) {
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_FAILED,
SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
} else {
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_DONE, SVM_TAG_METADEVICE,
MD_UN2SET(un), MD_SID(un));
}
/*
* Decrement the raid resync count for cpr
*/
mutex_enter(&md_cpr_resync.md_resync_mutex);
md_cpr_resync.md_raid_resync--;
mutex_exit(&md_cpr_resync.md_resync_mutex);
thread_exit();
}
static int
raid_regen_unit(minor_t mnum, md_error_t *ep)
{
mdi_unit_t *ui;
mr_unit_t *un;
int i;
set_t setno = MD_MIN2SET(mnum);
ui = MDI_UNIT(mnum);
un = (mr_unit_t *)MD_UNIT(mnum);
if (md_get_setstatus(setno) & MD_SET_STALE)
return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
/* Don't start a regen if the device is not available */
if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
}
if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) {
(void) md_unit_writerlock(ui);
for (i = 0; i < un->un_totalcolumncnt; i++)
raid_set_state(un, i, RCS_ERRED, 0);
md_unit_writerexit(ui);
return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
}
/* start resync_unit thread */
(void) thread_create(NULL, 0, regen_unit,
(void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
return (0);
}
static int
raid_regen(md_regen_param_t *mrp, IOLOCK *lock)
{
minor_t mnum = mrp->mnum;
mr_unit_t *un;
mdclrerror(&mrp->mde);
un = md_unit_readerlock(MDI_UNIT(mnum));
if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
md_unit_readerexit(MDI_UNIT(mnum));
return (mdmderror(&mrp->mde, MDE_IN_USE, mnum));
}
if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
(raid_state_cnt(un, RCS_RESYNC))) {
md_unit_readerexit(MDI_UNIT(mnum));
return (mdmderror(&mrp->mde, MDE_RESYNC_ACTIVE, mnum));
}
if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) {
md_unit_readerexit(MDI_UNIT(mnum));
return (mdmderror(&mrp->mde, MDE_IN_USE, mnum));
}
if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) ||
(! (un->un_state & RUS_OKAY))) {
md_unit_readerexit(MDI_UNIT(mnum));
return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum));
}
md_unit_readerexit(MDI_UNIT(mnum));
/* get locks and recheck to be sure something did not change */
if ((un = raid_getun(mnum, &mrp->mde, WRITERS, lock)) == NULL)
return (0);
if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) ||
(! (un->un_state & RUS_OKAY))) {
return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum));
}
raid_set_state(un, 0, RCS_REGEN, 0);
raid_commit(un, NULL);
md_ioctl_droplocks(lock);
return (raid_regen_unit(mnum, &mrp->mde));
}
/*
* NAME: raid_set
* DESCRIPTION: used to create a RAID metadevice
* PARAMETERS: md_set_params_t *d - pointer to set data structure
* int mode - must be FWRITE
*
* LOCKS: none
*
*/
static int
raid_set(void *d, int mode)
{
minor_t mnum;
mr_unit_t *un;
mddb_recid_t mr_recid;
mddb_recid_t *recids;
mddb_type_t typ1;
int err;
set_t setno;
int num_recs;
int rid;
int col;
md_set_params_t *msp = d;
mnum = msp->mnum;
setno = MD_MIN2SET(mnum);
mdclrerror(&msp->mde);
if (raid_getun(mnum, &msp->mde, NO_OLD, NULL) == NULL)
return (0);
typ1 = (mddb_type_t)md_getshared_key(setno,
raid_md_ops.md_driver.md_drivername);
/* create the db record for this mdstruct */
if (msp->options & MD_CRO_64BIT) {
#if defined(_ILP32)
return (mdmderror(&msp->mde, MDE_UNIT_TOO_LARGE, mnum));
#else
mr_recid = mddb_createrec(msp->size, typ1, 0,
MD_CRO_64BIT | MD_CRO_RAID | MD_CRO_FN, setno);
#endif
} else {
mr_recid = mddb_createrec(msp->size, typ1, 0,
MD_CRO_32BIT | MD_CRO_RAID | MD_CRO_FN, setno);
}
if (mr_recid < 0)
return (mddbstatus2error(&msp->mde,
(int)mr_recid, mnum, setno));
/* get the address of the mdstruct */
un = (mr_unit_t *)mddb_getrecaddr(mr_recid);
/*
* It is okay that we muck with the mdstruct here,
* since no one else will know about the mdstruct
* until we commit it. If we crash, the record will
* be automatically purged, since we haven't
* committed it yet.
*/
/* copy in the user's mdstruct */
if (err = ddi_copyin((caddr_t)(uintptr_t)msp->mdp, un,
msp->size, mode)) {
mddb_deleterec_wrapper(mr_recid);
return (EFAULT);
}
/* All 64 bit metadevices only support EFI labels. */
if (msp->options & MD_CRO_64BIT) {
un->c.un_flag |= MD_EFILABEL;
}
/*
* allocate the real recids array. since we may have to commit
* underlying metadevice records, we need an array of size:
* total number of components in raid + 3 (1 for the raid itself,
* one for the hotspare, one for the end marker).
*/
num_recs = un->un_totalcolumncnt + 3;
rid = 0;
recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP);
recids[rid++] = mr_recid;
MD_SID(un) = mnum;
MD_RECID(un) = recids[0];
MD_CAPAB(un) = MD_CAN_PARENT | MD_CAN_SP;
MD_PARENT(un) = MD_NO_PARENT;
un->un_resync_copysize = 0;
un->c.un_revision |= MD_FN_META_DEV;
if (UNIT_STATE(un) == RUS_INIT)
MD_STATUS(un) |= MD_UN_GROW_PENDING;
if ((UNIT_STATE(un) != RUS_INIT) && raid_check_pw(un)) {
mddb_deleterec_wrapper(mr_recid);
err = mderror(&msp->mde, MDE_RAID_INVALID);
goto out;
}
if (err = raid_build_incore(un, 0)) {
if (un->mr_ic) {
kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) *
un->un_totalcolumncnt);
kmem_free(un->mr_ic, sizeof (*un->mr_ic));
}
md_nblocks_set(mnum, -1ULL);
MD_UNIT(mnum) = NULL;
mddb_deleterec_wrapper(mr_recid);
goto out;
}
/*
* Update unit availability
*/
md_set[setno].s_un_avail--;
recids[rid] = 0;
if (un->un_hsp_id != -1) {
/* increment the reference count of the hot spare pool */
err = md_hot_spare_ifc(HSP_INCREF, un->un_hsp_id, 0, 0,
&recids[rid], NULL, NULL, NULL);
if (err) {
md_nblocks_set(mnum, -1ULL);
MD_UNIT(mnum) = NULL;
mddb_deleterec_wrapper(mr_recid);
goto out;
}
rid++;
}
/*
* set the parent on any metadevice components.
* NOTE: currently soft partitions are the only metadevices
* which can appear within a RAID metadevice.
*/
for (col = 0; col < un->un_totalcolumncnt; col++) {
mr_column_t *mr_col = &un->un_column[col];
md_unit_t *comp_un;
if (md_getmajor(mr_col->un_dev) == md_major) {
comp_un = MD_UNIT(md_getminor(mr_col->un_dev));
recids[rid++] = MD_RECID(comp_un);
md_set_parent(mr_col->un_dev, MD_SID(un));
}
}
/* set the end marker */
recids[rid] = 0;
mddb_commitrecs_wrapper(recids);
md_create_unit_incore(mnum, &raid_md_ops, 1);
SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_METADEVICE, setno,
MD_SID(un));
out:
kmem_free(recids, (num_recs * sizeof (mddb_recid_t)));
if (err)
return (err);
/* only attempt to init a device that is in the init state */
if (UNIT_STATE(un) != RUS_INIT)
return (0);
return (raid_init_unit(mnum, &msp->mde));
}
/*
* NAME: raid_get
* DESCRIPTION: used to get the unit structure of a RAID metadevice
* PARAMETERS: md_i_get_t *migp - pointer to get data structure
* int mode - must be FREAD
* IOLOCK *lock - pointer to IOCTL lock
*
* LOCKS: obtains unit reader lock via IOLOCK
*
*/
static int
raid_get(
void *migp,
int mode,
IOLOCK *lock
)
{
minor_t mnum;
mr_unit_t *un;
md_i_get_t *migph = migp;
mnum = migph->id;
mdclrerror(&migph->mde);
if ((un = raid_getun(mnum, &migph->mde,
RD_LOCK, lock)) == NULL)
return (0);
if (migph->size == 0) {
migph->size = un->c.un_size;
return (0);
}
if (migph->size < un->c.un_size) {
return (EFAULT);
}
if (ddi_copyout(un, (void *)(uintptr_t)migph->mdp,
un->c.un_size, mode))
return (EFAULT);
return (0);
}
/*
* NAME: raid_replace
* DESCRIPTION: used to replace a component of a RAID metadevice
* PARAMETERS: replace_params_t *mrp - pointer to replace data structure
* IOLOCK *lock - pointer to IOCTL lock
*
* LOCKS: obtains unit writer lock via IOLOCK (through raid_getun),
* obtains and releases md_unit_array_rw write lock
*
*/
static int
raid_replace(
replace_params_t *mrp,
IOLOCK *lock
)
{
minor_t mnum = mrp->mnum;
md_dev64_t odev = mrp->old_dev;
md_error_t *ep = &mrp->mde;
mr_unit_t *un;
rcs_state_t state;
int ix, col = -1;
int force = 0;
int err = 0;
replace_cmd_t cmd;
set_t setno;
side_t side;
mdkey_t devkey;
int nkeys;
mddb_recid_t extra_recids[3] = { 0, 0, 0 };
int extra_rids = 0;
md_error_t mde = mdnullerror;
sv_dev_t sv = {MD_SET_BAD, MD_SIDEWILD, MD_KEYWILD};
mdclrerror(ep);
setno = MD_MIN2SET(mnum);
side = mddb_getsidenum(setno);
un = md_unit_readerlock(MDI_UNIT(mnum));
if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
(raid_state_cnt(un, RCS_RESYNC) != 0)) {
md_unit_readerexit(MDI_UNIT(mnum));
return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
}
if (un->un_state & RUS_DOI) {
md_unit_readerexit(MDI_UNIT(mnum));
return (mdmderror(ep, MDE_RAID_DOI, mnum));
}
if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) ||
(MD_STATUS(un) & MD_UN_GROW_PENDING)) {
md_unit_readerexit(MDI_UNIT(mnum));
return (mdmderror(ep, MDE_IN_USE, mnum));
}
md_unit_readerexit(MDI_UNIT(mnum));
/* get locks and recheck to be sure something did not change */
if ((un = raid_getun(mnum, ep, WRITERS, lock)) == NULL)
return (0);
if (md_getkeyfromdev(setno, side, odev, &devkey, &nkeys) != 0) {
return (mddeverror(ep, MDE_NAME_SPACE, odev));
}
for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
md_dev64_t tmpdevt = un->un_column[ix].un_orig_dev;
/*
* Try to resolve devt again if NODEV64
*/
if (tmpdevt == NODEV64) {
tmpdevt = md_resolve_bydevid(mnum, tmpdevt,
un->un_column[ix].un_orig_key);
un->un_column[ix].un_orig_dev = tmpdevt;
}
if (un->un_column[ix].un_orig_dev == odev) {
col = ix;
break;
} else {
if (un->un_column[ix].un_orig_dev == NODEV64) {
/*
* Now we use the keys to match.
* If no key found, continue.
*/
if (nkeys == 0) {
continue;
}
if (un->un_column[ix].un_orig_key == devkey) {
if (nkeys > 1)
return (mddeverror(ep,
MDE_MULTNM, odev));
col = ix;
break;
}
}
}
}
if (col == -1)
return (mdcomperror(ep, MDE_CANT_FIND_COMP,
mnum, odev));
if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
(raid_state_cnt(un, RCS_RESYNC) != 0))
return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
if (un->un_state & RUS_DOI)
return (mdcomperror(ep, MDE_REPL_INVAL_STATE, mnum,
un->un_column[col].un_dev));
if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) ||
(MD_STATUS(un) & MD_UN_GROW_PENDING))
return (mdmderror(ep, MDE_IN_USE, mnum));
if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == FORCE_REPLACE_COMP))
force = 1;
if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == ENABLE_COMP))
cmd = ENABLE_COMP;
if ((mrp->cmd == FORCE_REPLACE_COMP) || (mrp->cmd == REPLACE_COMP))
cmd = REPLACE_COMP;
if (un->un_state == RUS_LAST_ERRED) {
/* Must use -f force flag for unit in LAST_ERRED state */
if (!force)
return (mdmderror(ep, MDE_RAID_NEED_FORCE, mnum));
/* Must use -f force flag on ERRED column first */
if (un->un_column[col].un_devstate != RCS_ERRED) {
for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
if (un->un_column[ix].un_devstate & RCS_ERRED)
return (mdcomperror(ep,
MDE_RAID_COMP_ERRED, mnum,
un->un_column[ix].un_dev));
}
}
/* must use -f force flag on LAST_ERRED columns next */
if ((un->un_column[col].un_devstate != RCS_LAST_ERRED) &&
(un->un_column[col].un_devstate != RCS_ERRED))
return (mdcomperror(ep, MDE_RAID_COMP_ERRED,
mnum, un->un_column[col].un_dev));
}
if (un->un_state == RUS_ERRED) {
if (! (un->un_column[col].un_devstate &
(RCS_ERRED | RCS_INIT_ERRED)))
return (mdcomperror(ep, MDE_RAID_COMP_ERRED,
mnum, un->un_column[ix].un_dev));
}
ASSERT(!(un->un_column[col].un_devflags & MD_RAID_ALT_ISOPEN));
ASSERT(!(un->un_column[col].un_devflags & MD_RAID_WRITE_ALT));
state = un->un_column[col].un_devstate;
if (state & RCS_INIT_ERRED) {
MD_STATUS(un) |= MD_UN_GROW_PENDING;
un->un_percent_done = 0;
raid_set_state(un, col, RCS_INIT, 0);
} else if (((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) &&
resync_request(mnum, col, 0, ep))
return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
if (cmd == REPLACE_COMP) {
md_dev64_t tmpdev = mrp->new_dev;
/*
* open the device by device id
*/
tmpdev = md_resolve_bydevid(mnum, tmpdev, mrp->new_key);
if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) {
return (mdcomperror(ep, MDE_COMP_OPEN_ERR, mnum,
tmpdev));
}
/*
* If it's a metadevice, make sure it gets reparented
*/
if (md_getmajor(tmpdev) == md_major) {
minor_t new_mnum = md_getminor(tmpdev);
md_unit_t *new_un = MD_UNIT(new_mnum);
md_set_parent(tmpdev, MD_SID(un));
extra_recids[extra_rids++] = MD_RECID(new_un);
}
mrp->new_dev = tmpdev;
un->un_column[col].un_orig_dev = tmpdev;
un->un_column[col].un_orig_key = mrp->new_key;
un->un_column[col].un_orig_pwstart = mrp->start_blk;
un->un_column[col].un_orig_devstart =
mrp->start_blk + un->un_pwsize;
/*
* If the old device was a metadevice, make sure to
* reset its parent.
*/
if (md_getmajor(odev) == md_major) {
minor_t old_mnum = md_getminor(odev);
md_unit_t *old_un = MD_UNIT(old_mnum);
md_reset_parent(odev);
extra_recids[extra_rids++] =
MD_RECID(old_un);
}
if (HOTSPARED(un, col)) {
md_layered_close(mrp->new_dev, MD_OFLG_NULL);
un->un_column[col].un_alt_dev = mrp->new_dev;
un->un_column[col].un_alt_pwstart = mrp->start_blk;
un->un_column[col].un_alt_devstart =
mrp->start_blk + un->un_pwsize;
un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC;
} else {
/*
* not hot spared. Close the old device and
* move the new device in.
*/
if (un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN)
md_layered_close(odev, MD_OFLG_NULL);
un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN;
un->un_column[col].un_dev = mrp->new_dev;
un->un_column[col].un_pwstart = mrp->start_blk;
un->un_column[col].un_devstart =
mrp->start_blk + un->un_pwsize;
if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) {
un->un_column[col].un_devflags |=
MD_RAID_REGEN_RESYNC;
}
}
/*
* If the old device is not a metadevice then
* save off the set number and key so that it
* can be removed from the namespace later.
*/
if (md_getmajor(odev) != md_major) {
sv.setno = setno;
sv.key = devkey;
}
}
if (cmd == ENABLE_COMP) {
md_dev64_t tmpdev = un->un_column[col].un_orig_dev;
mdkey_t raidkey = un->un_column[col].un_orig_key;
/*
* We trust the dev_t because we cannot determine the
* dev_t from the device id since a new disk is in the
* same location. Since this is a call from metareplace -e dx
* AND it is SCSI a new dev_t is not generated. So the
* dev_t from the mddb is used. Before enabling the device
* we check to make sure that multiple entries for the same
* device does not exist in the namespace. If they do we
* fail the ioctl.
* One of the many ways multiple entries in the name space
* can occur is if one removed the failed component in a
* RAID metadevice and put another disk that was part of
* another metadevice. After reboot metadevadm would correctly
* update the device name for the metadevice whose component
* has moved. However now in the metadb there are two entries
* for the same name (ctds) that belong to different
* metadevices. One is valid, the other is a ghost or "last
* know as" ctds.
*/
tmpdev = md_resolve_bydevid(mnum, tmpdev, raidkey);
if (tmpdev == NODEV64)
tmpdev = md_getdevnum(setno, side, raidkey,
MD_TRUST_DEVT);
/*
* check for multiple entries in namespace for the
* same dev
*/
if (md_getkeyfromdev(setno, side, tmpdev, &devkey,
&nkeys) != 0)
return (mddeverror(ep, MDE_NAME_SPACE, tmpdev));
/*
* If number of keys are greater that
* 1, then we have an invalid
* namespace. STOP and return.
*/
if (nkeys > 1)
return (mddeverror(ep, MDE_MULTNM, tmpdev));
if (devkey != raidkey)
return (mdcomperror(ep, MDE_CANT_FIND_COMP,
mnum, tmpdev));
if (un->un_column[col].un_orig_dev == NODEV64)
un->un_column[col].un_orig_dev = tmpdev;
if (HOTSPARED(un, col)) {
un->un_column[col].un_alt_dev =
un->un_column[col].un_orig_dev;
un->un_column[col].un_alt_pwstart =
un->un_column[col].un_orig_pwstart;
un->un_column[col].un_alt_devstart =
un->un_column[col].un_orig_devstart;
un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC;
} else {
if (!(un->un_column[col].un_devflags &
MD_RAID_DEV_ISOPEN)) {
if (md_layered_open(mnum, &tmpdev,
MD_OFLG_NULL)) {
un->un_column[col].un_dev = tmpdev;
return (mdcomperror(ep,
MDE_COMP_OPEN_ERR, mnum, tmpdev));
}
ASSERT(tmpdev != NODEV64 &&
tmpdev != 0);
if ((md_getmajor(tmpdev) != md_major) &&
(md_devid_found(setno, side, raidkey)
== 1)) {
if (md_update_namespace_did(setno, side,
raidkey, &mde) != 0) {
cmn_err(CE_WARN,
"md: could not"
" update namespace\n");
}
}
un->un_column[col].un_dev =
un->un_column[col].un_orig_dev;
}
un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN;
un->un_column[col].un_devflags |= MD_RAID_REGEN_RESYNC;
}
}
if (mrp->has_label) {
un->un_column[col].un_devflags |= MD_RAID_HAS_LABEL;
} else {
un->un_column[col].un_devflags &= ~MD_RAID_HAS_LABEL;
}
raid_commit(un, extra_recids);
/* If the component has been replaced - clean up the name space */
if (sv.setno != MD_SET_BAD) {
md_rem_names(&sv, 1);
}
md_ioctl_droplocks(lock);
if ((cmd == ENABLE_COMP) || (cmd == FORCE_ENABLE_COMP)) {
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ENABLE, SVM_TAG_METADEVICE,
setno, MD_SID(un));
} else {
SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REPLACE, SVM_TAG_METADEVICE,
setno, MD_SID(un));
}
if (un->un_column[col].un_devstate & RCS_INIT)
err = raid_init_unit(mnum, ep);
else if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0)
err = raid_resync_unit(mnum, ep);
mdclrerror(ep);
if (!err)
return (0);
/* be sure state */
/* is already set by this time */
/* fix state and commit record */
un = md_unit_writerlock(MDI_UNIT(mnum));
if (state & RCS_INIT_ERRED)
raid_set_state(un, col, state, 1);
else if (state & RCS_OKAY)
raid_set_state(un, col, RCS_ERRED, 0);
else
raid_set_state(un, col, state, 1);
raid_commit(un, NULL);
md_unit_writerexit(MDI_UNIT(mnum));
mdclrerror(ep);
return (0);
}
/*
* NAME: raid_set_sync
* DESCRIPTION: used to sync a component of a RAID metadevice
* PARAMETERS: md_resync_ioctl_t *mrp - pointer to resync data structure
* int mode - must be FWRITE
* IOLOCK *lock - pointer to IOCTL lock
*
* LOCKS: obtains unit writer lock via IOLOCK (through raid_getun),
* obtains and releases md_unit_array_rw write lock
*
*/
static int
raid_set_sync(
md_resync_ioctl_t *rip,
IOLOCK *lock
)
{
minor_t mnum = rip->ri_mnum;
mr_unit_t *un;
int init = 0;
int resync = 0;
int regen = 0;
int ix;
int err;
mdclrerror(&rip->mde);
if ((un = raid_getun(mnum, &rip->mde, WRITERS, lock)) == NULL)
return (0);
if (un->un_state & RUS_DOI)
return (mdmderror(&rip->mde, MDE_RAID_DOI, mnum));
if (un->c.un_status & MD_UN_RESYNC_ACTIVE)
return (mdmderror(&rip->mde, MDE_RESYNC_ACTIVE, mnum));
/* This prevents new opens */
rip->ri_flags = 0;
if (un->un_state & RUS_REGEN)
regen++;
if (raid_state_cnt(un, RCS_RESYNC))
resync++;
if (raid_state_cnt(un, RCS_INIT) || (un->un_state & RUS_INIT))
init++;
ASSERT(!(resync && init && regen));
md_ioctl_droplocks(lock);
rip->ri_percent_done = 0;
if (init) {
MD_STATUS(un) |= MD_UN_GROW_PENDING;
return (raid_init_unit(mnum, &rip->mde));
}
/*
* If resync is needed, it will call raid_internal_open forcing
* replay before the open completes.
* Otherwise, call raid_internal_open directly to force
* replay to complete during boot (metasync -r).
* NOTE: the unit writer lock must remain held while setting
* MD_UN_RESYNC_ACTIVE but must be released before
* calling raid_resync_unit or raid_internal_open.
*/
if (resync) {
ASSERT(resync < 2);
un = md_unit_writerlock(MDI_UNIT(mnum));
MD_STATUS(un) |= MD_UN_RESYNC_ACTIVE;
/* Must release unit writer lock for resync */
/*
* correctly setup the devices before trying to start the
* resync operation.
*/
for (ix = 0; un->un_totalcolumncnt; ix++) {
if (un->un_column[ix].un_devstate & RCS_RESYNC) {
if ((un->un_column[ix].un_devflags &
MD_RAID_COPY_RESYNC) &&
HOTSPARED(un, ix)) {
un->un_column[ix].un_alt_dev =
un->un_column[ix].un_orig_dev;
un->un_column[ix].un_alt_devstart =
un->un_column[ix].un_orig_devstart;
un->un_column[ix].un_alt_pwstart =
un->un_column[ix].un_orig_pwstart;
}
break;
}
}
ASSERT(un->un_column[ix].un_devflags &
(MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC));
rip->ri_percent_done = 0;
un->un_column[ix].un_devflags |= MD_RAID_RESYNC;
(void) resync_request(mnum, ix, 0, NULL);
md_unit_writerexit(MDI_UNIT(mnum));
err = raid_resync_unit(mnum, &rip->mde);
return (err);
}
if (regen) {
err = raid_regen_unit(mnum, &rip->mde);
return (err);
}
/* The unit requires not work so just force replay of the device */
if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0))
return (mdmderror(&rip->mde,
MDE_RAID_OPEN_FAILURE, mnum));
(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
return (0);
}
/*
* NAME: raid_get_resync
* DESCRIPTION: used to check resync status on a component of a RAID metadevice
* PARAMETERS: md_resync_ioctl_t *mrp - pointer to resync data structure
* int mode - must be FWRITE
* IOLOCK *lock - pointer to IOCTL lock
*
* LOCKS: none
*
*/
static int
raid_get_resync(
md_resync_ioctl_t *rip,
IOLOCK *lock
)
{
minor_t mnum = rip->ri_mnum;
mr_unit_t *un;
u_longlong_t percent;
int cnt;
int ix;
uint64_t d;
mdclrerror(&rip->mde);
if ((un = raid_getun(mnum, &rip->mde, RD_LOCK, lock)) == NULL)
return (0);
rip->ri_flags = 0;
if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
d = un->un_segsincolumn;
percent = d ? ((1000 * un->un_resync_line_index) / d) : 0;
if (percent > 1000)
percent = 1000; /* can't go over 100% */
rip->ri_percent_done = (int)percent;
rip->ri_flags |= MD_RI_INPROGRESS;
}
if (UNIT_STATE(un) & RUS_INIT) {
d = un->un_segsize * un->un_segsincolumn *
un->un_totalcolumncnt;
percent =
d ? ((1000 * (u_longlong_t)un->un_init_iocnt) / d) : 0;
if (percent > 1000)
percent = 1000; /* can't go over 100% */
rip->ri_percent_done = (int)percent;
rip->ri_flags |= MD_GROW_INPROGRESS;
} else if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
d = un->un_segsize * un->un_segsincolumn * un->un_init_colcnt;
percent =
d ? (((u_longlong_t)un->un_init_iocnt * 1000) / d) : 0;
if (percent > 1000)
percent = 1000;
rip->ri_percent_done = (int)percent;
rip->ri_flags |= MD_GROW_INPROGRESS;
}
if (un->un_state & RUS_REGEN)
rip->ri_percent_done = un->un_percent_done;
cnt = 0;
for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
switch (un->un_column[ix].un_devstate) {
case RCS_INIT:
case RCS_ERRED:
case RCS_LAST_ERRED:
cnt++;
break;
default:
break;
}
}
d = un->un_totalcolumncnt;
rip->ri_percent_dirty = d ? (((u_longlong_t)cnt * 100) / d) : 0;
return (0);
}
/*
* NAME: raid_grow
* DESCRIPTION: Concatenate to a RAID metadevice
* PARAMETERS: md_grow_params_t *mgp
* - pointer to IOCGROW data structure
* int mode - must be FWRITE
* IOLOCK *lockp - IOCTL read/write and unit_array_rw lock
*
* LOCKS: obtains unit writer lock via IOLOCK (through raid_getun),
* obtains and releases md_unit_array_rw write lock
*
*/
static int
raid_grow(void *mgp, int mode, IOLOCK *lock)
{
minor_t mnum;
mr_unit_t *un, *new_un;
mdi_unit_t *ui;
mddb_type_t typ1;
mddb_recid_t mr_recid;
mddb_recid_t old_vtoc = 0;
mddb_recid_t *recids;
md_create_rec_option_t options;
int err;
int col, i;
int64_t tb, atb;
u_longlong_t unrev;
int tc;
int rval = 0;
set_t setno;
mr_column_ic_t *mrc;
int num_recs, rid;
md_grow_params_t *mgph = mgp;
mnum = mgph->mnum;
mdclrerror(&mgph->mde);
ui = MDI_UNIT(mnum);
un = md_unit_readerlock(ui);
if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
md_unit_readerexit(ui);
return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
}
if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
md_unit_readerexit(ui);
return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE, mnum));
}
if (UNIT_STATE(un) & RUS_LAST_ERRED) {
md_unit_readerexit(ui);
return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED, mnum));
}
if (UNIT_STATE(un) & RUS_DOI) {
md_unit_readerexit(ui);
return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum));
}
if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) {
md_unit_readerexit(ui);
return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
}
md_unit_readerexit(ui);
if ((un = raid_getun(mnum, &mgph->mde, WRITERS, lock)) ==
NULL)
return (0);
if (MD_STATUS(un) & MD_UN_GROW_PENDING)
return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE, mnum));
if (un->c.un_size >= mgph->size)
return (EINVAL);
if (UNIT_STATE(un) & RUS_LAST_ERRED)
return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED, mnum));
if (UNIT_STATE(un) & RUS_DOI)
return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum));
if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT))
return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
setno = MD_MIN2SET(mnum);
typ1 = (mddb_type_t)md_getshared_key(setno,
raid_md_ops.md_driver.md_drivername);
/*
* Preserve the friendly name nature of the device that is
* growing.
*/
options = MD_CRO_RAID;
if (un->c.un_revision & MD_FN_META_DEV)
options |= MD_CRO_FN;
if (mgph->options & MD_CRO_64BIT) {
#if defined(_ILP32)
return (mdmderror(&mgph->mde, MDE_UNIT_TOO_LARGE, mnum));
#else
mr_recid = mddb_createrec(mgph->size, typ1, 0,
MD_CRO_64BIT | options, setno);
#endif
} else {
mr_recid = mddb_createrec(mgph->size, typ1, 0,
MD_CRO_32BIT | options, setno);
}
if (mr_recid < 0) {
rval = mddbstatus2error(&mgph->mde, (int)mr_recid,
mnum, setno);
return (rval);
}
/* get the address of the new unit */
new_un = (mr_unit_t *)mddb_getrecaddr(mr_recid);
/*
* It is okay that we muck with the new unit here,
* since no one else will know about the unit struct
* until we commit it. If we crash, the record will
* be automatically purged, since we haven't
* committed it yet and the old unit struct will be found.
*/
/* copy in the user's unit struct */
err = ddi_copyin((void *)(uintptr_t)mgph->mdp, new_un,
mgph->size, mode);
if (err) {
mddb_deleterec_wrapper(mr_recid);
return (EFAULT);
}
/* make sure columns are being added */
if (un->un_totalcolumncnt >= new_un->un_totalcolumncnt) {
mddb_deleterec_wrapper(mr_recid);
return (EINVAL);
}
/*
* Save a few of the new unit structs fields.
* Before they get clobbered.
*/
tc = new_un->un_totalcolumncnt;
tb = new_un->c.un_total_blocks;
atb = new_un->c.un_actual_tb;
unrev = new_un->c.un_revision;
/*
* Copy the old unit struct (static stuff)
* into new unit struct
*/
bcopy((caddr_t)un, (caddr_t)new_un, un->c.un_size);
/*
* Restore a few of the new unit struct values.
*/
new_un->un_totalcolumncnt = tc;
new_un->c.un_actual_tb = atb;
new_un->un_grow_tb = tb;
new_un->c.un_revision = unrev;
new_un->c.un_record_id = mr_recid;
new_un->c.un_size = mgph->size;
ASSERT(new_un->mr_ic == un->mr_ic);
/*
* Save old column slots
*/
mrc = un->un_column_ic;
/*
* Allocate new column slot
*/
new_un->un_column_ic = (mr_column_ic_t *)
kmem_zalloc(sizeof (mr_column_ic_t) * new_un->un_totalcolumncnt,
KM_SLEEP);
/*
* Restore old column slots
* Free the old column slots
*/
bcopy(mrc, new_un->un_column_ic,
sizeof (mr_column_ic_t) * un->un_totalcolumncnt);
kmem_free(mrc, sizeof (mr_column_ic_t) * un->un_totalcolumncnt);
/* All 64 bit metadevices only support EFI labels. */
if (mgph->options & MD_CRO_64BIT) {
new_un->c.un_flag |= MD_EFILABEL;
/*
* If the device was previously smaller than a terabyte,
* and had a vtoc record attached to it, we remove the
* vtoc record, because the layout has changed completely.
*/
if (((un->c.un_revision & MD_64BIT_META_DEV) == 0) &&
(un->c.un_vtoc_id != 0)) {
old_vtoc = un->c.un_vtoc_id;
new_un->c.un_vtoc_id =
md_vtoc_to_efi_record(old_vtoc, setno);
}
}
/*
* allocate the real recids array. since we may have to commit
* underlying metadevice records, we need an array of size:
* total number of new components being attach + 2 (one for the
* raid itself, one for the end marker).
*/
num_recs = new_un->un_totalcolumncnt + 2;
rid = 0;
recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP);
recids[rid++] = mr_recid;
for (col = un->un_totalcolumncnt;
(col < new_un->un_totalcolumncnt); col++) {
mr_column_t *mr_col = &new_un->un_column[col];
md_unit_t *comp_un;
if (raid_build_pw_reservation(new_un, col) != 0) {
/* release pwslots already allocated by grow */
for (i = un->un_totalcolumncnt; i < col; i++) {
raid_free_pw_reservation(new_un, i);
}
kmem_free(new_un->un_column_ic,
sizeof (mr_column_ic_t) *
new_un->un_totalcolumncnt);
kmem_free(new_un->mr_ic, sizeof (*un->mr_ic));
kmem_free(recids, num_recs * sizeof (mddb_recid_t));
mddb_deleterec_wrapper(mr_recid);
return (EINVAL);
}
/*
* set parent on metadevices being added.
* NOTE: currently soft partitions are the only metadevices
* which can appear within a RAID metadevice.
*/
if (md_getmajor(mr_col->un_dev) == md_major) {
comp_un = MD_UNIT(md_getminor(mr_col->un_dev));
recids[rid++] = MD_RECID(comp_un);
md_set_parent(mr_col->un_dev, MD_SID(new_un));
}
new_un->un_column[col].un_devflags = 0;
}
/* set end marker */
recids[rid] = 0;
/* commit new unit struct */
mddb_commitrecs_wrapper(recids);
/* delete old unit struct */
mddb_deleterec_wrapper(un->c.un_record_id);
/* place new unit in in-core array */
md_nblocks_set(mnum, new_un->c.un_total_blocks);
MD_UNIT(mnum) = new_un;
/*
* If old_vtoc has a non zero value, we know:
* - This unit crossed the border from smaller to larger one TB
* - There was a vtoc record for the unit,
* - This vtoc record is no longer needed, because
* a new efi record has been created for this un.
*/
if (old_vtoc != 0) {
mddb_deleterec_wrapper(old_vtoc);
}
/* free recids */
kmem_free(recids, num_recs * sizeof (mddb_recid_t));
SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_GROW, SVM_TAG_METADEVICE,
MD_UN2SET(new_un), MD_SID(new_un));
MD_STATUS(new_un) |= MD_UN_GROW_PENDING;
/*
* Since the md_ioctl_writelock aquires the unit write lock
* and open/close aquires the unit reader lock it is necessary
* to drop the unit write lock and then reaquire it as needed
* later.
*/
md_unit_writerexit(ui);
if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) {
rval = mdmderror(&mgph->mde, MDE_RAID_OPEN_FAILURE, mnum);
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
MD_UN2SET(new_un), MD_SID(new_un));
return (rval);
}
(void) md_unit_writerlock(ui);
for (i = 0; i < new_un->un_totalcolumncnt; i++) {
if (new_un->un_column[i].un_devstate & RCS_OKAY)
(void) init_pw_area(new_un, new_un->un_column[i].un_dev,
new_un->un_column[i].un_pwstart, i);
}
md_unit_writerexit(ui);
(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
(void) md_unit_writerlock(ui);
/* create a background thread to initialize the columns */
md_ioctl_droplocks(lock);
return (raid_init_unit(mnum, &mgph->mde));
}
/*
* NAME: raid_reset
* DESCRIPTION: used to reset (clear / remove) a RAID metadevice
* PARAMETERS: md_i_reset_t *mirp - pointer to reset data structure
*
* LOCKS: obtains and releases md_unit_array_rw write lock
*
*/
static int
raid_reset(md_i_reset_t *mirp)
{
minor_t mnum = mirp->mnum;
mr_unit_t *un;
mdi_unit_t *ui;
set_t setno = MD_MIN2SET(mnum);
mdclrerror(&mirp->mde);
rw_enter(&md_unit_array_rw.lock, RW_WRITER);
/*
* NOTE: need to get md_unit_writerlock to avoid conflict
* with raid_init thread.
*/
if ((un = raid_getun(mnum, &mirp->mde, NO_LOCK, NULL)) ==
NULL) {
rw_exit(&md_unit_array_rw.lock);
return (0);
}
ui = MDI_UNIT(mnum);
if (MD_HAS_PARENT(MD_PARENT(un))) {
rw_exit(&md_unit_array_rw.lock);
return (mdmderror(&mirp->mde, MDE_IN_USE, mnum));
}
un = (mr_unit_t *)md_unit_openclose_enter(ui);
if (md_unit_isopen(MDI_UNIT(mnum))) {
md_unit_openclose_exit(ui);
rw_exit(&md_unit_array_rw.lock);
return (mdmderror(&mirp->mde, MDE_IS_OPEN, mnum));
}
md_unit_openclose_exit(ui);
if (UNIT_STATE(un) != RUS_OKAY && !mirp->force) {
rw_exit(&md_unit_array_rw.lock);
return (mdmderror(&mirp->mde, MDE_RAID_NEED_FORCE, mnum));
}
reset_raid(un, mnum, 1);
/*
* Update unit availability
*/
md_set[setno].s_un_avail++;
/*
* If MN set, reset s_un_next so all nodes can have
* the same view of the next available slot when
* nodes are -w and -j
*/
if (MD_MNSET_SETNO(setno)) {
(void) md_upd_set_unnext(setno, MD_MIN2UNIT(mnum));
}
rw_exit(&md_unit_array_rw.lock);
return (0);
}
/*
* NAME: raid_get_geom
* DESCRIPTION: used to get the geometry of a RAID metadevice
* PARAMETERS: mr_unit_t *un - RAID unit to get the geometry for
* struct dk_geom *gp - pointer to geometry data structure
*
* LOCKS: none
*
*/
static int
raid_get_geom(
mr_unit_t *un,
struct dk_geom *geomp
)
{
md_get_geom((md_unit_t *)un, geomp);
return (0);
}
/*
* NAME: raid_get_vtoc
* DESCRIPTION: used to get the VTOC on a RAID metadevice
* PARAMETERS: mr_unit_t *un - RAID unit to get the VTOC from
* struct vtoc *vtocp - pointer to VTOC data structure
*
* LOCKS: none
*
*/
static int
raid_get_vtoc(
mr_unit_t *un,
struct vtoc *vtocp
)
{
md_get_vtoc((md_unit_t *)un, vtocp);
return (0);
}
/*
* NAME: raid_set_vtoc
* DESCRIPTION: used to set the VTOC on a RAID metadevice
* PARAMETERS: mr_unit_t *un - RAID unit to set the VTOC on
* struct vtoc *vtocp - pointer to VTOC data structure
*
* LOCKS: none
*
*/
static int
raid_set_vtoc(
mr_unit_t *un,
struct vtoc *vtocp
)
{
return (md_set_vtoc((md_unit_t *)un, vtocp));
}
/*
* NAME: raid_get_extvtoc
* DESCRIPTION: used to get the extended VTOC on a RAID metadevice
* PARAMETERS: mr_unit_t *un - RAID unit to get the VTOC from
* struct extvtoc *vtocp - pointer to extended VTOC data structure
*
* LOCKS: none
*
*/
static int
raid_get_extvtoc(
mr_unit_t *un,
struct extvtoc *vtocp
)
{
md_get_extvtoc((md_unit_t *)un, vtocp);
return (0);
}
/*
* NAME: raid_set_extvtoc
* DESCRIPTION: used to set the extended VTOC on a RAID metadevice
* PARAMETERS: mr_unit_t *un - RAID unit to set the VTOC on
* struct extvtoc *vtocp - pointer to extended VTOC data structure
*
* LOCKS: none
*
*/
static int
raid_set_extvtoc(
mr_unit_t *un,
struct extvtoc *vtocp
)
{
return (md_set_extvtoc((md_unit_t *)un, vtocp));
}
/*
* NAME: raid_get_cgapart
* DESCRIPTION: used to get the dk_map on a RAID metadevice
* PARAMETERS: mr_unit_t *un - RAID unit to set the VTOC on
* struct vtoc *dkmapp - pointer to dk_map data structure
*
* LOCKS: none
*
*/
static int
raid_get_cgapart(
mr_unit_t *un,
struct dk_map *dkmapp
)
{
md_get_cgapart((md_unit_t *)un, dkmapp);
return (0);
}
/*
* NAME: raid_getdevs
* DESCRIPTION: return all devices within a RAID metadevice
* PARAMETERS: md_getdevs_params_t *mgdp
* - pointer to getdevs IOCTL data structure
* int mode - should be FREAD
* IOLOCK *lockp - IOCTL read/write lock
*
* LOCKS: obtains unit reader lock via IOLOCK
*
*/
static int
raid_getdevs(
void *mgdp,
int mode,
IOLOCK *lock
)
{
minor_t mnum;
mr_unit_t *un;
md_dev64_t *udevs;
int i, cnt;
md_dev64_t unit_dev;
md_getdevs_params_t *mgdph = mgdp;
mnum = mgdph->mnum;
/* check out unit */
mdclrerror(&mgdph->mde);
if ((un = raid_getun(mnum, &mgdph->mde, RD_LOCK, lock)) == NULL)
return (0);
udevs = (md_dev64_t *)(uintptr_t)mgdph->devs;
for (cnt = 0, i = 0; i < un->un_totalcolumncnt; i++, cnt++) {
if (cnt < mgdph->cnt) {
unit_dev = un->un_column[i].un_orig_dev;
if (md_getmajor(unit_dev) != md_major) {
if ((unit_dev = md_xlate_mini_2_targ
(unit_dev)) == NODEV64)
return (ENODEV);
}
if (ddi_copyout((caddr_t)&unit_dev,
(caddr_t)&udevs[cnt], sizeof (*udevs), mode) != 0)
return (EFAULT);
}
if (HOTSPARED(un, i)) {
cnt++;
if (cnt >= mgdph->cnt)
continue;
unit_dev = un->un_column[i].un_dev;
if (md_getmajor(unit_dev) != md_major) {
if ((unit_dev = md_xlate_mini_2_targ
(unit_dev)) == NODEV64)
return (ENODEV);
}
if (ddi_copyout((caddr_t)&unit_dev,
(caddr_t)&udevs[cnt], sizeof (*udevs), mode) != 0)
return (EFAULT);
}
}
mgdph->cnt = cnt;
return (0);
}
/*
* NAME: raid_change
* DESCRIPTION: used to change the following dynamic values:
* the hot spare pool
* in the unit structure of a RAID metadevice
* PARAMETERS: md_change_params_t *mcp - pointer to change data structure
* IOLOCK *lock - pointer to IOCTL lock
*
* LOCKS: obtains unit writer lock via IOLOCK (through raid_getun)
*
*/
static int
raid_change(
md_raid_params_t *mrp,
IOLOCK *lock
)
{
minor_t mnum = mrp->mnum;
mr_unit_t *un;
int ix;
mddb_recid_t recids[3] = {0, 0, 0};
int err;
int irecid;
int inc_new_hsp = 0;
mdclrerror(&mrp->mde);
if ((un = raid_getun(mnum, &mrp->mde, WR_LOCK, lock)) == NULL)
return (0);
if (!mrp->params.change_hsp_id)
return (0);
/* verify that no hotspare is in use */
for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
if (HOTSPARED(un, ix)) {
return (mdmderror(&mrp->mde, MDE_HS_IN_USE, mnum));
}
}
/* replace the hot spare pool */
irecid = 0;
if (mrp->params.hsp_id != -1) {
/* increment the reference count of the new hsp */
err = md_hot_spare_ifc(HSP_INCREF, mrp->params.hsp_id, 0, 0,
&recids[0], NULL, NULL, NULL);
if (err) {
return (mdhsperror(&mrp->mde, MDE_INVAL_HSP,
mrp->params.hsp_id));
}
inc_new_hsp = 1;
irecid++;
}
if (un->un_hsp_id != -1) {
/* decrement the reference count of the old hsp */
err = md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
&recids[irecid], NULL, NULL, NULL);
if (err) {
err = mdhsperror(&mrp->mde, MDE_INVAL_HSP,
mrp->params.hsp_id);
if (inc_new_hsp) {
(void) md_hot_spare_ifc(HSP_DECREF,
mrp->params.hsp_id, 0, 0,
&recids[0], NULL, NULL, NULL);
/*
* Don't need to commit the record,
* because it wasn't committed before
*/
}
return (err);
}
}
un->un_hsp_id = mrp->params.hsp_id;
raid_commit(un, recids);
SE_NOTIFY(EC_SVM_STATE, ESC_SVM_CHANGE, SVM_TAG_METADEVICE,
MD_UN2SET(un), MD_SID(un));
/* Now trigger hot spare processing in case one is needed. */
if ((un->un_hsp_id != -1) && (un->un_state == RUS_ERRED))
(void) raid_hotspares();
return (0);
}
/*
* NAME: raid_admin_ioctl
* DESCRIPTION: IOCTL operations unique to metadevices and RAID
* PARAMETERS: int cmd - IOCTL command to be executed
* void *data - pointer to IOCTL data structure
* int mode - either FREAD or FWRITE
* IOLOCK *lockp - IOCTL read/write lock
*
* LOCKS: none
*
*/
static int
raid_admin_ioctl(
int cmd,
void *data,
int mode,
IOLOCK *lockp
)
{
size_t sz = 0;
void *d = NULL;
int err = 0;
/* We can only handle 32-bit clients for internal commands */
if ((mode & DATAMODEL_MASK) != DATAMODEL_ILP32) {
return (EINVAL);
}
/* dispatch ioctl */
switch (cmd) {
case MD_IOCSET:
{
if (! (mode & FWRITE))
return (EACCES);
sz = sizeof (md_set_params_t);
d = kmem_alloc(sz, KM_SLEEP);
if (ddi_copyin(data, d, sz, mode)) {
err = EFAULT;
break;
}
err = raid_set(d, mode);
break;
}
case MD_IOCGET:
{
if (! (mode & FREAD))
return (EACCES);
sz = sizeof (md_i_get_t);
d = kmem_alloc(sz, KM_SLEEP);
if (ddi_copyin(data, d, sz, mode)) {
err = EFAULT;
break;
}
err = raid_get(d, mode, lockp);
break;
}
case MD_IOCREPLACE:
{
if (! (mode & FWRITE))
return (EACCES);
sz = sizeof (replace_params_t);
d = kmem_alloc(sz, KM_SLEEP);
if (ddi_copyin(data, d, sz, mode)) {
err = EFAULT;
break;
}
err = raid_replace((replace_params_t *)d, lockp);
break;
}
case MD_IOCSETSYNC:
{
if (! (mode & FWRITE))
return (EACCES);
sz = sizeof (md_resync_ioctl_t);
d = kmem_alloc(sz, KM_SLEEP);
if (ddi_copyin(data, d, sz, mode)) {
err = EFAULT;
break;
}
err = raid_set_sync((md_resync_ioctl_t *)d, lockp);
break;
}
case MD_IOCGETSYNC:
{
if (! (mode & FREAD))
return (EACCES);
sz = sizeof (md_resync_ioctl_t);
d = kmem_alloc(sz, KM_SLEEP);
if (ddi_copyin(data, d, sz, mode)) {
err = EFAULT;
break;
}
err = raid_get_resync((md_resync_ioctl_t *)d, lockp);
break;
}
case MD_IOCGROW:
{
if (! (mode & FWRITE))
return (EACCES);
sz = sizeof (md_grow_params_t);
d = kmem_alloc(sz, KM_SLEEP);
if (ddi_copyin(data, d, sz, mode)) {
err = EFAULT;
break;
}
err = raid_grow(d, mode, lockp);
break;
}
case MD_IOCCHANGE:
{
if (! (mode & FWRITE))
return (EACCES);
sz = sizeof (md_raid_params_t);
d = kmem_alloc(sz, KM_SLEEP);
if (ddi_copyin(data, d, sz, mode)) {
err = EFAULT;
break;
}
err = raid_change((md_raid_params_t *)d, lockp);
break;
}
case MD_IOCRESET:
{
if (! (mode & FWRITE))
return (EACCES);
sz = sizeof (md_i_reset_t);
d = kmem_alloc(sz, KM_SLEEP);
if (ddi_copyin(data, d, sz, mode)) {
err = EFAULT;
break;
}
err = raid_reset((md_i_reset_t *)d);
break;
}
case MD_IOCGET_DEVS:
{
if (! (mode & FREAD))
return (EACCES);
sz = sizeof (md_getdevs_params_t);
d = kmem_alloc(sz, KM_SLEEP);
if (ddi_copyin(data, d, sz, mode)) {
err = EFAULT;
break;
}
err = raid_getdevs(d, mode, lockp);
break;
}
case MD_IOCSETREGEN:
{
if (! (mode & FWRITE))
return (EACCES);
sz = sizeof (md_regen_param_t);
d = kmem_alloc(sz, KM_SLEEP);
if (ddi_copyin(data, d, sz, mode)) {
err = EFAULT;
break;
}
err = raid_regen((md_regen_param_t *)d, lockp);
break;
}
case MD_IOCPROBE_DEV:
{
md_probedev_impl_t *p = NULL;
md_probedev_t *ph = NULL;
daemon_queue_t *hdr = NULL;
int i;
size_t sz1 = 0;
if (! (mode & FREAD))
return (EACCES);
sz = sizeof (md_probedev_t);
d = kmem_alloc(sz, KM_SLEEP);
/* now copy in the data */
if (ddi_copyin(data, d, sz, mode)) {
err = EFAULT;
goto free_mem;
}
/*
* Sanity test the args. Test name should have the keyword
* probe.
*/
p = kmem_alloc(sizeof (md_probedev_impl_t), KM_SLEEP);
p->probe_sema = NULL;
p->probe_mx = NULL;
p->probe.mnum_list = (uint64_t)NULL;
ph = (md_probedev_t *)d;
p->probe.nmdevs = ph->nmdevs;
(void) strcpy(p->probe.test_name, ph->test_name);
bcopy(&ph->md_driver, &(p->probe.md_driver),
sizeof (md_driver_t));
if ((p->probe.nmdevs < 1) ||
(strstr(p->probe.test_name, "probe") == NULL)) {
err = EINVAL;
goto free_mem;
}
sz1 = sizeof (minor_t) * p->probe.nmdevs;
p->probe.mnum_list = (uint64_t)(uintptr_t)kmem_alloc(sz1,
KM_SLEEP);
if (ddi_copyin((caddr_t)(uintptr_t)ph->mnum_list,
(caddr_t)(uintptr_t)p->probe.mnum_list, sz1, mode)) {
err = EFAULT;
goto free_mem;
}
if (err = md_init_probereq(p, &hdr))
goto free_mem;
/*
* put the request on the queue and wait.
*/
daemon_request_new(&md_ff_daemonq, md_probe_one, hdr, REQ_NEW);
(void) IOLOCK_RETURN(0, lockp);
/* wait for the events to occur */
for (i = 0; i < p->probe.nmdevs; i++) {
sema_p(PROBE_SEMA(p));
}
while (md_ioctl_lock_enter() == EINTR)
;
/*
* clean up. The hdr list is freed in the probe routines
* since the list is NULL by the time we get here.
*/
free_mem:
if (p) {
if (p->probe_sema != NULL) {
sema_destroy(PROBE_SEMA(p));
kmem_free(p->probe_sema, sizeof (ksema_t));
}
if (p->probe_mx != NULL) {
mutex_destroy(PROBE_MX(p));
kmem_free(p->probe_mx, sizeof (kmutex_t));
}
if (p->probe.mnum_list)
kmem_free((caddr_t)(uintptr_t)
p->probe.mnum_list, sz1);
kmem_free(p, sizeof (md_probedev_impl_t));
}
break;
}
default:
return (ENOTTY);
}
/*
* copyout and free any args
*/
if (sz != 0) {
if (err == 0) {
if (ddi_copyout(d, data, sz, mode) != 0) {
err = EFAULT;
}
}
kmem_free(d, sz);
}
return (err);
}
/*
* NAME: md_raid_ioctl
* DESCRIPTION: RAID metadevice IOCTL operations entry point.
* PARAMETERS: md_dev64_t dev - RAID device identifier
* int cmd - IOCTL command to be executed
* void *data - pointer to IOCTL data structure
* int mode - either FREAD or FWRITE
* IOLOCK *lockp - IOCTL read/write lock
*
* LOCKS: none
*
*/
int
md_raid_ioctl(
dev_t dev,
int cmd,
void *data,
int mode,
IOLOCK *lockp
)
{
minor_t mnum = getminor(dev);
mr_unit_t *un;
int err = 0;
/* handle admin ioctls */
if (mnum == MD_ADM_MINOR)
return (raid_admin_ioctl(cmd, data, mode, lockp));
/* check unit */
if ((MD_MIN2SET(mnum) >= md_nsets) ||
(MD_MIN2UNIT(mnum) >= md_nunits) ||
((un = MD_UNIT(mnum)) == NULL))
return (ENXIO);
/* is this a supported ioctl? */
err = md_check_ioctl_against_unit(cmd, un->c);
if (err != 0) {
return (err);
}
/* dispatch ioctl */
switch (cmd) {
case DKIOCINFO:
{
struct dk_cinfo *p;
if (! (mode & FREAD))
return (EACCES);
p = kmem_alloc(sizeof (*p), KM_SLEEP);
get_info(p, mnum);
if (ddi_copyout((caddr_t)p, data, sizeof (*p), mode) != 0)
err = EFAULT;
kmem_free(p, sizeof (*p));
return (err);
}
case DKIOCGMEDIAINFO:
{
struct dk_minfo p;
if (! (mode & FREAD))
return (EACCES);
get_minfo(&p, mnum);
if (ddi_copyout(&p, data, sizeof (struct dk_minfo), mode) != 0)
err = EFAULT;
return (err);
}
case DKIOCGGEOM:
{
struct dk_geom *p;
if (! (mode & FREAD))
return (EACCES);
p = kmem_alloc(sizeof (*p), KM_SLEEP);
if ((err = raid_get_geom(un, p)) == 0) {
if (ddi_copyout((caddr_t)p, data, sizeof (*p),
mode) != 0)
err = EFAULT;
}
kmem_free(p, sizeof (*p));
return (err);
}
case DKIOCGVTOC:
{
struct vtoc vtoc;
if (! (mode & FREAD))
return (EACCES);
if ((err = raid_get_vtoc(un, &vtoc)) != 0) {
return (err);
}
if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
if (ddi_copyout(&vtoc, data, sizeof (vtoc), mode))
err = EFAULT;
}
#ifdef _SYSCALL32
else {
struct vtoc32 vtoc32;
vtoctovtoc32(vtoc, vtoc32);
if (ddi_copyout(&vtoc32, data, sizeof (vtoc32), mode))
err = EFAULT;
}
#endif /* _SYSCALL32 */
return (err);
}
case DKIOCSVTOC:
{
struct vtoc vtoc;
if (! (mode & FWRITE))
return (EACCES);
if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
if (ddi_copyin(data, &vtoc, sizeof (vtoc), mode)) {
err = EFAULT;
}
}
#ifdef _SYSCALL32
else {
struct vtoc32 vtoc32;
if (ddi_copyin(data, &vtoc32, sizeof (vtoc32), mode)) {
err = EFAULT;
} else {
vtoc32tovtoc(vtoc32, vtoc);
}
}
#endif /* _SYSCALL32 */
if (err == 0)
err = raid_set_vtoc(un, &vtoc);
return (err);
}
case DKIOCGEXTVTOC:
{
struct extvtoc extvtoc;
if (! (mode & FREAD))
return (EACCES);
if ((err = raid_get_extvtoc(un, &extvtoc)) != 0) {
return (err);
}
if (ddi_copyout(&extvtoc, data, sizeof (extvtoc), mode))
err = EFAULT;
return (err);
}
case DKIOCSEXTVTOC:
{
struct extvtoc extvtoc;
if (! (mode & FWRITE))
return (EACCES);
if (ddi_copyin(data, &extvtoc, sizeof (extvtoc), mode)) {
err = EFAULT;
}
if (err == 0)
err = raid_set_extvtoc(un, &extvtoc);
return (err);
}
case DKIOCGAPART:
{
struct dk_map dmp;
if ((err = raid_get_cgapart(un, &dmp)) != 0) {
return (err);
}
if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
if (ddi_copyout((caddr_t)&dmp, data, sizeof (dmp),
mode) != 0)
err = EFAULT;
}
#ifdef _SYSCALL32
else {
struct dk_map32 dmp32;
dmp32.dkl_cylno = dmp.dkl_cylno;
dmp32.dkl_nblk = dmp.dkl_nblk;
if (ddi_copyout((caddr_t)&dmp32, data, sizeof (dmp32),
mode) != 0)
err = EFAULT;
}
#endif /* _SYSCALL32 */
return (err);
}
case DKIOCGETEFI:
{
/*
* This one can be done centralized,
* no need to put in the same code for all types of metadevices
*/
return (md_dkiocgetefi(mnum, data, mode));
}
case DKIOCSETEFI:
{
/*
* This one can be done centralized,
* no need to put in the same code for all types of metadevices
*/
return (md_dkiocsetefi(mnum, data, mode));
}
case DKIOCPARTITION:
{
return (md_dkiocpartition(mnum, data, mode));
}
default:
return (ENOTTY);
}
}
/*
* rename/exchange named service entry points and support functions follow.
* Most functions are handled generically, except for raid-specific locking
* and checking
*/
/*
* NAME: raid_may_renexch_self
* DESCRIPTION: support routine for rename check ("MDRNM_CHECK") named service
* PARAMETERS: mr_unit_t *un - unit struct of raid unit to be renamed
* mdi_unit_t *ui - in-core unit struct of same raid unit
* md_rentxn_t *rtxnp - rename transaction state
*
* LOCKS: none
*
*/
static int
raid_may_renexch_self(
mr_unit_t *un,
mdi_unit_t *ui,
md_rentxn_t *rtxnp)
{
minor_t from_min;
minor_t to_min;
bool_t toplevel;
bool_t related;
from_min = rtxnp->from.mnum;
to_min = rtxnp->to.mnum;
if (!un || !ui) {
(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
from_min);
return (EINVAL);
}
ASSERT(!(MD_CAPAB(un) & MD_CAN_META_CHILD));
if (MD_CAPAB(un) & MD_CAN_META_CHILD) {
(void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
return (EINVAL);
}
if (MD_PARENT(un) == MD_MULTI_PARENT) {
(void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
return (EINVAL);
}
toplevel = !MD_HAS_PARENT(MD_PARENT(un));
/* we're related if trying to swap with our parent */
related = (!toplevel) && (MD_PARENT(un) == to_min);
switch (rtxnp->op) {
case MDRNOP_EXCHANGE:
if (!related) {
(void) mdmderror(&rtxnp->mde,
MDE_RENAME_TARGET_UNRELATED, to_min);
return (EINVAL);
}
break;
case MDRNOP_RENAME:
/*
* if from is top-level and is open, then the kernel is using
* the md_dev64_t.
*/
if (toplevel && md_unit_isopen(ui)) {
(void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
from_min);
return (EBUSY);
}
break;
default:
(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
from_min);
return (EINVAL);
}
return (0); /* ok */
}
/*
* NAME: raid_rename_check
* DESCRIPTION: ("MDRNM_CHECK") rename/exchange named service entry point
* PARAMETERS: md_rendelta_t *delta - describes changes to be made to this
* raid device for rename transaction
* md_rentxn_t *rtxnp - rename transaction state
*
* LOCKS: none
*
*/
intptr_t
raid_rename_check(
md_rendelta_t *delta,
md_rentxn_t *rtxnp)
{
int err = 0;
int column;
mr_unit_t *un;
ASSERT(delta);
ASSERT(rtxnp);
ASSERT(delta->unp);
ASSERT(delta->uip);
if (!delta || !rtxnp || !delta->unp || !delta->uip) {
(void) mdsyserror(&rtxnp->mde, EINVAL);
return (EINVAL);
}
un = (mr_unit_t *)delta->unp;
for (column = 0; column < un->un_totalcolumncnt; column++) {
rcs_state_t colstate;
colstate = un->un_column[column].un_devstate;
if (colstate & RCS_LAST_ERRED) {
(void) mdmderror(&rtxnp->mde, MDE_RAID_LAST_ERRED,
md_getminor(delta->dev));
return (EINVAL);
}
if (colstate & RCS_INIT_ERRED) {
(void) mdmderror(&rtxnp->mde, MDE_RAID_DOI,
md_getminor(delta->dev));
return (EINVAL);
}
/* How did we get this far before detecting this? */
if (colstate & RCS_RESYNC) {
(void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
md_getminor(delta->dev));
return (EBUSY);
}
if (colstate & RCS_ERRED) {
(void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
md_getminor(delta->dev));
return (EINVAL);
}
if (!(colstate & RCS_OKAY)) {
(void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
md_getminor(delta->dev));
return (EINVAL);
}
if (HOTSPARED(un, column)) {
(void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
md_getminor(delta->dev));
return (EINVAL);
}
}
/* self does additional checks */
if (delta->old_role == MDRR_SELF) {
err = raid_may_renexch_self((mr_unit_t *)delta->unp,
delta->uip, rtxnp);
}
return (err);
}
/*
* NAME: raid_rename_lock
* DESCRIPTION: ("MDRNM_LOCK") rename/exchange named service entry point
* PARAMETERS: md_rendelta_t *delta - describes changes to be made to this
* raid device for rename transaction
* md_rentxn_t *rtxnp - rename transaction state
*
* LOCKS: io and unit locks (taken explicitly *not* via ioctl wrappers)
*
*/
intptr_t
raid_rename_lock(
md_rendelta_t *delta,
md_rentxn_t *rtxnp)
{
minor_t mnum;
ASSERT(delta);
ASSERT(rtxnp);
mnum = md_getminor(delta->dev);
if (mnum == rtxnp->to.mnum && rtxnp->op == MDRNOP_RENAME) {
return (0);
}
ASSERT(delta->uip);
if (!delta->uip) {
(void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, mnum);
return (ENODEV);
}
ASSERT(delta->unp);
if (!delta->unp) {
return (ENODEV);
}
ASSERT(!IO_WRITER_HELD(delta->unp));
(void) md_io_writerlock(delta->uip);
ASSERT(IO_WRITER_HELD(delta->unp));
ASSERT(!UNIT_WRITER_HELD(delta->unp));
(void) md_unit_writerlock(delta->uip);
ASSERT(UNIT_WRITER_HELD(delta->unp));
return (0);
}
/*
* NAME: raid_rename_unlock
* DESCRIPTION: ("MDRNM_UNLOCK") rename/exchange named service entry point
* PARAMETERS: md_rendelta_t *delta - describes changes to be made to this
* raid device for rename transaction
* md_rentxn_t *rtxnp - rename transaction state
*
* LOCKS: drops io and unit locks
*
*/
/* ARGSUSED */
void
raid_rename_unlock(
md_rendelta_t *delta,
md_rentxn_t *rtxnp)
{
mr_unit_t *un = (mr_unit_t *)delta->unp;
minor_t mnum = MD_SID(un);
int col;
ASSERT(delta);
ASSERT(delta->unp);
ASSERT(delta->uip);
ASSERT(UNIT_WRITER_HELD(delta->unp));
md_unit_writerexit(delta->uip);
ASSERT(!UNIT_WRITER_HELD(delta->unp));
if (! (delta->txn_stat.role_swapped) || ! (delta->txn_stat.is_open)) {
goto out;
}
if (raid_internal_open(mnum, (FREAD | FWRITE),
OTYP_LYR, MD_OFLG_ISINIT) == 0) {
for (col = 0; col < un->un_totalcolumncnt; col++) {
if (un->un_column[col].un_devstate & RCS_OKAY)
(void) init_pw_area(un,
un->un_column[col].un_dev,
un->un_column[col].un_pwstart, col);
}
(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
}
out:
ASSERT(IO_WRITER_HELD(delta->unp));
md_io_writerexit(delta->uip);
ASSERT(!IO_WRITER_HELD(delta->unp));
}
/* end of rename/exchange named service and support functions */