md_subr.c revision d3d50737e566cade9a08d73d2af95105ac7cd960
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Driver for Virtual Disk.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/user.h>
#include <sys/uio.h>
#include <sys/proc.h>
#include <sys/t_lock.h>
#include <sys/dkio.h>
#include <sys/kmem.h>
#include <sys/debug.h>
#include <sys/cmn_err.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/mkdev.h>
#include <sys/vtoc.h>
#include <sys/open.h>
#include <sys/file.h>
#include <vm/page.h>
#include <sys/callb.h>
#include <sys/disp.h>
#include <sys/modctl.h>
#include <sys/errno.h>
#include <sys/door.h>
#include <sys/lvm/mdmn_commd.h>
#include <sys/lvm/md_hotspares.h>
#include <sys/lvm/mdvar.h>
#include <sys/lvm/md_names.h>
#include <sys/ddi.h>
#include <sys/proc.h>
#include <sys/sunddi.h>
#include <sys/esunddi.h>
#include <sys/sysevent.h>
#include <sys/sysevent/eventdefs.h>
#include <sys/sysevent/svm.h>
#include <sys/lvm/md_basic.h>
/*
* Machine specific Hertz is kept here
*/
extern clock_t md_hz;
/*
* Externs.
*/
extern int (*mdv_strategy_tstpnt)(buf_t *, int, void*);
extern major_t md_major;
extern unit_t md_nunits;
extern set_t md_nsets;
extern md_set_t md_set[];
extern md_set_io_t md_set_io[];
extern md_ops_t **md_ops;
extern md_ops_t *md_opslist;
extern ddi_modhandle_t *md_mods;
extern dev_info_t *md_devinfo;
extern md_krwlock_t md_unit_array_rw;
extern kmutex_t md_mx;
extern kcondvar_t md_cv;
extern md_krwlock_t hsp_rwlp;
extern md_krwlock_t ni_rwlp;
extern int md_num_daemons;
extern int md_status;
extern int md_ioctl_cnt;
extern int md_mtioctl_cnt;
extern struct metatransops metatransops;
extern md_event_queue_t *md_event_queue;
extern md_resync_t md_cpr_resync;
extern int md_done_daemon_threads;
extern int md_ff_daemon_threads;
extern mddb_set_t *mddb_setenter(set_t setno, int flag, int *errorcodep);
extern void mddb_setexit(mddb_set_t *s);
extern void *lookup_entry(struct nm_next_hdr *, set_t,
side_t, mdkey_t, md_dev64_t, int);
extern struct nm_next_hdr *get_first_record(set_t, int, int);
extern dev_t getrootdev(void);
struct mdq_anchor md_done_daemon; /* done request queue */
struct mdq_anchor md_mstr_daemon; /* mirror error, WOW requests */
struct mdq_anchor md_mhs_daemon; /* mirror hotspare requests queue */
struct mdq_anchor md_hs_daemon; /* raid hotspare requests queue */
struct mdq_anchor md_ff_daemonq; /* failfast request queue */
struct mdq_anchor md_mirror_daemon; /* mirror owner queue */
struct mdq_anchor md_mirror_io_daemon; /* mirror owner i/o queue */
struct mdq_anchor md_mirror_rs_daemon; /* mirror resync done queue */
struct mdq_anchor md_sp_daemon; /* soft-part error daemon queue */
struct mdq_anchor md_mto_daemon; /* mirror timeout daemon queue */
int md_done_daemon_threads = 1; /* threads for md_done_daemon requestq */
int md_mstr_daemon_threads = 1; /* threads for md_mstr_daemon requestq */
int md_mhs_daemon_threads = 1; /* threads for md_mhs_daemon requestq */
int md_hs_daemon_threads = 1; /* threads for md_hs_daemon requestq */
int md_ff_daemon_threads = 3; /* threads for md_ff_daemon requestq */
int md_mirror_daemon_threads = 1; /* threads for md_mirror_daemon requestq */
int md_sp_daemon_threads = 1; /* threads for md_sp_daemon requestq */
int md_mto_daemon_threads = 1; /* threads for md_mto_daemon requestq */
#ifdef DEBUG
/* Flag to switch on debug messages */
int md_release_reacquire_debug = 0; /* debug flag */
#endif
/*
*
* The md_request_queues is table of pointers to request queues and the number
* of threads associated with the request queues.
* When the number of threads is set to 1, then the order of execution is
* sequential.
* The number of threads for all the queues have been defined as global
* variables to enable kernel tuning.
*
*/
#define MD_DAEMON_QUEUES 11
md_requestq_entry_t md_daemon_queues[MD_DAEMON_QUEUES] = {
{&md_done_daemon, &md_done_daemon_threads},
{&md_mstr_daemon, &md_mstr_daemon_threads},
{&md_hs_daemon, &md_hs_daemon_threads},
{&md_ff_daemonq, &md_ff_daemon_threads},
{&md_mirror_daemon, &md_mirror_daemon_threads},
{&md_mirror_io_daemon, &md_mirror_daemon_threads},
{&md_mirror_rs_daemon, &md_mirror_daemon_threads},
{&md_sp_daemon, &md_sp_daemon_threads},
{&md_mhs_daemon, &md_mhs_daemon_threads},
{&md_mto_daemon, &md_mto_daemon_threads},
{0, 0}
};
/*
* Number of times a message is retried before issuing a warning to the operator
*/
#define MD_MN_WARN_INTVL 10
/*
* Setting retry cnt to one (pre decremented) so that we actually do no
* retries when committing/deleting a mddb rec. The underlying disk driver
* does several retries to check if the disk is really dead or not so there
* is no reason for us to retry on top of the drivers retries.
*/
uint_t md_retry_cnt = 1; /* global so it can be patched */
/*
* How many times to try to do the door_ki_upcall() in mdmn_ksend_message.
* Again, made patchable here should it prove useful.
*/
uint_t md_send_retry_limit = 30;
/*
* Bug # 1212146
* Before this change the user had to pass in a short aligned buffer because of
* problems in some underlying device drivers. This problem seems to have been
* corrected in the underlying drivers so we will default to not requiring any
* alignment. If the user needs to check for a specific alignment,
* md_uio_alignment_mask may be set in /etc/system to accomplish this. To get
* the behavior before this fix, the md_uio_alignment_mask would be set to 1,
* to check for word alignment, it can be set to 3, for double word alignment,
* it can be set to 7, etc.
*
* [Other part of fix is in function md_chk_uio()]
*/
static int md_uio_alignment_mask = 0;
/*
* for md_dev64_t translation
*/
struct md_xlate_table *md_tuple_table;
struct md_xlate_major_table *md_major_tuple_table;
int md_tuple_length;
uint_t md_majortab_len;
/* Function declarations */
static int md_create_probe_rqlist(md_probedev_impl_t *plist,
daemon_queue_t **hdr, intptr_t (*probe_test)());
/*
* manipulate global status
*/
void
md_set_status(int bits)
{
mutex_enter(&md_mx);
md_status |= bits;
mutex_exit(&md_mx);
}
void
md_clr_status(int bits)
{
mutex_enter(&md_mx);
md_status &= ~bits;
mutex_exit(&md_mx);
}
int
md_get_status()
{
int result;
mutex_enter(&md_mx);
result = md_status;
mutex_exit(&md_mx);
return (result);
}
void
md_set_setstatus(set_t setno, int bits)
{
ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
mutex_enter(&md_mx);
md_set[setno].s_status |= bits;
mutex_exit(&md_mx);
}
void
md_clr_setstatus(set_t setno, int bits)
{
ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
mutex_enter(&md_mx);
md_set[setno].s_status &= ~bits;
mutex_exit(&md_mx);
}
uint_t
md_get_setstatus(set_t setno)
{
uint_t result;
ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
mutex_enter(&md_mx);
result = md_set[setno].s_status;
mutex_exit(&md_mx);
return (result);
}
/*
* md_unit_readerlock_common:
* -------------------------
* Mark the given unit as having a reader reference. Spin waiting for any
* writer references to be released.
*
* Input:
* ui unit reference
* lock_held 0 => ui_mx needs to be grabbed
* 1 => ui_mx already held
* Output:
* mm_unit_t corresponding to unit structure
* ui->ui_readercnt incremented
*/
static void *
md_unit_readerlock_common(mdi_unit_t *ui, int lock_held)
{
uint_t flag = MD_UL_WRITER | MD_UL_WANABEWRITER;
if (!lock_held)
mutex_enter(&ui->ui_mx);
while (ui->ui_lock & flag) {
if (panicstr) {
if (ui->ui_lock & MD_UL_WRITER)
panic("md: writer lock is held");
break;
}
cv_wait(&ui->ui_cv, &ui->ui_mx);
}
ui->ui_readercnt++;
if (!lock_held)
mutex_exit(&ui->ui_mx);
return (MD_UNIT(ui->ui_link.ln_id));
}
void *
md_unit_readerlock(mdi_unit_t *ui)
{
return (md_unit_readerlock_common(ui, 0));
}
/*
* md_unit_writerlock_common:
* -------------------------
* Acquire a unique writer reference. Causes previous readers to drain.
* Spins if a writer reference already exists or if a previous reader/writer
* dropped the lock to allow a ksend_message to be despatched.
*
* Input:
* ui unit reference
* lock_held 0 => grab ui_mx
* 1 => ui_mx already held on entry
* Output:
* mm_unit_t reference
*/
static void *
md_unit_writerlock_common(mdi_unit_t *ui, int lock_held)
{
uint_t flag = MD_UL_WRITER;
if (panicstr)
panic("md: writer lock not allowed");
if (!lock_held)
mutex_enter(&ui->ui_mx);
while ((ui->ui_lock & flag) || (ui->ui_readercnt != 0)) {
ui->ui_wanabecnt++;
ui->ui_lock |= MD_UL_WANABEWRITER;
cv_wait(&ui->ui_cv, &ui->ui_mx);
if (--ui->ui_wanabecnt == 0)
ui->ui_lock &= ~MD_UL_WANABEWRITER;
}
ui->ui_lock |= MD_UL_WRITER;
ui->ui_owner = curthread;
if (!lock_held)
mutex_exit(&ui->ui_mx);
return (MD_UNIT(ui->ui_link.ln_id));
}
void *
md_unit_writerlock(mdi_unit_t *ui)
{
return (md_unit_writerlock_common(ui, 0));
}
/*
* md_unit_readerexit_common:
* -------------------------
* Release the readerlock for the specified unit. If the reader count reaches
* zero and there are waiting writers (MD_UL_WANABEWRITER set) wake them up.
*
* Input:
* ui unit reference
* lock_held 0 => ui_mx needs to be acquired
* 1 => ui_mx already held
*/
static void
md_unit_readerexit_common(mdi_unit_t *ui, int lock_held)
{
if (!lock_held)
mutex_enter(&ui->ui_mx);
ASSERT((ui->ui_lock & MD_UL_WRITER) == 0);
ASSERT(ui->ui_readercnt != 0);
ui->ui_readercnt--;
if ((ui->ui_wanabecnt != 0) && (ui->ui_readercnt == 0))
cv_broadcast(&ui->ui_cv);
if (!lock_held)
mutex_exit(&ui->ui_mx);
}
void
md_unit_readerexit(mdi_unit_t *ui)
{
md_unit_readerexit_common(ui, 0);
}
/*
* md_unit_writerexit_common:
* -------------------------
* Release the writerlock currently held on the unit. Wake any threads waiting
* on becoming reader or writer (MD_UL_WANABEWRITER set).
*
* Input:
* ui unit reference
* lock_held 0 => ui_mx to be acquired
* 1 => ui_mx already held
*/
static void
md_unit_writerexit_common(mdi_unit_t *ui, int lock_held)
{
if (!lock_held)
mutex_enter(&ui->ui_mx);
ASSERT((ui->ui_lock & MD_UL_WRITER) != 0);
ASSERT(ui->ui_readercnt == 0);
ui->ui_lock &= ~MD_UL_WRITER;
ui->ui_owner = NULL;
cv_broadcast(&ui->ui_cv);
if (!lock_held)
mutex_exit(&ui->ui_mx);
}
void
md_unit_writerexit(mdi_unit_t *ui)
{
md_unit_writerexit_common(ui, 0);
}
void *
md_io_readerlock(mdi_unit_t *ui)
{
md_io_lock_t *io = ui->ui_io_lock;
ASSERT(io); /* checks case where no io lock allocated */
mutex_enter(&io->io_mx);
while (io->io_lock & (MD_UL_WRITER | MD_UL_WANABEWRITER)) {
if (panicstr) {
if (io->io_lock & MD_UL_WRITER)
panic("md: writer lock is held");
break;
}
cv_wait(&io->io_cv, &io->io_mx);
}
io->io_readercnt++;
mutex_exit(&io->io_mx);
return (MD_UNIT(ui->ui_link.ln_id));
}
void *
md_io_writerlock(mdi_unit_t *ui)
{
md_io_lock_t *io = ui->ui_io_lock;
ASSERT(io); /* checks case where no io lock allocated */
if (panicstr)
panic("md: writer lock not allowed");
mutex_enter(&io->io_mx);
while ((io->io_lock & MD_UL_WRITER) || (io->io_readercnt != 0)) {
io->io_wanabecnt++;
io->io_lock |= MD_UL_WANABEWRITER;
cv_wait(&io->io_cv, &io->io_mx);
if (--io->io_wanabecnt == 0)
io->io_lock &= ~MD_UL_WANABEWRITER;
}
io->io_lock |= MD_UL_WRITER;
io->io_owner = curthread;
mutex_exit(&io->io_mx);
return (MD_UNIT(ui->ui_link.ln_id));
}
void
md_io_readerexit(mdi_unit_t *ui)
{
md_io_lock_t *io = ui->ui_io_lock;
mutex_enter(&io->io_mx);
ASSERT((io->io_lock & MD_UL_WRITER) == 0);
ASSERT(io->io_readercnt != 0);
io->io_readercnt--;
if ((io->io_wanabecnt != 0) && (io->io_readercnt == 0)) {
cv_broadcast(&io->io_cv);
}
mutex_exit(&io->io_mx);
}
void
md_io_writerexit(mdi_unit_t *ui)
{
md_io_lock_t *io = ui->ui_io_lock;
mutex_enter(&io->io_mx);
ASSERT((io->io_lock & MD_UL_WRITER) != 0);
ASSERT(io->io_readercnt == 0);
io->io_lock &= ~MD_UL_WRITER;
io->io_owner = NULL;
cv_broadcast(&io->io_cv);
mutex_exit(&io->io_mx);
}
/*
* Attempt to grab that set of locks defined as global.
* A mask containing the set of global locks that are owned upon
* entry is input. Any additional global locks are then grabbed.
* This keeps the caller from having to know the set of global
* locks.
*/
static int
md_global_lock_enter(int global_locks_owned_mask)
{
/*
* The current implementation has been verified by inspection
* and test to be deadlock free. If another global lock is
* added, changing the algorithm used by this function should
* be considered. With more than 2 locks it is difficult to
* guarantee that locks are being acquired in the correct order.
* The safe approach would be to drop all of the locks that are
* owned at function entry and then reacquire all of the locks
* in the order defined by the lock hierarchy.
*/
mutex_enter(&md_mx);
if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) {
while ((md_mtioctl_cnt != 0) ||
(md_status & MD_GBL_IOCTL_LOCK)) {
if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) {
mutex_exit(&md_mx);
return (EINTR);
}
}
md_status |= MD_GBL_IOCTL_LOCK;
md_ioctl_cnt++;
}
if (!(global_locks_owned_mask & MD_GBL_HS_LOCK)) {
while (md_status & MD_GBL_HS_LOCK) {
if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) {
md_status &= ~MD_GBL_IOCTL_LOCK;
mutex_exit(&md_mx);
return (EINTR);
}
}
md_status |= MD_GBL_HS_LOCK;
}
mutex_exit(&md_mx);
return (0);
}
/*
* Release the set of global locks that were grabbed in md_global_lock_enter
* that were not already owned by the calling thread. The set of previously
* owned global locks is passed in as a mask parameter.
*/
static int
md_global_lock_exit(int global_locks_owned_mask, int code,
int flags, mdi_unit_t *ui)
{
mutex_enter(&md_mx);
/* If MT ioctl decrement mt_ioctl_cnt */
if ((flags & MD_MT_IOCTL)) {
md_mtioctl_cnt--;
} else {
if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) {
/* clear the lock and decrement count */
ASSERT(md_ioctl_cnt == 1);
md_ioctl_cnt--;
md_status &= ~MD_GBL_IOCTL_LOCK;
}
if (!(global_locks_owned_mask & MD_GBL_HS_LOCK))
md_status &= ~MD_GBL_HS_LOCK;
}
if (flags & MD_READER_HELD)
md_unit_readerexit(ui);
if (flags & MD_WRITER_HELD)
md_unit_writerexit(ui);
if (flags & MD_IO_HELD)
md_io_writerexit(ui);
if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) {
rw_exit(&md_unit_array_rw.lock);
}
cv_broadcast(&md_cv);
mutex_exit(&md_mx);
return (code);
}
/*
* The two functions, md_ioctl_lock_enter, and md_ioctl_lock_exit make
* use of the md_global_lock_{enter|exit} functions to avoid duplication
* of code. They rely upon the fact that the locks that are specified in
* the input mask are not acquired or freed. If this algorithm changes
* as described in the block comment at the beginning of md_global_lock_enter
* then it will be necessary to change these 2 functions. Otherwise these
* functions will be grabbing and holding global locks unnecessarily.
*/
int
md_ioctl_lock_enter(void)
{
/* grab only the ioctl lock */
return (md_global_lock_enter(~MD_GBL_IOCTL_LOCK));
}
/*
* If md_ioctl_lock_exit is being called at the end of an ioctl before
* returning to user space, then ioctl_end is set to 1.
* Otherwise, the ioctl lock is being dropped in the middle of handling
* an ioctl and will be reacquired before the end of the ioctl.
* Do not attempt to process the MN diskset mddb parse flags unless
* ioctl_end is true - otherwise a deadlock situation could arise.
*/
int
md_ioctl_lock_exit(int code, int flags, mdi_unit_t *ui, int ioctl_end)
{
int ret_val;
uint_t status;
mddb_set_t *s;
int i;
int err;
md_mn_msg_mddb_parse_t *mddb_parse_msg;
md_mn_kresult_t *kresult;
mddb_lb_t *lbp;
int rval = 1;
int flag;
/* release only the ioctl lock */
ret_val = md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui);
/*
* If md_ioctl_lock_exit is being called with a possible lock held
* (ioctl_end is 0), then don't check the MN disksets since the
* call to mddb_setenter may cause a lock ordering deadlock.
*/
if (!ioctl_end)
return (ret_val);
/*
* Walk through disksets to see if there is a MN diskset that
* has messages that need to be sent. Set must be snarfed and
* be a MN diskset in order to be checked.
*
* In a MN diskset, this routine may send messages to the
* rpc.mdcommd in order to have the slave nodes re-parse parts
* of the mddb. Messages can only be sent with no locks held,
* so if mddb change occurred while the ioctl lock is held, this
* routine must send the messages.
*/
for (i = 1; i < md_nsets; i++) {
status = md_get_setstatus(i);
/* Set must be snarfed and be a MN diskset */
if ((status & (MD_SET_SNARFED | MD_SET_MNSET)) !=
(MD_SET_SNARFED | MD_SET_MNSET))
continue;
/* Grab set lock so that set can't change */
if ((s = mddb_setenter(i, MDDB_MUSTEXIST, &err)) == NULL)
continue;
lbp = s->s_lbp;
/* Re-get set status now that lock is held */
status = md_get_setstatus(i);
/*
* If MN parsing block flag is set - continue to next set.
*
* If s_mn_parseflags_sending is non-zero, then another thread
* is already currently sending a parse message, so just
* release the set mutex. If this ioctl had caused an mddb
* change that results in a parse message to be generated,
* the thread that is currently sending a parse message would
* generate the additional parse message.
*
* If s_mn_parseflags_sending is zero then loop until
* s_mn_parseflags is 0 (until there are no more
* messages to send).
* While s_mn_parseflags is non-zero,
* put snapshot of parse_flags in s_mn_parseflags_sending
* set s_mn_parseflags to zero
* release set mutex
* send message
* re-grab set mutex
* set s_mn_parseflags_sending to zero
*
* If set is STALE, send message with NO_LOG flag so that
* rpc.mdcommd won't attempt to log message to non-writeable
* replica.
*/
mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t),
KM_SLEEP);
while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
(s->s_mn_parseflags & MDDB_PARSE_MASK) &&
(!(status & MD_SET_MNPARSE_BLK))) {
/* Grab snapshot of parse flags */
s->s_mn_parseflags_sending = s->s_mn_parseflags;
s->s_mn_parseflags = 0;
mutex_exit(&md_set[(s)->s_setno].s_dbmx);
/*
* Send the message to the slaves to re-parse
* the indicated portions of the mddb. Send the status
* of the 50 mddbs in this set so that slaves know
* which mddbs that the master node thinks are 'good'.
* Otherwise, slave may reparse, but from wrong
* replica.
*/
mddb_parse_msg->msg_parse_flags =
s->s_mn_parseflags_sending;
for (i = 0; i < MDDB_NLB; i++) {
mddb_parse_msg->msg_lb_flags[i] =
lbp->lb_locators[i].l_flags;
}
kresult = kmem_zalloc(sizeof (md_mn_kresult_t),
KM_SLEEP);
while (rval != 0) {
flag = 0;
if (status & MD_SET_STALE)
flag |= MD_MSGF_NO_LOG;
rval = mdmn_ksend_message(s->s_setno,
MD_MN_MSG_MDDB_PARSE, flag, 0,
(char *)mddb_parse_msg,
sizeof (md_mn_msg_mddb_parse_t), kresult);
/* if the node hasn't yet joined, it's Ok. */
if ((!MDMN_KSEND_MSG_OK(rval, kresult)) &&
(kresult->kmmr_comm_state !=
MDMNE_NOT_JOINED)) {
mdmn_ksend_show_error(rval, kresult,
"MD_MN_MSG_MDDB_PARSE");
cmn_err(CE_WARN, "md_ioctl_lock_exit: "
"Unable to send mddb update "
"message to other nodes in "
"diskset %s\n", s->s_setname);
rval = 1;
}
}
kmem_free(kresult, sizeof (md_mn_kresult_t));
/*
* Re-grab mutex to clear sending field and to
* see if another parse message needs to be generated.
*/
mutex_enter(&md_set[(s)->s_setno].s_dbmx);
s->s_mn_parseflags_sending = 0;
}
kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t));
mutex_exit(&md_set[(s)->s_setno].s_dbmx);
}
return (ret_val);
}
/*
* Called when in an ioctl and need readerlock.
*/
void *
md_ioctl_readerlock(IOLOCK *lock, mdi_unit_t *ui)
{
ASSERT(lock != NULL);
lock->l_ui = ui;
lock->l_flags |= MD_READER_HELD;
return (md_unit_readerlock_common(ui, 0));
}
/*
* Called when in an ioctl and need writerlock.
*/
void *
md_ioctl_writerlock(IOLOCK *lock, mdi_unit_t *ui)
{
ASSERT(lock != NULL);
lock->l_ui = ui;
lock->l_flags |= MD_WRITER_HELD;
return (md_unit_writerlock_common(ui, 0));
}
void *
md_ioctl_io_lock(IOLOCK *lock, mdi_unit_t *ui)
{
ASSERT(lock != NULL);
lock->l_ui = ui;
lock->l_flags |= MD_IO_HELD;
return (md_io_writerlock(ui));
}
void
md_ioctl_readerexit(IOLOCK *lock)
{
ASSERT(lock != NULL);
lock->l_flags &= ~MD_READER_HELD;
md_unit_readerexit(lock->l_ui);
}
void
md_ioctl_writerexit(IOLOCK *lock)
{
ASSERT(lock != NULL);
lock->l_flags &= ~MD_WRITER_HELD;
md_unit_writerexit(lock->l_ui);
}
void
md_ioctl_io_exit(IOLOCK *lock)
{
ASSERT(lock != NULL);
lock->l_flags &= ~MD_IO_HELD;
md_io_writerexit(lock->l_ui);
}
/*
* md_ioctl_releaselocks:
* --------------------
* Release the unit locks that are held and stop subsequent
* md_unit_reader/writerlock calls from progressing. This allows the caller
* to send messages across the cluster when running in a multinode
* environment.
* ioctl originated locks (via md_ioctl_readerlock/md_ioctl_writerlock) are
* allowed to progress as normal. This is required as these typically are
* invoked by the message handler that may be called while a unit lock is
* marked as released.
*
* On entry:
* variety of unit locks may be held including ioctl lock
*
* On exit:
* locks released and unit structure updated to prevent subsequent reader/
* writer locks being acquired until md_ioctl_reacquirelocks is called
*/
void
md_ioctl_releaselocks(int code, int flags, mdi_unit_t *ui)
{
/* This actually releases the locks. */
(void) md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui);
}
/*
* md_ioctl_reacquirelocks:
* ----------------------
* Reacquire the locks that were held when md_ioctl_releaselocks
* was called.
*
* On entry:
* No unit locks held
* On exit:
* locks held that were held at md_ioctl_releaselocks time including
* the ioctl lock.
*/
void
md_ioctl_reacquirelocks(int flags, mdi_unit_t *ui)
{
if (flags & MD_MT_IOCTL) {
mutex_enter(&md_mx);
md_mtioctl_cnt++;
mutex_exit(&md_mx);
} else {
while (md_ioctl_lock_enter() == EINTR)
;
}
if (flags & MD_ARRAY_WRITER) {
rw_enter(&md_unit_array_rw.lock, RW_WRITER);
} else if (flags & MD_ARRAY_READER) {
rw_enter(&md_unit_array_rw.lock, RW_READER);
}
if (ui != (mdi_unit_t *)NULL) {
if (flags & MD_IO_HELD) {
(void) md_io_writerlock(ui);
}
mutex_enter(&ui->ui_mx);
if (flags & MD_READER_HELD) {
(void) md_unit_readerlock_common(ui, 1);
} else if (flags & MD_WRITER_HELD) {
(void) md_unit_writerlock_common(ui, 1);
}
/* Wake up any blocked readerlock() calls */
cv_broadcast(&ui->ui_cv);
mutex_exit(&ui->ui_mx);
}
}
void
md_ioctl_droplocks(IOLOCK *lock)
{
mdi_unit_t *ui;
int flags;
ASSERT(lock != NULL);
ui = lock->l_ui;
flags = lock->l_flags;
if (flags & MD_READER_HELD) {
lock->l_flags &= ~MD_READER_HELD;
md_unit_readerexit(ui);
}
if (flags & MD_WRITER_HELD) {
lock->l_flags &= ~MD_WRITER_HELD;
md_unit_writerexit(ui);
}
if (flags & MD_IO_HELD) {
lock->l_flags &= ~MD_IO_HELD;
md_io_writerexit(ui);
}
if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) {
lock->l_flags &= ~(MD_ARRAY_WRITER | MD_ARRAY_READER);
rw_exit(&md_unit_array_rw.lock);
}
}
void
md_array_writer(IOLOCK *lock)
{
ASSERT(lock != NULL);
lock->l_flags |= MD_ARRAY_WRITER;
rw_enter(&md_unit_array_rw.lock, RW_WRITER);
}
void
md_array_reader(IOLOCK *lock)
{
ASSERT(lock != NULL);
lock->l_flags |= MD_ARRAY_READER;
rw_enter(&md_unit_array_rw.lock, RW_READER);
}
/*
* Called when in an ioctl and need opencloselock.
* Sets flags in lockp for READER_HELD.
*/
void *
md_ioctl_openclose_enter(IOLOCK *lockp, mdi_unit_t *ui)
{
void *un;
ASSERT(lockp != NULL);
mutex_enter(&ui->ui_mx);
while (ui->ui_lock & MD_UL_OPENORCLOSE)
cv_wait(&ui->ui_cv, &ui->ui_mx);
ui->ui_lock |= MD_UL_OPENORCLOSE;
/* Maintain mutex across the readerlock call */
lockp->l_ui = ui;
lockp->l_flags |= MD_READER_HELD;
un = md_unit_readerlock_common(ui, 1);
mutex_exit(&ui->ui_mx);
return (un);
}
/*
* Clears reader lock using md_ioctl instead of md_unit
* and updates lockp.
*/
void
md_ioctl_openclose_exit(IOLOCK *lockp)
{
mdi_unit_t *ui;
ASSERT(lockp != NULL);
ui = lockp->l_ui;
ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
md_ioctl_readerexit(lockp);
mutex_enter(&ui->ui_mx);
ui->ui_lock &= ~MD_UL_OPENORCLOSE;
cv_broadcast(&ui->ui_cv);
mutex_exit(&ui->ui_mx);
}
/*
* Clears reader lock using md_ioctl instead of md_unit
* and updates lockp.
* Does not acquire or release the ui_mx lock since the calling
* routine has already acquired this lock.
*/
void
md_ioctl_openclose_exit_lh(IOLOCK *lockp)
{
mdi_unit_t *ui;
ASSERT(lockp != NULL);
ui = lockp->l_ui;
ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
lockp->l_flags &= ~MD_READER_HELD;
md_unit_readerexit_common(lockp->l_ui, 1);
ui->ui_lock &= ~MD_UL_OPENORCLOSE;
cv_broadcast(&ui->ui_cv);
}
void *
md_unit_openclose_enter(mdi_unit_t *ui)
{
void *un;
mutex_enter(&ui->ui_mx);
while (ui->ui_lock & (MD_UL_OPENORCLOSE))
cv_wait(&ui->ui_cv, &ui->ui_mx);
ui->ui_lock |= MD_UL_OPENORCLOSE;
/* Maintain mutex across the readerlock call */
un = md_unit_readerlock_common(ui, 1);
mutex_exit(&ui->ui_mx);
return (un);
}
void
md_unit_openclose_exit(mdi_unit_t *ui)
{
md_unit_readerexit(ui);
mutex_enter(&ui->ui_mx);
ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
ui->ui_lock &= ~MD_UL_OPENORCLOSE;
cv_broadcast(&ui->ui_cv);
mutex_exit(&ui->ui_mx);
}
/*
* Drop the openclose and readerlocks without acquiring or
* releasing the ui_mx lock since the calling routine has
* already acquired this lock.
*/
void
md_unit_openclose_exit_lh(mdi_unit_t *ui)
{
md_unit_readerexit_common(ui, 1);
ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
ui->ui_lock &= ~MD_UL_OPENORCLOSE;
cv_broadcast(&ui->ui_cv);
}
int
md_unit_isopen(
mdi_unit_t *ui
)
{
int isopen;
/* check status */
mutex_enter(&ui->ui_mx);
isopen = ((ui->ui_lock & MD_UL_OPEN) ? 1 : 0);
mutex_exit(&ui->ui_mx);
return (isopen);
}
int
md_unit_incopen(
minor_t mnum,
int flag,
int otyp
)
{
mdi_unit_t *ui = MDI_UNIT(mnum);
int err = 0;
/* check type and flags */
ASSERT(ui != NULL);
mutex_enter(&ui->ui_mx);
if ((otyp < 0) || (otyp >= OTYPCNT)) {
err = EINVAL;
goto out;
}
if (((flag & FEXCL) && (ui->ui_lock & MD_UL_OPEN)) ||
(ui->ui_lock & MD_UL_EXCL)) {
err = EBUSY;
goto out;
}
/* count and flag open */
ui->ui_ocnt[otyp]++;
ui->ui_lock |= MD_UL_OPEN;
if (flag & FEXCL)
ui->ui_lock |= MD_UL_EXCL;
/* setup kstat, return success */
mutex_exit(&ui->ui_mx);
md_kstat_init(mnum);
return (0);
/* return error */
out:
mutex_exit(&ui->ui_mx);
return (err);
}
int
md_unit_decopen(
minor_t mnum,
int otyp
)
{
mdi_unit_t *ui = MDI_UNIT(mnum);
int err = 0;
unsigned i;
/* check type and flags */
ASSERT(ui != NULL);
mutex_enter(&ui->ui_mx);
if ((otyp < 0) || (otyp >= OTYPCNT)) {
err = EINVAL;
goto out;
} else if (ui->ui_ocnt[otyp] == 0) {
err = ENXIO;
goto out;
}
/* count and flag closed */
if (otyp == OTYP_LYR)
ui->ui_ocnt[otyp]--;
else
ui->ui_ocnt[otyp] = 0;
ui->ui_lock &= ~MD_UL_OPEN;
for (i = 0; (i < OTYPCNT); ++i)
if (ui->ui_ocnt[i] != 0)
ui->ui_lock |= MD_UL_OPEN;
if (! (ui->ui_lock & MD_UL_OPEN))
ui->ui_lock &= ~MD_UL_EXCL;
/* teardown kstat, return success */
if (! (ui->ui_lock & MD_UL_OPEN)) {
/*
* We have a race condition inherited from specfs between
* open() and close() calls. This results in the kstat
* for a pending I/O being torn down, and then a panic.
* To avoid this, only tear the kstat down if there are
* no other readers on this device.
*/
if (ui->ui_readercnt > 1) {
mutex_exit(&ui->ui_mx);
} else {
mutex_exit(&ui->ui_mx);
md_kstat_destroy(mnum);
}
return (0);
}
/* return success */
out:
mutex_exit(&ui->ui_mx);
return (err);
}
md_dev64_t
md_xlate_targ_2_mini(md_dev64_t targ_devt)
{
dev32_t mini_32_devt, targ_32_devt;
int i;
/*
* check to see if we're in an upgrade situation
* if we are not in upgrade just return the input device
*/
if (!MD_UPGRADE)
return (targ_devt);
targ_32_devt = md_cmpldev(targ_devt);
i = 0;
while (i != md_tuple_length) {
if (md_tuple_table[i].targ_devt == targ_32_devt) {
mini_32_devt = md_tuple_table[i].mini_devt;
return (md_expldev((md_dev64_t)mini_32_devt));
}
i++;
}
return (NODEV64);
}
md_dev64_t
md_xlate_mini_2_targ(md_dev64_t mini_devt)
{
dev32_t mini_32_devt, targ_32_devt;
int i;
if (!MD_UPGRADE)
return (mini_devt);
mini_32_devt = md_cmpldev(mini_devt);
i = 0;
while (i != md_tuple_length) {
if (md_tuple_table[i].mini_devt == mini_32_devt) {
targ_32_devt = md_tuple_table[i].targ_devt;
return (md_expldev((md_dev64_t)targ_32_devt));
}
i++;
}
return (NODEV64);
}
void
md_xlate_free(int size)
{
kmem_free(md_tuple_table, size);
}
char *
md_targ_major_to_name(major_t maj)
{
char *drv_name = NULL;
int i;
if (!MD_UPGRADE)
return (ddi_major_to_name(maj));
for (i = 0; i < md_majortab_len; i++) {
if (md_major_tuple_table[i].targ_maj == maj) {
drv_name = md_major_tuple_table[i].drv_name;
break;
}
}
return (drv_name);
}
major_t
md_targ_name_to_major(char *drv_name)
{
major_t maj;
int i;
maj = md_getmajor(NODEV64);
if (!MD_UPGRADE)
return (ddi_name_to_major(drv_name));
for (i = 0; i < md_majortab_len; i++) {
if ((strcmp(md_major_tuple_table[i].drv_name,
drv_name)) == 0) {
maj = md_major_tuple_table[i].targ_maj;
break;
}
}
return (maj);
}
void
md_majortab_free()
{
size_t sz;
int i;
for (i = 0; i < md_majortab_len; i++) {
freestr(md_major_tuple_table[i].drv_name);
}
sz = md_majortab_len * sizeof (struct md_xlate_major_table);
kmem_free(md_major_tuple_table, sz);
}
/* functions return a pointer to a function which returns an int */
intptr_t (*
md_get_named_service(md_dev64_t dev, int modindex, char *name,
intptr_t (*Default)()))()
{
mdi_unit_t *ui;
md_named_services_t *sp;
int i;
/*
* Return the first named service found.
* Use this path when it is known that there is only
* one named service possible (e.g., hotspare interface)
*/
if ((dev == NODEV64) && (modindex == ANY_SERVICE)) {
for (i = 0; i < MD_NOPS; i++) {
if (md_ops[i] == NULL) {
continue;
}
sp = md_ops[i]->md_services;
if (sp == NULL)
continue;
while (sp->md_service != NULL) {
if (strcmp(name, sp->md_name) == 0)
return (sp->md_service);
sp++;
}
}
return (Default);
}
/*
* Return the named service for the given modindex.
* This is used if there are multiple possible named services
* and each one needs to be called (e.g., poke hotspares)
*/
if (dev == NODEV64) {
if (modindex >= MD_NOPS)
return (Default);
if (md_ops[modindex] == NULL)
return (Default);
sp = md_ops[modindex]->md_services;
if (sp == NULL)
return (Default);
while (sp->md_service != NULL) {
if (strcmp(name, sp->md_name) == 0)
return (sp->md_service);
sp++;
}
return (Default);
}
/*
* Return the named service for this md_dev64_t
*/
if (md_getmajor(dev) != md_major)
return (Default);
if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) ||
(MD_MIN2UNIT(md_getminor(dev)) >= md_nunits))
return (NULL);
if ((ui = MDI_UNIT(md_getminor(dev))) == NULL)
return (NULL);
sp = md_ops[ui->ui_opsindex]->md_services;
if (sp == NULL)
return (Default);
while (sp->md_service != NULL) {
if (strcmp(name, sp->md_name) == 0)
return (sp->md_service);
sp++;
}
return (Default);
}
/*
* md_daemon callback routine
*/
boolean_t
callb_md_cpr(void *arg, int code)
{
callb_cpr_t *cp = (callb_cpr_t *)arg;
int ret = 0; /* assume success */
clock_t delta;
mutex_enter(cp->cc_lockp);
switch (code) {
case CB_CODE_CPR_CHKPT:
/*
* Check for active resync threads
*/
mutex_enter(&md_cpr_resync.md_resync_mutex);
if ((md_cpr_resync.md_mirror_resync > 0) ||
(md_cpr_resync.md_raid_resync > 0)) {
mutex_exit(&md_cpr_resync.md_resync_mutex);
cmn_err(CE_WARN, "There are Solaris Volume Manager "
"synchronization threads running.");
cmn_err(CE_WARN, "Please try system suspension at "
"a later time.");
ret = -1;
break;
}
mutex_exit(&md_cpr_resync.md_resync_mutex);
cp->cc_events |= CALLB_CPR_START;
delta = CPR_KTHREAD_TIMEOUT_SEC * hz;
while (!(cp->cc_events & CALLB_CPR_SAFE))
/* cv_reltimedwait() returns -1 if it times out. */
if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
cp->cc_lockp, delta, TR_CLOCK_TICK)) == -1)
break;
break;
case CB_CODE_CPR_RESUME:
cp->cc_events &= ~CALLB_CPR_START;
cv_signal(&cp->cc_stop_cv);
break;
}
mutex_exit(cp->cc_lockp);
return (ret != -1);
}
void
md_daemon(int pass_thru, mdq_anchor_t *anchor)
{
daemon_queue_t *dq;
callb_cpr_t cprinfo;
if (pass_thru && (md_get_status() & MD_GBL_DAEMONS_LIVE))
return;
/*
* Register cpr callback
*/
CALLB_CPR_INIT(&cprinfo, &anchor->a_mx, callb_md_cpr, "md_daemon");
/*CONSTCOND*/
while (1) {
mutex_enter(&anchor->a_mx);
while ((dq = anchor->dq.dq_next) == &(anchor->dq)) {
if (pass_thru) {
/*
* CALLB_CPR_EXIT Will do
* mutex_exit(&anchor->a_mx)
*/
CALLB_CPR_EXIT(&cprinfo);
return;
}
if (md_get_status() & MD_GBL_DAEMONS_DIE) {
mutex_exit(&anchor->a_mx);
mutex_enter(&md_mx);
md_num_daemons--;
mutex_exit(&md_mx);
/*
* CALLB_CPR_EXIT will do
* mutex_exit(&anchor->a_mx)
*/
mutex_enter(&anchor->a_mx);
CALLB_CPR_EXIT(&cprinfo);
thread_exit();
}
CALLB_CPR_SAFE_BEGIN(&cprinfo);
cv_wait(&anchor->a_cv, &anchor->a_mx);
CALLB_CPR_SAFE_END(&cprinfo, &anchor->a_mx);
}
dq->dq_prev->dq_next = dq->dq_next;
dq->dq_next->dq_prev = dq->dq_prev;
dq->dq_prev = dq->dq_next = NULL;
anchor->dq.qlen--;
mutex_exit(&anchor->a_mx);
(*(dq->dq_call))(dq);
}
/*NOTREACHED*/
}
/*
* daemon_request:
*
* Adds requests to appropriate requestq which is
* anchored by *anchor.
* The request is the first element of a doubly linked circular list.
* When the request is a single element, the forward and backward
* pointers MUST point to the element itself.
*/
void
daemon_request(mdq_anchor_t *anchor, void (*func)(),
daemon_queue_t *request, callstyle_t style)
{
daemon_queue_t *rqtp;
int i = 0;
rqtp = request;
if (style == REQ_OLD) {
ASSERT((rqtp->dq_next == NULL) && (rqtp->dq_prev == NULL));
/* set it to the new style */
rqtp->dq_prev = rqtp->dq_next = rqtp;
}
ASSERT((rqtp->dq_next != NULL) && (rqtp->dq_prev != NULL));
/* scan the list and add the function to each element */
do {
rqtp->dq_call = func;
i++;
rqtp = rqtp->dq_next;
} while (rqtp != request);
/* save pointer to tail of the request list */
rqtp = request->dq_prev;
mutex_enter(&anchor->a_mx);
/* stats */
anchor->dq.qlen += i;
anchor->dq.treqs += i;
anchor->dq.maxq_len = (anchor->dq.qlen > anchor->dq.maxq_len) ?
anchor->dq.qlen : anchor->dq.maxq_len;
/* now add the list to request queue */
request->dq_prev = anchor->dq.dq_prev;
rqtp->dq_next = &anchor->dq;
anchor->dq.dq_prev->dq_next = request;
anchor->dq.dq_prev = rqtp;
cv_broadcast(&anchor->a_cv);
mutex_exit(&anchor->a_mx);
}
void
mddb_commitrec_wrapper(mddb_recid_t recid)
{
int sent_log = 0;
uint_t retry = md_retry_cnt;
set_t setno;
while (mddb_commitrec(recid)) {
if (! sent_log) {
cmn_err(CE_WARN,
"md: state database commit failed");
sent_log = 1;
}
delay(md_hz);
/*
* Setting retry cnt to one (pre decremented) so that we
* actually do no retries when committing/deleting a mddb rec.
* The underlying disk driver does several retries to check
* if the disk is really dead or not so there
* is no reason for us to retry on top of the drivers retries.
*/
if (--retry == 0) {
setno = mddb_getsetnum(recid);
if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
panic(
"md: Panic due to lack of DiskSuite state\n"
" database replicas. Fewer than 50%% of "
"the total were available,\n so panic to "
"ensure data integrity.");
} else {
panic("md: state database problem");
}
/*NOTREACHED*/
}
}
}
void
mddb_commitrecs_wrapper(mddb_recid_t *recids)
{
int sent_log = 0;
uint_t retry = md_retry_cnt;
set_t setno;
while (mddb_commitrecs(recids)) {
if (! sent_log) {
cmn_err(CE_WARN,
"md: state database commit failed");
sent_log = 1;
}
delay(md_hz);
/*
* Setting retry cnt to one (pre decremented) so that we
* actually do no retries when committing/deleting a mddb rec.
* The underlying disk driver does several retries to check
* if the disk is really dead or not so there
* is no reason for us to retry on top of the drivers retries.
*/
if (--retry == 0) {
/*
* since all the records are part of the same set
* use the first one to get setno
*/
setno = mddb_getsetnum(*recids);
if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
panic(
"md: Panic due to lack of DiskSuite state\n"
" database replicas. Fewer than 50%% of "
"the total were available,\n so panic to "
"ensure data integrity.");
} else {
panic("md: state database problem");
}
/*NOTREACHED*/
}
}
}
void
mddb_deleterec_wrapper(mddb_recid_t recid)
{
int sent_log = 0;
uint_t retry = md_retry_cnt;
set_t setno;
while (mddb_deleterec(recid)) {
if (! sent_log) {
cmn_err(CE_WARN,
"md: state database delete failed");
sent_log = 1;
}
delay(md_hz);
/*
* Setting retry cnt to one (pre decremented) so that we
* actually do no retries when committing/deleting a mddb rec.
* The underlying disk driver does several retries to check
* if the disk is really dead or not so there
* is no reason for us to retry on top of the drivers retries.
*/
if (--retry == 0) {
setno = mddb_getsetnum(recid);
if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
panic(
"md: Panic due to lack of DiskSuite state\n"
" database replicas. Fewer than 50%% of "
"the total were available,\n so panic to "
"ensure data integrity.");
} else {
panic("md: state database problem");
}
/*NOTREACHED*/
}
}
}
/*
* md_holdset_enter is called in order to hold the set in its
* current state (loaded, unloaded, snarfed, unsnarfed, etc)
* until md_holdset_exit is called. This is used by the mirror
* code to mark the set as HOLD so that the set won't be
* unloaded while hotspares are being allocated in check_4_hotspares.
* The original fix to the mirror code to hold the set was to call
* md_haltsnarf_enter, but this will block all ioctls and ioctls
* must work for a MN diskset while hotspares are allocated.
*/
void
md_holdset_enter(set_t setno)
{
mutex_enter(&md_mx);
while (md_set[setno].s_status & MD_SET_HOLD)
cv_wait(&md_cv, &md_mx);
md_set[setno].s_status |= MD_SET_HOLD;
mutex_exit(&md_mx);
}
void
md_holdset_exit(set_t setno)
{
mutex_enter(&md_mx);
md_set[setno].s_status &= ~MD_SET_HOLD;
cv_broadcast(&md_cv);
mutex_exit(&md_mx);
}
/*
* Returns a 0 if this thread marked the set as HOLD (success),
* returns a -1 if set was already marked HOLD (failure).
* Used by the release_set code to see if set is marked HOLD.
* HOLD is set by a daemon when hotspares are being allocated
* to mirror units.
*/
int
md_holdset_testandenter(set_t setno)
{
mutex_enter(&md_mx);
if (md_set[setno].s_status & MD_SET_HOLD) {
mutex_exit(&md_mx);
return (-1);
}
md_set[setno].s_status |= MD_SET_HOLD;
mutex_exit(&md_mx);
return (0);
}
void
md_haltsnarf_enter(set_t setno)
{
mutex_enter(&md_mx);
while (md_set[setno].s_status & MD_SET_SNARFING)
cv_wait(&md_cv, &md_mx);
md_set[setno].s_status |= MD_SET_SNARFING;
mutex_exit(&md_mx);
}
void
md_haltsnarf_exit(set_t setno)
{
mutex_enter(&md_mx);
md_set[setno].s_status &= ~MD_SET_SNARFING;
cv_broadcast(&md_cv);
mutex_exit(&md_mx);
}
void
md_haltsnarf_wait(set_t setno)
{
mutex_enter(&md_mx);
while (md_set[setno].s_status & MD_SET_SNARFING)
cv_wait(&md_cv, &md_mx);
mutex_exit(&md_mx);
}
/*
* ASSUMED that the md_unit_array_rw WRITER lock is held.
*/
int
md_halt_set(set_t setno, enum md_haltcmd cmd)
{
int i, err;
if (md_set[setno].s_un == NULL || md_set[setno].s_ui == NULL) {
return (0);
}
if ((cmd == MD_HALT_CHECK) || (cmd == MD_HALT_ALL)) {
for (i = 0; i < MD_NOPS; i++) {
if (md_ops[i] == NULL)
continue;
if ((*(md_ops[i]->md_halt))(MD_HALT_CLOSE, setno)) {
for (--i; i > 0; --i) {
if (md_ops[i] == NULL)
continue;
(void) (*(md_ops[i]->md_halt))
(MD_HALT_OPEN, setno);
}
return (EBUSY);
}
}
for (i = 0; i < MD_NOPS; i++) {
if (md_ops[i] == NULL)
continue;
if ((*(md_ops[i]->md_halt))(MD_HALT_CHECK, setno)) {
for (i = 0; i < MD_NOPS; i++) {
if (md_ops[i] == NULL)
continue;
(void) (*(md_ops[i]->md_halt))
(MD_HALT_OPEN, setno);
}
return (EBUSY);
}
}
}
if ((cmd == MD_HALT_DOIT) || (cmd == MD_HALT_ALL)) {
for (i = 0; i < MD_NOPS; i++) {
if (md_ops[i] == NULL)
continue;
err = (*(md_ops[i]->md_halt))(MD_HALT_DOIT, setno);
if (err != 0)
cmn_err(CE_NOTE,
"md: halt failed for %s, error %d",
md_ops[i]->md_driver.md_drivername, err);
}
/*
* Unload the devid namespace if it is loaded
*/
md_unload_namespace(setno, NM_DEVID);
md_unload_namespace(setno, 0L);
md_clr_setstatus(setno, MD_SET_SNARFED);
}
return (0);
}
int
md_halt(int global_locks_owned_mask)
{
set_t i, j;
int err;
int init_queues;
md_requestq_entry_t *rqp;
md_ops_t **pops, *ops, *lops;
ddi_modhandle_t mod;
char *name;
rw_enter(&md_unit_array_rw.lock, RW_WRITER);
/*
* Grab the all of the global locks that are not
* already owned to ensure that there isn't another
* thread trying to access a global resource
* while the halt is in progress
*/
if (md_global_lock_enter(global_locks_owned_mask) == EINTR)
return (EINTR);
for (i = 0; i < md_nsets; i++)
md_haltsnarf_enter(i);
/*
* Kill the daemon threads.
*/
init_queues = ((md_get_status() & MD_GBL_DAEMONS_LIVE) ? FALSE : TRUE);
md_clr_status(MD_GBL_DAEMONS_LIVE);
md_set_status(MD_GBL_DAEMONS_DIE);
rqp = &md_daemon_queues[0];
i = 0;
while (!NULL_REQUESTQ_ENTRY(rqp)) {
cv_broadcast(&rqp->dispq_headp->a_cv);
rqp = &md_daemon_queues[++i];
}
mutex_enter(&md_mx);
while (md_num_daemons != 0) {
mutex_exit(&md_mx);
delay(md_hz);
mutex_enter(&md_mx);
}
mutex_exit(&md_mx);
md_clr_status(MD_GBL_DAEMONS_DIE);
for (i = 0; i < md_nsets; i++)
/*
* Only call into md_halt_set if s_un / s_ui are both set.
* If they are NULL this set hasn't been accessed, so its
* pointless performing the call.
*/
if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) {
if (md_halt_set(i, MD_HALT_CHECK)) {
if (md_start_daemons(init_queues))
cmn_err(CE_WARN,
"md: restart of daemon threads "
"failed");
for (j = 0; j < md_nsets; j++)
md_haltsnarf_exit(j);
return (md_global_lock_exit(
global_locks_owned_mask, EBUSY,
MD_ARRAY_WRITER, NULL));
}
}
/*
* if we get here we are going to do it
*/
for (i = 0; i < md_nsets; i++) {
/*
* Only call into md_halt_set if s_un / s_ui are both set.
* If they are NULL this set hasn't been accessed, so its
* pointless performing the call.
*/
if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) {
err = md_halt_set(i, MD_HALT_DOIT);
if (err != 0)
cmn_err(CE_NOTE,
"md: halt failed set %u, error %d",
(unsigned)i, err);
}
}
/*
* issue a halt unload to each module to indicate that it
* is about to be unloaded. Each module is called once, set
* has no meaning at this point in time.
*/
for (i = 0; i < MD_NOPS; i++) {
if (md_ops[i] == NULL)
continue;
err = (*(md_ops[i]->md_halt))(MD_HALT_UNLOAD, 0);
if (err != 0)
cmn_err(CE_NOTE,
"md: halt failed for %s, error %d",
md_ops[i]->md_driver.md_drivername, err);
}
/* ddi_modclose the submodules */
for (i = 0; i < MD_NOPS; i++) {
/* skip if not open */
if ((md_ops[i] == NULL) || (md_mods[i] == NULL))
continue;
/* find and unlink from md_opslist */
ops = md_ops[i];
mod = md_mods[i];
pops = &md_opslist;
for (lops = *pops; lops;
pops = &lops->md_next, lops = *pops) {
if (lops == ops) {
*pops = ops->md_next;
ops->md_next = NULL;
break;
}
}
/* uninitialize */
name = ops->md_driver.md_drivername;
md_ops[i] = NULL;
md_mods[i] = NULL;
ops->md_selfindex = 0;
ops->md_driver.md_drivername[0] = '\0';
rw_destroy(&ops->md_link_rw.lock);
/* close */
err = ddi_modclose(mod);
if (err != 0)
cmn_err(CE_NOTE,
"md: halt close failed for %s, error %d",
name ? name : "UNKNOWN", err);
}
/* Unload the database */
mddb_unload();
md_set_status(MD_GBL_HALTED); /* we are ready to be unloaded */
for (i = 0; i < md_nsets; i++)
md_haltsnarf_exit(i);
return (md_global_lock_exit(global_locks_owned_mask, 0,
MD_ARRAY_WRITER, NULL));
}
/*
* md_layered_open() is an internal routine only for SVM modules.
* So the input device will be a md_dev64_t, because all SVM modules internally
* work with that device type.
* ddi routines on the other hand work with dev_t. So, if we call any ddi
* routines from here we first have to convert that device into a dev_t.
*/
int
md_layered_open(
minor_t mnum,
md_dev64_t *dev,
int md_oflags
)
{
int flag = (FREAD | FWRITE);
cred_t *cred_p = kcred;
major_t major;
int err;
dev_t ddi_dev = md_dev64_to_dev(*dev);
if (ddi_dev == NODEV)
return (ENODEV);
major = getmajor(ddi_dev);
/* metadevice */
if (major == md_major) {
mdi_unit_t *ui;
/* open underlying driver */
mnum = getminor(ddi_dev);
ui = MDI_UNIT(mnum);
if (md_ops[ui->ui_opsindex]->md_open != NULL) {
int ret = (*md_ops[ui->ui_opsindex]->md_open)(&ddi_dev,
flag, OTYP_LYR, cred_p, md_oflags);
/*
* As open() may change the device,
* send this info back to the caller.
*/
*dev = md_expldev(ddi_dev);
return (ret);
}
/* or do it ourselves */
(void) md_unit_openclose_enter(ui);
err = md_unit_incopen(mnum, flag, OTYP_LYR);
md_unit_openclose_exit(ui);
/* convert our ddi_dev back to the dev we were given */
*dev = md_expldev(ddi_dev);
return (err);
}
/*
* Open regular device, since open() may change dev_t give new dev_t
* back to the caller.
*/
err = dev_lopen(&ddi_dev, flag, OTYP_LYR, cred_p);
*dev = md_expldev(ddi_dev);
return (err);
}
/*
* md_layered_close() is an internal routine only for SVM modules.
* So the input device will be a md_dev64_t, because all SVM modules internally
* work with that device type.
* ddi routines on the other hand work with dev_t. So, if we call any ddi
* routines from here we first have to convert that device into a dev_t.
*/
void
md_layered_close(
md_dev64_t dev,
int md_cflags
)
{
int flag = (FREAD | FWRITE);
cred_t *cred_p = kcred;
dev_t ddi_dev = md_dev64_to_dev(dev);
major_t major = getmajor(ddi_dev);
minor_t mnum = getminor(ddi_dev);
/* metadevice */
if (major == md_major) {
mdi_unit_t *ui = MDI_UNIT(mnum);
/* close underlying driver */
if (md_ops[ui->ui_opsindex]->md_close != NULL) {
(*md_ops[ui->ui_opsindex]->md_close)
(ddi_dev, flag, OTYP_LYR, cred_p, md_cflags);
return;
}
/* or do it ourselves */
(void) md_unit_openclose_enter(ui);
(void) md_unit_decopen(mnum, OTYP_LYR);
md_unit_openclose_exit(ui);
return;
}
/* close regular device */
(void) dev_lclose(ddi_dev, flag, OTYP_LYR, cred_p);
}
/*
* saves a little code in mdstrategy
*/
int
errdone(mdi_unit_t *ui, struct buf *bp, int err)
{
if ((bp->b_error = err) != 0)
bp->b_flags |= B_ERROR;
else
bp->b_resid = bp->b_bcount;
md_unit_readerexit(ui);
md_biodone(bp);
return (1);
}
static int md_write_label = 0;
int
md_checkbuf(mdi_unit_t *ui, md_unit_t *un, buf_t *bp)
{
diskaddr_t endblk;
set_t setno = MD_UN2SET(un);
if ((md_get_setstatus(setno) & MD_SET_STALE) &&
(! (bp->b_flags & B_READ)))
return (errdone(ui, bp, EROFS));
/*
* Check early for unreasonable block number.
*
* b_blkno is defined as adaddr_t which is typedef'd to a long.
* A problem occurs if b_blkno has bit 31 set and un_total_blocks
* doesn't, b_blkno is then compared as a negative number which is
* always less than a positive.
*/
if ((u_longlong_t)bp->b_lblkno > (u_longlong_t)un->c.un_total_blocks)
return (errdone(ui, bp, EINVAL));
if (bp->b_lblkno == un->c.un_total_blocks)
return (errdone(ui, bp, 0));
/*
* make sure we don't clobber any labels
*/
if ((bp->b_lblkno == 0) && (! (bp->b_flags & B_READ)) &&
(un->c.un_flag & MD_LABELED) && (! md_write_label)) {
cmn_err(CE_NOTE, "md: %s: write to label",
md_shortname(getminor(bp->b_edev)));
return (errdone(ui, bp, EINVAL));
}
bp->b_resid = 0;
endblk = (diskaddr_t)(bp->b_lblkno +
howmany(bp->b_bcount, DEV_BSIZE) - 1);
if (endblk > (un->c.un_total_blocks - 1)) {
bp->b_resid = dbtob(endblk - (un->c.un_total_blocks - 1));
endblk = un->c.un_total_blocks - 1;
bp->b_bcount -= bp->b_resid;
}
return (0);
}
/*
* init_request_queue: initializes the request queues and creates the threads.
* return value = 0 :invalid num_threads
* = n : n is the number of threads created.
*/
int
init_requestq(
md_requestq_entry_t *rq, /* request queue info */
void (*threadfn)(), /* function to start the thread */
caddr_t threadfn_args, /* args to the function */
int pri, /* thread priority */
int init_queue) /* flag to init queues */
{
struct mdq_anchor *rqhead;
int i;
int num_threads;
num_threads = *(rq->num_threadsp);
rqhead = rq->dispq_headp;
if (NULL_REQUESTQ_ENTRY(rq) || num_threads == 0)
return (0);
if (init_queue) {
rqhead->dq.maxq_len = 0;
rqhead->dq.treqs = 0;
rqhead->dq.dq_next = &rqhead->dq;
rqhead->dq.dq_prev = &rqhead->dq;
cv_init(&rqhead->a_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&rqhead->a_mx, NULL, MUTEX_DEFAULT, NULL);
}
for (i = 0; i < num_threads; i++) {
(void) thread_create(NULL, 0, threadfn, threadfn_args, 0, &p0,
TS_RUN, pri);
}
return (i);
}
static void
start_daemon(struct mdq_anchor *q)
{
md_daemon(0, q);
ASSERT(0);
}
/*
* Creates all the md daemons.
* Global:
* md_num_daemons is set to number of daemons.
* MD_GBL_DAEMONS_LIVE flag set to indicate the daemons are active.
*
* Return value: 0 success
* 1 failure
*/
int
md_start_daemons(int init_queue)
{
md_requestq_entry_t *rqp;
int cnt;
int i;
int retval = 0;
if (md_get_status() & MD_GBL_DAEMONS_LIVE) {
return (retval);
}
md_clr_status(MD_GBL_DAEMONS_DIE);
rqp = &md_daemon_queues[0];
i = 0;
while (!NULL_REQUESTQ_ENTRY(rqp)) {
cnt = init_requestq(rqp, start_daemon,
(caddr_t)rqp->dispq_headp, minclsyspri, init_queue);
if (cnt && cnt != *rqp->num_threadsp) {
retval = 1;
break;
}
/*
* initialize variables
*/
md_num_daemons += cnt;
rqp = &md_daemon_queues[++i];
}
md_set_status(MD_GBL_DAEMONS_LIVE);
return (retval);
}
int
md_loadsubmod(set_t setno, char *name, int drvrid)
{
ddi_modhandle_t mod;
md_ops_t **pops, *ops;
int i, err;
/*
* See if the submodule is mdopened. If not, i is the index of the
* next empty slot.
*/
for (i = 0; md_ops[i] != NULL; i++) {
if (strncmp(name, md_ops[i]->md_driver.md_drivername,
MD_DRIVERNAMELEN) == 0)
return (i);
if (i == (MD_NOPS - 1))
return (-1);
}
if (drvrid < 0) {
/* Do not try to add any records to the DB when stale. */
if (md_get_setstatus(setno) & MD_SET_STALE)
return (-1);
drvrid = md_setshared_name(setno, name, 0L);
}
if (drvrid < 0)
return (-1);
/* open and import the md_ops of the submodules */
mod = ddi_modopen(name, KRTLD_MODE_FIRST, &err);
if (mod == NULL) {
cmn_err(CE_WARN, "md_loadsubmod: "
"unable to ddi_modopen %s, error %d\n", name, err);
return (-1);
}
pops = ddi_modsym(mod, "md_interface_ops", &err);
if (pops == NULL) {
cmn_err(CE_WARN, "md_loadsubmod: "
"unable to import md_interface_ops from %s, error %d\n",
name, err);
(void) ddi_modclose(mod);
return (-1);
}
/* ddi_modsym returns pointer to md_interface_ops in submod */
ops = *pops;
/* initialize */
ops->md_selfindex = i;
rw_init(&ops->md_link_rw.lock, NULL, RW_DEFAULT, NULL);
(void) strncpy(ops->md_driver.md_drivername, name,
MD_DRIVERNAMELEN);
/* plumb */
md_ops[i] = ops;
md_mods[i] = mod;
ops->md_next = md_opslist;
md_opslist = ops;
/* return index */
return (i);
}
int
md_getmodindex(md_driver_t *driver, int dont_load, int db_notrequired)
{
int i;
int modindex;
char *name = driver->md_drivername;
set_t setno = driver->md_setno;
int drvid;
int local_dont_load;
if (setno >= md_nsets)
return (-1);
for (i = 0; name[i] != 0; i++)
if (i == (MD_DRIVERNAMELEN -1))
return (-1);
/*
* If set is STALE, set local_dont_load to 1 since no records
* should be added to DB when stale.
*/
if (md_get_setstatus(setno) & MD_SET_STALE) {
local_dont_load = 1;
} else {
local_dont_load = dont_load;
}
/*
* Single thread ioctl module binding with respect to
* similar code executed in md_loadsubmod that is called
* from md_snarf_db_set (which is where that path does
* its md_haltsnarf_enter call).
*/
md_haltsnarf_enter(setno);
/* See if the submodule is already ddi_modopened. */
for (i = 0; md_ops[i] != NULL; i++) {
if (strncmp(name, md_ops[i]->md_driver.md_drivername,
MD_DRIVERNAMELEN) == 0) {
if (! local_dont_load &&
(md_getshared_key(setno, name) == MD_KEYBAD)) {
if (md_setshared_name(setno, name, 0L)
== MD_KEYBAD) {
if (!db_notrequired)
goto err;
}
}
md_haltsnarf_exit(setno);
return (i);
}
if (i == (MD_NOPS -1))
break;
}
if (local_dont_load)
goto err;
drvid = ((db_notrequired) ? 0 : (int)md_getshared_key(setno, name));
/* ddi_modopen the submodule */
modindex = md_loadsubmod(setno, name, drvid);
if (modindex < 0)
goto err;
if (md_ops[modindex]->md_snarf != NULL)
(*(md_ops[modindex]->md_snarf))(MD_SNARF_DOIT, setno);
md_haltsnarf_exit(setno);
return (modindex);
err: md_haltsnarf_exit(setno);
return (-1);
}
void
md_call_strategy(buf_t *bp, int flags, void *private)
{
mdi_unit_t *ui;
if (mdv_strategy_tstpnt)
if ((*mdv_strategy_tstpnt)(bp, flags, private) != 0)
return;
if (getmajor(bp->b_edev) != md_major) {
(void) bdev_strategy(bp);
return;
}
flags = (flags & MD_STR_PASSEDON) | MD_STR_NOTTOP;
ui = MDI_UNIT(getminor(bp->b_edev));
ASSERT(ui != NULL);
(*md_ops[ui->ui_opsindex]->md_strategy)(bp, flags, private);
}
/*
* md_call_ioctl:
* -------------
* Issue the specified ioctl to the device associated with the given md_dev64_t
*
* Arguments:
* dev - underlying device [md_dev64_t]
* cmd - ioctl to perform
* data - arguments / result location
* mode - read/write/layered ioctl
* lockp - lock reference
*
* Returns:
* 0 success
* !=0 Failure (error code)
*/
int
md_call_ioctl(md_dev64_t dev, int cmd, void *data, int mode, IOLOCK *lockp)
{
dev_t device = md_dev64_to_dev(dev);
int rval;
mdi_unit_t *ui;
/*
* See if device is a metadevice. If not call cdev_ioctl(), otherwise
* call the ioctl entry-point in the metadevice.
*/
if (md_getmajor(dev) != md_major) {
int rv;
rval = cdev_ioctl(device, cmd, (intptr_t)data, mode,
ddi_get_cred(), &rv);
} else {
ui = MDI_UNIT(md_getminor(dev));
ASSERT(ui != NULL);
rval = (*md_ops[ui->ui_opsindex]->md_ioctl)(device, cmd, data,
mode, lockp);
}
return (rval);
}
void
md_rem_link(set_t setno, int id, krwlock_t *rw, md_link_t **head)
{
md_link_t *next;
md_link_t **pprev;
rw_enter(rw, RW_WRITER);
next = *head;
pprev = head;
while (next) {
if ((next->ln_setno == setno) && (next->ln_id == id)) {
*pprev = next->ln_next;
rw_exit(rw);
return;
}
pprev = &next->ln_next;
next = next->ln_next;
}
rw_exit(rw);
}
int
md_dev_exists(md_dev64_t dev)
{
if (dev == NODEV64)
return (0);
if (strcmp(ddi_major_to_name(md_getmajor(dev)), "md") != 0)
return (1);
if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) ||
(MD_MIN2UNIT(md_getminor(dev)) >= md_nunits))
return (0);
if (MDI_UNIT(md_getminor(dev)) != NULL)
return (1);
return (0);
}
md_parent_t
md_get_parent(md_dev64_t dev)
{
md_unit_t *un;
mdi_unit_t *ui;
md_parent_t parent;
if (md_getmajor(dev) != md_major)
return (MD_NO_PARENT);
ui = MDI_UNIT(md_getminor(dev));
un = (md_unit_t *)md_unit_readerlock(ui);
parent = un->c.un_parent;
md_unit_readerexit(ui);
return (parent);
}
void
md_set_parent(md_dev64_t dev, md_parent_t parent)
{
md_unit_t *un;
mdi_unit_t *ui;
if (md_getmajor(dev) != md_major)
return;
ui = MDI_UNIT(md_getminor(dev));
un = (md_unit_t *)md_unit_readerlock(ui);
un->c.un_parent = parent;
md_unit_readerexit(ui);
}
void
md_reset_parent(md_dev64_t dev)
{
md_unit_t *un;
mdi_unit_t *ui;
if (md_getmajor(dev) != md_major)
return;
ui = MDI_UNIT(md_getminor(dev));
un = (md_unit_t *)md_unit_readerlock(ui);
un->c.un_parent = MD_NO_PARENT;
md_unit_readerexit(ui);
}
static intptr_t (*hot_spare_interface)() = (intptr_t (*)())NULL;
int
md_hot_spare_ifc(
hs_cmds_t cmd,
mddb_recid_t id,
u_longlong_t size,
int labeled,
mddb_recid_t *hs_id,
mdkey_t *key,
md_dev64_t *dev,
diskaddr_t *sblock)
{
int err;
/*
* RW lock on hot_spare_interface. We don't want it to change from
* underneath us. If hot_spare_interface is NULL we're going to
* need to set it. So we need to upgrade to a WRITER lock. If that
* doesn't work, we drop the lock and reenter as WRITER. This leaves
* a small hole during which hot_spare_interface could be modified
* so we check it for NULL again. What a pain. Then if still null
* load from md_get_named_service.
*/
rw_enter(&hsp_rwlp.lock, RW_READER);
if (hot_spare_interface == NULL) {
if (rw_tryupgrade(&hsp_rwlp.lock) == 0) {
rw_exit(&hsp_rwlp.lock);
rw_enter(&hsp_rwlp.lock, RW_WRITER);
if (hot_spare_interface != NULL) {
err = ((*hot_spare_interface)
(cmd, id, size, labeled, hs_id, key, dev,
sblock));
rw_exit(&hsp_rwlp.lock);
return (err);
}
}
hot_spare_interface = md_get_named_service(NODEV64, ANY_SERVICE,
"hot spare interface", 0);
rw_downgrade(&hsp_rwlp.lock);
}
if (hot_spare_interface == NULL) {
cmn_err(CE_WARN, "md: no hotspare interface");
rw_exit(&hsp_rwlp.lock);
return (0);
}
err = ((*hot_spare_interface)
(cmd, id, size, labeled, hs_id, key, dev, sblock));
rw_exit(&hsp_rwlp.lock);
return (err);
}
void
md_clear_hot_spare_interface()
{
rw_enter(&hsp_rwlp.lock, RW_WRITER);
hot_spare_interface = NULL;
rw_exit(&hsp_rwlp.lock);
}
static intptr_t (*notify_interface)() = (intptr_t (*)())NULL;
int
md_notify_interface(
md_event_cmds_t cmd,
md_tags_t tag,
set_t set,
md_dev64_t dev,
md_event_type_t event
)
{
int err;
if (md_event_queue == NULL)
return (0);
rw_enter(&ni_rwlp.lock, RW_READER);
if (notify_interface == NULL) {
if (rw_tryupgrade(&ni_rwlp.lock) == 0) {
rw_exit(&ni_rwlp.lock);
rw_enter(&ni_rwlp.lock, RW_WRITER);
if (notify_interface != NULL) {
err = ((*notify_interface)
(cmd, tag, set, dev, event));
rw_exit(&ni_rwlp.lock);
return (err);
}
}
notify_interface = md_get_named_service(NODEV64, ANY_SERVICE,
"notify interface", 0);
rw_downgrade(&ni_rwlp.lock);
}
if (notify_interface == NULL) {
cmn_err(CE_WARN, "md: no notify interface");
rw_exit(&ni_rwlp.lock);
return (0);
}
err = ((*notify_interface)(cmd, tag, set, dev, event));
rw_exit(&ni_rwlp.lock);
return (err);
}
char *
obj2devname(uint32_t tag, uint_t setno, md_dev64_t dev)
{
char *setname;
char name[MD_MAX_CTDLEN];
minor_t mnum = md_getminor(dev);
major_t maj = md_getmajor(dev);
int rtn = 0;
/*
* Verify that the passed dev_t refers to a valid metadevice.
* If it doesn't we can make no assumptions as to what the device
* name is. Return NULL in these cases.
*/
if (((maj != md_major) || (MD_MIN2UNIT(mnum) >= md_nunits)) ||
(MD_MIN2SET(mnum) >= md_nsets)) {
return (NULL);
}
setname = NULL;
name[0] = '\0';
switch (tag) {
case SVM_TAG_HSP:
if (setno == 0) {
rtn = snprintf(name, sizeof (name), "hsp%u",
(unsigned)MD_MIN2UNIT(mnum));
} else {
setname = mddb_getsetname(setno);
if (setname != NULL) {
rtn = snprintf(name, sizeof (name), "%s/hsp%u",
setname, (unsigned)MD_MIN2UNIT(mnum));
}
}
break;
case SVM_TAG_DRIVE:
(void) sprintf(name, "drive");
break;
case SVM_TAG_HOST:
(void) sprintf(name, "host");
break;
case SVM_TAG_SET:
rtn = snprintf(name, sizeof (name), "%s",
mddb_getsetname(setno));
if ((name[0] == '\0') || (rtn >= sizeof (name))) {
(void) sprintf(name, "diskset");
rtn = 0;
}
break;
default:
rtn = snprintf(name, sizeof (name), "%s", md_shortname(mnum));
break;
}
/* Check if we got any rubbish for any of the snprintf's */
if ((name[0] == '\0') || (rtn >= sizeof (name))) {
return (NULL);
}
return (md_strdup(name));
}
/* Sysevent subclass and mdnotify event type pairs */
struct node {
char *se_ev;
md_event_type_t md_ev;
};
/*
* Table must be sorted in case sensitive ascending order of
* the sysevents values
*/
static struct node ev_table[] = {
{ ESC_SVM_ADD, EQ_ADD },
{ ESC_SVM_ATTACH, EQ_ATTACH },
{ ESC_SVM_ATTACHING, EQ_ATTACHING },
{ ESC_SVM_CHANGE, EQ_CHANGE },
{ ESC_SVM_CREATE, EQ_CREATE },
{ ESC_SVM_DELETE, EQ_DELETE },
{ ESC_SVM_DETACH, EQ_DETACH },
{ ESC_SVM_DETACHING, EQ_DETACHING },
{ ESC_SVM_DRIVE_ADD, EQ_DRIVE_ADD },
{ ESC_SVM_DRIVE_DELETE, EQ_DRIVE_DELETE },
{ ESC_SVM_ENABLE, EQ_ENABLE },
{ ESC_SVM_ERRED, EQ_ERRED },
{ ESC_SVM_EXCHANGE, EQ_EXCHANGE },
{ ESC_SVM_GROW, EQ_GROW },
{ ESC_SVM_HS_CHANGED, EQ_HS_CHANGED },
{ ESC_SVM_HS_FREED, EQ_HS_FREED },
{ ESC_SVM_HOST_ADD, EQ_HOST_ADD },
{ ESC_SVM_HOST_DELETE, EQ_HOST_DELETE },
{ ESC_SVM_HOTSPARED, EQ_HOTSPARED },
{ ESC_SVM_INIT_FAILED, EQ_INIT_FAILED },
{ ESC_SVM_INIT_FATAL, EQ_INIT_FATAL },
{ ESC_SVM_INIT_START, EQ_INIT_START },
{ ESC_SVM_INIT_SUCCESS, EQ_INIT_SUCCESS },
{ ESC_SVM_IOERR, EQ_IOERR },
{ ESC_SVM_LASTERRED, EQ_LASTERRED },
{ ESC_SVM_MEDIATOR_ADD, EQ_MEDIATOR_ADD },
{ ESC_SVM_MEDIATOR_DELETE, EQ_MEDIATOR_DELETE },
{ ESC_SVM_OFFLINE, EQ_OFFLINE },
{ ESC_SVM_OK, EQ_OK },
{ ESC_SVM_ONLINE, EQ_ONLINE },
{ ESC_SVM_OPEN_FAIL, EQ_OPEN_FAIL },
{ ESC_SVM_REGEN_DONE, EQ_REGEN_DONE },
{ ESC_SVM_REGEN_FAILED, EQ_REGEN_FAILED },
{ ESC_SVM_REGEN_START, EQ_REGEN_START },
{ ESC_SVM_RELEASE, EQ_RELEASE },
{ ESC_SVM_REMOVE, EQ_REMOVE },
{ ESC_SVM_RENAME_DST, EQ_RENAME_DST },
{ ESC_SVM_RENAME_SRC, EQ_RENAME_SRC },
{ ESC_SVM_REPLACE, EQ_REPLACE },
{ ESC_SVM_RESYNC_DONE, EQ_RESYNC_DONE },
{ ESC_SVM_RESYNC_FAILED, EQ_RESYNC_FAILED },
{ ESC_SVM_RESYNC_START, EQ_RESYNC_START },
{ ESC_SVM_RESYNC_SUCCESS, EQ_RESYNC_SUCCESS },
{ ESC_SVM_TAKEOVER, EQ_TAKEOVER }
};
static md_tags_t md_tags[] = {
TAG_UNK,
TAG_METADEVICE,
TAG_UNK,
TAG_UNK,
TAG_UNK,
TAG_UNK,
TAG_REPLICA,
TAG_HSP,
TAG_HS,
TAG_SET,
TAG_DRIVE,
TAG_HOST,
TAG_MEDIATOR
};
md_event_type_t
ev_get(char *subclass)
{
int high, mid, low, p;
low = 0;
high = (sizeof (ev_table) / sizeof (ev_table[0])) - 1;
while (low <= high) {
mid = (high + low) / 2;
p = strcmp(subclass, ev_table[mid].se_ev);
if (p == 0) {
return (ev_table[mid].md_ev);
} else if (p < 0) {
high = mid - 1;
} else {
low = mid + 1;
}
}
return (EQ_EMPTY);
}
/*
* Log mdnotify event
*/
void
do_mdnotify(char *se_subclass, uint32_t tag, set_t setno, md_dev64_t devid)
{
md_event_type_t ev_type;
md_tags_t md_tag;
/* Translate sysevent into mdnotify event */
ev_type = ev_get(se_subclass);
if (tag >= (sizeof (md_tags) / sizeof (md_tags[0]))) {
md_tag = TAG_UNK;
} else {
md_tag = md_tags[tag];
}
NOTIFY_MD(md_tag, setno, devid, ev_type);
}
/*
* Log SVM sys events
*/
void
svm_gen_sysevent(
char *se_class,
char *se_subclass,
uint32_t tag,
set_t setno,
md_dev64_t devid
)
{
nvlist_t *attr_list;
sysevent_id_t eid;
int err = DDI_SUCCESS;
char *devname;
extern dev_info_t *md_devinfo;
/* Raise the mdnotify event before anything else */
do_mdnotify(se_subclass, tag, setno, devid);
if (md_devinfo == NULL) {
return;
}
err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_NOSLEEP);
if (err == DDI_SUCCESS) {
/* Add the version numver */
err = nvlist_add_uint32(attr_list, SVM_VERSION_NO,
(uint32_t)SVM_VERSION);
if (err != DDI_SUCCESS) {
goto fail;
}
/* Add the tag attribute */
err = nvlist_add_uint32(attr_list, SVM_TAG, (uint32_t)tag);
if (err != DDI_SUCCESS) {
goto fail;
}
/* Add the set number attribute */
err = nvlist_add_uint32(attr_list, SVM_SET_NO, (uint32_t)setno);
if (err != DDI_SUCCESS) {
goto fail;
}
/* Add the device id attribute */
err = nvlist_add_uint64(attr_list, SVM_DEV_ID, (uint64_t)devid);
if (err != DDI_SUCCESS) {
goto fail;
}
/* Add the device name attribute */
devname = obj2devname(tag, setno, devid);
if (devname != NULL) {
err = nvlist_add_string(attr_list, SVM_DEV_NAME,
devname);
freestr(devname);
} else {
err = nvlist_add_string(attr_list, SVM_DEV_NAME,
"unspecified");
}
if (err != DDI_SUCCESS) {
goto fail;
}
/* Attempt to post event */
err = ddi_log_sysevent(md_devinfo, DDI_VENDOR_SUNW, se_class,
se_subclass, attr_list, &eid, DDI_SLEEP);
nvlist_free(attr_list);
if (err != DDI_SUCCESS) {
cmn_err(CE_WARN, "Failed to log event for %s, %s,"
" err=%x", se_class, se_subclass, err);
}
}
return;
fail:
nvlist_free(attr_list);
cmn_err(CE_WARN, "Failed to setup attributes for event %s, %s, err=%x",
se_class, se_subclass, err);
}
void
md_clear_named_service()
{
rw_enter(&ni_rwlp.lock, RW_WRITER);
notify_interface = NULL;
rw_exit(&ni_rwlp.lock);
}
void
md_create_unit_incore(minor_t mnum, md_ops_t *ops, int alloc_lock)
{
mdi_unit_t *ui;
set_t setno = MD_MIN2SET(mnum);
ui = (mdi_unit_t *)kmem_zalloc(sizeof (mdi_unit_t), KM_SLEEP);
ui->ui_opsindex = ops->md_selfindex;
/* initialize all the incore conditional variables */
mutex_init(&ui->ui_mx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ui->ui_cv, NULL, CV_DEFAULT, NULL);
if (alloc_lock) {
ui->ui_io_lock = kmem_zalloc(sizeof (md_io_lock_t), KM_SLEEP);
mutex_init(&ui->ui_io_lock->io_mx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ui->ui_io_lock->io_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&ui->ui_io_lock->io_list_mutex, NULL,
MUTEX_DEFAULT, NULL);
ui->ui_io_lock->io_list_front = NULL;
ui->ui_io_lock->io_list_back = NULL;
}
if (! (md_get_setstatus(setno) & MD_SET_SNARFING)) {
rw_enter(&md_unit_array_rw.lock, RW_WRITER);
MDI_VOIDUNIT(mnum) = (void *) ui;
rw_exit(&md_unit_array_rw.lock);
} else
MDI_VOIDUNIT(mnum) = (void *) ui;
rw_enter(&ops->md_link_rw.lock, RW_WRITER);
ui->ui_link.ln_next = ops->md_head;
ui->ui_link.ln_setno = setno;
ui->ui_link.ln_id = mnum;
ops->md_head = &ui->ui_link;
/* setup the unavailable field */
#if defined(_ILP32)
if (((md_unit_t *)MD_UNIT(mnum))->c.un_revision & MD_64BIT_META_DEV) {
ui->ui_tstate |= MD_64MD_ON_32KERNEL;
cmn_err(CE_NOTE, "d%d is unavailable because 64 bit "
"metadevices are not accessible on a 32 bit kernel",
mnum);
}
#endif
rw_exit(&ops->md_link_rw.lock);
}
void
md_destroy_unit_incore(minor_t mnum, md_ops_t *ops)
{
mdi_unit_t *ui;
/*
* ASSUMPTION: md_unit_array_rw WRITER lock is held.
*/
ui = MDI_UNIT(mnum);
if (ui == NULL)
return;
md_rem_link(MD_MIN2SET(mnum), mnum, &ops->md_link_rw.lock,
&ops->md_head);
/* destroy the io lock if one is being used */
if (ui->ui_io_lock) {
mutex_destroy(&ui->ui_io_lock->io_mx);
cv_destroy(&ui->ui_io_lock->io_cv);
kmem_free(ui->ui_io_lock, sizeof (md_io_lock_t));
}
/* teardown kstat */
md_kstat_destroy(mnum);
/* destroy all the incore conditional variables */
mutex_destroy(&ui->ui_mx);
cv_destroy(&ui->ui_cv);
kmem_free(ui, sizeof (mdi_unit_t));
MDI_VOIDUNIT(mnum) = (void *) NULL;
}
void
md_rem_names(sv_dev_t *sv, int nsv)
{
int i, s;
int max_sides;
if (nsv == 0)
return;
/* All entries removed are in the same diskset */
if (md_get_setstatus(sv[0].setno) & MD_SET_MNSET)
max_sides = MD_MNMAXSIDES;
else
max_sides = MD_MAXSIDES;
for (i = 0; i < nsv; i++)
for (s = 0; s < max_sides; s++)
(void) md_remdevname(sv[i].setno, s, sv[i].key);
}
/*
* Checking user args before we get into physio - returns 0 for ok, else errno
* We do a lot of checking against illegal arguments here because some of the
* real disk drivers don't like certain kinds of arguments. (e.g xy doesn't
* like odd address user buffer.) Those drivers capture bad arguments in
* xxread and xxwrite. But since meta-driver calls their strategy routines
* directly, two bad scenario might happen:
* 1. the real strategy doesn't like it and panic.
* 2. the real strategy doesn't like it and set B_ERROR.
*
* The second case is no better than the first one, since the meta-driver
* will treat it as a media-error and off line the mirror metapartition.
* (Too bad there is no way to tell what error it is.)
*
*/
int
md_chk_uio(struct uio *uio)
{
int i;
struct iovec *iov;
/*
* Check for negative or not block-aligned offset
*/
if ((uio->uio_loffset < 0) ||
((uio->uio_loffset & (DEV_BSIZE - 1)) != 0)) {
return (EINVAL);
}
iov = uio->uio_iov;
i = uio->uio_iovcnt;
while (i--) {
if ((iov->iov_len & (DEV_BSIZE - 1)) != 0)
return (EINVAL);
/*
* Bug # 1212146
* The default is to not check alignment, but we can now check
* for a larger number of alignments if desired.
*/
if ((uintptr_t)(iov->iov_base) & md_uio_alignment_mask)
return (EINVAL);
iov++;
}
return (0);
}
char *
md_shortname(
minor_t mnum
)
{
static char buf[MAXPATHLEN];
char *devname;
char *invalid = " (Invalid minor number %u) ";
char *metaname;
mdc_unit_t *un;
side_t side;
set_t setno = MD_MIN2SET(mnum);
unit_t unit = MD_MIN2UNIT(mnum);
if ((un = MD_UNIT(mnum)) == NULL) {
(void) snprintf(buf, sizeof (buf), invalid, mnum);
return (buf);
}
/*
* If unit is not a friendly name unit, derive the name from the
* minor number.
*/
if ((un->un_revision & MD_FN_META_DEV) == 0) {
/* This is a traditional metadevice */
if (setno == MD_LOCAL_SET) {
(void) snprintf(buf, sizeof (buf), "d%u",
(unsigned)unit);
} else {
(void) snprintf(buf, sizeof (buf), "%s/d%u",
mddb_getsetname(setno), (unsigned)unit);
}
return (buf);
}
/*
* It is a friendly name metadevice, so we need to get its name.
*/
side = mddb_getsidenum(setno);
devname = (char *)kmem_alloc(MAXPATHLEN, KM_SLEEP);
if (md_getdevname(setno, side, MD_KEYWILD,
md_makedevice(md_major, mnum), devname, MAXPATHLEN) == 0) {
/*
* md_getdevname has given us either /dev/md/dsk/<metaname>
* or /dev/md/<setname>/dsk/<metname> depending on whether
* or not we are in the local set. Thus, we'll pull the
* metaname from this string.
*/
if ((metaname = strrchr(devname, '/')) == NULL) {
(void) snprintf(buf, sizeof (buf), invalid, mnum);
goto out;
}
metaname++; /* move past slash */
if (setno == MD_LOCAL_SET) {
/* No set name. */
(void) snprintf(buf, sizeof (buf), "%s", metaname);
} else {
/* Include setname */
(void) snprintf(buf, sizeof (buf), "%s/%s",
mddb_getsetname(setno), metaname);
}
} else {
/* We couldn't find the name. */
(void) snprintf(buf, sizeof (buf), invalid, mnum);
}
out:
kmem_free(devname, MAXPATHLEN);
return (buf);
}
char *
md_devname(
set_t setno,
md_dev64_t dev,
char *buf,
size_t size
)
{
static char mybuf[MD_MAX_CTDLEN];
int err;
if (buf == NULL) {
buf = mybuf;
size = sizeof (mybuf);
} else {
ASSERT(size >= MD_MAX_CTDLEN);
}
err = md_getdevname_common(setno, mddb_getsidenum(setno),
0, dev, buf, size, MD_NOWAIT_LOCK);
if (err) {
if (err == ENOENT) {
(void) sprintf(buf, "(Unavailable)");
} else {
(void) sprintf(buf, "(%u.%u)",
md_getmajor(dev), md_getminor(dev));
}
}
return (buf);
}
void
md_minphys(buf_t *pb)
{
extern unsigned md_maxbcount;
if (pb->b_bcount > md_maxbcount)
pb->b_bcount = md_maxbcount;
}
void
md_bioinit(struct buf *bp)
{
ASSERT(bp);
bioinit(bp);
bp->b_back = bp;
bp->b_forw = bp;
bp->b_flags = B_BUSY; /* initialize flags */
}
void
md_bioreset(struct buf *bp)
{
ASSERT(bp);
bioreset(bp);
bp->b_back = bp;
bp->b_forw = bp;
bp->b_flags = B_BUSY; /* initialize flags */
}
/*
* md_bioclone is needed as long as the real bioclone only takes a daddr_t
* as block number.
* We simply call bioclone with all input parameters but blkno, and set the
* correct blkno afterwards.
* Caveat Emptor: bp_mem must not be NULL!
*/
buf_t *
md_bioclone(buf_t *bp, off_t off, size_t len, dev_t dev, diskaddr_t blkno,
int (*iodone)(buf_t *), buf_t *bp_mem, int sleep)
{
(void) bioclone(bp, off, len, dev, 0, iodone, bp_mem, sleep);
bp_mem->b_lblkno = blkno;
return (bp_mem);
}
/*
* kstat stuff
*/
void
md_kstat_init_ui(
minor_t mnum,
mdi_unit_t *ui
)
{
if ((ui != NULL) && (ui->ui_kstat == NULL)) {
set_t setno = MD_MIN2SET(mnum);
unit_t unit = MD_MIN2UNIT(mnum);
char module[KSTAT_STRLEN];
char *p = module;
if (setno != MD_LOCAL_SET) {
char buf[64];
char *s = buf;
char *e = module + sizeof (module) - 4;
(void) sprintf(buf, "%u", setno);
while ((p < e) && (*s != '\0'))
*p++ = *s++;
*p++ = '/';
}
*p++ = 'm';
*p++ = 'd';
*p = '\0';
if ((ui->ui_kstat = kstat_create(module, unit, NULL, "disk",
KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) != NULL) {
ui->ui_kstat->ks_lock = &ui->ui_mx;
kstat_install(ui->ui_kstat);
}
}
}
void
md_kstat_init(
minor_t mnum
)
{
md_kstat_init_ui(mnum, MDI_UNIT(mnum));
}
void
md_kstat_destroy_ui(
mdi_unit_t *ui
)
{
/*
* kstat_delete() interface has it's own locking mechanism and
* does not allow holding of kstat lock (ks_lock).
* Note: ks_lock == ui_mx from the md_kstat_init_ui().
*/
if ((ui != NULL) && (ui->ui_kstat != NULL)) {
kstat_delete(ui->ui_kstat);
ui->ui_kstat = NULL;
}
}
void
md_kstat_destroy(
minor_t mnum
)
{
md_kstat_destroy_ui(MDI_UNIT(mnum));
}
/*
* In the following subsequent routines, locks are held before checking the
* validity of ui_kstat. This is done to make sure that we don't trip over
* a NULL ui_kstat anymore.
*/
void
md_kstat_waitq_enter(
mdi_unit_t *ui
)
{
mutex_enter(&ui->ui_mx);
if (ui->ui_kstat != NULL)
kstat_waitq_enter(KSTAT_IO_PTR(ui->ui_kstat));
mutex_exit(&ui->ui_mx);
}
void
md_kstat_waitq_to_runq(
mdi_unit_t *ui
)
{
mutex_enter(&ui->ui_mx);
if (ui->ui_kstat != NULL)
kstat_waitq_to_runq(KSTAT_IO_PTR(ui->ui_kstat));
mutex_exit(&ui->ui_mx);
}
void
md_kstat_waitq_exit(
mdi_unit_t *ui
)
{
mutex_enter(&ui->ui_mx);
if (ui->ui_kstat != NULL)
kstat_waitq_exit(KSTAT_IO_PTR(ui->ui_kstat));
mutex_exit(&ui->ui_mx);
}
void
md_kstat_runq_enter(
mdi_unit_t *ui
)
{
mutex_enter(&ui->ui_mx);
if (ui->ui_kstat != NULL)
kstat_runq_enter(KSTAT_IO_PTR(ui->ui_kstat));
mutex_exit(&ui->ui_mx);
}
void
md_kstat_runq_exit(
mdi_unit_t *ui
)
{
mutex_enter(&ui->ui_mx);
if (ui->ui_kstat != NULL)
kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat));
mutex_exit(&ui->ui_mx);
}
void
md_kstat_done(
mdi_unit_t *ui,
buf_t *bp,
int war
)
{
size_t n_done;
/* check for end of device */
if ((bp->b_resid != 0) && (! (bp->b_flags & B_ERROR))) {
n_done = bp->b_bcount;
} else if (bp->b_bcount < bp->b_resid) {
n_done = 0;
} else {
n_done = bp->b_bcount - bp->b_resid;
}
/* do accounting */
mutex_enter(&ui->ui_mx);
if (ui->ui_kstat != NULL) {
if ((! war) && (bp->b_flags & B_READ)) {
KSTAT_IO_PTR(ui->ui_kstat)->reads++;
KSTAT_IO_PTR(ui->ui_kstat)->nread += n_done;
} else {
KSTAT_IO_PTR(ui->ui_kstat)->writes++;
KSTAT_IO_PTR(ui->ui_kstat)->nwritten += n_done;
}
kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat));
}
mutex_exit(&ui->ui_mx);
}
pid_t
md_getpid()
{
pid_t valuep;
if (drv_getparm(PPID, (pid_t *)&valuep) != 0) {
ASSERT(0);
return ((pid_t)0);
} else {
ASSERT(valuep);
return (valuep);
}
}
proc_t *
md_getproc()
{
proc_t *valuep;
if (drv_getparm(UPROCP, (proc_t **)&valuep) != 0) {
ASSERT(0);
return ((proc_t *)NULL);
} else {
ASSERT(valuep);
return (valuep);
}
}
extern kmutex_t pidlock;
/*
* this check to see if a process pid pair are still running. For the
* disk set lock when both pid/proc are zero then the locks is not
* currently held.
*/
int
md_checkpid(pid_t pid, proc_t *proc)
{
int retval = 1;
if (pid == 0 && proc == NULL)
return (0);
mutex_enter(&pidlock);
if (prfind(pid) != proc)
retval = 0;
mutex_exit(&pidlock);
return (retval);
}
/*
* NAME: md_init_probereq
*
* DESCRIPTION: initializes a probe request. Parcels out the mnums such that
* they can be dispatched to multiple daemon threads.
*
* PARAMETERS: struct md_probedev *p pointer ioctl input
*
* RETURN VALUE: Returns errno
*
*/
int
md_init_probereq(struct md_probedev_impl *p, daemon_queue_t **hdrpp)
{
int err = 0;
int modindx;
intptr_t (*probe_test)();
/*
* Initialize the semaphores and mutex
* for the request
*/
p->probe_sema = kmem_alloc(sizeof (ksema_t), KM_SLEEP);
p->probe_mx = kmem_alloc(sizeof (kmutex_t), KM_SLEEP);
sema_init(PROBE_SEMA(p), 0, NULL, SEMA_DRIVER, NULL);
mutex_init(PROBE_MX(p), NULL, MUTEX_DEFAULT, NULL);
modindx = md_getmodindex(&(p->probe.md_driver), 1, 1);
probe_test = md_get_named_service(NODEV64, modindx,
p->probe.test_name, 0);
if (probe_test == NULL) {
err = EINVAL;
goto err_out;
}
err = md_create_probe_rqlist(p, hdrpp, probe_test);
err_out:
return (err);
}
/*
* NAME: md_probe_one
*
* DESCRIPTION: Generic routine for probing disks. This is called from the
* daemon.
*
* PARAMETERS: probe_req_t *reqp pointer to the probe request structure.
*
*/
void
md_probe_one(probe_req_t *reqp)
{
mdi_unit_t *ui;
md_probedev_impl_t *p;
int err = 0;
set_t setno;
p = (md_probedev_impl_t *)reqp->private_handle;
/*
* Validate the unit while holding the global ioctl lock, then
* obtain the unit_writerlock. Once the writerlock has been obtained
* we can release the global lock. As long as we hold one of these
* locks this will prevent a metaclear operation being performed
* on the metadevice because metaclear takes the readerlock (via
* openclose lock).
* To avoid a potential deadlock with the probe_fcn() causing i/o to
* be issued to the writerlock'd metadevice we only grab the writerlock
* if the unit is not an SVM root device.
*/
while (md_ioctl_lock_enter() == EINTR)
;
setno = MD_MIN2SET(reqp->mnum);
ui = MDI_UNIT(reqp->mnum);
if (ui != NULL) {
int writer_grabbed;
dev_t svm_root;
if ((setno == MD_LOCAL_SET) && root_is_svm) {
svm_root = getrootdev();
if (getminor(svm_root) == reqp->mnum) {
writer_grabbed = 0;
} else {
writer_grabbed = 1;
(void) md_unit_writerlock_common(ui, 0);
}
} else {
writer_grabbed = 1;
(void) md_unit_writerlock_common(ui, 0);
}
(void) md_ioctl_lock_exit(0, 0, 0, FALSE);
err = (*reqp->probe_fcn)(ui, reqp->mnum);
if (writer_grabbed) {
md_unit_writerexit(ui);
}
} else {
(void) md_ioctl_lock_exit(0, 0, 0, FALSE);
}
/* update the info in the probe structure */
mutex_enter(PROBE_MX(p));
if (err != 0) {
cmn_err(CE_NOTE, "md_probe_one: err %d mnum %d\n", err,
reqp->mnum);
(void) mdsyserror(&(p->probe.mde), err);
}
mutex_exit(PROBE_MX(p));
sema_v(PROBE_SEMA(p));
kmem_free(reqp, sizeof (probe_req_t));
}
char *
md_strdup(char *cp)
{
char *new_cp = NULL;
new_cp = kmem_alloc(strlen(cp) + 1, KM_SLEEP);
return (strcpy(new_cp, cp));
}
void
freestr(char *cp)
{
kmem_free(cp, strlen(cp) + 1);
}
/*
* Validate the list and skip invalid devices. Then create
* a doubly linked circular list of devices to probe.
* The hdr points to the head and tail of this list.
*/
static int
md_create_probe_rqlist(md_probedev_impl_t *plist, daemon_queue_t **hdr,
intptr_t (*probe_test)())
{
int i, err, nodevcnt;
probe_req_t *tp;
daemon_queue_t *hp;
minor_t mnum;
nodevcnt = 0;
hp = NULL;
for (i = 0; i < plist->probe.nmdevs; i++) {
mnum = ((minor_t *)(uintptr_t)(plist->probe.mnum_list))[i];
if (MDI_UNIT(mnum) == NULL) {
cmn_err(CE_WARN, "md: Cannot probe %s since it does "
"not exist", md_shortname(mnum));
nodevcnt++;
continue;
}
tp = kmem_alloc(sizeof (probe_req_t), KM_SLEEP);
tp->mnum = mnum;
tp->private_handle = (void *)plist;
tp->probe_fcn = probe_test;
if (hp == NULL) {
hp = (daemon_queue_t *)tp;
hp->dq_prev = hp->dq_next = (daemon_queue_t *)tp;
} else {
tp->dq.dq_next = hp;
tp->dq.dq_prev = hp->dq_prev;
hp->dq_prev->dq_next = (daemon_queue_t *)tp;
hp->dq_prev = (daemon_queue_t *)tp;
}
}
*hdr = hp;
if (nodevcnt > 0)
plist->probe.nmdevs -= nodevcnt;
/*
* If there are no devices to be probed because they were
* incorrect, then return an error.
*/
err = (plist->probe.nmdevs == 0) ? ENODEV : 0;
return (err);
}
/*
* This routine increments the I/O count for set I/O operations. This
* value is used to determine if an I/O can done. If a release is in
* process this will return an error and cause the I/O to be errored.
*/
int
md_inc_iocount(set_t setno)
{
int rc = 0;
if (setno == 0)
return (0);
mutex_enter(&md_set_io[setno].md_io_mx);
if (!(md_set_io[setno].io_state & MD_SET_ACTIVE)) {
rc = EIO;
goto out;
}
ASSERT(md_set_io[setno].io_cnt >= 0);
md_set_io[setno].io_cnt++;
out: mutex_exit(&md_set_io[setno].md_io_mx);
return (rc);
}
void
md_inc_iocount_noblock(set_t setno)
{
if (setno == 0)
return;
mutex_enter(&md_set_io[setno].md_io_mx);
md_set_io[setno].io_cnt++;
mutex_exit(&md_set_io[setno].md_io_mx);
}
void
md_dec_iocount(set_t setno)
{
if (setno == 0)
return;
mutex_enter(&md_set_io[setno].md_io_mx);
md_set_io[setno].io_cnt--;
ASSERT(md_set_io[setno].io_cnt >= 0);
if ((md_set_io[setno].io_state & MD_SET_RELEASE) &&
(md_set_io[setno].io_cnt == 0))
cv_broadcast(&md_set_io[setno].md_io_cv);
mutex_exit(&md_set_io[setno].md_io_mx);
}
int
md_isblock_setio(set_t setno)
{
int rc = 0;
if (setno == 0)
return (0);
mutex_enter(&md_set_io[setno].md_io_mx);
if (md_set_io[setno].io_state & MD_SET_RELEASE)
rc = 1;
mutex_exit(&md_set_io[setno].md_io_mx);
return (rc);
}
int
md_block_setio(set_t setno)
{
int rc = 0;
if (setno == 0)
return (1);
mutex_enter(&md_set_io[setno].md_io_mx);
md_set_io[setno].io_state = MD_SET_RELEASE;
while (md_set_io[setno].io_cnt > 0) {
cv_wait(&md_set_io[setno].md_io_cv,
&md_set_io[setno].md_io_mx);
}
rc = 1;
ASSERT(md_set_io[setno].io_cnt == 0);
mutex_exit(&md_set_io[setno].md_io_mx);
return (rc);
}
void
md_clearblock_setio(set_t setno)
{
if (setno == 0)
return;
mutex_enter(&md_set_io[setno].md_io_mx);
md_set_io[setno].io_state = MD_SET_ACTIVE;
mutex_exit(&md_set_io[setno].md_io_mx);
}
void
md_unblock_setio(set_t setno)
{
if (setno == 0)
return;
mutex_enter(&md_set_io[setno].md_io_mx);
#ifdef DEBUG
if (md_set_io[setno].io_cnt != 0) {
cmn_err(CE_NOTE, "set %d count was %ld at take",
setno, md_set_io[setno].io_cnt);
}
#endif /* DEBUG */
md_set_io[setno].io_state = MD_SET_ACTIVE;
md_set_io[setno].io_cnt = 0;
mutex_exit(&md_set_io[setno].md_io_mx);
}
/*
* Test and set version of the md_block_setio.
* Set the io_state to keep new I/O from being issued.
* If there is I/O currently in progress, then set io_state to active
* and return failure. Otherwise, return a 1 for success.
*
* Used in a MN diskset since the commd must be suspended before
* this node can attempt to withdraw from a diskset. But, with commd
* suspended, I/O may have been issued that can never finish until
* commd is resumed (allocation of hotspare, etc). So, if I/O is
* outstanding after diskset io_state is marked RELEASE, then set diskset
* io_state back to ACTIVE and return failure.
*/
int
md_tas_block_setio(set_t setno)
{
int rc;
if (setno == 0)
return (1);
mutex_enter(&md_set_io[setno].md_io_mx);
md_set_io[setno].io_state = MD_SET_RELEASE;
if (md_set_io[setno].io_cnt > 0) {
md_set_io[setno].io_state = MD_SET_ACTIVE;
rc = 0;
} else {
rc = 1;
}
mutex_exit(&md_set_io[setno].md_io_mx);
return (rc);
}
void
md_biodone(struct buf *pb)
{
minor_t mnum;
set_t setno;
mdi_unit_t *ui;
mnum = getminor(pb->b_edev);
setno = MD_MIN2SET(mnum);
if (setno == 0) {
biodone(pb);
return;
}
#ifdef DEBUG
ui = MDI_UNIT(mnum);
if (!md_unit_isopen(ui))
cmn_err(CE_NOTE, "io after close on %s\n", md_shortname(mnum));
#endif /* DEBUG */
/*
* Handle the local diskset
*/
if (md_set_io[setno].io_cnt > 0)
md_dec_iocount(setno);
#ifdef DEBUG
/*
* this is being done after the lock is dropped so there
* are cases it may be invalid. It is advisory.
*/
if (md_set_io[setno].io_state & MD_SET_RELEASE) {
/* Only display this error once for this metadevice */
if ((ui->ui_tstate & MD_RELEASE_IOERR_DONE) == 0) {
cmn_err(CE_NOTE,
"I/O to %s attempted during set RELEASE\n",
md_shortname(mnum));
ui->ui_tstate |= MD_RELEASE_IOERR_DONE;
}
}
#endif /* DEBUG */
biodone(pb);
}
/*
* Driver special private devt handling routine
* INPUT: md_dev64_t
* OUTPUT: dev_t, 32 bit on a 32 bit kernel, 64 bit on a 64 bit kernel.
*/
dev_t
md_dev64_to_dev(md_dev64_t dev)
{
major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
minor_t minor = (minor_t)(dev & MAXMIN64);
return (makedevice(major, minor));
}
/*
* Driver private makedevice routine
* INPUT: major_t major, minor_t minor
* OUTPUT: md_dev64_t, no matter if on 32 bit or 64 bit kernel.
*/
md_dev64_t
md_makedevice(major_t major, minor_t minor)
{
return (((md_dev64_t)major << NBITSMINOR64) | minor);
}
/*
* Driver private devt md_getmajor routine
* INPUT: dev a 64 bit container holding either a 32 bit or a 64 bit device
* OUTPUT: the appropriate major number
*/
major_t
md_getmajor(md_dev64_t dev)
{
major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
if (major == 0) {
/* Here we were given a 32bit dev */
major = (major_t)(dev >> NBITSMINOR32) & MAXMAJ32;
}
return (major);
}
/*
* Driver private devt md_getminor routine
* INPUT: dev a 64 bit container holding either a 32 bit or a 64 bit device
* OUTPUT: the appropriate minor number
*/
minor_t
md_getminor(md_dev64_t dev)
{
minor_t minor;
major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
if (major == 0) {
/* Here we were given a 32bit dev */
minor = (minor_t)(dev & MAXMIN32);
} else {
minor = (minor_t)(dev & MAXMIN64);
}
return (minor);
}
int
md_check_ioctl_against_unit(int cmd, mdc_unit_t c)
{
/*
* If the metadevice is an old style device, it has a vtoc,
* in that case all reading EFI ioctls are not applicable.
* If the metadevice has an EFI label, reading vtoc and geom ioctls
* are not supposed to work.
*/
switch (cmd) {
case DKIOCGGEOM:
case DKIOCGAPART:
/* if > 2 TB then fail */
if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
return (ENOTSUP);
}
break;
case DKIOCGVTOC:
/* if > 2 TB then fail */
if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
return (ENOTSUP);
}
/* if > 1 TB but < 2TB return overflow */
if (c.un_revision & MD_64BIT_META_DEV) {
return (EOVERFLOW);
}
break;
case DKIOCGEXTVTOC:
/* if > 2 TB then fail */
if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
return (ENOTSUP);
}
break;
case DKIOCGETEFI:
case DKIOCPARTITION:
if ((c.un_flag & MD_EFILABEL) == 0) {
return (ENOTSUP);
}
break;
case DKIOCSETEFI:
/* setting an EFI label should always be ok */
return (0);
case DKIOCSVTOC:
/* if > 2 TB then fail */
if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
return (ENOTSUP);
}
/* if > 1 TB but < 2TB return overflow */
if (c.un_revision & MD_64BIT_META_DEV) {
return (EOVERFLOW);
}
break;
case DKIOCSEXTVTOC:
if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
return (ENOTSUP);
}
break;
}
return (0);
}
/*
* md_vtoc_to_efi_record()
* Input: record id of the vtoc record
* Output: record id of the efi record
* Function:
* - reads the volume name from the vtoc record
* - converts the volume name to a format, libefi understands
* - creates a new record of size MD_EFI_PARTNAME_BYTES
* - stores the volname in that record,
* - commits that record
* - returns the recid of the efi record.
* Caveat Emptor:
* The calling routine must do something like
* - un->c.un_vtoc_id = md_vtoc_to_efi_record(vtoc_recid)
* - commit(un)
* - delete(vtoc_recid)
* in order to keep the mddb consistent in case of a panic in the middle.
* Errors:
* - returns 0 on any error
*/
mddb_recid_t
md_vtoc_to_efi_record(mddb_recid_t vtoc_recid, set_t setno)
{
struct vtoc *vtoc;
ushort_t *v;
mddb_recid_t efi_recid;
int i;
if (mddb_getrecstatus(vtoc_recid) != MDDB_OK) {
return (0);
}
vtoc = (struct vtoc *)mddb_getrecaddr(vtoc_recid);
efi_recid = mddb_createrec(MD_EFI_PARTNAME_BYTES, MDDB_EFILABEL, 0,
MD_CRO_32BIT, setno);
if (efi_recid < 0) {
return (0);
}
v = (ushort_t *)mddb_getrecaddr(efi_recid);
/* This for loop read, converts and writes */
for (i = 0; i < LEN_DKL_VVOL; i++) {
v[i] = LE_16((uint16_t)vtoc->v_volume[i]);
}
/* commit the new record */
mddb_commitrec_wrapper(efi_recid);
return (efi_recid);
}
/*
* Send a kernel message.
* user has to provide for an allocated result structure
* If the door handler disappears we retry, emitting warnings every so often.
*
* The recipient argument is almost always unused, and is therefore typically
* set to zero, as zero is an invalid cluster nodeid. The exceptions are the
* marking and clearing of the DRL from a node that is not currently the
* owner. In these cases, the recipient argument will be the nodeid of the
* mirror owner, and MD_MSGF_DIRECTED will be set in the flags. Non-owner
* nodes will not receive these messages.
*
* For the case where md_mn_is_commd_present() is false, we rely on the
* "result" having been kmem_zalloc()ed which, in effect, sets MDMNE_NULL for
* kmmr_comm_state making MDMN_KSEND_MSG_OK() result in 0.
*/
int
mdmn_ksend_message(
set_t setno,
md_mn_msgtype_t type,
uint_t flags,
md_mn_nodeid_t recipient,
char *data,
int size,
md_mn_kresult_t *result)
{
door_arg_t da;
md_mn_kmsg_t *kmsg;
uint_t send_try_cnt = 0;
uint_t retry_noise_cnt = 0;
int rval;
k_sigset_t oldmask, newmask;
if (size > MDMN_MAX_KMSG_DATA)
return (ENOMEM);
kmsg = kmem_zalloc(sizeof (md_mn_kmsg_t), KM_SLEEP);
kmsg->kmsg_flags = flags;
kmsg->kmsg_setno = setno;
kmsg->kmsg_recipient = recipient;
kmsg->kmsg_type = type;
kmsg->kmsg_size = size;
bcopy(data, &(kmsg->kmsg_data), size);
/*
* Wait for the door handle to be established.
*/
while (mdmn_door_did == -1) {
if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) {
cmn_err(CE_WARN, "door handle not yet ready. "
"Check if /usr/lib/lvm/mddoors is running");
}
delay(md_hz);
}
/*
* If MD_MSGF_BLK_SIGNAL is set, mask out all signals so that we
* do not fail if the user process receives a signal while we're
* active in the door interface.
*/
if (flags & MD_MSGF_BLK_SIGNAL) {
sigfillset(&newmask);
sigreplace(&newmask, &oldmask);
}
/*
* If message failed with an RPC_FAILURE when rpc.mdcommd had
* been gracefully shutdown (md_mn_is_commd_present returns FALSE)
* then don't retry the message anymore. If message
* failed due to any other reason, then retry up to MD_MN_WARN_INTVL
* times which should allow a shutting down system time to
* notify the kernel of a graceful shutdown of rpc.mdcommd.
*
* Caller of this routine will need to check the md_mn_commd_present
* flag and the failure error in order to determine whether to panic
* or not. If md_mn_commd_present is set to 0 and failure error
* is RPC_FAILURE, the calling routine should not panic since the
* system is in the process of being shutdown.
*
*/
retry_noise_cnt = send_try_cnt = 0;
while (md_mn_is_commd_present_lite()) {
/*
* data_ptr and data_size are initialized here because on
* return from the upcall, they contain data duplicated from
* rbuf and rsize. This causes subsequent upcalls to fail.
*/
da.data_ptr = (char *)(kmsg);
da.data_size = sizeof (md_mn_kmsg_t);
da.desc_ptr = NULL;
da.desc_num = 0;
da.rbuf = (char *)result;
da.rsize = sizeof (*result);
while ((rval = door_ki_upcall_limited(mdmn_door_handle, &da,
NULL, SIZE_MAX, 0)) != 0) {
if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) {
if (rval == EAGAIN) {
cmn_err(CE_WARN,
"md: door_upcall failed. "
"Check if mddoors is running.");
} else if (rval == EINTR) {
cmn_err(CE_WARN,
"md: door_upcall failed. "
"Check if rpc.mdcommd is running.");
} else {
cmn_err(CE_WARN,
"md: door_upcall failed. "
"Returned %d",
rval);
}
}
if (++send_try_cnt >= md_send_retry_limit)
break;
delay(md_hz);
/*
* data_ptr and data_size are re-initialized here
* because on return from the upcall, they contain
* data duplicated from rbuf and rsize. This causes
* subsequent upcalls to fail.
*/
da.data_ptr = (char *)(kmsg);
da.data_size = sizeof (md_mn_kmsg_t);
da.desc_ptr = NULL;
da.desc_num = 0;
da.rbuf = (char *)result;
da.rsize = sizeof (*result);
}
/*
* If:
* - the send succeeded (MDMNE_ACK)
* - we had an MDMNE_RPC_FAIL and commd is now gone
* (note: since the outer loop is commd-dependent,
* checking MDMN_RPC_FAIL here is meaningless)
* - we were told not to retry
* - we exceeded the RPC failure send limit
* punch out of the outer loop prior to the delay()
*/
if (result->kmmr_comm_state == MDMNE_ACK ||
(flags & MD_MSGF_KSEND_NORETRY) ||
(++send_try_cnt % md_send_retry_limit) == 0 ||
!md_mn_is_commd_present())
break;
delay(md_hz);
}
if (flags & MD_MSGF_BLK_SIGNAL) {
sigreplace(&oldmask, (k_sigset_t *)NULL);
}
kmem_free(kmsg, sizeof (md_mn_kmsg_t));
return (0);
}
/*
* Called to propagate the capability of a metadevice to all nodes in the set.
*
* On entry, lockp is set if the function has been called from within an ioctl.
*
* IOLOCK_RETURN_RELEASE, which drops the md_ioctl_lock is called in this
* routine to enable other mdioctls to enter the kernel while this
* thread of execution waits on the completion of mdmn_ksend_message. When
* the message is completed the thread continues and md_ioctl_lock must be
* reacquired. Even though md_ioctl_lock is interruptable, we choose to
* ignore EINTR as we must not return without acquiring md_ioctl_lock.
*/
int
mdmn_send_capability_message(minor_t mnum, volcap_t vc, IOLOCK *lockp)
{
md_mn_msg_setcap_t msg;
md_mn_kresult_t *kres;
mdi_unit_t *ui = MDI_UNIT(mnum);
int ret;
k_sigset_t oldmask, newmask;
(void) strncpy((char *)&msg.msg_setcap_driver,
md_ops[ui->ui_opsindex]->md_driver.md_drivername, MD_DRIVERNAMELEN);
msg.msg_setcap_mnum = mnum;
msg.msg_setcap_set = vc.vc_set;
if (lockp)
IOLOCK_RETURN_RELEASE(0, lockp);
kres = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP);
/*
* Mask signals for the mdmd_ksend_message call. This keeps the door
* interface from failing if the user process receives a signal while
* in mdmn_ksend_message.
*/
sigfillset(&newmask);
sigreplace(&newmask, &oldmask);
ret = (mdmn_ksend_message(MD_MIN2SET(mnum), MD_MN_MSG_SET_CAP,
MD_MSGF_NO_LOG, 0, (char *)&msg, sizeof (md_mn_msg_setcap_t),
kres));
sigreplace(&oldmask, (k_sigset_t *)NULL);
if (!MDMN_KSEND_MSG_OK(ret, kres)) {
mdmn_ksend_show_error(ret, kres, "MD_MN_MSG_SET_CAP");
ret = EIO;
}
kmem_free(kres, sizeof (md_mn_kresult_t));
if (lockp) {
IOLOCK_RETURN_REACQUIRE(lockp);
}
return (ret);
}
/*
* Called to clear all of the transient capabilities for a metadevice when it is
* not open on any node in the cluster
* Called from close for mirror and sp.
*/
void
mdmn_clear_all_capabilities(minor_t mnum)
{
md_isopen_t clumsg;
int ret;
md_mn_kresult_t *kresult;
volcap_t vc;
k_sigset_t oldmask, newmask;
clumsg.dev = md_makedevice(md_major, mnum);
clumsg.mde = mdnullerror;
/*
* The check open message doesn't have to be logged, nor should the
* result be stored in the MCT. We want an up-to-date state.
*/
kresult = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP);
/*
* Mask signals for the mdmd_ksend_message call. This keeps the door
* interface from failing if the user process receives a signal while
* in mdmn_ksend_message.
*/
sigfillset(&newmask);
sigreplace(&newmask, &oldmask);
ret = mdmn_ksend_message(MD_MIN2SET(mnum),
MD_MN_MSG_CLU_CHECK,
MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | MD_MSGF_NO_MCT, 0,
(char *)&clumsg, sizeof (clumsg), kresult);
sigreplace(&oldmask, (k_sigset_t *)NULL);
if ((ret == 0) && (kresult->kmmr_exitval == 0)) {
/*
* Not open on any node, clear all capabilities, eg ABR and
* DMR
*/
vc.vc_set = 0;
(void) mdmn_send_capability_message(mnum, vc, NULL);
}
kmem_free(kresult, sizeof (md_mn_kresult_t));
}
/*
* mdmn_ksend_show_error:
* ---------------------
* Called to display the error contents of a failing mdmn_ksend_message() result
*
* Input:
* rv - return value from mdmn_ksend_message()
* kres - pointer to result structure filled in by mdmn_ksend_message
* s - Informative message to identify failing condition (e.g.
* "Ownership change") This string will be displayed with
* cmn_err(CE_WARN, "%s *FAILED*",...) to alert the system
* administrator
*/
void
mdmn_ksend_show_error(int rv, md_mn_kresult_t *kres, const char *s)
{
if (rv == 0) {
cmn_err(CE_WARN, "%s *FAILED*", s);
cmn_err(CE_CONT, "exit_val = %d, comm_state = %d, failing_node"
" = %d", kres->kmmr_exitval, kres->kmmr_comm_state,
kres->kmmr_failing_node);
} else {
cmn_err(CE_WARN, "%s *FAILED*, return value = %d", s, rv);
}
}
/*
* Callback routine for resync thread. If requested to suspend we mark the
* commd as not being present.
*/
boolean_t
callb_md_mrs_cpr(void *arg, int code)
{
callb_cpr_t *cp = (callb_cpr_t *)arg;
int ret = 0; /* assume success */
clock_t delta;
mutex_enter(cp->cc_lockp);
switch (code) {
case CB_CODE_CPR_CHKPT:
/*
* Mark the rpc.mdcommd as no longer present. We are trying to
* suspend the system and so we should expect RPC failures to
* occur.
*/
md_mn_clear_commd_present();
cp->cc_events |= CALLB_CPR_START;
delta = CPR_KTHREAD_TIMEOUT_SEC * hz;
while (!(cp->cc_events & CALLB_CPR_SAFE))
/* cv_timedwait() returns -1 if it times out. */
if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
cp->cc_lockp, delta, TR_CLOCK_TICK)) == -1)
break;
break;
case CB_CODE_CPR_RESUME:
cp->cc_events &= ~CALLB_CPR_START;
cv_signal(&cp->cc_stop_cv);
break;
}
mutex_exit(cp->cc_lockp);
return (ret != -1);
}
void
md_rem_hspname(set_t setno, mdkey_t n_key)
{
int s;
int max_sides;
/* All entries removed are in the same diskset */
if (md_get_setstatus(setno) & MD_SET_MNSET)
max_sides = MD_MNMAXSIDES;
else
max_sides = MD_MAXSIDES;
for (s = 0; s < max_sides; s++)
(void) md_remdevname(setno, s, n_key);
}
int
md_rem_selfname(minor_t selfid)
{
int s;
set_t setno = MD_MIN2SET(selfid);
int max_sides;
md_dev64_t dev;
struct nm_next_hdr *nh;
struct nm_name *n;
mdkey_t key;
/*
* Get the key since remove routine expects it
*/
dev = md_makedevice(md_major, selfid);
if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
return (ENOENT);
}
if ((n = (struct nm_name *)lookup_entry(nh, setno, MD_SIDEWILD,
MD_KEYWILD, dev, 0L)) == NULL) {
return (ENOENT);
}
/* All entries removed are in the same diskset */
key = n->n_key;
if (md_get_setstatus(setno) & MD_SET_MNSET)
max_sides = MD_MNMAXSIDES;
else
max_sides = MD_MAXSIDES;
for (s = 0; s < max_sides; s++)
(void) md_remdevname(setno, s, key);
return (0);
}
void
md_upd_set_unnext(set_t setno, unit_t un)
{
if (un < md_set[setno].s_un_next) {
md_set[setno].s_un_next = un;
}
}
struct hot_spare_pool *
find_hot_spare_pool(set_t setno, int hsp_id)
{
hot_spare_pool_t *hsp;
hsp = (hot_spare_pool_t *)md_set[setno].s_hsp;
while (hsp != NULL) {
if (hsp->hsp_self_id == hsp_id)
return (hsp);
hsp = hsp->hsp_next;
}
return ((hot_spare_pool_t *)0);
}
/*
* md_create_taskq:
*
* Create a kernel taskq for the given set/unit combination. This is typically
* used to complete a RR_CLEAN request when the callee is unable to obtain the
* mutex / condvar access required to update the DRL safely.
*/
void *
md_create_taskq(set_t setno, minor_t mnum)
{
char name[20];
ddi_taskq_t *tqp;
(void) snprintf(name, 20, "%d/d%d", setno, MD_MIN2UNIT(mnum));
tqp = ddi_taskq_create(md_devinfo, name, 1, TASKQ_DEFAULTPRI, 0);
return ((void *)tqp);
}