zvol.c revision a6e57bd4c7a2bf9cc33be939d674d4c7d3e67cce
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* ZFS volume emulation driver.
*
* Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
* Volumes are accessed through the symbolic links named:
*
*
* These links are created by the ZFS-specific devfsadm link generator.
* Volumes are persistent through reboot. No user command needs to be
* run before opening and using a device.
*/
#include <sys/dmu_traverse.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_prop.h>
#include <sys/efi_partition.h>
#include <sys/byteorder.h>
#include <sys/pathname.h>
#include <sys/zfs_ioctl.h>
#include <sys/refcount.h>
#include <sys/zfs_znode.h>
#include <sys/zfs_rlock.h>
#include <sys/vdev_disk.h>
#include <sys/vdev_impl.h>
#include "zfs_namecheck.h"
static void *zvol_state;
#define ZVOL_DUMPSIZE "dumpsize"
/*
* This lock protects the zvol_state structure from being modified
* while it's being used, e.g. an open that comes in before a create
* finishes. It also protects temporary opens of the dataset so that,
* e.g., an open doesn't get a spurious EBUSY.
*/
static kmutex_t zvol_state_lock;
static uint32_t zvol_minors;
typedef struct zvol_extent {
/*
* The list of extents associated with the dump device
*/
typedef struct zvol_ext_list {
struct zvol_ext_list *zl_next;
/*
* The in-core state of each volume.
*/
typedef struct zvol_state {
} zvol_state_t;
/*
* zvol specific flags
*/
#define ZVOL_RDONLY 0x1
#define ZVOL_DUMPIFIED 0x2
#define ZVOL_EXCL 0x4
/*
* zvol maximum transfer in one DMU tx.
*/
extern int zfs_set_prop_nvlist(const char *, nvlist_t *);
static void
{
/* Notify specfs to invalidate the cached size */
}
int
{
if (volsize == 0)
return (EINVAL);
return (EINVAL);
#ifdef _ILP32
return (EOVERFLOW);
#endif
return (0);
}
int
{
if (volblocksize < SPA_MINBLOCKSIZE ||
!ISP2(volblocksize))
return (EDOM);
return (0);
}
static void
{
if (newval)
else
}
int
{
int error;
if (error)
return (error);
if (error == 0) {
}
return (error);
}
/*
* Find a free minor number.
*/
static minor_t
zvol_minor_alloc(void)
{
return (minor);
return (0);
}
static zvol_state_t *
zvol_minor_lookup(const char *name)
{
continue;
break;
}
return (zv);
}
void
{
}
/* extent mapping arg */
struct maparg {
int ma_gang;
};
/*ARGSUSED*/
static int
{
/* If there is an error, then keep trying to make progress */
return (ERESTART);
#ifdef ZFS_DEBUG
} else {
}
} else {
}
}
}
fill++;
}
}
#endif
return (0);
/* Abort immediately if we have encountered gang blocks */
if (BP_IS_GANG(bp)) {
return (EINTR);
}
/* first time? */
return (0);
}
/* second block in this extent */
return (0);
/*
* the block we allocated has the same
* stride
*/
return (0);
}
}
/*
* dtrace -n 'zfs-dprintf
* /stringof(arg0) == "zvol.c"/
* {
* printf("%s: %s", stringof(arg1), stringof(arg3))
* } '
*/
dprintf("ma_extent 0x%lx mrstride 0x%lx stride %lx\n",
/* start a new extent */
KM_SLEEP);
} else {
}
return (0);
}
/* ARGSUSED */
void
{
int error;
/*
* These properties must be removed from the list so the generic
* property setting step won't apply to them.
*/
zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
(void) nvlist_remove_all(nvprops,
DMU_OT_NONE, 0, tx);
DMU_OT_NONE, 0, tx);
}
/*
* Replay a TX_WRITE ZIL transaction that didn't get committed
* after a system failure
*/
static int
{
int error;
if (byteswap)
if (error) {
} else {
}
return (error);
}
/* ARGSUSED */
static int
{
return (ENOTSUP);
}
/*
* Callback vectors for replaying records.
* Only TX_WRITE is needed for zvol.
*/
zvol_replay_err, /* 0 no such transaction type */
zvol_replay_err, /* TX_CREATE */
zvol_replay_err, /* TX_MKDIR */
zvol_replay_err, /* TX_MKXATTR */
zvol_replay_err, /* TX_SYMLINK */
zvol_replay_err, /* TX_REMOVE */
zvol_replay_err, /* TX_RMDIR */
zvol_replay_err, /* TX_LINK */
zvol_replay_err, /* TX_RENAME */
zvol_replay_write, /* TX_WRITE */
zvol_replay_err, /* TX_TRUNCATE */
zvol_replay_err, /* TX_SETATTR */
zvol_replay_err, /* TX_ACL */
};
/*
* reconstruct dva that gets us to the desired offset (offset
* is in bytes)
*/
int
{
int idx;
return (EIO);
idx = 0;
/* we've reached the end of this array */
return (-1);
idx = 0;
} else {
ze++;
idx++;
}
}
return (0);
}
static void
{
}
}
}
int
{
int err;
/*
* We currently don't support dump devices when the pool
* is so fragmented that our allocation has resulted in
* gang blocks.
*/
return (EFRAGS);
}
while (ze) {
} else {
ze++;
}
}
return (EIO);
}
return (0);
}
/*
* Create a minor node (plus a whole lot more) for the specified volume.
*/
int
{
int ds_mode = DS_MODE_OWNER;
char *devpath;
int error;
return (EEXIST);
}
if (error) {
return (error);
}
if (error) {
return (error);
}
/*
* same minor number we used last time.
*/
if (error == 0) {
if (error == 0) {
}
}
}
/*
* If we found a minor but it's already in use, we must pick a new one.
*/
minor = 0;
if (minor == 0)
minor = zvol_minor_alloc();
if (minor == 0) {
return (ENXIO);
}
return (EAGAIN);
}
(char *)name);
return (EAGAIN);
}
return (EAGAIN);
}
/* get and cache the blocksize */
/* XXX this should handle the possible i/o error */
zvol_minors++;
return (0);
}
/*
* Remove minor node for the specified volume.
*/
int
zvol_remove_minor(const char *name)
{
char namebuf[30];
return (ENXIO);
}
if (zv->zv_total_opens != 0) {
return (EBUSY);
}
zvol_minors--;
return (0);
}
int
{
void *data;
/* Check the space usage before attempting to allocate the space */
return (ENOSPC);
/* Free old extents if they exist */
/* allocate the blocks by writing each one */
while (resid != 0) {
int error;
if (error) {
return (error);
}
}
return (0);
}
int
{
int error;
if (error) {
return (error);
}
if (error == 0)
/*
* If we are using a faked-up state (zv_minor == 0) then don't
* try to update the in-core zvol state.
*/
}
return (error);
}
int
{
int error;
zvol_state_t state = { 0 };
/*
* If we are doing a "zfs clone -o volsize=", then the
* minor node won't exist yet.
*/
if (error != 0)
goto out;
}
doi.doi_data_block_size)) != 0)
goto out;
goto out;
}
/*
* Reinitialize the dump area to the new size. If we
* failed to resize the dump area then restore the it back to
* it's original size.
*/
(error = dumpvp_resize()) != 0) {
}
}
out:
return (error);
}
int
{
int error;
return (ENXIO);
}
return (EROFS);
}
if (error) {
} else {
volblocksize, 0, tx);
}
return (error);
}
/*ARGSUSED*/
int
{
if (minor == 0) /* This is the control device */
return (0);
return (ENXIO);
}
return (EROFS);
}
return (EBUSY);
}
if (zv->zv_total_opens != 0) {
return (EBUSY);
}
}
zv->zv_total_opens++;
}
return (0);
}
/*ARGSUSED*/
int
{
if (minor == 0) /* This is the control device */
return (0);
return (ENXIO);
}
}
/*
* If the open count is zero, this is a spurious close.
* That indicates a bug in the kernel / DDI framework.
*/
/*
* You may get multiple opens, but only one close.
*/
zv->zv_total_opens--;
return (0);
}
static void
{
}
/*
* Get data to generate a TX_WRITE intent log record.
*/
static int
{
int error;
/*
* Write records come in two flavors: immediate and indirect.
* For small writes it's cheaper to store the data with the
* log record (immediate); for large writes it's cheaper to
* sync the data and get a pointer to it (indirect) so that
* we don't have to write the data twice.
*/
/*
* Lock the range of the block to ensure that when the data is
* written out and its checksum is being calculated that no other
* thread can change the block.
*/
if (error == 0)
/*
* If we get EINPROGRESS, then we need to wait for a
* write IO initiated by dmu_sync() to complete before
* we can release this dbuf. We will finish everything
* up in the zvol_get_done() callback.
*/
if (error == EINPROGRESS)
return (0);
return (error);
}
/*
* zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
*
* We store data in the log buffers if it's small enough.
* Otherwise we will later flush the data out via dmu_sync().
*/
static void
{
lr_write_t *lr;
while (len) {
itx->itx_wr_state =
}
}
int
{
int direction;
int c;
int numerrors = 0;
for (c = 0; c < vd->vdev_children; c++) {
numerrors++;
break;
}
}
if (!vdev_writeable(vd))
return (EIO);
if (ddi_in_panic() || isdump) {
return (EIO);
} else {
direction));
}
}
int
{
int error;
/* restrict requests to multiples of the system block size */
return (EINVAL);
return (EIO);
return (error);
}
int
{
char *addr;
int error = 0;
return (0);
}
return (0);
}
return (0);
}
return (EIO);
/*
* There must be no buffer changes when doing a dmu_sync() because
* we can't change the data whilst calculating the checksum.
*/
if (is_dump) {
/* can't straddle a block boundary */
addr, 0);
} else if (reading) {
} else {
if (error) {
} else {
}
}
if (error) {
/* convert checksum errors into IO errors */
break;
}
}
return (0);
}
/*
* Set the buffer count to the zvol maximum transfer.
* Using our own routine instead of the default minphys()
* means that for larger writes we write bigger buffers on X86
* (128K instead of 56K) and flush the disk write cache less often
* (every zvol_maxphys - currently 1MB) instead of minphys (currently
* 56K on X86 and 128K on sparc).
*/
void
{
}
int
{
int error = 0;
if (minor == 0) /* This is the control device */
return (ENXIO);
return (ENXIO);
/* dump should know better than to write here */
return (EIO);
}
while (resid) {
/* can't straddle a block boundary */
if (error)
break;
}
return (error);
}
/*ARGSUSED*/
int
{
int error = 0;
if (minor == 0) /* This is the control device */
return (ENXIO);
return (ENXIO);
return (EIO);
/* don't read past the end */
if (error) {
/* convert checksum errors into IO errors */
break;
}
}
return (error);
}
/*ARGSUSED*/
int
{
int error = 0;
if (minor == 0) /* This is the control device */
return (ENXIO);
return (ENXIO);
return (EIO);
zvol_minphys, uio);
return (error);
}
if (error) {
break;
}
if (error == 0)
if (error)
break;
}
return (error);
}
int
{
int length;
char *ptr;
return (EFAULT);
/*
* Some clients may attempt to request a PMBR for the
* zvol. Currently this interface will return EINVAL to
* such requests. These requests could be supported by
* adding a check for lba == 0 and consing up an appropriate
* PMBR.
*/
return (EINVAL);
flag))
return (EFAULT);
}
return (EFAULT);
return (0);
}
/*
* Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I).
*/
/*ARGSUSED*/
int
{
struct dk_callback *dkc;
int error = 0;
return (ENXIO);
}
switch (cmd) {
case DKIOCINFO:
return (error);
case DKIOCGMEDIAINFO:
return (error);
case DKIOCGETEFI:
{
return (error);
}
case DKIOCFLUSHWRITECACHE:
error = 0;
}
break;
case DKIOCGGEOM:
case DKIOCGVTOC:
/*
* commands using these (like prtvtoc) expect ENOTSUP
* since we're emulating an EFI label
*/
break;
case DKIOCDUMPINIT:
break;
case DKIOCDUMPFINI:
break;
default:
break;
}
return (error);
}
int
zvol_busy(void)
{
return (zvol_minors != 0);
}
void
zvol_init(void)
{
}
void
zvol_fini(void)
{
}
static boolean_t
{
char *devpath;
int error;
return (ret);
}
static int
{
int error = 0;
if (error) {
return (error);
}
/*
* If we are resizing the dump device then we only need to
* update the refreservation to match the newly updated
* zvolsize. Otherwise, we save off the original state of the
* zvol so that we can restore them if the zvol is ever undumpified.
*/
if (resize) {
} else {
}
/* Truncate the file */
if (!error)
ZVOL_OBJ, 0, DMU_OBJECT_END);
if (error)
return (error);
/*
* We only need update the zvol's property if we are initializing
* the dump area for the first time.
*/
if (!resize) {
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
ZIO_COMPRESS_OFF) == 0);
ZIO_CHECKSUM_OFF) == 0);
if (error)
return (error);
}
/* Allocate the space for the dump */
return (error);
}
static int
{
int error = 0;
return (EROFS);
/*
* We do not support swap devices acting as dump devices.
*/
if (zvol_is_swap(zv))
return (ENOTSUP);
(void) zvol_dump_fini(zv);
return (error);
}
}
/*
* Build up our lba mapping.
*/
if (error) {
(void) zvol_dump_fini(zv);
return (error);
}
if (error) {
(void) zvol_dump_fini(zv);
return (error);
}
if (error) {
(void) zvol_dump_fini(zv);
return (error);
}
return (0);
}
static int
{
int error = 0;
/*
* Attempt to restore the zvol back to its pre-dumpified state.
* This is a best-effort attempt as it's possible that not all
* of these properties were initialized during the dumpify process
* (i.e. error during zvol_dump_init).
*/
if (error) {
return (error);
}
(void) nvlist_add_uint64(nv,
(void) nvlist_add_uint64(nv,
(void) nvlist_add_uint64(nv,
return (0);
}