xdf.c revision a6e5dd18ce1ac2c94bbb09fbb01867dc6effa694
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* xdf.c - Xen Virtual Block Device Driver
* TODO:
* - support alternate block size (currently only DEV_BSIZE supported)
* - revalidate geometry for removable devices
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include "xdf.h"
#define FLUSH_DISKCACHE 0x1
#define WRITE_BARRIER 0x2
#define USE_WRITE_BARRIER(vdp) \
#define USE_FLUSH_DISKCACHE(vdp) \
#define IS_FLUSH_DISKCACHE(bp) \
static void *vbd_ss;
static kmem_cache_t *xdf_vreq_cache;
static kmem_cache_t *xdf_gs_cache;
static int xdf_maxphys = XB_MAXPHYS;
int xdfdebug = 0;
extern int do_polled_io;
int xdf_barrier_flush_disable = 0;
/*
* dev_ops and cb_ops entrypoints
*/
static int xdf_strategy(struct buf *);
caddr_t, int *);
/*
* misc private functions
*/
static int xdf_suspend(dev_info_t *);
static int xdf_resume(dev_info_t *);
static int xdf_start_connect(xdf_t *);
static int xdf_start_disconnect(xdf_t *);
static int xdf_post_connect(xdf_t *);
static void xdf_post_disconnect(xdf_t *);
static void xdf_iostart(xdf_t *);
static int xdf_drain_io(xdf_t *);
static int xdf_dmacallback(caddr_t);
static void xdf_timeout_handler(void *);
static void unexpectedie(xdf_t *);
NULL,
};
struct dev_ops xdf_devops = {
DEVO_REV, /* devo_rev */
0, /* devo_refcnt */
xdf_getinfo, /* devo_getinfo */
nulldev, /* devo_identify */
nulldev, /* devo_probe */
xdf_attach, /* devo_attach */
xdf_detach, /* devo_detach */
xdf_reset, /* devo_reset */
&xdf_cbops, /* devo_cb_ops */
};
&mod_driverops, /* Type of module. This one is a driver */
"virtual block driver %I%", /* short description */
&xdf_devops /* driver specific ops */
};
static struct modlinkage xdf_modlinkage = {
};
/*
* I/O buffer DMA attributes
* Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
*/
static ddi_dma_attr_t xb_dma_attr = {
(uint64_t)0, /* lowest address */
XB_BSIZE, /* min transfer */
BLKIF_MAX_SEGMENTS_PER_REQUEST, /* maximum number of segments */
XB_BSIZE, /* granularity */
0, /* flags (reserved) */
};
static ddi_device_acc_attr_t xc_acc_attr = {
};
/* callbacks from commmon label */
void *);
static int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
static cmlb_tg_ops_t xdf_lb_ops = {
};
int
_init(void)
{
int rc;
}
}
return (rc);
}
int
_fini(void)
{
int err;
return (err);
return (0);
}
int
{
}
/*ARGSUSED*/
static int
{
int instance;
switch (cmd) {
case DDI_INFO_DEVT2DEVINFO:
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
case DDI_INFO_DEVT2INSTANCE:
return (DDI_SUCCESS);
default:
return (DDI_FAILURE);
}
}
static int
{
/*
* xdf dynamic properties are device specific and size oriented.
* Requests issued under conditions where size is valid are passed
* to ddi_prop_op_nblocks with the size information, otherwise the
* request is passed to ddi_prop_op.
*/
/* do cv_wait until connected or failed */
goto out;
}
out:
lengthp));
}
static int
{
int instance;
"xdfdebug", 0);
switch (cmd) {
case DDI_ATTACH:
break;
case DDI_RESUME:
return (xdf_resume(devi));
default:
return (DDI_FAILURE);
}
return (DDI_FAILURE);
goto errout1;
}
!= DDI_SUCCESS) {
goto errout2;
}
goto errout2;
}
/*
* create kstat for iostat(1M)
*/
} else {
goto errout3;
}
/*
* driver handles kernel-issued IOCTLs
*/
goto errout4;
}
/*
* create default device minor nodes: non-removable disk
* we will adjust minor nodes after we are connected w/ backend
*/
goto errout5;
}
/*
* We ship with cache-enabled disks
*/
/* Watch backend XenbusState change */
xdf_oe_change) != DDI_SUCCESS) {
goto errout6;
}
(void) xdf_start_disconnect(vdp);
goto errout7;
}
return (DDI_SUCCESS);
return (DDI_FAILURE);
}
static int
{
int instance;
switch (cmd) {
case DDI_PM_SUSPEND:
break;
case DDI_SUSPEND:
return (xdf_suspend(devi));
case DDI_DETACH:
break;
default:
return (DDI_FAILURE);
}
return (DDI_FAILURE);
return (DDI_FAILURE);
}
return (DDI_FAILURE);
}
if (vdp->xdf_timeout_id != 0)
/* we'll support backend running in domU later */
#ifdef DOMU_BACKEND
#endif
return (DDI_SUCCESS);
}
static int
{
int instance;
if (xdfdebug & SUSRES_DBG)
return (DDI_FAILURE);
/* change status to stop further I/O requests */
/* make sure no more I/O responses left in the ring buffer */
(void) xdf_drain_io(vdp);
/*
* no need to teardown the ring buffer here
* it will be simply re-init'ed during resume when
* we call xvdi_alloc_ring
*/
}
if (xdfdebug & SUSRES_DBG)
xen_printf("xdf_suspend: SUCCESS\n");
return (DDI_SUCCESS);
}
/*ARGSUSED*/
static int
{
int instance;
if (xdfdebug & SUSRES_DBG)
return (DDI_FAILURE);
return (DDI_FAILURE);
}
return (DDI_FAILURE);
}
if (xdfdebug & SUSRES_DBG)
xen_printf("xdf_resume: done\n");
return (DDI_SUCCESS);
}
/*ARGSUSED*/
static int
{
int instance;
return (DDI_FAILURE);
/*
* wait for any outstanding I/O to complete
*/
(void) xdf_drain_io(vdp);
return (DDI_SUCCESS);
}
static int
{
int part;
diskaddr_t p_blkct = 0;
return (ENXIO);
/* do cv_wait until connected or failed */
return (ENXIO);
}
return (EROFS);
}
return (EBUSY);
}
/* are we the first one to open this node? */
return (EBUSY);
}
/* force a re-validation */
if (firstopen)
/*
* check size
*/
return (ENXIO);
}
return (0);
}
/*ARGSUSED*/
static int
{
int part;
return (ENXIO);
return (ENXIO);
}
} else {
}
return (0);
}
static int
{
int part;
return (0);
}
return (0);
}
/*
* starting beyond partition
*/
return (0);
}
/* Legacy: don't set error flag at this case */
return (0);
}
/*
* adjust for partial transfer
*/
}
} else {
}
if (do_polled_io)
(void) xdf_drain_io(vdp);
return (0);
}
/*ARGSUSED*/
static int
{
int part;
return (ENXIO);
return (ENXIO);
return (ENXIO);
return (EINVAL);
}
/*ARGSUSED*/
static int
{
int part;
return (ENXIO);
return (ENXIO);
return (ENXIO);
return (ENOSPC);
return (EINVAL);
}
/*ARGSUSED*/
static int
{
int part;
return (ENXIO);
return (ENXIO);
return (ENXIO);
return (ENOSPC);
return (EINVAL);
}
/*ARGSUSED*/
static int
{
int part;
return (ENXIO);
return (ENXIO);
return (ENXIO);
return (ENOSPC);
return (EINVAL);
}
static int
{
int err = 0;
int part;
return (ENXIO);
return (ENXIO);
return (ENXIO);
return (EINVAL);
}
} else {
}
return (err);
}
/*ARGSUSED*/
static int
int *rvalp)
{
int instance;
int part;
return (ENXIO);
return (ENXIO);
switch (cmd) {
case DKIOCGMEDIAINFO: {
struct dk_minfo media_info;
return (EFAULT);
} else {
return (0);
}
}
case DKIOCINFO: {
/* controller information */
else
/* unit information */
return (EFAULT);
else
return (0);
}
case DKIOCSTATE: {
mode) != 0)
return (EFAULT);
return (0);
}
/*
* is media removable?
*/
case DKIOCREMOVABLE: {
return (EFAULT);
return (0);
}
case DKIOCG_PHYGEOM:
case DKIOCG_VIRTGEOM:
case DKIOCGGEOM:
case DKIOCSGEOM:
case DKIOCGAPART:
case DKIOCGVTOC:
case DKIOCSVTOC:
case DKIOCPARTINFO:
case DKIOCGETEFI:
case DKIOCSETEFI:
case DKIOCPARTITION: {
int rc;
return (rc);
}
case DKIOCGETWCE:
return (EFAULT);
return (0);
case DKIOCSETWCE:
return (EFAULT);
return (0);
case DKIOCFLUSHWRITECACHE: {
int rc;
if (vdp->xdf_flush_supported) {
} else {
return (ENOTTY);
}
/* need to return 0 after calling callback */
rc = 0;
}
return (rc);
}
default:
return (ENOTTY);
}
}
/*
* xdf interrupt handler
*/
static uint_t
{
int bioerr = 0;
extern int do_polled_io;
return (DDI_INTR_UNCLAIMED);
}
/*
* complete all requests which have a response
*/
/*
* XXPV - close connection to the backend and restart
*/
if (status != BLKIF_RSP_OKAY) {
}
}
if (!do_polled_io)
return (DDI_INTR_CLAIMED);
}
int xdf_fbrewrites; /* how many times was our flush block rewritten */
/*
* Snarf new data if our flush block was re-written
*/
static void
{
int nblks;
return; /* write was a flush write */
}
if (mapin)
}
}
static void
{
if (bioerr)
return;
}
/*
* return value of xdf_prepare_rreq()
* used in xdf_iostart()
*/
#define XF_PARTIAL 0 /* rreq is full, not all I/O in buf transferred */
static void
{
int retval;
int rreqready = 0;
/*
* populate the ring request(s)
*
* loop until there is no buf to transfer or no free slot
* available in I/O ring
*/
for (;;) {
break;
/* active buf queue empty? */
break;
/* try to grab a vreq for this bp */
break;
break;
/* get next blkif_request in the ring */
break;
/* populate blkif_request with this buf */
rreqready++;
/* finish this bp, switch to next one */
}
}
/*
* Send the request(s) to the backend
*/
if (rreqready) {
if (xvdi_ring_push_request(xbr)) {
"sent request(s) to backend\n"));
}
}
}
/*
* populate a single blkif_request_t w/ a buf
*/
static int
{
int rval;
int seg = 0;
if (isread)
else {
switch (vreq->v_flush_diskcache) {
case FLUSH_DISKCACHE:
return (XF_COMP);
case WRITE_BARRIER:
break;
default:
else
break;
}
}
/*
* loop until all segments are populated or no more dma cookie in buf
*/
for (;;) {
/*
* Each segment of a blkif request can transfer up to
* one 4K page of data.
*/
seg++;
if (--ndmacs) {
continue;
}
/* last win */
else
rval = XF_PARTIAL;
break;
}
return (rval);
}
static int
{
if (xdfdebug & SUSRES_DBG)
xen_printf("xdf_drain_io: start\n");
goto out;
rval = 0;
}
if (!xvdi_ring_has_incomp_request(xbr))
goto out;
(void) HYPERVISOR_yield();
/*
* file-backed devices can be slow
*/
}
out:
if (xdfdebug & SUSRES_DBG)
return (rval);
}
/* ARGSUSED5 */
static int
{
int err = 0;
return (ENXIO);
return (EINVAL);
else
} else {
}
return (err);
}
/*
* synthetic geometry
*/
#define XDF_NSECTS 256
#define XDF_NHEADS 16
static int
{
return (ENXIO);
return (0);
}
static int
{
return (ENXIO);
return (0);
}
/*
* No real HBA, no geometry available from it
*/
/*ARGSUSED*/
static int
{
return (EINVAL);
}
static int
{
return (ENXIO);
else
return (0);
}
/* ARGSUSED3 */
static int
{
switch (cmd) {
case TG_GETPHYGEOM:
case TG_GETVIRTGEOM:
case TG_GETCAPACITY:
case TG_GETBLOCKSIZE:
return (0);
case TG_GETATTR:
default:
return (ENOTTY);
}
}
/*
* Kick-off connect process
* Status should be XD_UNKNOWN or XD_CLOSED
* On success, status will be changed to XD_INIT
* On error, status won't be changed
*/
static int
{
char *xsnode;
int rv;
goto errout;
goto errout;
}
DDI_SUCCESS) {
goto errout1;
}
DDI_SUCCESS) {
goto errout2;
}
/*
* Write into xenstore the info needed by backend
*/
"failed to get xenstore node path",
goto fail_trans;
}
if (xenbus_transaction_start(&xbt)) {
goto fail_trans;
}
goto abort_trans;
}
xvdi_get_evtchn(dip))) {
goto abort_trans;
}
"failed to switch state to XenbusStateInitialised",
goto abort_trans;
}
/* kick-off connect process */
goto trans_retry;
goto fail_trans;
}
return (DDI_SUCCESS);
return (DDI_FAILURE);
}
/*
* Kick-off disconnect process
* Status won't be changed
*/
static int
{
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
int
{
/*
* Get a DEV_BSIZE aligned bufer
*/
return (DDI_FAILURE);
return (DDI_SUCCESS);
}
/*
* Finish other initialization after we've connected to backend
* Status should be XD_INIT before calling this routine
* On success, status should be changed to XD_READY
* On error, status should stay XD_INIT
*/
static int
{
int rv;
char *type;
char *barrier;
/*
* Determine if feature barrier is supported by backend
*/
} else {
vdp->xdf_feature_barrier = 0;
}
/* probe backend */
return (DDI_FAILURE);
}
/* fix disk type */
return (DDI_FAILURE);
}
/*
* We've created all the minor nodes via cmlb_attach() using default
* value in xdf_attach() to make it possbile to block in xdf_open(),
* in case there's anyone (say, booting thread) ever trying to open
* it before connected to backend. We will refresh all those minor
* nodes w/ latest info we've got now when we are almost connected.
*
* Don't do this when xdf is already opened by someone (could happen
* during resume), for that cmlb_attach() will invalid the label info
* and confuse those who has already opened the node, which is bad.
*/
/* re-init cmlb w/ latest info we got from backend */
return (DDI_FAILURE);
}
}
/* mark vbd is ready for I/O */
/*
* If backend has feature-barrier, see if it supports disk
* cache flush op.
*/
vdp->xdf_flush_supported = 0;
if (vdp->xdf_feature_barrier) {
/*
* Pretend we already know flush is supported so probe
* will attempt the correct op.
*/
} else {
vdp->xdf_flush_supported = 0;
/*
* If the other end does not support the cache flush op
* then we must use a barrier-write to force disk
* cache flushing. Barrier writes require that a data
* block actually be written.
* Cache a block to barrier-write when we are
* asked to perform a flush.
* XXX - would it be better to just copy 1 block
* (512 bytes) from whatever write we did last
* and rewrite that block?
*/
return (DDI_FAILURE);
}
}
return (DDI_SUCCESS);
}
/*
* Finish other uninitialization after we've disconnected from backend
* when status is XD_CLOSING or XD_INIT. After returns, status is XD_CLOSED
*/
static void
{
}
/*ARGSUSED*/
static void
{
int status;
return;
}
switch (new_state) {
case XenbusStateInitialising:
/*
* backend recovered from a previous failure,
* kick-off connect process again
*/
" failed to start reconnecting to backend",
}
break;
case XenbusStateConnected:
/* finish final init after connect */
(void) xdf_start_disconnect(vdp);
break;
case XenbusStateClosing:
break;
} else {
}
}
(void) xdf_start_disconnect(vdp);
break;
case XenbusStateClosed:
/* first check if BE closed unexpectedly */
}
#ifdef DOMU_BACKEND
#endif
} else {
}
}
/* notify anybody waiting for oe state change */
/* interface is closed successfully, remove all minor nodes */
}
}
/* check if partition is open, -1 - check all partitions on the disk */
static boolean_t
{
int i;
if (partition == -1)
else
for (i = 0; i < OTYPCNT; i++) {
}
return (rval);
}
/*
* Xdf_check_state_transition will check the XenbusState change to see
* if the change is a valid transition or not.
* The new state is written by backend domain, or by running xenstore-write
* to change it manually in dom0
*/
static int
{
int status;
int stcheck;
#define STOK 0 /* need further process */
switch (status) {
case XD_UNKNOWN:
if ((oestate == XenbusStateUnknown) ||
(oestate == XenbusStateConnected))
else if ((oestate == XenbusStateInitialising) ||
(oestate == XenbusStateInitWait) ||
break;
case XD_INIT:
if (oestate == XenbusStateUnknown)
else if ((oestate == XenbusStateInitialising) ||
(oestate == XenbusStateInitWait) ||
break;
case XD_READY:
if ((oestate == XenbusStateUnknown) ||
(oestate == XenbusStateInitialising) ||
(oestate == XenbusStateInitWait) ||
else if (oestate == XenbusStateConnected)
break;
case XD_CLOSING:
if ((oestate == XenbusStateUnknown) ||
(oestate == XenbusStateInitialising) ||
(oestate == XenbusStateInitWait) ||
(oestate == XenbusStateInitialised) ||
(oestate == XenbusStateConnected))
else if (oestate == XenbusStateClosing)
break;
case XD_CLOSED:
if ((oestate == XenbusStateUnknown) ||
(oestate == XenbusStateConnected))
else if ((oestate == XenbusStateInitWait) ||
(oestate == XenbusStateInitialised) ||
(oestate == XenbusStateClosing) ||
(oestate == XenbusStateClosed))
break;
case XD_SUSPEND:
default:
}
return (DDI_SUCCESS);
"state change to %d!, when status is %d",
return (DDI_FAILURE);
}
static int
{
break;
break;
}
return (vdp->xdf_status);
}
/*
*
* Note: we only register one callback function to grant table subsystem
* since we only have one 'struct gnttab_free_callback' in xdf_t.
*/
static int
{
return (DDI_DMA_CALLBACK_DONE);
}
static uint_t
{
return (DDI_INTR_CLAIMED);
}
static void
xdf_timeout_handler(void *arg)
{
vdp->xdf_timeout_id = 0;
/* new timeout thread could be re-scheduled */
}
/*
* Alloc a vreq for this bp
* bp->av_back contains the pointer to the vreq upon return
*/
static v_req_t *
{
if (vdp->xdf_timeout_id == 0)
/* restart I/O after one second */
return (NULL);
}
/* init of other fields in vreq is up to the caller */
return (vreq);
}
static void
{
goto done;
case VREQ_DMAWIN_DONE:
case VREQ_GS_ALLOCED:
case VREQ_DMABUF_BOUND:
/*FALLTHRU*/
case VREQ_DMAMEM_ALLOCED:
if (!ALIGNED_XFER(bp)) {
}
/*FALLTHRU*/
case VREQ_MEMDMAHDL_ALLOCED:
if (!ALIGNED_XFER(bp))
/*FALLTHRU*/
case VREQ_DMAHDL_ALLOCED:
break;
default:
break;
}
done:
}
/*
* Initalize the DMA and grant table resources for the buf
*/
static int
{
int rc;
case VREQ_INIT:
if (IS_FLUSH_DISKCACHE(bp)) {
"xdf@%s: get ge_slotfailed\n",
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
/* See if we wrote new data to our flush block */
/*FALLTHRU*/
case VREQ_INIT_DONE:
/*
* alloc DMA handle
*/
if (rc != DDI_SUCCESS) {
return (DDI_FAILURE);
}
/*FALLTHRU*/
case VREQ_DMAHDL_ALLOCED:
/*
* alloc dma handle for 512-byte aligned buf
*/
if (!ALIGNED_XFER(bp)) {
/*
* XXPV: we need to temporarily enlarge the seg
* boundary and s/g length to work round CR6381968
*/
if (rc != DDI_SUCCESS) {
"handle alloc failed\n",
return (DDI_FAILURE);
}
}
/*FALLTHRU*/
case VREQ_MEMDMAHDL_ALLOCED:
/*
* alloc 512-byte aligned buf
*/
if (!ALIGNED_XFER(bp)) {
if (rc != DDI_SUCCESS) {
"xdf@%s: DMA mem allocation failed\n",
return (DDI_FAILURE);
}
}
/*FALLTHRU*/
case VREQ_DMAMEM_ALLOCED:
/*
* dma bind
*/
if (ALIGNED_XFER(bp)) {
} else {
}
/* get num of dma windows */
if (rc == DDI_DMA_PARTIAL_MAP) {
} else {
ndws = 1;
}
} else {
return (DDI_FAILURE);
}
/*FALLTHRU*/
case VREQ_DMABUF_BOUND:
/*
* get ge_slot, callback is set upon failure from gs_get(),
* if not set previously
*/
return (DDI_FAILURE);
}
break;
case VREQ_GS_ALLOCED:
/* nothing need to be done */
break;
case VREQ_DMAWIN_DONE:
/*
* move to the next dma window
*/
/* get a ge_slot for this DMA window */
return (DDI_FAILURE);
}
break;
default:
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
static ge_slot_t *
{
/* try to alloc GTEs needed in this slot, first */
(void (*)(void *))xdf_dmacallback,
(void *)vdp,
}
return (NULL);
}
if (vdp->xdf_timeout_id == 0)
/* restart I/O after one second */
return (NULL);
}
/* init gs_slot */
return (gs);
}
static void
{
int i;
/* release all grant table entry resources used in this slot */
}
static grant_ref_t
{
return (gr);
}
static void
{
}
/* free up all grant table entries */
/*
* move bp back to active list orderly
* vreq_busy is updated in vreq_free()
*/
} else {
/* move to the head of list */
}
}
}
static void
{
}