vds.c revision 445b4c2ed2d52ef648ae6b36e4f5e14ff3d234af
2N/A * The contents of this file are subject to the terms of the 2N/A * Common Development and Distribution License (the "License"). 2N/A * You may not use this file except in compliance with the License. 2N/A * See the License for the specific language governing permissions 2N/A * and limitations under the License. 2N/A * When distributing Covered Code, include this CDDL HEADER in each 2N/A * If applicable, add the following below this CDDL HEADER, with the 2N/A * fields enclosed by brackets "[]" replaced with your own identifying 2N/A * information: Portions Copyright [yyyy] [name of copyright owner] 2N/A * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 2N/A * Use is subject to license terms. 2N/A#
pragma ident "%Z%%M% %I% %E% SMI" 2N/A * Virtual disk server 2N/A/* Virtual disk server initialization flags */ 2N/A/* Virtual disk server tunable parameters */ 2N/A/* Identification parameters for MD, synthetic dkio(7i) structures, etc. */ 2N/A/* Virtual disk initialization flags */ 2N/A * unfortunately, this convention does not appear to be codified. 2N/A/* Return a cpp token as a string */ 2N/A * Print a message prefixed with the current function name to the message log 2N/A * (and optionally to the console for verbose boots); these macros use cpp's 2N/A * concatenation of string literals and C99 variable-length-argument-list 2N/A/* Return a pointer to the "i"th vdisk dring element */ 2N/A/* Return the virtual disk client's type as a string (for use in messages) */ 2N/A "unsupported client")))
2N/A * Specification of an MD node passed to the MDEG to filter any 2N/A * 'vport' nodes that do not belong to the specified node. This 2N/A * template is copied for each vds instance and filled in with 2N/A * the appropriate 'cfg-handle' value before being passed to the MDEG. 2N/A * Matching criteria passed to the MDEG to register interest 2N/A * in changes to 'virtual-device-port' nodes identified by their 2N/A/* Debugging macros */ 2N/A PRN(
"dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n", \
2N/A default:
str =
"unknown";
break;
default:
sstr =
"unknown";
break;
default:
estr =
"unknown";
break;
PR1(
"(%x/%x/%x) message : (%s/%s/%s)",
* Soft state structure for a vds instance * Types of descriptor-processing tasks * Structure describing the task for processing a descriptor struct vd *
vd;
/* vd instance task is for */ int index;
/* dring elem index for task */ struct buf buf;
/* buf(9s) for I/O request */ * Soft state structure for a virtual disk instance struct vtoc vtoc;
/* synthetic for slice type */ int cmd;
/* corresponding ioctl cmd */ const char *
cmd_name;
/* ioctl cmd name */ void *
arg;
/* ioctl cmd argument */ /* convert input vd_buf to output ioctl_arg */ /* convert input ioctl_arg to output vd_buf */ * Supported protocol version pairs, from highest (newest) to lowest (oldest) * Each supported major version should appear only once, paired with (and only * with) its highest supported minor version number (as the protocol requires * supporting all lower minor version numbers as well) return (
EINVAL);
/* no service for trivial requests */ PR1(
"%s %lu bytes at block %lu",
/* Map memory exported by client */ PR0(
"ldc_mem_map() returned err %d ",
status);
PR0(
"ldc_mem_acquire() returned err %d ",
status);
/* Start the block I/O */ return (
EINPROGRESS);
/* will complete on completionq */ /* Clean up after error */ PR0(
"ldc_mem_release() returned err %d ",
rv);
PR0(
"ldc_mem_unmap() returned err %d ",
status);
PR0(
"ldc_write() returned errno %d",
status);
PR0(
"ldc_write() performed only partial write");
* Reset the state of the connection with a client, if needed; reset the LDC * transport as well, if needed. This function should only be called from the * "vd_recv_msg", as it waits for tasks - otherwise a deadlock can occur. * Let any asynchronous I/O complete before possibly pulling the rug * out from under it; defer checking vd->reset_ldc, as one of the * asynchronous tasks might set it PR0(
"ldc_mem_dring_unmap() returned errno %d",
status);
/* Free the staging buffer for msgs */ /* Free the inband message buffer */ PR0(
"taking down LDC channel");
/* Allocate the staging buffer */ PR0(
"vd_mark_in_reset: marking vd in reset\n");
PR0(
"cannot schedule task to recv msg\n");
/* Acquire the element */ PR0(
"ldc_mem_dring_acquire() returned errno %d",
/* Set the element's status and mark it done */ /* Perhaps client timed out waiting for I/O... */ PR0(
"element %u no longer \"accepted\"",
idx);
/* Release the element */ PR0(
"ldc_mem_dring_release() returned errno %d",
/* Wait for the I/O to complete */ PR0(
"ldc_mem_release() returned errno %d copying to " /* Unmap the memory, even if in reset */ PR0(
"ldc_mem_unmap() returned errno %d copying to client",
/* Update the dring element for a dring client */ * If a transport error occurred, arrange to "nack" the message when * the final task in the descriptor element range completes * Only the final task for a range of elements will respond to and * Send the "ack" or "nack" back to the client; if sending the message * via LDC fails, arrange to reset both the connection state and LDC PR0(
"initiating full reset");
PR0(
"ldi_ioctl(DKIOCGVTOC) returned error %d",
status);
PR0(
"vds_efi_alloc_and_read returned error %d",
status);
/* Get data from client and convert, if necessary */ PR1(
"Getting \"arg\" data from client");
PR0(
"ldc_mem_copy() returned errno %d " "copying from client",
status);
/* Convert client's data, if necessary */ else /* convert client vdisk operation data to ioctl data */ * Handle single-slice block devices internally; otherwise, have the * real driver perform the ioctl() PR0(
"%s set rval = %d, which is not being returned to client",
/* Convert data and send to client, if necessary */ PR1(
"Sending \"arg\" data to client");
/* Convert ioctl data to vdisk operation data, if necessary */ PR0(
"ldc_mem_copy() returned errno %d " /* Command (no-copy) operations */ /* "Get" (copy-out) operations */ /* "Set" (copy-in) operations */ * Determine ioctl corresponding to caller's "operation" and * validate caller's "nbytes" /* LDC memory operations require 8-byte multiples */ PR0(
"%s: Expected at least nbytes = %lu, " PR0(
"%s: Expected nbytes = %lu, got %lu",
/* update disk information */ PR0(
"vd_read_vtoc return error %d",
rc);
/* the most common failure is that no devid is available */ * Save the buffer size here for use in deallocation. * The actual number of bytes copied is returned in * the 'nbytes' field of the request structure. /* LDC memory operations require 8-byte multiples */ PR0(
"ldc_mem_copy() returned errno %d copying to client",
* Define the supported operations once the functions for performing them have * Process a task specifying a client I/O request /* Find the requested operation */ /* Handle client using absolute disk offsets */ PR0(
"Invalid \"slice\" %u (max %u) for virtual disk",
/* Start the operation */ PR0(
"operation : %s returned status %d",
return (0);
/* but request completed */ PR0(
"Unexpected return of EINPROGRESS " "with no I/O completion handler");
return (0);
/* but request completed */ /* Queue a task to complete the operation */ /* ddi_taskq_dispatch(9f) guarantees success with DDI_SLEEP */ PR1(
"Operation in progress");
return (
EINPROGRESS);
/* completion handler will finish request */ * Return true if the "type", "subtype", and "env" fields of the "tag" first * argument match the corresponding remaining arguments; otherwise, return false * Check whether the major/minor version specified in "ver_msg" is supported * If the major versions match, adjust the minor version, if * necessary, down to the highest value supported by this * server and return true so this message will get "ack"ed; * the client should also support all minor versions lower PR0(
"Adjusting minor version from %u to %u",
* If the message contains a higher major version number, set * the message's major/minor versions to the current values * and return false, so this message will get "nack"ed with * these values, and the client will potentially try again * with the same or a lower version * Otherwise, the message's major version is less than the * current major version, so continue the loop to the next * (lower) supported version * No common version was found; "ground" the version pair in the * message to terminate negotiation * Process a version message from a client. vds expects to receive version * messages from clients seeking service, but never issues version messages * itself; therefore, vds can ACK or NACK client version messages, but does * not expect to receive version-message ACKs or NACKs (and will treat such return (
ENOMSG);
/* not a version message */ PR0(
"Expected %lu-byte version message; " PR0(
"Expected device class %u (disk); received %u",
* We're talking to the expected kind of client; set our device class * for "ack/nack" back to the client * Check whether the (valid) version message specifies a version * supported by this server. If the version is not supported, return * EBADMSG so the message will get "nack"ed; vds_supported_version() * will have updated the message with a supported version for the * A version has been agreed upon; use the client's SID for * communication on this channel now * When multiple versions are supported, this function should store * the negotiated major and minor version values in the "vd" data * structure to govern further communication; in particular, note that * the client might have specified a lower minor version for the * agreed major version than specifed in the vds_version[] array. The * following assertions should help remind future maintainers to make * the appropriate changes to support multiple versions. PR0(
"Using major version %u, minor version %u",
PR0(
"Message is not an attribute message");
PR0(
"Expected %lu-byte attribute message; " PR0(
"Received maximum transfer size of 0 from client");
PR0(
"Client requested unsupported transfer mode");
/* Success: valid message and transfer mode */ * The vd_dring_inband_msg_t contains one cookie; need room * for up to n-1 more cookies, where "n" is the number of full * pages plus possibly one partial page required to cover * "max_xfer_sz". Add room for one more cookie if * "max_xfer_sz" isn't an integral multiple of the page size. * Must first get the maximum transfer size in bytes. * Set the maximum expected message length to * accommodate in-band-descriptor messages with all * Initialize the data structure for processing in-band I/O /* Return the device's block size and max transfer size to the client */ PR0(
"Message is not a register-dring message");
PR0(
"Expected at least %lu-byte register-dring message; " PR0(
"Expected %lu-byte register-dring message; " PR0(
"A dring was previously registered; only support one");
PR0(
"reg_msg->num_descriptors = %u; must be <= %u (%s)",
* In addition to fixing the assertion in the success case * below, supporting drings which require more than one * "cookie" requires increasing the value of vd->max_msglen * somewhere in the code path prior to receiving the message * which results in calling this function. Note that without * making this change, the larger message size required to * accommodate multiple cookies cannot be successfully * received, so this function will not even get called. * Gracefully accommodating more dring cookies might * reasonably demand exchanging an additional attribute or * making a minor protocol adjustment PR0(
"ldc_mem_dring_map() returned errno %d",
status);
* To remove the need for this assertion, must call * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a * successful call to ldc_mem_dring_map() PR0(
"ldc_mem_dring_info() returned errno %d",
status);
PR0(
"ldc_mem_dring_unmap() returned errno %d",
status);
PR0(
"Descriptor ring virtual address is NULL");
/* Initialize for valid message and mapped dring */ PR1(
"descriptor size = %u, dring length = %u",
* Allocate and initialize a "shadow" array of data structures for * tasks to process I/O requests in dring elements PR0(
"ldc_mem_alloc_handle() returned err %d ",
status);
PR0(
"Message is not an unregister-dring message");
PR0(
"Expected %lu-byte unregister-dring message; " PR0(
"Expected dring ident %lu; received %lu",
PR0(
"Message is not an RDX message");
PR0(
"Expected %lu-byte RDX message; received %lu bytes",
PR0(
"Valid RDX message");
PR0(
"Received seq_num %lu; expected %lu",
PR0(
"initiating soft reset");
* Return the expected size of an inband-descriptor message with all the * cookies it claims to include return ((
sizeof (*
msg)) +
* Process an in-band descriptor message: used with clients like OBP, with * which vds exchanges descriptors within VIO message payloads, rather than * operating on them within a descriptor ring PR1(
"Message is not an in-band-descriptor message");
PR0(
"Expected at least %lu-byte descriptor message; " PR0(
"Expected %lu-byte descriptor message; " * Valid message: Set up the in-band descriptor task and process the * request. Arrange to acknowledge the client's message, unless an * error processing the descriptor task results in setting PR1(
"Valid in-band-descriptor message");
* The task request is now the payload of the message * that was just copied into the body of the task. /* Accept the updated dring element */ PR0(
"ldc_mem_dring_acquire() returned errno %d",
status);
PR0(
"descriptor %u not ready",
idx);
PR0(
"ldc_mem_dring_release() returned errno %d",
status);
/* Initialize a task and process the accepted element */ PR1(
"Processing dring element %u",
idx);
/* duplicate msg buf for cookies etc. */ * Arrange to acknowledge the client's message, unless an error * processing one of the dring elements results in setting * Process the dring elements in the range * If some, but not all, operations of a multi-element range are in * progress, wait for other operations to complete before returning * (which will result in "ack" or "nack" of the message). Note that * all outstanding operations will need to complete, not just the ones * corresponding to the current range of dring elements; howevever, as * this situation is an error case, performance is less critical. PR1(
"Message is not a dring-data message");
PR0(
"Expected %lu-byte dring message; received %lu bytes",
PR0(
"Expected dring ident %lu; received ident %lu",
PR0(
"\"start_idx\" = %u; must be less than %u",
PR0(
"\"end_idx\" = %u; must be >= 0 and less than %u",
/* Valid message; process range of updated dring elements */ PR1(
"Processing descriptor range, start = %u, end = %u",
PR1(
"ldc_read() attempt %d", (
retry +
1));
PR1(
"ldc_read() returned 0 and no message read");
* Validate session ID up front, since it applies to all messages PR0(
"Expected SID %u, received %u",
vd->
sid,
* Process the received message based on connection state /* Version negotiated, move to that state */ /* Attributes exchanged, move to that state */ /* Ready to receive in-band descriptors */ /* One dring negotiated, move to that state */ ASSERT(
"Unsupported transfer mode");
PR0(
"Unsupported transfer mode");
/* Ready to receive data */ * If another register-dring message is received, stay in * dring state in case the client sends RDX; although the * protocol allows multiple drings, this server does not * support using more than one * Acknowledge an unregister-dring message, but reset the * connection anyway: Although the protocol allows * unregistering drings, this server cannot serve a vdisk * Typically expect dring-data messages, so handle * Acknowledge an unregister-dring message, but reset * the connection anyway: Although the protocol * allows unregistering drings, this server cannot * serve a vdisk without its only dring ASSERT(
"Unsupported transfer mode");
PR0(
"Unsupported transfer mode");
ASSERT(
"Invalid client connection state");
PR0(
"Invalid client connection state");
* Check that the message is at least big enough for a "tag", so that * message processing can proceed based on tag-specified message type PR0(
"Received short (%lu-byte) message",
msglen);
/* Can't "nack" short message, so drop the big hammer */ PR0(
"initiating full reset");
/* "ack" valid, successfully-processed messages */ /* The completion handler will "ack" or "nack" the message */ PR0(
"Received unexpected message");
/* "nack" invalid messages */ /* "nack" failed messages */ /* An LDC error probably occurred, so try resetting it */ /* Send the "ack" or "nack" to the client */ /* Arrange to reset the connection for nack'ed or failed messages */ PR0(
"initiating %s reset",
PR2(
"New task to receive incoming message(s)");
* Receive and process a message * check if channel is UP - else break out of loop PR0(
"channel not up (status=%d), exiting recv loop\n",
/* check if max_msglen changed */ PR0(
"max_msglen changed 0x%lx to 0x%lx bytes\n",
PR0(
"initiating soft reset (ECONNRESET)\n");
/* Probably an LDC failure; arrange to reset it */ PR0(
"initiating full reset (status=0x%x)",
status);
PR0(
"LDC_EVT_DOWN: LDC channel went down");
PR0(
"cannot schedule task to recv msg\n");
PR0(
"LDC_EVT_RESET: LDC channel was reset");
PR0(
"scheduling full reset");
PR0(
"cannot schedule task to recv msg\n");
PR0(
"channel already reset, ignoring...\n");
PR0(
"doing ldc up...\n");
PR0(
"EVT_UP: LDC is up\nResetting client connection state");
PR0(
"initiating soft reset");
PR0(
"cannot schedule task to recv msg\n");
PR1(
"New data available");
/* Queue a task to receive the new data */ PR0(
"cannot schedule task to recv msg\n");
/* the real work happens below */ PR0(
"No action required for DDI_SUSPEND");
PR0(
"Unrecognized \"cmd\"");
/* Do no detach when serving any vdisks */ PR0(
"Not detaching because serving vdisks");
* At this point, vdisk_size is set to the size of partition 2 but * this does not represent the size of the disk because partition 2 * may not cover the entire disk and its size does not include reserved * blocks. So we update vdisk_size to be the size of the entire disk. PR0(
"ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d",
/* Set full-disk parameters */ /* Move dev number and LDI handle to entire-disk-slice array elements */ /* Initialize device numbers for remaining slices and open them */ * Skip the entire-disk slice, as it's already open and its * Construct the device number for the current slice * Open all slices of the disk to serve them to the client. * Slices are opened exclusively to prevent other threads or * processes in the service domain from performing I/O to * slices being accessed by a client. Failure to open a slice * results in vds not serving this disk, as the client could * attempt (and should be able) to access any slice immediately. * Any slices successfully opened before a failure will get * closed by vds_destroy_vd() as a result of the error returned * We need to do the open with FNDELAY so that opening an empty PR0(
"Opening device major %u, minor %u = slice %u",
PR0(
"ldi_open_by_dev() returned errno %d " /* vds_destroy_vd() will close any open slices */ * We need to open with FNDELAY so that opening an empty partition * nslices must be updated now so that vds_destroy_vd() will close * the slice we have just opened in case of an error. /* Get device number and size of backing device */ PRN(
"ldi_get_dev() returned errno %d for %s",
/* Verify backing device supports dk_cinfo, dk_geom, and vtoc */ PRN(
"ldi_ioctl(DKIOCINFO) returned errno %d for %s",
PRN(
"slice %u >= maximum slice %u for %s",
PRN(
"vd_read_vtoc returned errno %d for %s",
PRN(
"ldi_ioctl(DKIOCGEOM) returned errno %d for %s",
/* Store the device's max transfer size for return to the client */ /* Determine if backing device is a pseudo device */ return (0);
/* ...and we're done */ /* If slice is entire-disk slice, initialize for full disk */ /* Otherwise, we have a non-entire slice of a device */ /* Initialize dk_geom structure for single-slice device */ /* Initialize vtoc structure for single-slice device */ PRN(
"No memory for virtual disk");
*
vdp =
vd;
/* assign here so vds_destroy_vd() can cleanup later */ /* Open vdisk and initialize parameters */ PR0(
"vdisk_type = %s, pseudo = %s, nslices = %u",
PRN(
"Could not get iblock cookie.");
/* Create start and completion task queues for the vdisk */ PRN(
"Could not create task queue");
PRN(
"Could not create task queue");
vd->
enabled =
1;
/* before callback can dispatch to startq */ PR0(
"ldc_reg_callback() returned errno %d",
status);
/* Allocate the inband task memory handle */ PR0(
"ldc_mem_alloc_handle() returned err %d ",
status);
/* Add the successfully-initialized vdisk to the server's table */ PRN(
"Error adding vdisk ID %lu to table",
id);
/* Allocate the staging buffer */ /* store initial state */ /* Free all dring_task memory handles */ * Destroy the state associated with a virtual disk PR0(
"Destroying vdisk state");
/* Disable queuing requests for the vdisk */ /* Drain and destroy start queue (*before* destroying completionq) */ /* Drain and destroy completion queue (*before* shutting down LDC) */ /* Free the inband task memory handle */ /* close LDC channel - retry on EAGAIN */ PR0(
"Timed out closing channel");
* Closing the LDC channel has failed. Ideally we should * fail here but there is no Zeus level infrastructure * to handle this. The MD has already been changed and * we have to do the close. So we try to do as much /* Free the staging buffer for msgs */ /* Free the inband message buffer */ /* Close any open backing-device slices */ /* Finally, free the vdisk structure itself */ /* Look for channel endpoint child(ren) of the vdisk MD node */ /* Get the "id" value for the first channel endpoint node */ PRN(
"No \"%s\" property found for \"%s\" of vdisk",
PRN(
"Using ID of first of multiple channels for this vdisk");
PRN(
"Invalid node count in Machine Description subtree");
PR0(
"Adding vdisk ID %lu",
id);
PRN(
"Error getting LDC ID for vdisk %lu",
id);
PRN(
"Failed to add vdisk ID %lu",
id);
PRN(
"Unable to get \"%s\" property from vdisk's MD node",
PR0(
"Removing vdisk ID %lu",
id);
PRN(
"No vdisk entry found for vdisk ID %lu",
id);
/* Validate that vdisk ID has not changed */ PRN(
"Error getting previous vdisk \"%s\" property",
PRN(
"Not changing vdisk: ID changed from %lu to %lu",
/* Validate that LDC ID has not changed */ PRN(
"Error getting LDC ID for vdisk %lu",
prev_id);
PRN(
"Error getting LDC ID for vdisk %lu",
curr_id);
PRN(
"Not changing vdisk: " /* Determine whether device path has changed */ PRN(
"Error getting previous vdisk \"%s\"",
return;
/* no relevant (supported) change */ /* Remove old state, which will close vdisk and reset */ /* Re-initialize vdisk with new state */ * The "cfg-handle" property of a vds node in an MD contains the MD's * notion of "instance", or unique identifier, for that node; OBP * stores the value of the "cfg-handle" MD property as the value of * the "reg" property on the node in the device tree it builds from * the MD and passes to Solaris. Thus, we look up the devinfo node's * "reg" property value to uniquely identify this device instance when * registering with the MD event-generation framework. If the "reg" * property cannot be found, the device tree state is presumably so * broken that there is no point in continuing. /* Get the MD instance for later MDEG registration */ PRN(
"Could not allocate state for instance %u",
instance);
PRN(
"ldi_ident_from_dip() returned errno %d",
status);
/* Register for MD updates */ /* initialize the complete prop spec structure */ PRN(
"Unable to register for MD updates");
/* Prevent auto-detaching so driver is available whenever MD changes */ PRN(
"failed to set \"%s\" property for instance %u",
PR0(
"No action required for DDI_RESUME");
"virtual disk server v%I%",
/* Fill in the bit-mask of server-supported operations */