nvme.c revision d148d46e69709052e6d92205c98874f3e7cfd4c0
/*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
*/
/*
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
* Copyright 2016 Tegile Systems, Inc. All rights reserved.
* Copyright (c) 2016 The MathWorks, Inc. All rights reserved.
*/
/*
* blkdev driver for NVMe compliant storage devices
*
* This driver was written to conform to version 1.0e of the NVMe specification.
* It may work with newer versions, but that is completely untested and disabled
* by default.
*
* The driver has only been tested on x86 systems and will not work on big-
* endian systems without changes to the code accessing registers and data
* structures used by the hardware.
*
*
* Interrupt Usage:
*
* The driver will use a FIXED interrupt while configuring the device as the
* specification requires. Later in the attach process it will switch to MSI-X
* or MSI if supported. The driver wants to have one interrupt vector per CPU,
* but it will work correctly if less are available. Interrupts can be shared
* by queues, the interrupt handler will iterate through the I/O queue array by
* steps of n_intr_cnt. Usually only the admin queue will share an interrupt
* with one I/O queue. The interrupt handler will retrieve completed commands
* from all queues sharing an interrupt vector and will post them to a taskq
* for completion processing.
*
*
* Command Processing:
*
* NVMe devices can have up to 65536 I/O queue pairs, with each queue holding up
* to 65536 I/O commands. The driver will configure one I/O queue pair per
* available interrupt vector, with the queue length usually much smaller than
* the maximum of 65536. If the hardware doesn't provide enough queues, fewer
* interrupt vectors will be used.
*
* Additionally the hardware provides a single special admin queue pair that can
* hold up to 4096 admin commands.
*
* From the hardware perspective both queues of a queue pair are independent,
* but they share some driver state: the command array (holding pointers to
* commands currently being processed by the hardware) and the active command
* counter. Access to the submission side of a queue pair and the shared state
* is protected by nq_mutex. The completion side of a queue pair does not need
* that protection apart from its access to the shared state; it is called only
* in the interrupt handler which does not run concurrently for the same
* interrupt vector.
*
* When a command is submitted to a queue pair the active command counter is
* incremented and a pointer to the command is stored in the command array. The
* array index is used as command identifier (CID) in the submission queue
* entry. Some commands may take a very long time to complete, and if the queue
* wraps around in that time a submission may find the next array slot to still
* be used by a long-running command. In this case the array is sequentially
* searched for the next free slot. The length of the command array is the same
* as the configured queue length.
*
*
* Namespace Support:
*
* NVMe devices can have multiple namespaces, each being a independent data
* store. The driver supports multiple namespaces and creates a blkdev interface
* for each namespace found. Namespaces can have various attributes to support
* thin provisioning and protection information. This driver does not support
* any of this and ignores namespaces that have these attributes.
*
*
* Blkdev Interface:
*
* This driver uses blkdev to do all the heavy lifting involved with presenting
* a disk device to the system. As a result, the processing of I/O requests is
* relatively simple as blkdev takes care of partitioning, boundary checks, DMA
* setup, and splitting of transfers into manageable chunks.
*
* I/O requests coming in from blkdev are turned into NVM commands and posted to
* an I/O queue. The queue is selected by taking the CPU id modulo the number of
* queues. There is currently no timeout handling of I/O commands.
*
* devid. The driver reports the best block size as determined by the namespace
* format back to blkdev as physical block size to support partition and block
* alignment. The devid is composed using the device vendor ID, model number,
* serial number, and the namespace ID.
*
*
* Error Handling:
*
* Error handling is currently limited to detecting fatal hardware errors,
* either by asynchronous events, or synchronously through command status or
* admin command timeouts. In case of severe errors the device is fenced off,
* all further requests will return EIO. FMA is then called to fault the device.
*
* The hardware has a limit for outstanding asynchronous event requests. Before
* this limit is known the driver assumes it is at least 1 and posts a single
* asynchronous request. Later when the limit is known more asynchronous event
* requests are posted to allow quicker reception of error information. When an
* asynchronous event is posted by the hardware the driver will parse the error
* status fields and log information or fault the device, depending on the
* severity of the asynchronous event. The asynchronous event request is then
* reused and posted to the admin queue again.
*
* On command completion the command status is checked for errors. In case of
* errors indicating a driver bug the driver panics. Almost all other error
* status values just cause EIO to be returned.
*
* Command timeouts are currently detected for all admin commands except
* asynchronous event requests. If a command times out and the hardware appears
* to be healthy the driver attempts to abort the command. If this fails the
* driver assumes the device to be dead, fences it off, and calls FMA to retire
* it. In general admin commands are issued at attach time only. No timeout
* handling of normal I/O commands is presently done.
*
* In some cases it may be possible that the ABORT command times out, too. In
* that case the device is also declared dead and fenced off.
*
*
* Quiesce / Fast Reboot:
*
* The driver currently does not support fast reboot. A quiesce(9E) entry point
* is still provided which is used to send a shutdown notification to the
* device.
*
*
* Driver Configuration:
*
* The following driver properties can be changed to control some aspects of the
* drivers operation:
* - strict-version: can be set to 0 to allow devices conforming to newer
* versions to be used
* - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
* specific command status as a fatal error leading device faulting
* - admin-queue-len: the maximum length of the admin queue (16-4096)
* - io-queue-len: the maximum length of the I/O queues (16-65536)
* - async-event-limit: the maximum number of asynchronous event requests to be
* posted by the driver
* - volatile-write-cache-enable: can be set to 0 to disable the volatile write
* cache
*
*
* TODO:
* - figure out sane default for I/O queue depth reported to blkdev
* - polled I/O support to support kernel core dumping
* - FMA handling of media errors
* - support for devices supporting very large I/O requests using chained PRPs
* - support for querying log pages from user space
* - support for configuring hardware parameters like interrupt coalescing
* - support for media formatting and hard partitioning into namespaces
* - support for big-endian systems
* - support for fast reboot
*/
#include <sys/byteorder.h>
#ifdef _BIG_ENDIAN
#endif
#include <sys/sysmacros.h>
#include <sys/archsystm.h>
#include "nvme_reg.h"
#include "nvme_var.h"
/* NVMe spec version supported */
static const int nvme_version_major = 1;
static const int nvme_version_minor = 0;
/* tunable for admin command timeout in seconds, default is 1s */
static volatile int nvme_admin_cmd_timeout = 1;
static int nvme_quiesce(dev_info_t *);
static int nvme_setup_interrupts(nvme_t *, int, int);
static void nvme_release_interrupts(nvme_t *);
static void nvme_free_cmd(nvme_cmd_t *);
bd_xfer_t *);
static int nvme_admin_cmd(nvme_cmd_t *, int);
static void nvme_wakeup_cmd(void *);
static void nvme_async_event_task(void *);
static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
static int nvme_check_vendor_cmd_status(nvme_cmd_t *);
static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
static int nvme_check_specific_cmd_status(nvme_cmd_t *);
static int nvme_check_generic_cmd_status(nvme_cmd_t *);
static inline int nvme_check_cmd_status(nvme_cmd_t *);
static void nvme_abort_cmd(nvme_cmd_t *);
static int nvme_async_event(nvme_t *);
uint32_t *);
static void nvme_free_dma(nvme_dma_t *);
nvme_dma_t **);
nvme_dma_t **);
static void nvme_free_qpair(nvme_qpair_t *);
static void nvme_bd_xfer_done(void *);
static void nvme_bd_driveinfo(void *, bd_drive_t *);
static int nvme_bd_mediainfo(void *, bd_media_t *);
static int nvme_bd_read(void *, bd_xfer_t *);
static int nvme_bd_write(void *, bd_xfer_t *);
static int nvme_bd_sync(void *, bd_xfer_t *);
static int nvme_prp_dma_constructor(void *, void *, int);
static void nvme_prp_dma_destructor(void *, void *);
static void *nvme_state;
static kmem_cache_t *nvme_cmd_cache;
/*
* DMA attributes for queue DMA memory
*
* Queue DMA memory must be page aligned. The maximum length of a queue is
* 65536 entries, and an entry can be 64 bytes long.
*/
static ddi_dma_attr_t nvme_queue_dma_attr = {
.dma_attr_addr_lo = 0,
.dma_attr_addr_hi = 0xffffffffffffffffULL,
.dma_attr_align = 0x1000,
.dma_attr_burstsizes = 0x7ff,
.dma_attr_minxfer = 0x1000,
.dma_attr_seg = 0xffffffffffffffffULL,
.dma_attr_sgllen = 1,
.dma_attr_granular = 1,
.dma_attr_flags = 0,
};
/*
* DMA attributes for transfers using Physical Region Page (PRP) entries
*
* A PRP entry describes one page of DMA memory using the page size specified
* in the controller configuration's memory page size register (CC.MPS). It uses
* a 64bit base address aligned to this page size. There is no limitation on
* chaining PRPs together for arbitrarily large DMA transfers.
*/
static ddi_dma_attr_t nvme_prp_dma_attr = {
.dma_attr_addr_lo = 0,
.dma_attr_addr_hi = 0xffffffffffffffffULL,
.dma_attr_count_max = 0xfff,
.dma_attr_align = 0x1000,
.dma_attr_burstsizes = 0x7ff,
.dma_attr_minxfer = 0x1000,
.dma_attr_maxxfer = 0x1000,
.dma_attr_seg = 0xfff,
.dma_attr_sgllen = -1,
.dma_attr_granular = 1,
.dma_attr_flags = 0,
};
/*
*
* A SGL entry describes a chunk of DMA memory using a 64bit base address and a
* 32bit length field. SGL Segment and SGL Last Segment entries require the
* length to be a multiple of 16 bytes.
*/
static ddi_dma_attr_t nvme_sgl_dma_attr = {
.dma_attr_addr_lo = 0,
.dma_attr_addr_hi = 0xffffffffffffffffULL,
.dma_attr_count_max = 0xffffffffUL,
.dma_attr_align = 1,
.dma_attr_burstsizes = 0x7ff,
.dma_attr_minxfer = 0x10,
.dma_attr_maxxfer = 0xfffffffffULL,
.dma_attr_seg = 0xffffffffffffffffULL,
.dma_attr_sgllen = -1,
.dma_attr_granular = 0x10,
.dma_attr_flags = 0
};
static ddi_device_acc_attr_t nvme_reg_acc_attr = {
};
static struct dev_ops nvme_dev_ops = {
.devo_refcnt = 0,
.devo_identify = nulldev,
.devo_probe = nulldev,
.devo_reset = nodev,
.devo_cb_ops = NULL,
.devo_bus_ops = NULL,
.devo_power = NULL,
};
static struct modldrv nvme_modldrv = {
.drv_modops = &mod_driverops,
.drv_linkinfo = "NVMe v1.0e",
};
static struct modlinkage nvme_modlinkage = {
};
static bd_ops_t nvme_bd_ops = {
.o_read = nvme_bd_read,
.o_write = nvme_bd_write,
};
int
_init(void)
{
int error;
if (error != DDI_SUCCESS)
return (error);
if (error != DDI_SUCCESS) {
}
return (error);
}
int
_fini(void)
{
int error;
if (error == DDI_SUCCESS) {
}
return (error);
}
int
{
}
static inline void
{
/*LINTED: E_BAD_PTR_CAST_ALIGN*/
}
static inline void
{
/*LINTED: E_BAD_PTR_CAST_ALIGN*/
}
static inline uint64_t
{
/*LINTED: E_BAD_PTR_CAST_ALIGN*/
return (val);
}
static inline uint32_t
{
/*LINTED: E_BAD_PTR_CAST_ALIGN*/
return (val);
}
static boolean_t
{
return (B_TRUE);
return (B_FALSE);
}
static boolean_t
{
return (B_FALSE);
return (B_TRUE);
return (B_FALSE);
}
static void
{
}
static void
{
}
/* ARGSUSED */
static void
{
}
static int
{
/*
* Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and
* the only other possible error is DDI_DMA_BADATTR which
* indicates a driver bug which should cause a panic.
*/
"!failed to get DMA handle, check DMA attributes");
return (DDI_FAILURE);
}
/*
* ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified
* or the flags are conflicting, which isn't the case here.
*/
"!failed to bind DMA memory");
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
static int
{
DDI_SUCCESS) {
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
/* ARGSUSED */
static int
{
return (-1);
}
return (0);
}
static int
{
!= DDI_SUCCESS) {
"!failed to get DMA memory for queue");
goto fail;
}
"!got too many cookies for queue DMA");
goto fail;
}
return (DDI_SUCCESS);
fail:
if (*dma) {
nvme_free_dma(*dma);
}
return (DDI_FAILURE);
}
static void
{
int i;
if (qp->nq_active_cmds > 0)
}
static int
int idx)
{
goto fail;
goto fail;
qp->nq_next_cmd = 0;
return (DDI_SUCCESS);
fail:
return (DDI_FAILURE);
}
static nvme_cmd_t *
{
return (cmd);
return (cmd);
}
static void
{
else
}
}
static int
{
nvme_reg_sqtdbl_t tail = { 0 };
return (DDI_FAILURE);
}
/*
* Try to insert the cmd into the active cmd array at the nq_next_cmd
* slot. If the slot is already occupied advance to the next slot and
* try again. This can happen for long running commands like async event
* requests.
*/
qp->nq_active_cmds++;
sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV);
return (DDI_SUCCESS);
}
static nvme_cmd_t *
{
nvme_reg_cqhdbl_t head = { 0 };
/* Check phase tag of CQE. Hardware inverts it for new entries. */
return (NULL);
qp->nq_active_cmds--;
/* Toggle phase on wrap-around. */
return (cmd);
}
static int
{
"!unknown command status received: opc = %x, sqid = %d, cid = %d, "
}
return (EIO);
}
static int
{
"!unknown command status received: opc = %x, sqid = %d, cid = %d, "
}
return (EIO);
}
static int
{
/* write fail */
/* TODO: post ereport */
return (EIO);
case NVME_CQE_SC_INT_NVM_READ:
/* read fail */
/* TODO: post ereport */
return (EIO);
default:
return (nvme_check_unknown_cmd_status(cmd));
}
}
static int
{
case NVME_CQE_SC_GEN_SUCCESS:
return (0);
/*
* Errors indicating a bug in the driver should cause a panic.
*/
case NVME_CQE_SC_GEN_INV_OPC:
/* Invalid Command Opcode */
"invalid opcode in cmd %p", (void *)cmd);
return (0);
case NVME_CQE_SC_GEN_INV_FLD:
/* Invalid Field in Command */
"invalid field in cmd %p", (void *)cmd);
return (0);
case NVME_CQE_SC_GEN_ID_CNFL:
/* Command ID Conflict */
"cmd ID conflict in cmd %p", (void *)cmd);
return (0);
case NVME_CQE_SC_GEN_INV_NS:
/* Invalid Namespace or Format */
return (0);
/* LBA Out Of Range */
"LBA out of range in cmd %p", (void *)cmd);
return (0);
/*
* Non-fatal errors, handle gracefully.
*/
/* Data Transfer Error (DMA) */
/* TODO: post ereport */
return (EIO);
/*
* Internal Error. The spec (v1.0, section 4.5.1.2) says
* detailed error information is returned as async event,
* so we pretty much ignore the error here and handle it
* in the async event handler.
*/
return (EIO);
/*
* Command Abort Requested. This normally happens only when a
* command times out.
*/
/* TODO: post ereport or change blkdev to handle this? */
return (ECANCELED);
/* Command Aborted due to Power Loss Notification */
return (EIO);
/* Command Aborted due to SQ Deletion */
return (EIO);
/* Capacity Exceeded */
return (EIO);
/* Namespace Not Ready */
return (EIO);
default:
return (nvme_check_unknown_cmd_status(cmd));
}
}
static int
{
case NVME_CQE_SC_SPC_INV_CQ:
/* Completion Queue Invalid */
return (EINVAL);
case NVME_CQE_SC_SPC_INV_QID:
/* Invalid Queue Identifier */
return (EINVAL);
/* Max Queue Size Exceeded */
return (EINVAL);
/* Abort Command Limit Exceeded */
"abort command limit exceeded in cmd %p", (void *)cmd);
return (0);
/* Async Event Request Limit Exceeded */
"async event request limit exceeded in cmd %p",
(void *)cmd);
return (0);
/* Invalid Interrupt Vector */
return (EINVAL);
/* Invalid Log Page */
return (EINVAL);
/* Invalid Format */
return (EINVAL);
/* Invalid Queue Deletion */
return (EINVAL);
/* Conflicting Attributes */
return (EINVAL);
/* Invalid Protection Information */
return (EINVAL);
/* Write to Read Only Range */
return (EROFS);
default:
return (nvme_check_unknown_cmd_status(cmd));
}
}
static inline int
{
/* take a shortcut if everything is alright */
return (0);
return (nvme_check_generic_cmd_status(cmd));
return (nvme_check_specific_cmd_status(cmd));
return (nvme_check_integrity_cmd_status(cmd));
return (nvme_check_vendor_cmd_status(cmd));
return (nvme_check_unknown_cmd_status(cmd));
}
/*
* nvme_abort_cmd_cb -- replaces nc_callback of aborted commands
*
* This functions takes care of cleaning up aborted commands. The command
* status is checked to catch any fatal errors.
*/
static void
nvme_abort_cmd_cb(void *arg)
{
/*
* Grab the command mutex. Once we have it we hold the last reference
* to the command and can safely free it.
*/
(void) nvme_check_cmd_status(cmd);
}
static void
{
nvme_abort_cmd_t ac = { 0 };
/*
* Drop the mutex of the aborted command. From this point on
* we must assume that the abort callback has freed the command.
*/
/*
* Send the ABORT to the hardware. The ABORT command will return _after_
* the aborted command has completed (aborted or otherwise).
*/
"!nvme_admin_cmd failed for ABORT");
return;
}
if (nvme_check_cmd_status(cmd)) {
"!ABORT failed with sct = %x, sc = %x",
} else {
}
}
/*
* nvme_wait_cmd -- wait for command completion or timeout
*
* Returns B_TRUE if the command completed normally.
*
* Returns B_FALSE if the command timed out and an abort was attempted. The
* command mutex will be dropped and the command must be considered freed. The
* freeing of the command is normally done by the abort command callback.
*
* In case of a serious error or a timeout of the abort command the hardware
* will be declared dead and FMA will be notified.
*/
static boolean_t
{
while (!cmd->nc_completed) {
break;
}
if (cmd->nc_completed)
return (B_TRUE);
/*
* The command timed out. Change the callback to the cleanup function.
*/
/*
* Check controller for fatal status, any errors associated with the
* register or DMA handle, or for a double timeout (abort command timed
* out). If necessary log a warning and call FMA.
*/
} else {
/*
* Try to abort the command. The command mutex is released by
* nvme_abort_cmd().
* If the abort succeeds it will have freed the aborted command.
* If the abort fails for other reasons we must assume that the
* command may complete at any time, and the callback will free
* it for us.
*/
}
return (B_FALSE);
}
static void
nvme_wakeup_cmd(void *arg)
{
/*
* There is a slight chance that this command completed shortly after
* the timeout was hit in nvme_wait_cmd() but before the callback was
* changed. Catch that case here and clean up accordingly.
*/
return;
}
}
static void
nvme_async_event_task(void *arg)
{
int ret;
/*
* Check for errors associated with the async request itself. The only
* command-specific error is "async event limit exceeded", which
* indicates a programming error in the driver and causes a panic in
* nvme_check_cmd_status().
*
* Other possible errors are various scenarios where the async request
* was aborted, or internal errors in the device. Internal errors are
* reported to FMA, the command aborts need no special handling here.
*/
if (nvme_check_cmd_status(cmd)) {
"!async event request returned failure, sct = %x, "
}
return;
}
/* Clear CQE and re-submit the async request. */
if (ret != DDI_SUCCESS) {
"!failed to resubmit async event request");
}
case NVME_ASYNC_TYPE_ERROR:
} else {
}
case NVME_ASYNC_ERROR_INV_SQ:
"invalid submission queue");
return;
case NVME_ASYNC_ERROR_INV_DBL:
"invalid doorbell write value");
return;
break;
"device error");
break;
"device error");
/* TODO: send ereport */
break;
case NVME_ASYNC_ERROR_FW_LOAD:
"!firmware image load error");
break;
}
break;
case NVME_ASYNC_TYPE_HEALTH:
health_log = (nvme_health_log_t *)
} else {
}
"!device reliability compromised");
/* TODO: send ereport */
break;
"!temperature above threshold");
/* TODO: send ereport */
break;
case NVME_ASYNC_HEALTH_SPARE:
"!spare space below threshold");
/* TODO: send ereport */
break;
}
break;
case NVME_ASYNC_TYPE_VENDOR:
event.b.ae_logpage);
break;
default:
break;
}
if (error_log)
if (health_log)
}
static int
{
int ret;
if (ret != DDI_SUCCESS) {
"!nvme_submit_cmd failed");
return (DDI_FAILURE);
}
/*
* The command timed out. An abort command was posted that
* will take care of the cleanup.
*/
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
static int
{
int ret;
if (ret != DDI_SUCCESS) {
"!nvme_submit_cmd failed for ASYNCHRONOUS EVENT");
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
static void *
{
nvme_getlogpage_t getlogpage = { 0 };
switch (logpage) {
case NVME_LOGPAGE_ERROR:
sizeof (nvme_error_log_entry_t);
break;
case NVME_LOGPAGE_HEALTH:
bufsize = sizeof (nvme_health_log_t);
break;
case NVME_LOGPAGE_FWSLOT:
bufsize = sizeof (nvme_fwslot_log_t);
break;
default:
logpage);
goto fail;
}
"!nvme_zalloc_dma failed for GET LOG PAGE");
goto fail;
}
"!too many DMA cookies for GET LOG PAGE");
goto fail;
}
}
"!nvme_admin_cmd failed for GET LOG PAGE");
return (NULL);
}
if (nvme_check_cmd_status(cmd)) {
"!GET LOG PAGE failed with sct = %x, sc = %x",
goto fail;
}
fail:
return (buf);
}
static void *
{
"!nvme_zalloc_dma failed for IDENTIFY");
goto fail;
}
"!too many DMA cookies for IDENTIFY");
goto fail;
}
}
"!nvme_admin_cmd failed for IDENTIFY");
return (NULL);
}
if (nvme_check_cmd_status(cmd)) {
"!IDENTIFY failed with sct = %x, sc = %x",
goto fail;
}
fail:
return (buf);
}
static boolean_t
{
switch (feature) {
case NVME_FEAT_WRITE_CACHE:
if (!nvme->n_write_cache_present)
goto fail;
break;
case NVME_FEAT_NQUEUES:
break;
default:
goto fail;
}
"!nvme_admin_cmd failed for SET FEATURES");
return (ret);
}
if (nvme_check_cmd_status(cmd)) {
"!SET FEATURES %d failed with sct = %x, sc = %x",
goto fail;
}
fail:
return (ret);
}
static boolean_t
{
nvme_write_cache_t nwc = { 0 };
if (enable)
return (B_FALSE);
return (B_TRUE);
}
static int
{
nvme_nqueue_t nq = { 0 };
return (0);
}
/*
* Always use the same number of submission and completion queues, and
* never use more than the requested number of queues.
*/
}
static int
{
nvme_create_queue_dw10_t dw10 = { 0 };
nvme_create_cq_dw11_t c_dw11 = { 0 };
nvme_create_sq_dw11_t s_dw11 = { 0 };
"!nvme_admin_cmd failed for CREATE CQUEUE");
return (DDI_FAILURE);
}
if (nvme_check_cmd_status(cmd)) {
"!CREATE CQUEUE failed with sct = %x, sc = %x",
return (DDI_FAILURE);
}
"!nvme_admin_cmd failed for CREATE SQUEUE");
return (DDI_FAILURE);
}
if (nvme_check_cmd_status(cmd)) {
"!CREATE SQUEUE failed with sct = %x, sc = %x",
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
static boolean_t
{
int i;
break;
if (quiesce)
drv_usecwait(50000);
else
}
}
}
static void
{
int i;
for (i = 0; i != 10; i++) {
break;
if (quiesce)
drv_usecwait(100000);
else
}
}
static void
{
}
static int
{
nvme_reg_cc_t cc = { 0 };
nvme_reg_aqa_t aqa = { 0 };
nvme_reg_asq_t asq = { 0 };
nvme_reg_acq_t acq = { 0 };
int i = 0;
int nqueues;
/* Check controller version */
if (nvme->n_strict_version)
goto fail;
}
/* retrieve controller configuration */
"!NVM command set not supported by hardware");
goto fail;
}
/*
* The MPSMIN and MPSMAX fields in the CAP register use 0 to specify
* the base page size of 4k (1<<12), so add 12 here to get the real
* page size value.
*/
/*
* Set up Queue DMA to transfer at least 1 page-aligned page at a time.
*/
/*
* Set up PRP DMA to transfer 1 page-aligned page at a time.
* Maxxfer may be increased after we identified the controller limits.
*/
/*
* Reset controller if it's still in ready state.
*/
goto fail;
}
/*
* Create the admin queue pair.
*/
!= DDI_SUCCESS) {
"!unable to allocate admin qpair");
goto fail;
}
/*
* Wait for the controller to become ready.
*/
"!controller fatal status at init");
goto fail;
}
break;
}
}
goto fail;
}
/*
* Assume an abort command limit of 1. We'll destroy and re-init
* that later when we know the true abort command limit.
*/
/*
* Setup initial interrupt for admin queue.
*/
!= DDI_SUCCESS) &&
!= DDI_SUCCESS) &&
!= DDI_SUCCESS)) {
"!failed to setup initial interrupt");
goto fail;
}
/*
* Post an asynchronous event command to catch errors.
*/
"!failed to post async event");
goto fail;
}
/*
* Identify Controller
*/
"!failed to identify controller");
goto fail;
}
/*
* Get Vendor & Product ID
*/
else
/*
* Get controller limits.
*/
/*
* Reinitialize the semaphore with the true abort command limit
* supported by the hardware. It's not necessary to disable interrupts
* as only command aborts use the semaphore, and no commands are
* executed or aborted while we're here.
*/
SEMA_DRIVER, NULL);
else
/*
* Limit n_max_data_transfer_size to what we can handle in one PRP.
* Chained PRPs are currently unsupported.
*
* This is a no-op on hardware which doesn't support a transfer size
* big enough to require chained PRPs.
*/
/*
*/
goto fail;
/*
* Check for the presence of a Volatile Write Cache. If present,
* enable or disable based on the value of the property
* volatile-write-cache-enable (default is enabled).
*/
"volatile-write-cache-present",
if (!nvme->n_write_cache_present) {
"!failed to %sable volatile write cache",
/*
* Assume the cache is (still) enabled.
*/
}
"volatile-write-cache-enable",
/*
* Grab a copy of all mandatory log pages.
*
* TODO: should go away once user space tool exists to print logs
*/
/*
* Identify Namespaces
*/
for (i = 0; i != nvme->n_namespace_count; i++) {
int last_rp;
"!failed to identify namespace %d", i + 1);
goto fail;
}
/*
* Find the LBA format with no metadata and the best relative
* performance. A value of 3 means "degraded", 0 is best.
*/
last_rp = 3;
break;
continue;
continue;
}
/*
* We currently don't support namespaces that use either:
* - thin provisioning
* - protection information
*/
"!ignoring namespace %d, unsupported features: "
"thin = %d, pinfo = %d", i + 1,
}
}
/*
*/
!= 0) {
nqueues) != DDI_SUCCESS) &&
nqueues) != DDI_SUCCESS)) {
goto fail;
}
}
/*
* Create I/O queue pairs.
*/
if (nvme->n_ioq_count == 0) {
"!failed to set number of I/O queues to %d", nqueues);
goto fail;
}
/*
* Reallocate I/O queue array
*/
/*
* If we got less queues than we asked for we might as well give
* some of the interrupt vectors back to the system.
*/
"!failed to reduce number of interrupts");
goto fail;
}
}
/*
* Alloc & register I/O queue pairs
*/
"!unable to allocate I/O qpair %d", i);
goto fail;
}
!= DDI_SUCCESS) {
"!unable to create I/O qpair %d", i);
goto fail;
}
}
/*
* Post more asynchronous events commands to reduce event reporting
* latency as suggested by the spec.
*/
"!failed to post async event %d", i);
goto fail;
}
}
return (DDI_SUCCESS);
fail:
return (DDI_FAILURE);
}
static uint_t
{
/*LINTED: E_PTR_BAD_CAST_ALIGN*/
int ccnt = 0;
int qnum;
return (DDI_INTR_UNCLAIMED);
/*
* The interrupt vector a queue uses is calculated as queue_idx %
* intr_cnt in nvme_create_io_qpair(). Iterate through the queue array
* in steps of n_intr_cnt to process all queues using this vector.
*/
ccnt++;
}
}
}
static void
{
int i;
for (i = 0; i < nvme->n_intr_cnt; i++) {
break;
else
}
}
static int
{
int ret;
int i;
if (nvme->n_intr_types == 0) {
&nvme->n_intr_types);
if (ret != DDI_SUCCESS) {
"!%s: ddi_intr_get_supported types failed",
__func__);
return (ret);
}
}
return (DDI_FAILURE);
if (ret != DDI_SUCCESS) {
__func__);
return (ret);
}
if (ret != DDI_SUCCESS) {
__func__);
return (ret);
}
/* We want at most one interrupt per queue pair. */
&count, 0);
if (ret != DDI_SUCCESS) {
__func__);
goto fail;
}
if (ret != DDI_SUCCESS) {
__func__);
goto fail;
}
for (i = 0; i < count; i++) {
if (ret != DDI_SUCCESS) {
"!%s: ddi_intr_add_handler failed", __func__);
goto fail;
}
}
for (i = 0; i < count; i++) {
else
if (ret != DDI_SUCCESS) {
"!%s: enabling interrupt %d failed", __func__, i);
goto fail;
}
}
return (DDI_SUCCESS);
fail:
return (ret);
}
static int
{
return (fm_error->fme_status);
}
static int
{
int instance;
int nregs;
int i;
char name[32];
if (cmd != DDI_ATTACH)
return (DDI_FAILURE);
return (DDI_FAILURE);
DDI_PROP_DONTPASS, "async-event-limit",
/*
* Setup FMA support.
*/
}
(void *)nvme);
}
/*
* The spec defines several register sets. Only the controller
* registers (set 1) are currently used.
*/
nregs < 2 ||
goto fail;
goto fail;
}
/*
* Create taskq for command completion.
*/
TASKQ_DEFAULTPRI, 0);
goto fail;
}
/*
* Create PRP DMA cache
*/
goto fail;
/*
* Attach the blkdev driver for each namespace.
*/
for (i = 0; i != nvme->n_namespace_count; i++) {
continue;
"!failed to get blkdev handle for namespace %d", i);
goto fail;
}
!= DDI_SUCCESS) {
"!failed to attach blkdev handle for namespace %d",
i);
goto fail;
}
}
return (DDI_SUCCESS);
fail:
/* attach successful anyway so that FMA can retire the device */
return (DDI_SUCCESS);
return (DDI_FAILURE);
}
static int
{
int instance, i;
if (cmd != DDI_DETACH)
return (DDI_FAILURE);
return (DDI_FAILURE);
for (i = 0; i != nvme->n_namespace_count; i++) {
(void) bd_detach_handle(
}
sizeof (nvme_identify_nsid_t));
}
}
if (nvme->n_cmd_taskq)
if (nvme->n_ioq_count > 0) {
/* TODO: send destroy queue commands */
}
}
}
}
}
if (nvme->n_cmd_taskq)
}
return (DDI_SUCCESS);
}
static int
{
int instance;
return (DDI_FAILURE);
return (DDI_FAILURE);
}
static int
{
return (DDI_FAILURE);
return (DDI_SUCCESS);
return (DDI_SUCCESS);
}
/*
* We currently don't support chained PRPs and set up our DMA
* attributes to reflect that. If we still get an I/O request
* that needs a chained PRP something is very wrong.
*/
/*LINTED: E_PTR_BAD_CAST_ALIGN*/
}
return (DDI_SUCCESS);
}
static nvme_cmd_t *
{
/*
* Blkdev only sets BD_XFER_POLL when dumping, so don't sleep.
*/
KM_NOSLEEP : KM_SLEEP);
return (NULL);
switch (opc) {
case NVME_OPC_NVM_WRITE:
case NVME_OPC_NVM_READ:
goto fail;
break;
case NVME_OPC_NVM_FLUSH:
break;
default:
goto fail;
}
return (cmd);
fail:
return (NULL);
}
static void
nvme_bd_xfer_done(void *arg)
{
int error = 0;
}
static void
{
/*
* blkdev maintains one queue size per instance (namespace),
* but all namespace share the I/O queues.
* TODO: need to figure out a sane default, or use per-NS I/O queues,
* or change blkdev to handle EAGAIN
*/
/*
* d_maxxfer is not set, which means the value is taken from the DMA
* attributes specified to bd_alloc_handle.
*/
}
static int
{
return (0);
}
static int
{
return (EIO);
/* No polling for now */
return (EIO);
return (ENOMEM);
!= DDI_SUCCESS)
return (EAGAIN);
return (0);
}
static int
{
}
static int
{
}
static int
{
return (EIO);
/*
* If the volatile write cache is not present or not enabled the FLUSH
* command is a no-op, so we can take a shortcut here.
*/
return (0);
}
bd_xfer_done(xfer, 0);
return (0);
}
}
static int
{
}