145e0143b4896d03ce53b1af6787afa1a7e73959dh * CDDL HEADER START
145e0143b4896d03ce53b1af6787afa1a7e73959dh * The contents of this file are subject to the terms of the
145e0143b4896d03ce53b1af6787afa1a7e73959dh * Common Development and Distribution License (the "License").
145e0143b4896d03ce53b1af6787afa1a7e73959dh * You may not use this file except in compliance with the License.
145e0143b4896d03ce53b1af6787afa1a7e73959dh * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
145e0143b4896d03ce53b1af6787afa1a7e73959dh * See the License for the specific language governing permissions
145e0143b4896d03ce53b1af6787afa1a7e73959dh * and limitations under the License.
145e0143b4896d03ce53b1af6787afa1a7e73959dh * When distributing Covered Code, include this CDDL HEADER in each
145e0143b4896d03ce53b1af6787afa1a7e73959dh * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
145e0143b4896d03ce53b1af6787afa1a7e73959dh * If applicable, add the following below this CDDL HEADER, with the
145e0143b4896d03ce53b1af6787afa1a7e73959dh * fields enclosed by brackets "[]" replaced with your own identifying
145e0143b4896d03ce53b1af6787afa1a7e73959dh * information: Portions Copyright [yyyy] [name of copyright owner]
145e0143b4896d03ce53b1af6787afa1a7e73959dh * CDDL HEADER END
658280b6253b61dbb155f43d0e3cbcffa85ccb90David Hollister * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
145e0143b4896d03ce53b1af6787afa1a7e73959dh * PM8001 device state recovery routines
145e0143b4896d03ce53b1af6787afa1a7e73959dh * SAS Topology Configuration
6745c559e4b531cf336a91f4653445c32ee46693Jesse Butlerstatic void pmcs_ds_operational(pmcs_phy_t *pptr, pmcs_xscsi_t *tgt);
145e0143b4896d03ce53b1af6787afa1a7e73959dhstatic void pmcs_handle_ds_recovery_error(pmcs_phy_t *phyp,
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana pmcs_xscsi_t *tgt, pmcs_hw_t *pwp, const char *func_name,
145e0143b4896d03ce53b1af6787afa1a7e73959dh * Get device state. Called with statlock and PHY lock held.
145e0143b4896d03ce53b1af6787afa1a7e73959dhpmcs_get_dev_state(pmcs_hw_t *pwp, pmcs_phy_t *phyp, pmcs_xscsi_t *xp,
145e0143b4896d03ce53b1af6787afa1a7e73959dh pmcs_prt(pwp, PMCS_PRT_DEBUG3, phyp, xp, "%s: tgt(0x%p)", __func__,
145e0143b4896d03ce53b1af6787afa1a7e73959dh (void *)xp);
f96f3b56078f1646f6c42036086a938c112fbb9fSrikanth, Ramana pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, xp,
145e0143b4896d03ce53b1af6787afa1a7e73959dh pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nowrk, __func__);
145e0143b4896d03ce53b1af6787afa1a7e73959dh return (-1);
145e0143b4896d03ce53b1af6787afa1a7e73959dh return (-1);
145e0143b4896d03ce53b1af6787afa1a7e73959dh pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nomsg, __func__);
145e0143b4896d03ce53b1af6787afa1a7e73959dh return (-1);
145e0143b4896d03ce53b1af6787afa1a7e73959dh return (-1);
145e0143b4896d03ce53b1af6787afa1a7e73959dh return (0);
145e0143b4896d03ce53b1af6787afa1a7e73959dh return (-1);
145e0143b4896d03ce53b1af6787afa1a7e73959dh * Set device state. Called with target's statlock and PHY lock held.
145e0143b4896d03ce53b1af6787afa1a7e73959dhpmcs_set_dev_state(pmcs_hw_t *pwp, pmcs_phy_t *phyp, pmcs_xscsi_t *xp,
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: ds: 0x%x tgt: 0x%p phy: 0x%p", __func__, ds, (void *)xp,
145e0143b4896d03ce53b1af6787afa1a7e73959dh return (-1);
145e0143b4896d03ce53b1af6787afa1a7e73959dh pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nowrk, __func__);
145e0143b4896d03ce53b1af6787afa1a7e73959dh return (-1);
145e0143b4896d03ce53b1af6787afa1a7e73959dh return (-1);
145e0143b4896d03ce53b1af6787afa1a7e73959dh pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nomsg, __func__);
145e0143b4896d03ce53b1af6787afa1a7e73959dh return (-1);
145e0143b4896d03ce53b1af6787afa1a7e73959dh return (-1);
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: previous_ds=0x%x, new_ds=0x%x", __func__, pds, nds);
145e0143b4896d03ce53b1af6787afa1a7e73959dh return (0);
145e0143b4896d03ce53b1af6787afa1a7e73959dh return (-1);
6745c559e4b531cf336a91f4653445c32ee46693Jesse Butlerpmcs_ds_operational(pmcs_phy_t *pptr, pmcs_xscsi_t *tgt)
6745c559e4b531cf336a91f4653445c32ee46693Jesse Butler } else if (ddi_get_lbolt() < pptr->last_good_recovery +
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana pmcs_handle_ds_recovery_error(pptr, tgt, pwp, __func__,
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana "Max recovery attempts reached. Declaring PHY dead");
6745c559e4b531cf336a91f4653445c32ee46693Jesse Butler /* Don't bother to run the work queues if the PHY is dead */
6745c559e4b531cf336a91f4653445c32ee46693Jesse Butler (void) ddi_taskq_dispatch(pwp->tq, pmcs_worker,
145e0143b4896d03ce53b1af6787afa1a7e73959dh * First time, check to see if we're already performing recovery
145e0143b4896d03ce53b1af6787afa1a7e73959dh * Since ds_err_recovering is set, we can be assured these
145e0143b4896d03ce53b1af6787afa1a7e73959dh * PHYs won't disappear on us while we do this.
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana if (pptr->dead || !pptr->valid_device_id) {
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana if (pptr->iport && (pptr->iport->ua_state != UA_ACTIVE)) {
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, pptr->target,
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana "%s: No DS recovery on PHY %s, iport not active",
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: DS recovery on PHY %s "
145e0143b4896d03ce53b1af6787afa1a7e73959dh "re-invoked too soon. Skipping...",
145e0143b4896d03ce53b1af6787afa1a7e73959dh * Step 1: Put the device into the IN_RECOVERY state
145e0143b4896d03ce53b1af6787afa1a7e73959dh if (rc != 0) {
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: pmcs_get_dev_state on PHY %s "
145e0143b4896d03ce53b1af6787afa1a7e73959dh "failed (rc=%d)",
6745c559e4b531cf336a91f4653445c32ee46693Jesse Butler /* If the chip says it's operational, we're done */
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: pmcs_send_err_recovery_cmd "
145e0143b4896d03ce53b1af6787afa1a7e73959dh "result(%d) tgt(0x%p) ds(0x%x) tgt->ds(0x%x)",
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: pmcs_send_err_recovery_cmd to PHY %s "
145e0143b4896d03ce53b1af6787afa1a7e73959dh "failed (rc=%d)",
6745c559e4b531cf336a91f4653445c32ee46693Jesse Butler * Step 2: Perform a hard reset on the PHY.
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt,
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana "%s: Issue HARD_RESET to PHY %s", __func__,
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana * Must release statlock here because pmcs_reset_phy
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana * will drop and reacquire the PHY lock.
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana rc = pmcs_reset_phy(pwp, pptr, PMCS_PHYOP_HARD_RESET);
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana "%s: HARD_RESET to PHY %s failed (rc=%d)",
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana pmcs_handle_ds_recovery_error(pptr, tgt, pwp,
145e0143b4896d03ce53b1af6787afa1a7e73959dh * Step 3: Abort all I/Os to the device
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: Waiting for outstanding ABORT_ALL on "
145e0143b4896d03ce53b1af6787afa1a7e73959dh if (rc != 0) {
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: pmcs_abort to PHY %s failed (rc=%d)",
145e0143b4896d03ce53b1af6787afa1a7e73959dh * Step 4: Set the device back to OPERATIONAL state
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: Set PHY/tgt 0x%p/0x%p to OPERATIONAL state",
145e0143b4896d03ce53b1af6787afa1a7e73959dh if (rc == 0) {
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: Failed to SET tgt 0x%p to OPERATIONAL state",
145e0143b4896d03ce53b1af6787afa1a7e73959dh * Only clear ds_err_recovering if we're exiting for good and not
145e0143b4896d03ce53b1af6787afa1a7e73959dh * just unwinding from recursion
b0e5d1e5d408b31359b469816fe135bd0b5a5918Srikanth, Ramana SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY);
145e0143b4896d03ce53b1af6787afa1a7e73959dh * Called with target's statlock held (if target is non-NULL) and PHY lock held.
145e0143b4896d03ce53b1af6787afa1a7e73959dhpmcs_send_err_recovery_cmd(pmcs_hw_t *pwp, uint8_t dev_state, pmcs_phy_t *phyp,
145e0143b4896d03ce53b1af6787afa1a7e73959dh return (0);
145e0143b4896d03ce53b1af6787afa1a7e73959dh return (-1);
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: ds: 0x%x, tgt ds(0x%x)", __func__, dev_state, tgt_dev_state);
145e0143b4896d03ce53b1af6787afa1a7e73959dh if (rc != 0) {
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s(1): Failed to set tgt(0x%p) to IN_RECOVERY",
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: Target 0x%p not ready to go OPERATIONAL",
145e0143b4896d03ce53b1af6787afa1a7e73959dh if (rc != 0) {
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s(2): Failed to SET tgt(0x%p) to OPERATIONAL",
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: Device at %s is non-operational",
145e0143b4896d03ce53b1af6787afa1a7e73959dh return (rc);
145e0143b4896d03ce53b1af6787afa1a7e73959dh * Start ssp event recovery. We have to schedule recovery operation because
145e0143b4896d03ce53b1af6787afa1a7e73959dh * it involves sending multiple commands to device and we should not do it
145e0143b4896d03ce53b1af6787afa1a7e73959dh * in the interrupt context.
145e0143b4896d03ce53b1af6787afa1a7e73959dh * If it is failure of a recovery command, let the recovery thread deal with it.
225bf9057f97461ac410ec2e1432d1575360ee18Jesse Butler * Called with the work lock held.
145e0143b4896d03ce53b1af6787afa1a7e73959dhpmcs_start_ssp_event_recovery(pmcs_hw_t *pwp, pmcwork_t *pwrk, uint32_t *iomb,
145e0143b4896d03ce53b1af6787afa1a7e73959dh * No target, need to run RE-DISCOVERY here.
145e0143b4896d03ce53b1af6787afa1a7e73959dh * Although we cannot mark phy to force abort nor mark phy
145e0143b4896d03ce53b1af6787afa1a7e73959dh * as changed, killing of a target would take care of aborting
145e0143b4896d03ce53b1af6787afa1a7e73959dh * commands for the device.
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: No valid target for event processing. Reconfigure.",
225bf9057f97461ac410ec2e1432d1575360ee18Jesse Butler /* We have a phy pointer, we'll need to lock it */
145e0143b4896d03ce53b1af6787afa1a7e73959dh if (event == PMCOUT_STATUS_OPEN_CNX_ERROR_IT_NEXUS_LOSS) {
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: Device at %s is non-operational",
145e0143b4896d03ce53b1af6787afa1a7e73959dh * If this command is run in WAIT mode, it is a failing recovery
145e0143b4896d03ce53b1af6787afa1a7e73959dh * command. If so, just wake up recovery thread waiting for
145e0143b4896d03ce53b1af6787afa1a7e73959dh * command completion.
56976565c7cea0f2fbde67ea3a985d8b17a71288David Hollister "%s: Not scheduling SSP event recovery for NULL tgt"
56976565c7cea0f2fbde67ea3a985d8b17a71288David Hollister " pwrk(%p) tag(0x%x)", __func__, (void *)pwrk,
658280b6253b61dbb155f43d0e3cbcffa85ccb90David Hollister * If the SSP event was an OPEN_RETRY_TIMEOUT, we don't want
658280b6253b61dbb155f43d0e3cbcffa85ccb90David Hollister * to go through the recovery (abort/LU reset) process.
658280b6253b61dbb155f43d0e3cbcffa85ccb90David Hollister * Simply complete the command and return it as STATUS_BUSY.
658280b6253b61dbb155f43d0e3cbcffa85ccb90David Hollister * This will cause the target driver to simply retry.
658280b6253b61dbb155f43d0e3cbcffa85ccb90David Hollister if (event == PMCOUT_STATUS_IO_XFER_OPEN_RETRY_TIMEOUT) {
658280b6253b61dbb155f43d0e3cbcffa85ccb90David Hollister "%s: Got OPEN_RETRY_TIMEOUT event (htag 0x%08x)",
225bf9057f97461ac410ec2e1432d1575360ee18Jesse Butler /* Note: work remains locked for the callback */
145e0143b4896d03ce53b1af6787afa1a7e73959dh * To recover from primary failures,
145e0143b4896d03ce53b1af6787afa1a7e73959dh * we need to schedule handling events recovery.
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: Scheduling SSP event recovery for tgt(0x%p) "
145e0143b4896d03ce53b1af6787afa1a7e73959dh "pwrk(%p) tag(0x%x)", __func__, (void *)tgt, (void *)pwrk,
145e0143b4896d03ce53b1af6787afa1a7e73959dh /* Work cannot be completed until event recovery is completed. */
145e0143b4896d03ce53b1af6787afa1a7e73959dh * SSP target event recovery
219ebc8ed9819073f2dd55dead617f876abaa22dSrikanth Suravajhala * phy->lock should be held upon entry.
219ebc8ed9819073f2dd55dead617f876abaa22dSrikanth Suravajhala * pwrk->lock should be held upon entry and gets released by this routine.
219ebc8ed9819073f2dd55dead617f876abaa22dSrikanth Suravajhala * tgt->statlock should not be held.
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: event recovery for target 0x%p", __func__, (void *)pwrk->xp);
145e0143b4896d03ce53b1af6787afa1a7e73959dh event == PMCOUT_STATUS_XFER_ERROR_CMD_ISSUE_ACK_NAK_TIMEOUT) {
145e0143b4896d03ce53b1af6787afa1a7e73959dh /* Command may be still pending on device */
145e0143b4896d03ce53b1af6787afa1a7e73959dh if (rv != 0) {
145e0143b4896d03ce53b1af6787afa1a7e73959dh /* Command NOT pending on a device */
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: No pending command for tgt 0x%p",
145e0143b4896d03ce53b1af6787afa1a7e73959dh /* Nothing more to do, just abort it on chip */
145e0143b4896d03ce53b1af6787afa1a7e73959dh * All other events left the command pending in the host
145e0143b4896d03ce53b1af6787afa1a7e73959dh * Send abort task and abort it on the chip
145e0143b4896d03ce53b1af6787afa1a7e73959dh if (htag != 0) {
219ebc8ed9819073f2dd55dead617f876abaa22dSrikanth Suravajhala (void) pmcs_abort(pwp, pptr, htag, 0, 1);
145e0143b4896d03ce53b1af6787afa1a7e73959dh * Abort either took care of work completion, or put device in
145e0143b4896d03ce53b1af6787afa1a7e73959dh * a recovery state
145e0143b4896d03ce53b1af6787afa1a7e73959dh /* Abort failed, do full device recovery */
219ebc8ed9819073f2dd55dead617f876abaa22dSrikanth Suravajhala pmcs_start_dev_state_recovery(tgt, pptr);
145e0143b4896d03ce53b1af6787afa1a7e73959dh * SSP event recovery task.
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana "%s: found target(0x%p)", __func__, (void *) tgt);
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana /* Check what cmd expects recovery */
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana * aq may contain TMF commands, so we
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana * may not find work structure with htag
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana "%s: pwrk(%p) htag(0x%x)",
219ebc8ed9819073f2dd55dead617f876abaa22dSrikanth Suravajhala * pwrk->lock gets dropped in
219ebc8ed9819073f2dd55dead617f876abaa22dSrikanth Suravajhala * pmcs_tgt_event_recovery()
219ebc8ed9819073f2dd55dead617f876abaa22dSrikanth Suravajhala /* All bets are off on tgt/aq now, restart */
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana "%s: end of SSP event recovery for target(0x%p)",
145e0143b4896d03ce53b1af6787afa1a7e73959dh (void *) pwp);
145e0143b4896d03ce53b1af6787afa1a7e73959dhpmcs_start_dev_state_recovery(pmcs_xscsi_t *xp, pmcs_phy_t *phyp)
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: Start ds_recovery for tgt 0x%p/PHY 0x%p (%s)",
145e0143b4896d03ce53b1af6787afa1a7e73959dh * Rather than waiting for the watchdog timer, we'll
145e0143b4896d03ce53b1af6787afa1a7e73959dh * kick it right now.
145e0143b4896d03ce53b1af6787afa1a7e73959dh (void) ddi_taskq_dispatch(xp->pwp->tq, pmcs_worker, xp->pwp,
145e0143b4896d03ce53b1af6787afa1a7e73959dh * Increment the phy ds error retry count.
145e0143b4896d03ce53b1af6787afa1a7e73959dh * If too many retries, mark phy dead and restart discovery;
145e0143b4896d03ce53b1af6787afa1a7e73959dh * otherwise schedule ds recovery.
145e0143b4896d03ce53b1af6787afa1a7e73959dhstatic void
145e0143b4896d03ce53b1af6787afa1a7e73959dhpmcs_handle_ds_recovery_error(pmcs_phy_t *phyp, pmcs_xscsi_t *tgt,
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana pmcs_hw_t *pwp, const char *func_name, char *reason_string)
145e0143b4896d03ce53b1af6787afa1a7e73959dh if (phyp->ds_recovery_retries > PMCS_MAX_DS_RECOVERY_RETRIES) {
145e0143b4896d03ce53b1af6787afa1a7e73959dh "%s: retry limit reached after %s to PHY %s failed",
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana * Mark the PHY as dead and it and its parent as changed,
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana * then restart discovery
145e0143b4896d03ce53b1af6787afa1a7e73959dh (phyp->last_good_recovery + drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME)
145e0143b4896d03ce53b1af6787afa1a7e73959dh pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt, "%s: max number of "
145e0143b4896d03ce53b1af6787afa1a7e73959dh "successful recoveries reached, declaring PHY %s dead",
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana * Mark the PHY as dead and its parent as changed,
601c90f161ff0319c1b4a2c3362b466043a65d8dSrikanth, Ramana * then restart discovery