/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* sun4u Fire Error Handling
*/
#include <sys/pcie_impl.h>
#include "px_obj.h"
#include <px_regs.h>
#include <px_csr.h>
#include <sys/machcpuvar.h>
#include <sys/platform_module.h>
#include "px_lib4u.h"
#include "px_err.h"
#include "px_err_impl.h"
#include "oberon_regs.h"
/*
* Do not enable Link Interrupts
*/
/*
* (1ull << ILU_INTERRUPT_ENABLE_IHB_PE_S) |
* (1ull << ILU_INTERRUPT_ENABLE_IHB_PE_P);
*/
/*
* LPU Intr Registers are reverse encoding from the registers above.
* 1 = disable
* 0 = enable
*
* Log and Count are however still the same.
*/
/*
* JBC error bit table
*/
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
PX_ERR_JBC_CLASS(bit) }, \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
/* JBC FATAL */
/* JBC MERGE */
/* JBC Jbusint IN */
/* JBC Jbusint Out */
/*
* JBC Dmcint ODCD
*
* Error bits which can be set via a bad PCItool access go through
* jbc_safe_acc instead.
*/
/* JBC Dmcint IDC */
/* JBC CSR */
};
#define px_err_jbc_keys \
(sizeof (px_err_jbc_tbl)) / (sizeof (px_err_bit_desc_t))
/*
* UBC error bit table
*/
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
PX_ERR_UBC_CLASS(bit) }, \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
/* UBC FATAL */
};
#define px_err_ubc_keys \
(sizeof (px_err_ubc_tbl)) / (sizeof (px_err_bit_desc_t))
char *ubc_class_eid_qualifier[] = {
"-mem",
"-channel",
"-cpu",
"-path"
};
/*
* DMC error bit tables
*/
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
PX_ERR_DMC_CLASS(bit) }, \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
/* DMC IMU RDS */
/* DMC IMU SCS */
/* DMC IMU */
};
/* mmu errors */
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
PX_ERR_DMC_CLASS(bit) }, \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
/* DMC MMU */
};
/*
* PEC error bit tables
*/
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
PX_ERR_PEC_CLASS(bit) }, \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
/* PEC ILU none */
};
#define px_err_ilu_keys \
(sizeof (px_err_ilu_tbl)) / (sizeof (px_err_bit_desc_t))
/*
* PEC UE errors implementation is incomplete pending PCIE generic
* fabric rules. Must handle both PRIMARY and SECONDARY errors.
*/
/* pec ue errors */
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
PX_ERR_PEC_CLASS(bit) }, \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
PX_ERR_PEC_OB_CLASS(bit) }, \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
/* PCI-E Receive Uncorrectable Errors */
/* PCI-E Transmit Uncorrectable Errors */
/* Other PCI-E Uncorrectable Errors */
/* Not used */
};
#define px_err_tlu_ue_keys \
(sizeof (px_err_tlu_ue_tbl)) / (sizeof (px_err_bit_desc_t))
/*
* PEC CE errors implementation is incomplete pending PCIE generic
* fabric rules.
*/
/* pec ce errors */
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
PX_ERR_PEC_CLASS(bit) }, \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
/* PCI-E Correctable Errors */
};
#define px_err_tlu_ce_keys \
(sizeof (px_err_tlu_ce_tbl)) / (sizeof (px_err_bit_desc_t))
/* pec oe errors */
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
PX_ERR_PEC_CLASS(bit) }, \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
PX_ERR_PEC_OB_CLASS(bit) }, \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
/* TLU Other Event Status (receive only) */
/* TLU Other Event Status (rx + tx) */
/* TLU Other Event */
};
#define px_err_tlu_oe_keys \
(sizeof (px_err_tlu_oe_tbl)) / (sizeof (px_err_bit_desc_t))
/*
* All the following tables below are for LPU Interrupts. These interrupts
* are *NOT* error interrupts, but event status interrupts.
*
* These events are probably of most interest to:
* o Hotplug
* o Power Management
* o etc...
*
* There are also a few events that would be interresting for FMA.
* Again none of the regiseters below state that an error has occured
* or that data has been lost. If anything, they give status that an
* error is *about* to occur. examples
* o INT_SKP_ERR - indicates clock between fire and child is too far
* off and is most unlikely able to compensate
* o INT_TX_PAR_ERR - A parity error occured in ONE lane. This is
* HW recoverable, but will like end up as a future
* fabric error as well.
*
* For now, we don't care about any of these errors and should be ignore,
* but cleared.
*/
/* LPU Link Interrupt Table */
0, \
NULL, \
NULL, \
""
};
#define px_err_lpul_keys \
(sizeof (px_err_lpul_tbl)) / (sizeof (px_err_bit_desc_t))
/* LPU Physical Interrupt Table */
0, \
NULL, \
NULL, \
""
};
#define px_err_lpup_keys \
(sizeof (px_err_lpup_tbl)) / (sizeof (px_err_bit_desc_t))
/* LPU Receive Interrupt Table */
0, \
NULL, \
NULL, \
""
};
#define px_err_lpur_keys \
(sizeof (px_err_lpur_tbl)) / (sizeof (px_err_bit_desc_t))
/* LPU Transmit Interrupt Table */
0, \
NULL, \
NULL, \
""
};
#define px_err_lpux_keys \
(sizeof (px_err_lpux_tbl)) / (sizeof (px_err_bit_desc_t))
/* LPU LTSSM Interrupt Table */
0, \
NULL, \
NULL, \
""
};
#define px_err_lpus_keys \
(sizeof (px_err_lpus_tbl)) / (sizeof (px_err_bit_desc_t))
/* LPU Gigablaze Glue Interrupt Table */
0, \
NULL, \
NULL, \
""
};
#define px_err_lpug_keys \
(sizeof (px_err_lpug_tbl)) / (sizeof (px_err_bit_desc_t))
/* Mask and Tables */
PX_REG_XBC, \
0
PX_REG_CSR, \
0
/* LPU Registers Addresses */
NULL, \
/* LPU Registers Addresses with Irregularities */
NULL, \
/* TLU Registers Addresses */
/* Registers Addresses for JBC, UBC, MMU, IMU and ILU */
pre ## _ERROR_LOG_ENABLE, \
pre ## _INTERRUPT_ENABLE, \
pre ## _INTERRUPT_STATUS, \
/* Bits in chip_mask, set according to type. */
/*
* Register error handling tables.
* The ID Field (first field) is identified by an enum px_err_id_t.
* It is located in px_err.h
*/
static const
};
typedef struct px_err_ss {
} px_err_ss_t;
px_err_ss_t *ss);
/*
* px_err_cb_intr:
* o lock
* o create derr
* o px_err_cmn_intr
* o unlock
* o handle error: fatal? fm_panic() : return INTR_CLAIMED)
*/
{
int err;
/* Create the derr */
goto done;
done:
return (DDI_INTR_CLAIMED);
}
/*
* px_err_dmc_pec_intr:
* o lock
* o create derr
* o px_err_cmn_intr(leaf, with out cb)
* o pcie_scan_fabric (leaf)
* o unlock
* o handle error: fatal? fm_panic() : return INTR_CLAIMED)
*/
{
/* Create the derr */
goto done;
/* Check all child devices for errors */
/* Set the interrupt state to idle */
done:
return (DDI_INTR_CLAIMED);
}
/*
* Proper csr_base is responsibility of the caller. (Called from px_lib_dev_init
* via px_err_reg_setup_all for pcie error registers; called from
*
* Note: reg_id is passed in instead of reg_desc since this function is called
* from px_lib4u.c, which doesn't know about the structure of the table.
*/
void
{
/* Enable logs if it exists */
/*
* For readability you in code you set 1 to enable an interrupt.
* But in Fire it's backwards. You set 1 to *disable* an intr.
* Reverse the user tunable intr mask field.
*
* Disable All Errors
* Clear All Errors
* Enable Errors
*/
}
}
void
{
}
/*
* Set up pcie error registers.
*/
void
{
/*
* JBC or XBC are enabled during adding of common block interrupts,
* not done here.
*/
}
}
/*
* px_err_cmn_intr:
* Common function called by trap, mondo and fabric intr.
* o Snap shot current fire registers
* o check for safe access
* o send ereport and clear snap shot registers
* o create and queue RC info for later use in fabric scan.
* o check severity of snap shot registers
*
* @param px_p leaf in which to check access
* @param derr fm err data structure to be updated
* @param caller PX_TRAP_CALL | PX_INTR_CALL
* @param block PX_FM_BLOCK_HOST | PX_FM_BLOCK_PCIE | PX_FM_BLOCK_ALL
* @return err PX_NO_PANIC | PX_PANIC | PX_HW_RESET | PX_PROTECTED
*/
int
{
int err;
/* check for safe access */
/* snap shot the current fire registers */
/* check for error severity */
/* Mark the On Trap Handle if an error occured */
if (err != PX_NO_ERROR) {
}
return (err);
}
/*
* Static function
*/
/*
* px_err_snapshot:
* Take a current snap shot of all the fire error registers. This includes
*
* @param px_p leaf in which to take the snap shot.
* @param ss pre-allocated memory to store the snap shot.
*/
static void
{
continue;
if ((block & PX_FM_BLOCK_HOST) &&
else if ((block & PX_FM_BLOCK_PCIE) &&
else {
continue;
}
}
}
/*
* px_err_erpt_and_clr:
* This function does the following thing to all the fire registers based
* on an earlier snap shot.
* o Send ereport
* o Handle the error
* o Clear the error
*
* @param px_p leaf in which to take the snap shot.
* @param derr fm err in which the ereport is to be based on
* @param ss_p pre-allocated memory to store the snap shot.
*/
static int
{
int (*err_handler)();
int (*erpt_handler)();
int biterr = 0;
/* Get the correct register description table */
/* Only look at enabled groups. */
continue;
/* Get the correct CSR BASE */
/* If there are no errors in this register, continue */
if (!ss_reg)
continue;
/* Get pointers to masks and register addresses */
/* Get the register BIT description table */
/* For each known bit in the register send erpt and handle */
/*
* If the ss_reg is set for this bit,
* send ereport and handle
*/
continue;
/* Increment the counter if necessary */
err_bit_desc->counter++;
}
/* Error Handle for this bit */
if (err_handler) {
}
/*
* Send the ereport if it's an UNEXPECTED err.
* This is the only place where PX_EXPECTED is utilized.
*/
(biterr == PX_EXPECTED))
continue;
if (erpt_handler)
}
/* Clear the register and error */
}
return (err);
}
/*
* px_err_check_severity:
* Check the severity of the fire error based on an earlier snapshot
*
* @param px_p leaf in which to take the snap shot.
* @param derr fm err in which the ereport is to be based on
* @param err fire register error status
* @param caller PX_TRAP_CALL | PX_INTR_CALL | PX_LIB_CALL
*/
static int
{
/*
* Nothing to do if called with no error.
* The err could have already been set to PX_NO_PANIC, which means the
*/
if (err == PX_NO_ERROR)
return (err);
/* Cautious access error handling */
case DDI_FM_ERR_EXPECTED:
if (caller == PX_TRAP_CALL) {
/*
* for ddi_caut_get treat all events as nonfatal
* The trampoline will set err_ena = 0,
* err_status = NONFATAL.
*/
is_safeacc = B_TRUE;
} else {
/*
* For ddi_caut_put treat all events as nonfatal. Here
* we have the handle and can call ndi_fm_acc_err_set().
*/
is_safeacc = B_TRUE;
}
break;
case DDI_FM_ERR_PEEK:
case DDI_FM_ERR_POKE:
/*
*/
is_safeacc = B_TRUE;
break;
default:
}
/* re-adjust error status from safe access, forgive all errors */
if (is_safeacc)
return (PX_NO_PANIC);
return (err);
}
/* predefined convenience functions */
/* ARGSUSED */
void
{
"Bit %d, %s, at %s(0x%x) has occured %d times with a severity "
"of \"%s\"\n",
}
/* ARGSUSED */
int
{
if (px_log & PX_HW_RESET) {
"HW RESET");
}
return (PX_HW_RESET);
}
/* ARGSUSED */
int
{
}
return (PX_PANIC);
}
/* ARGSUSED */
int
{
if (px_log & PX_PROTECTED) {
"PROTECTED");
}
return (PX_PROTECTED);
}
/* ARGSUSED */
int
{
if (px_log & PX_NO_PANIC) {
"NO PANIC");
}
return (PX_NO_PANIC);
}
/* ARGSUSED */
int
{
if (px_log & PX_NO_ERROR) {
"NO ERROR");
}
return (PX_NO_ERROR);
}
/* ARGSUSED */
{
return (PX_NO_ERROR);
}
/*
* Search the px_cb_list_t embedded in the px_cb_t for the
* px_t of the specified Leaf (leaf_id). Return its associated dip.
*/
static dev_info_t *
{
int i;
OBERON_PORT_ID_LEAF_MASK) == leaf_id) {
}
}
return (NULL);
}
/* UBC FATAL - see io erpt doc, section 1.1 */
/* ARGSUSED */
{
int unum_length;
unum[0] = '\0';
if (eid == UBC_EID_MEM) {
if (&plat_get_mem_unum) {
if ((plat_get_mem_unum(0,
FM_MAX_CLASS, &unum_length)) != 0)
unum[0] = '\0';
}
} else if (eid == UBC_EID_CPU) {
(void) fm_fmri_cpu_set(resource,
&cpu_version, sbuf);
}
}
/*
* For most of the errors represented in the UBC Interrupt Status
* register, one can compute the dip of the actual Leaf that was
* involved in the error. To do this, find the px_cb_t structure
* that is shared between a pair of Leaves (eg, LeafA and LeafB).
*
* If any of the error bits for LeafA are set in the hardware
* register, search the list of px_t's rooted in the px_cb_t for
* the one corresponding to LeafA. If error bits for LeafB are set,
* search the list for LeafB's px_t. The px_t references its
* associated dip.
*/
/* read hardware register */
if ((ubc_intr_status & UBC_INTERRUPT_STATUS_LEAFA) != 0) {
/* then Leaf A is involved in the error */
rpdip = actual_dip;
} else if ((ubc_intr_status & UBC_INTERRUPT_STATUS_LEAFB) != 0) {
/* then Leaf B is involved in the error */
rpdip = actual_dip;
} /* else error cannot be associated with a Leaf */
if (resource) {
NULL);
} else {
NULL);
}
return (PX_NO_PANIC);
}
/* JBC FATAL */
{
NULL);
return (PX_NO_PANIC);
}
/* JBC MERGE */
{
NULL);
return (PX_NO_PANIC);
}
/*
* JBC Merge buffer retryable errors:
* Merge buffer parity error (rd_buf): PIO or DMA
* Merge buffer parity error (wr_buf): PIO or DMA
*/
/* ARGSUSED */
int
{
/*
* Holder function to attempt error recovery. When the features
* are in place, look up the address of the transaction in:
*
* paddr = CSR_XR(csr_base, MERGE_TRANSACTION_ERROR_LOG);
* paddr &= MERGE_TRANSACTION_ERROR_LOG_ADDRESS_MASK;
*
* If the error is a secondary error, there is no log information
* just panic as it is unknown which address has been affected.
*
* Remember the address is pretranslation and might be hard to look
* up the appropriate driver based on the PA.
*/
}
/* JBC Jbusint IN */
{
NULL);
return (PX_NO_PANIC);
}
/*
* JBC Jbusint IN retryable errors
* Log Reg[42:0].
* Write Data Parity Error: PIO Writes
* Read Data Parity Error: DMA Reads
*/
int
{
/*
* Holder function to attempt error recovery. When the features
* are in place, look up the address of the transaction in:
*
* paddr = CSR_XR(csr_base, JBCINT_IN_TRANSACTION_ERROR_LOG);
* paddr &= JBCINT_IN_TRANSACTION_ERROR_LOG_ADDRESS_MASK;
*
* If the error is a secondary error, there is no log information
* just panic as it is unknown which address has been affected.
*
* Remember the address is pretranslation and might be hard to look
* up the appropriate driver based on the PA.
*/
}
/* JBC Jbusint Out */
{
NULL);
return (PX_NO_PANIC);
}
/* JBC Dmcint ODCD */
{
NULL);
return (PX_NO_PANIC);
}
/*
* JBC Dmcint ODCO nonfatal errer handling -
* PIO data parity error: PIO
*/
/* ARGSUSED */
int
{
/*
* Holder function to attempt error recovery. When the features
* are in place, look up the address of the transaction in:
*
* paddr = CSR_XR(csr_base, DMCINT_ODCD_ERROR_LOG);
* paddr &= DMCINT_ODCD_ERROR_LOG_ADDRESS_MASK;
*
* If the error is a secondary error, there is no log information
* just panic as it is unknown which address has been affected.
*
* Remember the address is pretranslation and might be hard to look
* up the appropriate driver based on the PA.
*/
}
/* Does address in DMCINT error log register match address of pcitool access? */
static boolean_t
{
return (pcitool_addr == errlog_addr);
}
/*
* JBC Dmcint ODCD errer handling for errors which are forgivable during a safe
* access. (This will be most likely be a PCItool access.) If not a safe
* access context, treat like jbc_dmcint_odcd.
* Unmapped PIO read error: pio:read:M:nonfatal
* Unmapped PIO write error: pio:write:M:nonfatal
*/
/* ARGSUSED */
int
{
if (!pri)
/*
* Got an error which is forgivable during a PCItool access.
*
* Don't do handler check since the error may otherwise be unfairly
* attributed to a device. Just return.
*
* Note: There is a hole here in that a legitimate error can come in
* while a PCItool access is in play and be forgiven. This is possible
* though not likely.
*/
}
/* JBC Dmcint IDC */
{
NULL);
return (PX_NO_PANIC);
}
/* JBC CSR */
{
"jbc-error-reg", DATA_TYPE_UINT64,
NULL);
return (PX_NO_PANIC);
}
/* DMC IMU RDS */
{
NULL);
return (PX_NO_PANIC);
}
/* handle EQ overflow */
/* ARGSUSED */
int
{
} else {
}
}
/* DMC IMU SCS */
{
NULL);
return (PX_NO_PANIC);
}
/* DMC IMU */
{
NULL);
return (PX_NO_PANIC);
}
{
if (pri) {
/* Only PIO Fault Addresses are valid, this is DMA */
}
NULL);
return (PX_NO_PANIC);
}
/* DMC MMU */
{
NULL);
return (PX_NO_PANIC);
}
/*
* IMU function to handle all Received but Not Enabled errors.
*
* These errors are due to transactions modes in which the PX driver was not
* setup to be able to do. If possible, inform the driver that their DMA has
* failed by marking their DMA handle as failed, but do not panic the system.
* Most likely the address is not valid, as Fire wasn't setup to handle them in
* the first place.
*
* These errors are not retryable, unless the PX mode has changed, otherwise the
* same error will occur again.
*/
int
{
goto done;
bdf);
done:
}
/*
* IMU function to handle all invalid address errors.
*
* These errors are due to transactions in which the address is not recognized.
* If possible, inform the driver that all DMAs have failed by marking their DMA
* handles. Fire should not panic the system, it'll be up to the driver to
* panic. The address logged is invalid.
*
* These errors are not retryable since retrying the same transaction with the
* same invalid address will result in the same error.
*/
/* ARGSUSED */
int
{
goto done;
bdf);
done:
}
/*
* IMU function to handle normal transactions that encounter a parity error.
*
* These errors are due to transactions that enouter a parity error. If
* possible, inform the driver that their DMA have failed and that they should
* retry. If Fire is unable to contact the leaf driver, panic the system.
* Otherwise, it'll be up to the device to determine is this is a panicable
* error.
*/
/* ARGSUSED */
int
{
goto done;
done:
if (status == PF_HDL_NOTFOUND)
else
}
/*
*/
/* ARGSUSED */
int
{
int sts;
goto done;
done:
}
/*
* TLU LUP event - if caused by power management activity, then it is expected.
* In all other cases, it is an error.
*/
/* ARGSUSED */
int
{
/*
* power management code is currently the only segment that sets
* px_lup_pending to indicate its expectation for a healthy LUP
* event. For all other occasions, LUP event should be flaged as
* error condition.
*/
}
/*
* TLU LDN event - if caused by power management activity, then it is expected.
* In all other cases, it is an error.
*/
/* ARGSUSED */
int
{
}
/* PEC ILU none - see io erpt doc, section 3.1 */
{
NULL);
return (PX_NO_PANIC);
}
/* PCIEX UE Errors */
/* ARGSUSED */
int
{
int err;
/*
* Log the Received Log for PTLP, UR and UC.
*/
err_bit) {
}
} else {
}
} else {
}
}
/* PCI-E Uncorrectable Errors */
{
NULL);
return (PX_NO_PANIC);
}
/* PCI-E Uncorrectable Errors */
{
NULL);
return (PX_NO_PANIC);
}
/* PCI-E Uncorrectable Errors */
{
NULL);
return (PX_NO_PANIC);
}
/* PCI-E Uncorrectable Errors */
{
NULL);
return (PX_NO_PANIC);
}
/* PCIEX UE Errors */
/* ARGSUSED */
int
{
int err;
else
} else {
}
}
/* PCI-E Correctable Errors - see io erpt doc, section 3.6 */
{
NULL);
return (PX_NO_PANIC);
}
/* TLU Other Event Status (receive only) - see io erpt doc, section 3.7 */
{
NULL);
return (PX_NO_PANIC);
}
/* TLU Other Event Status (rx + tx) - see io erpt doc, section 3.8 */
{
int sts;
if ((bit == TLU_OTHER_EVENT_STATUS_SET_RUC_P) ||
(bit == TLU_OTHER_EVENT_STATUS_SET_WUC_P)) {
/* get completer bdf (fault bdf) from rx logs */
/* get fault addr from tx logs */
if (sts == DDI_SUCCESS)
}
NULL);
return (PX_NO_PANIC);
}
/* TLU Other Event - see io erpt doc, section 3.9 */
{
NULL);
return (PX_NO_PANIC);
}