px_err.c revision 0168954460bd77d83497a4a6aa9c3f34c55dba25
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* sun4u Fire Error Handling
*/
#include <sys/pcie_impl.h>
#include "px_obj.h"
#include <px_regs.h>
#include <px_csr.h>
#include "pcie_pwr.h"
#include "px_lib4u.h"
#include "px_err.h"
#include "px_err_impl.h"
/*
* JBC error bit table
*/
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
PX_ERR_JBC_CLASS(bit) }, \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
/* JBC FATAL - see io erpt doc, section 1.1 */
/* JBC MERGE - see io erpt doc, section 1.2 */
/* JBC Jbusint IN - see io erpt doc, section 1.3 */
/* JBC Jbusint Out - see io erpt doc, section 1.4 */
/* JBC Dmcint ODCD - see io erpt doc, section 1.5 */
/* JBC Dmcint IDC - see io erpt doc, section 1.6 */
/* JBC CSR - see io erpt doc, section 1.7 */
};
#define px_err_cb_keys \
(sizeof (px_err_cb_tbl)) / (sizeof (px_err_bit_desc_t))
/*
* DMC error bit tables
*/
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
PX_ERR_DMC_CLASS(bit) }, \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
/* DMC IMU RDS - see io erpt doc, section 2.1 */
/* DMC IMU SCS - see io erpt doc, section 2.2 */
/* DMC IMU - see io erpt doc, section 2.3 */
};
/* mmu errors */
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
PX_ERR_DMC_CLASS(bit) }, \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
/* DMC MMU - see io erpt doc, section 2.5 */
};
/*
* PEC error bit tables
*/
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
PX_ERR_PEC_CLASS(bit) }, \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
/* PEC ILU none - see io erpt doc, section 3.1 */
};
#define px_err_ilu_keys \
(sizeof (px_err_ilu_tbl)) / (sizeof (px_err_bit_desc_t))
/*
* PEC UE errors implementation is incomplete pending PCIE generic
* fabric rules. Must handle both PRIMARY and SECONDARY errors.
*/
/* pec ue errors */
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
PX_ERR_PEC_CLASS(bit) }, \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
/* PCI-E Receive Uncorrectable Errors - see io erpt doc, section 3.2 */
/* PCI-E Transmit Uncorrectable Errors - see io erpt doc, section 3.3 */
/* Other PCI-E Uncorrectable Errors - see io erpt doc, section 3.5 */
/* Not used */
};
#define px_err_tlu_ue_keys \
(sizeof (px_err_tlu_ue_tbl)) / (sizeof (px_err_bit_desc_t))
/*
* PEC CE errors implementation is incomplete pending PCIE generic
* fabric rules.
*/
/* pec ce errors */
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
PX_ERR_PEC_CLASS(bit) }, \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
/* PCI-E Correctable Errors - see io erpt doc, section 3.6 */
};
#define px_err_tlu_ce_keys \
(sizeof (px_err_tlu_ce_tbl)) / (sizeof (px_err_bit_desc_t))
/* pec oe errors */
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
PX_ERR_PEC_CLASS(bit) }, \
0, \
PX_ERR_BIT_HANDLE(hdl), \
PX_ERPT_SEND(erpt), \
/*
* TLU Other Event Status (receive only) - see io erpt doc, section 3.7
*/
/* TLU Other Event Status (rx + tx) - see io erpt doc, section 3.8 */
/* TLU Other Event - see io erpt doc, section 3.9 */
};
#define px_err_tlu_oe_keys \
(sizeof (px_err_tlu_oe_tbl)) / (sizeof (px_err_bit_desc_t))
/*
* All the following tables below are for LPU Interrupts. These interrupts
* are *NOT* error interrupts, but event status interrupts.
*
* These events are probably of most interest to:
* o Hotplug
* o Power Management
* o etc...
*
* There are also a few events that would be interresting for FMA.
* Again none of the regiseters below state that an error has occured
* or that data has been lost. If anything, they give status that an
* error is *about* to occur. examples
* o INT_SKP_ERR - indicates clock between fire and child is too far
* off and is most unlikely able to compensate
* o INT_TX_PAR_ERR - A parity error occured in ONE lane. This is
* HW recoverable, but will like end up as a future
* fabric error as well.
*
* For now, we don't care about any of these errors and should be ignore,
* but cleared.
*/
/* LPU Link Interrupt Table */
0, \
NULL, \
NULL, \
""
};
#define px_err_lpul_keys \
(sizeof (px_err_lpul_tbl)) / (sizeof (px_err_bit_desc_t))
/* LPU Physical Interrupt Table */
0, \
NULL, \
NULL, \
""
};
#define px_err_lpup_keys \
(sizeof (px_err_lpup_tbl)) / (sizeof (px_err_bit_desc_t))
/* LPU Receive Interrupt Table */
0, \
NULL, \
NULL, \
""
};
#define px_err_lpur_keys \
(sizeof (px_err_lpur_tbl)) / (sizeof (px_err_bit_desc_t))
/* LPU Transmit Interrupt Table */
0, \
NULL, \
NULL, \
""
};
#define px_err_lpux_keys \
(sizeof (px_err_lpux_tbl)) / (sizeof (px_err_bit_desc_t))
/* LPU LTSSM Interrupt Table */
0, \
NULL, \
NULL, \
""
};
#define px_err_lpus_keys \
(sizeof (px_err_lpus_tbl)) / (sizeof (px_err_bit_desc_t))
/* LPU Gigablaze Glue Interrupt Table */
0, \
NULL, \
NULL, \
""
};
#define px_err_lpug_keys \
(sizeof (px_err_lpug_tbl)) / (sizeof (px_err_bit_desc_t))
/* Mask and Tables */
B_FALSE, \
0
/* LPU Registers Addresses */
NULL, \
/* LPU Registers Addresses with Irregularities */
NULL, \
/* TLU Registers Addresses */
/* Registers Addresses for JBC, MMU, IMU and ILU */
pre ## _ERROR_LOG_ENABLE, \
pre ## _INTERRUPT_ENABLE, \
pre ## _INTERRUPT_STATUS, \
/*
* Register error handling tables.
* The ID Field (first field) is identified by an enum px_err_id_t.
* It is located in px_err.h
*/
};
typedef struct px_err_ss {
} px_err_ss_t;
px_err_ss_t *ss);
/*
* px_err_cb_intr:
* Interrupt handler for the JBC block.
* o lock
* o create derr
* o px_err_handle(leaf1, with jbc)
* o px_err_handle(leaf2, without jbc)
* o dispatch (leaf1)
* o dispatch (leaf2)
* o unlock
* o handle error: fatal? fm_panic() : return INTR_CLAIMED)
*/
{
int fatal = 0;
/* Create the derr */
switch (ret) {
case DDI_FM_FATAL:
fatal++;
break;
case DDI_FM_NONFATAL:
case DDI_FM_UNKNOWN:
default:
break;
}
/* Set the intr state to idle for the leaf that received the mondo */
/*
* PX_FATAL_HW error is diagnosed after system recovered from
* HW initiated reset, therefore no furthur handling is required.
*/
PX_FM_PANIC("Fatal System Bus Error has occurred\n");
return (DDI_INTR_CLAIMED);
}
/*
* px_err_dmc_pec_intr:
* o lock
* o create derr
* o px_err_handle(leaf, with jbc)
* o dispatch (leaf)
* o unlock
* o handle error: fatal? fm_panic() : return INTR_CLAIMED)
*/
{
/* Create the derr */
/* Check all child devices for errors */
/* Set the interrupt state to idle */
/*
* PX_FATAL_HW indicates a condition recovered from Fatal-Reset,
* therefore it does not cause panic.
*/
PX_FM_PANIC("Fatal System Port Error has occurred\n");
return (DDI_INTR_CLAIMED);
}
/*
* Error register are being handled by px_hlib xxx_init functions.
* They are also called again by px_err_add_intr for mondo62 and 63
* from px_cb_attach and px_attach
*/
void
{
if (id == PX_ERR_JBC)
else
/* Enable logs if it exists */
/*
* For readability you in code you set 1 to enable an interrupt.
* But in Fire it's backwards. You set 1 to *disable* an intr.
* Reverse the user tunable intr mask field.
*
* Disable All Errors
* Clear All Errors
* Enable Errors
*/
}
}
void
{
if (id == PX_ERR_JBC)
else
switch (id) {
case PX_ERR_JBC:
case PX_ERR_MMU:
case PX_ERR_IMU:
case PX_ERR_TLU_UE:
case PX_ERR_TLU_CE:
case PX_ERR_TLU_OE:
case PX_ERR_ILU:
}
break;
case PX_ERR_LPU_LINK:
case PX_ERR_LPU_PHY:
case PX_ERR_LPU_RX:
case PX_ERR_LPU_TX:
case PX_ERR_LPU_LTSSM:
case PX_ERR_LPU_GIGABLZ:
}
break;
}
}
/*
* px_err_handle:
* Common function called by trap, mondo and fabric intr.
* o Snap shot current fire registers
* o check for safe access
* o send ereport and clear snap shot registers
* o check severity of snap shot registers
*
* @param px_p leaf in which to check access
* @param derr fm err data structure to be updated
* @param caller PX_TRAP_CALL | PX_INTR_CALL
* @param chkjbc whether to handle jbc registers
* @return err PX_OK | PX_NONFATAL |
* PX_FATAL_GOS | PX_FATAL_HW | PX_STUCK_FATAL
*/
int
{
/* snap shot the current fire registers */
/* check for safe access */
/* check for error severity */
/* Mark the On Trap Handle if an error occured */
}
return (err);
}
/*
* Static function
*/
/*
* px_err_snapshot:
* Take a current snap shot of all the fire error registers. This includes
* JBC, DMC, and PEC, unless chkjbc == false;
*
* @param px_p leaf in which to take the snap shot.
* @param ss pre-allocated memory to store the snap shot.
* @param chkjbc boolean on whether to store jbc register.
*/
static void
{
int reg_id;
/* snapshot JBC interrupt status */
reg_id = PX_ERR_JBC;
} else {
}
}
}
/*
* px_err_erpt_and_clr:
* This function does the following thing to all the fire registers based
* on an earlier snap shot.
* o Send ereport
* o Handle the error
* o Clear the error
*
* @param px_p leaf in which to take the snap shot.
* @param derr fm err in which the ereport is to be based on
* @param ss pre-allocated memory to store the snap shot.
*/
static int
{
int (*err_handler)();
int (*erpt_handler)();
int biterr;
/* Get the correct register description table */
/* Get the correct CSR BASE */
if (reg_id == PX_ERR_JBC) {
} else {
}
/* Get pointers to masks and register addresses */
/* Get the register BIT description table */
/* For each known bit in the register send erpt and handle */
/* Get the bit description table for this register */
/*
* If the ss_reg is set for this bit,
* send ereport and handle
*/
/* Increment the counter if necessary */
err_bit_desc->counter++;
}
/* Error Handle for this bit */
if (err_handler) {
derr,
}
/* Send the ereport if it's an UNEXPECTED err */
if (erpt_handler)
(void) erpt_handler(rpdip,
derr,
}
}
}
/* Print register status */
/* Clear the register and error */
}
return (err);
}
/*
* px_err_check_severity:
* Check the severity of the fire error based on an earlier snapshot
*
* @param px_p leaf in which to take the snap shot.
* @param derr fm err in which the ereport is to be based on
* @param ss pre-allocated memory to store the snap shot.
*/
static int
{
/* nothing to do if called with no error */
return (err);
/* Cautious access error handling */
case DDI_FM_ERR_EXPECTED:
if (caller == PX_TRAP_CALL) {
/*
* for ddi_caut_get treat all events as nonfatal
* The trampoline will set err_ena = 0,
* err_status = NONFATAL.
*/
is_safeacc = B_TRUE;
} else {
/*
* For ddi_caut_put treat all events as nonfatal. Here
* we have the handle and can call ndi_fm_acc_err_set().
*/
is_safeacc = B_TRUE;
}
break;
case DDI_FM_ERR_PEEK:
case DDI_FM_ERR_POKE:
/*
*/
is_safeacc = B_TRUE;
break;
default:
}
/*
* The third argument "err" is passed in as error status from checking
* Fire register, re-adjust error status from safe access.
*/
return (PX_NONFATAL);
return (err);
}
/* predefined convenience functions */
/* ARGSUSED */
int
{
return (PX_FATAL_HW);
}
/* ARGSUSED */
int
{
return (PX_FATAL_GOS);
}
/* ARGSUSED */
int
{
return (PX_STUCK_FATAL);
}
/* ARGSUSED */
int
{
return (PX_FATAL_SW);
}
/* ARGSUSED */
int
{
return (PX_NONFATAL);
}
/* ARGSUSED */
int
{
return (PX_OK);
}
/* ARGSUSED */
int
{
return (PX_ERR_UNKNOWN);
}
/* ARGSUSED */
{
return (PX_OK);
}
/* JBC FATAL - see io erpt doc, section 1.1 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
/* JBC MERGE - see io erpt doc, section 1.2 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
/*
* JBC Merge buffer nonfatal errors:
* Merge buffer parity error (rd_buf): dma:read:M:nonfatal
* Merge buffer parity error (wr_buf): dma:write:M:nonfatal
*/
/* ARGSUSED */
int
{
int ret;
if (!pri)
return (PX_FATAL_GOS);
}
/* JBC Jbusint IN - see io erpt doc, section 1.3 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
/*
* JBC Jbusint IN nonfatal errors: PA logged in Jbusint In Transaction Error
* Log Reg[42:0].
* CE async fault error: nonfatal
* Jbus bus error: dma::nonfatal
* Jbus unmapped error: pio|dma:rdwr:M:nonfatal
* Illegal NCWR bytemask: pio:write:M:nonfatal
* Illegal NCRD bytemask: pio:write:M:nonfatal
* Invalid jbus transaction: nonfatal
*/
/* ARGSUSED */
int
{
int ret;
if (!pri)
return (PX_FATAL_GOS);
}
/* JBC Jbusint Out - see io erpt doc, section 1.4 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
/* JBC Dmcint ODCD - see io erpt doc, section 1.5 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
/*
* JBC Dmcint ODCO nonfatal errer handling -
* Unmapped PIO read error: pio:read:M:nonfatal
* Unmapped PIO write error: pio:write:M:nonfatal
* PIO data parity error: pio:write:M:nonfatal
*/
/* ARGSUSED */
int
{
int ret;
if (!pri)
return (PX_FATAL_GOS);
}
/* JBC Dmcint IDC - see io erpt doc, section 1.6 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
/* JBC CSR - see io erpt doc, section 1.7 */
{
char buf[FM_MAX_CLASS];
"jbc-error-reg", DATA_TYPE_UINT64,
NULL);
return (PX_OK);
}
/*
* JBC CSR errer handling -
* Ebus ready timeout error: pio:rdwr:M:nonfatal
*/
/* ARGSUSED */
int
{
int ret;
if (!pri)
return (PX_FATAL_GOS);
}
/* JBC Dmcint IDC - see io erpt doc, section 1.6 */
/* DMC IMU RDS - see io erpt doc, section 2.1 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
/* imu function to handle all Received but Not Enabled errors */
/* ARGSUSED */
int
{
int err = PX_NONFATAL;
/*
* If matching bit is not set, meaning corresponding rbne not
* enabled, then receiving it indicates some sort of malfunction
* possibly in hardware.
*
* Other wise, software may have intentionally disabled certain
* errors for a period of time within which the occuring of the
* disabled errors become rbne, that is non fatal.
*/
err = PX_FATAL_SW;
return (err);
}
/*
* No platforms uses PME. Any PME received is simply logged
* for analysis.
*/
/* ARGSUSED */
int
{
px_p->px_pme_ignored++;
return (PX_NONFATAL);
}
/* handle EQ overflow */
/* ARGSUSED */
int
{
int err = PX_NONFATAL;
int i;
for (i = 0; i < msiq_state_p->msiq_cnt; i++) {
DDI_SUCCESS) {
if (msiq_state == PCI_MSIQ_STATE_ERROR) {
err = PX_FATAL_SW;
}
}
}
return (err);
}
/* DMC IMU SCS - see io erpt doc, section 2.2 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
/* DMC IMU - see io erpt doc, section 2.3 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
/* DMC MMU - see io erpt doc, section 2.5 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
/* imu function to handle all Received but Not Enabled errors */
int
{
uint64_t mmu_enable_bit = 0;
int err = PX_NONFATAL;
int ret;
switch (err_bit_descr->bit) {
break;
break;
default:
mmu_enable_bit = 0;
break;
}
/*
* If the interrupts are enabled and Translation/Bypass Enable bit
* was set, then panic. This error should not have occured.
*/
if (mmu_log_enable & mmu_intr_enable &
(mmu_ctrl & mmu_enable_bit)) {
err = PX_FATAL_SW;
} else {
if (!pri)
return (PX_FATAL_GOS);
/*
* S/W bug - this error should always be enabled
*/
/* enable error & intr reporting for this bit */
}
return (err);
}
/* Generic error handling functions that involve MMU Translation Fault Addr */
/* ARGSUSED */
int
{
if (!pri)
return (PX_FATAL_GOS);
}
/* MMU Table walk errors */
/* ARGSUSED */
int
{
if (!pri)
return (PX_FATAL_GOS);
}
/*
* TLU LUP event - if caused by power management activity, then it is expected.
* In all other cases, it is an error.
*/
/* ARGSUSED */
int
{
/*
* power management code is currently the only segment that sets
* px_lup_pending to indicate its expectation for a healthy LUP
* event. For all other occasions, LUP event should be flaged as
* error condition.
*/
PX_NONFATAL : PX_OK);
}
/*
* TLU LDN event - if caused by power management activity, then it is expected.
* In all other cases, it is an error.
*/
/* ARGSUSED */
int
{
}
/* PEC ILU none - see io erpt doc, section 3.1 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
/* PCIEX UE Errors */
/* ARGSUSED */
int
{
}
/* PCI-E Uncorrectable Errors - see io erpt doc, section 3.2 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
/* PCI-E Uncorrectable Errors - see io erpt doc, section 3.3 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
/* PCI-E Uncorrectable Errors - see io erpt doc, section 3.4 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
/* PCI-E Uncorrectable Errors - see io erpt doc, section 3.5 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
/* PCIEX UE Errors */
/* ARGSUSED */
int
{
}
/* PCI-E Correctable Errors - see io erpt doc, section 3.6 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
/* TLU Other Event Status (receive only) - see io erpt doc, section 3.7 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
/* TLU Other Event Status (rx + tx) - see io erpt doc, section 3.8 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}
/* TLU Other Event - see io erpt doc, section 3.9 */
{
char buf[FM_MAX_CLASS];
NULL);
return (PX_OK);
}