pcie_fault.c revision 0c5eba8c5970fdedca3397ca86830ae5db5d98eb
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/sysmacros.h>
#include <sys/pcie_impl.h>
/* size of error queue */
#define PF_SAER_FATAL_ERR (PCIE_AER_SUCE_USC_MSG_DATA_ERR | \
#define PF_SAER_NON_FATAL_ERR (PCIE_AER_SUCE_TA_ON_SC | \
#define PF_DATA_NOT_FOUND -1
#define GET_SAER_CMD(pf_data_p) \
#define CE_ADVISORY(pf_data_p) \
/* PCIe Fault Fabric Error analysis table */
typedef struct pf_fab_err_tbl {
int (*handler)(); /* Error handling fuction */
/* PCIe Fault Support Functions. */
/* PCIe Fabric Handle Lookup Support Functions. */
pf_data_t *q, int last_index);
int
{
}
{
return (B_FALSE);
return (B_TRUE);
}
void
{
}
/*
* SPARC PCI-E platforms
*/
/* Called during postattach to initalize FM lock */
void
{
int cap = DDI_FM_EREPORT_CAPABLE;
if (fmhdl) {
} else {
if (cmd == DDI_ATTACH)
}
/* If ddi_fm_init fails for any reason RETURN */
ppd_p->ppd_fm_flags = 0;
return;
}
}
/* undo OPL FMA lock, called at predetach */
void
{
/* Don't fini anything if device isn't FM Ready */
return;
/* undo non-hardened drivers */
if (cmd == DDI_DETACH) {
}
}
/* no other code should set the flag to false */
}
static boolean_t
{
/* check if given bdf falls within bridge's bus range */
if ((hdr_type == PCI_HEADER_ONE) &&
return (B_TRUE);
else
return (B_FALSE);
}
/*
* "addr" is in the assigned addr of a device.
*/
static boolean_t
{
/* check if given address belongs to this device */
return (B_TRUE);
}
/* check if given address belongs to a child below this device */
return (B_TRUE);
break;
}
}
}
return (B_FALSE);
}
int
{
/* for bridge, check all downstream */
/* make sure dip is attached, ie. fm_ready */
continue;
if (sts & PF_DO_NOT_SCAN)
continue;
}
return (ret);
}
int
{
/* Make sure dip is attached and fm_ready */
continue;
if (sts & PF_DO_NOT_SCAN)
continue;
} else {
continue;
}
/* match or in bridge bus-range */
switch (ppd_p->ppd_dev_type) {
return (ret);
case PCIE_PCIECAP_DEV_TYPE_UP:
/* FALLTHROUGH */
return (ret);
default:
}
}
return (ret);
}
/*
* Called by the RC to scan the fabric.
*
* After all the necessary fabric devices are scanned, the error queue will be
* analyzed for error severity and ereports will be sent.
*/
int
{
int last_rc_index = *dq_tail_p;
i = 0;
/*
* Scan the fabric using the fault_bdf and fault_addr in error q.
* fault_bdf will be valid in the following cases:
* - Fabric message
* - Poisoned TLP
* - PIO load failures
*/
rc_pf_data_p++, i++) {
*dq_tail_p) == PF_DATA_NOT_FOUND) ||
}
/* If this is due to safe access, don't analyse the errors and return */
ret = DDI_SUCCESS;
sts = PF_NO_PANIC;
} else {
}
*dq_tail_p = -1;
/*
* If ret is not SUCCESS that means we were not able to add 1 or more
* devices to the fault q. Since that device could have have been the
* one which had a error, be conservative and panic here.
*/
if (ret != DDI_SUCCESS)
else
return (sts);
}
/*
* For each device in the fault queue ensure that no ereport is sent if that
* device was scanned as a result of a CE in one of its children.
*/
void
int i = dq_tail;
/*
* Always send ereport for the last device in a
* particular scan path.
*/
PF_NO_ERROR)) {
/*
* Since this device had a CE don't send ereport
* for parents.
*/
} else {
/* Send ereports for all parents */
}
}
}
}
void
int i;
}
}
/*
* Returns the index of the bdf if found in the PCIe Fault Data Queue
* Returns PF_DATA_NOT_FOUND of the index if the bdf is not found.
* This function should not be called by RC.
*/
static int
{
int i;
/* Check if this is the first item in queue */
if (dq_tail == -1)
return (PF_DATA_NOT_FOUND);
for (i = dq_tail; i >= 0; i--) {
return (i);
}
return (PF_DATA_NOT_FOUND);
}
int
{
return (pf_dq_size);
}
/*
* Add PFD to queue.
* Return true if successfully added.
* Return false if out of space or already in queue.
* Pass in pbdf = -1 if pfd is from RC.
*/
int
{
int parent_index = PF_DATA_NOT_FOUND;
if (*dq_tail_p >= (int)pf_dq_size)
return (DDI_FAILURE);
/* Look for parent BDF if pfd is not from RC and save rp_bdf */
}
*dq_tail_p += 1;
return (DDI_SUCCESS);
}
static int
{
/* Make sure this device hasn't already been snapshotted and cleared */
return (PF_SUCCESS);
/*
* could very well be a device that isn't responding anymore. Just
* stop. Save the basic info in the error q for post mortem debugging
* purposes.
*/
pbdf);
return (DDI_FAILURE);
}
if (hdr_type == PCI_HEADER_ONE) {
}
if (dev_type == PCIE_PCIECAP_DEV_TYPE_PCI_DEV) {
!= DDI_FAILURE) {
}
goto clear;
}
if (!pcie_off)
goto clear;
/*
* If a bridge does not have any error no need to scan any further down.
* For PCIe devices, check the PCIe device status and PCI secondary
* status.
* - Some non-compliant PCIe devices do not utilize PCIe
* error registers. If so rely on legacy PCI error registers.
* For PCI devices, check the PCI secondary status.
*/
if (hdr_type == PCI_HEADER_ONE) {
if ((dev_type == PCIE_PCIECAP_DEV_TYPE_PCIE2PCI) &&
sts |= PF_DO_NOT_SCAN;
if ((dev_type == PCIE_PCIECAP_DEV_TYPE_PCI_DEV) &&
sts |= PF_DO_NOT_SCAN;
}
if (!aer_off)
goto clear;
PCIE_AER_HDR_LOG + 0x0);
PCIE_AER_HDR_LOG + 0x4);
PCIE_AER_HDR_LOG + 0x8);
PCIE_AER_HDR_LOG + 0xc);
if (dev_type == PCIE_PCIECAP_DEV_TYPE_PCIE2PCI) {
PCIE_AER_SHDR_LOG + 0x0);
PCIE_AER_SHDR_LOG + 0x4);
PCIE_AER_SHDR_LOG + 0x8);
PCIE_AER_SHDR_LOG + 0xc);
}
/* Clear the Legacy PCI Errors */
if (hdr_type == PCI_HEADER_ONE)
if (!pcie_off)
goto queue;
/* Clear the Advanced PCIe Errors */
if (aer_off) {
}
/* Clear the PCIe Errors */
/*
* If the driver is FMA hardened and callback capable, call it's
* callback function
*/
sts |= PF_FAILURE;
else
sts |= PF_SUCCESS;
}
/* Add the snapshot to the error q */
sts |= PF_FAILURE;
return (sts);
}
/*
* Function used by PCI error handlers to check if captured address is stored
* in the DMA or ACC handle caches.
* return: PF_HDL_NOTFOUND if a handle is not found
* PF_HDL_FOUND if a handle is found
*/
int
{
int found = 0;
/* If we don't know the addr or rid just return with UNKNOWN */
return (PF_HDL_NOTFOUND);
return (PF_HDL_NOTFOUND);
}
/* If we know the addr or bdf mark the handle as failed */
if (flag & PF_DMA_ADDR) {
found++;
}
if (flag & PF_PIO_ADDR) {
found++;
}
if (flag & PF_CFG_ADDR) {
found++;
}
}
/*
* Recursively search the tree for the handler that matches the given address.
* If the BDF is known, only check the handlers that are associated with the
* given BDF, otherwise search the entire tree.
*/
static int
{
int status = PF_HDL_NOTFOUND;
struct i_ddi_fmhdl *fmhdl;
struct i_ddi_fmtgt *tgt;
/* Check if dip and BDF match, if not recurse to it's children. */
/* If we found the handler stop the search */
goto done;
}
/* If we can't find the handler check it's children */
goto done;
}
done:
return (status);
}
/*
* Find and Mark CFG Handles as failed associated with the given BDF. We should
* always know the BDF for CFG accesses, since it is encoded in the address of
* the TLP. Since there can be multiple cfg handles, mark them all as failed.
*/
/* ARGSUSED */
static int
{
int status = PF_HDL_NOTFOUND;
/* Return NOTFOUND if this driver doesn't support ACC flagerr */
return (PF_HDL_NOTFOUND);
/* CFG space is always reg 0 */
if (hp->ah_rnumber == 0) {
}
}
return (status);
}
/*
* Find and Mark all ACC Handles associated with a give address and BDF as
* failed. If the BDF != NULL, then check to see if the device has a ACC Handle
* associated with ADDR. If the handle is not found, mark all the handles as
* failed. If the BDF == NULL, mark the handle as failed if it is associated
* with ADDR.
*/
static int
{
int status = PF_HDL_NOTFOUND;
return (PF_HDL_NOTFOUND);
/* CFG space is always reg 0, don't mark config handlers. */
if (hp->ah_rnumber == 0)
continue;
/*
* Normalize the base addr to the addr and strip off the
* HB info. All PIOs are 32 bit access only.
*/
}
}
/*
* If no handles found and we know this is the right device mark
* all the handles as failed.
*/
return (status);
}
/*
* Find and Mark all DNA Handles associated with a give address and BDF as
* failed. If the BDF != NULL, then check to see if the device has a DMA Handle
* associated with ADDR. If the handle is not found, mark all the handles as
* failed. If the BDF == NULL, mark the handle as failed if it is associated
* with ADDR.
*/
static int
{
int status = PF_HDL_NOTFOUND;
return (PF_HDL_NOTFOUND);
/*
* Mark the handle as failed if the ADDR is mapped, or if we
* know the BDF and ADDR == 0.
*/
}
}
/*
* If no handles found and we know this is the right device mark
* all the handles as failed.
*/
return (status);
}
/*
* If a PCIe device does not support AER, assume all AER statuses have been set,
* unless other registers do not indicate a certain error occuring.
*/
static void
{
return;
/* Check if the device received a PTLP */
aer_ue &= ~PCIE_AER_UCE_PTLP;
/* Check if the device signaled a CA */
aer_ue &= ~PCIE_AER_UCE_CA;
/* Check if the device sent a UR */
aer_ue &= ~PCIE_AER_UCE_UR;
/*
* Ignore ECRCs as it is optional and will manefest itself as
* another error like PTLP and MFP
*/
aer_ue &= ~PCIE_AER_UCE_ECRC;
}
aer_ue &= ~PCIE_AER_UCE_SD;
}
}
static void
{
return;
/* Check if the device received a UC_DATA */
}
}
}
}
/* Find the PCIe-PCI bridge of a PCI device */
static pf_data_t *
{
return (NULL);
return (NULL);
}
return (bdg_pf_data_p);
}
/*
* See if a leaf error was bubbled up to the RC and handled.
* Check if either the fault address found in the rc matches the device's
* assigned address range (PIO's only) or the fault BDF in the rc matches the
* device's BDF or Secondary Bus.
*/
static boolean_t
{
/* If device and rc abort type does not match continue */
continue;
/* The Fault BDF = Device's BDF */
return (B_TRUE);
/* The Fault Addr is in device's address range */
return (B_TRUE);
/* The Fault BDF is from PCIe-PCI Bridge's secondary bus */
return (B_TRUE);
}
return (B_FALSE);
}
/*
* Decodes the TLP and returns the BDF of the handler, address and transaction
* type if known.
*
* Types of TLP logs seen in RC, and what to extract:
*
* Memory(DMA) - Requester BDF, address, PF_DMA_ADDR
* Memory(PIO) - address, PF_PIO_ADDR
* CFG - Should not occur and result in UR
* Completion(DMA) - Requester BDF, PF_DMA_ADDR
* Completion(PIO) - Requester BDF, PF_PIO_ADDR
*
*
* Memory(DMA) - Requester BDF, address, PF_DMA_ADDR
* Memory(PIO) - address, PF_PIO_ADDR
* CFG - Destined BDF, address, PF_CFG_ADDR
* Completion(DMA) - Requester BDF, PF_DMA_ADDR
* Completion(PIO) - Requester BDF, PF_PIO_ADDR
*
* If the TLP can be decoded the *bdf, *addr, and *trans_type will be populated
* with the TLP information. The caller may pass in NULL for any of the
* mentioned variables, if they are not interested in them.
*/
/* ARGSUSED */
int
{
case PCIE_TLP_TYPE_IO:
case PCIE_TLP_TYPE_MEM:
case PCIE_TLP_TYPE_MEMLK:
/* If the RID_BDF == RP_BDF, PIO, otherwise DMA */
} else {
}
break;
case PCIE_TLP_TYPE_CFG0:
case PCIE_TLP_TYPE_CFG1:
tlp_addr = 0;
break;
case PCIE_TLP_TYPE_CPL:
case PCIE_TLP_TYPE_CPLLK:
/*
* If the completer bdf == RP_BDF, DMA, otherwise PIO or a CFG
* completion.
*/
else
break;
default:
return (DDI_FAILURE);
}
if (addr)
if (trans_type)
if (bdf)
return (DDI_SUCCESS);
}
/*
* pf_pci_decode function decodes the secondary aer transaction logs in
* PCIe-PCI bridges.
*
* The log is 128 bits long and arranged in this manner.
* [0:35] Transaction Attribute (s_aer_h0-saer_h1)
* [36:39] Transaction lower command (saer_h1)
* [40:43] Transaction upper command (saer_h1)
* [44:63] Reserved
* [64:127] Address (saer_h2-saer_h3)
*/
/* ARGSUSED */
static int
switch (*cmd) {
case PCI_PCIX_CMD_MEMRD_DW:
case PCI_PCIX_CMD_MEMRD_BL:
case PCI_PCIX_CMD_MEMRDBL:
case PCI_PCIX_CMD_MEMWR:
case PCI_PCIX_CMD_MEMWR_BL:
case PCI_PCIX_CMD_MEMWRBL:
/*
* Could be DMA or PIO. Find out by look at requesting bdf.
* If the requester is the RC, then it's a PIO, otherwise, DMA
*/
*bdf = 0;
} else {
}
break;
case PCI_PCIX_CMD_CFRD:
case PCI_PCIX_CMD_CFWR:
/*
* CFG Access should always be down stream. Match the BDF in
* the address phase.
*/
*addr = 0;
break;
case PCI_PCIX_CMD_SPL:
/*
* Check for DMA read completions. The requesting BDF is in the
* Address phase.
*/
*addr = 0;
break;
default:
*addr = 0;
*bdf = 0;
*trans_type = 0;
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
/*
* For this function only the Primary AER Header Logs need to be valid in the
* pfd (PCIe Fault Data) arg.
*/
int
{
int err = PF_HDL_NOTFOUND;
DDI_SUCCESS) {
hdl_bdf);
}
return (err);
}
/*
* Last function called for PF Scan Fabric.
* Sends ereports for all devices that are not dev_type = RC.
* Will also unlock all the mutexes grabbed during fabric scan.
*/
/* ARGSUSED */
static void
int dq_tail)
{
char buf[FM_MAX_CLASS];
i = 0;
/*
* Search through the error queue and look for the number of pf_data
* from the RC and if the queue contains any errors. All the pf_data's
* from the RC will only be at the top of the queue.
*/
total--;
} else {
if (hasError)
break;
break;
}
}
}
i = dq_tail;
continue;
goto unlock;
"pcix_bdg_sts_reg", DATA_TYPE_UINT32,
NULL);
}
}
/*
* Ignore:
* - TRAINING: as leaves do not have children
* - SD: as leaves do not have children
*/
const pf_fab_err_tbl_t pcie_pcie_tbl[] = {
};
const pf_fab_err_tbl_t pcie_sw_tbl[] = {
};
const pf_fab_err_tbl_t pcie_pcie_bdg_tbl[] = {
};
const pf_fab_err_tbl_t pcie_pci_bdg_tbl[] = {
};
const pf_fab_err_tbl_t pcie_pci_tbl[] = {
};
/*
* Analyse all the PCIe Fault Data (pfd) gathered during dispatch in the pfd
* Queue.
*/
static int
int dq_tail)
{
pfd_err = 0;
break;
case PCIE_PCIECAP_DEV_TYPE_UP:
break;
/* Do not analyse RC info as it has already been done */
pfd_err |= PF_MATCHED_RC;
break;
if ((PCIE_DEVSTS_NFE_DETECTED |
& pf_data_p->dev_status) {
break;
}
/*
* Some non-compliant PCIe devices do not utilize PCIe
* error registers. So fallthrough and rely on legacy
* PCI error registers.
*/
/* FALLTHROUGH */
break;
}
}
return (err);
}
static int
const pf_fab_err_tbl_t *row;
int err = 0;
row++) {
}
if (!err)
err = PF_NO_ERROR;
return (err);
}
/*
* PCIe Completer Abort and Unsupport Request error analyser. If a PCIe device
* this error may be safely ignored. If not check the logs and see if an
* associated handler for this transaction can be found.
*/
/* ARGSUSED */
static int
{
if (bit == PCIE_AER_UCE_UR)
else
return (PF_MATCHED_RC);
return (PF_PANIC);
return (PF_MATCHED_DEVICE);
}
return (PF_PANIC);
}
/*
* PCIe-PCI Bridge Received Master Abort and Target error analyser. If a PCIe
* then this error may be safely ignored. If not check the logs and see if an
* associated handler for this transaction can be found.
*/
/* ARGSUSED */
static int
{
if (bit == PCIE_AER_SUCE_RCVD_MA)
else
return (PF_MATCHED_RC);
return (PF_PANIC);
return (PF_PANIC);
return (PF_PANIC);
return (PF_MATCHED_DEVICE);
}
/*
* Generic PCI error analyser. This function is used for Parity Errors,
* Received Master Aborts, Received Target Aborts, and Signaled Target Aborts.
* In general PCI devices do not have error logs, it is very difficult to figure
* out what transaction caused the error. Instead find the nearest PCIe-PCI
* Bridge and check to see if it has logs and if it has an error associated with
* this PCI Device.
*/
/* ARGSUSED */
static int
{
return (PF_PANIC);
} else {
}
if (parent_pfd_p == NULL)
return (PF_PANIC);
return (PF_PANIC);
!= DDI_SUCCESS)
return (PF_PANIC);
/*
* If the addr or bdf from the parent PCIe bridge logs belong to this
* PCI device, assume the PCIe bridge's error handling has already taken
* care of this PCI device's error.
*/
return (PF_MATCHED_PARENT);
/*
* If this device is a PCI-PCI bridge, check if the bdf in the parent
* PCIe bridge logs is in the range of this PCI-PCI Bridge's bus ranges.
* If they are, then assume the PCIe bridge's error handling has already
* taken care of this PCI-PCI bridge device's error.
*/
return (PF_MATCHED_PARENT);
return (PF_PANIC);
}
/*
* PCIe Bridge transactions associated with PERR.
* o Bridge received a poisoned Non-Posted Write (CFG Writes) from PCIe
* o Bridge received a poisoned Posted Write from (MEM Writes) from PCIe
* o Bridge received a poisoned Completion on a Split Transction from PCIe
* o Bridge received a poisoned Completion on a Delayed Transction from PCIe
*
* Check for non-poisoned PCIe transactions that got forwarded to the secondary
* side and detects a PERR#. Except for delayed read completions, a poisoned
* TLP will be forwarded to the secondary bus and PERR# will be asserted.
*/
/* ARGSUSED */
static int
{
int sts;
int err = PF_NO_ERROR;
&trans_type) != DDI_SUCCESS)
return (PF_PANIC);
switch (cmd) {
case PCI_PCIX_CMD_MEMWR:
case PCI_PCIX_CMD_MEMWR_BL:
case PCI_PCIX_CMD_MEMWRBL:
/* Posted Writes Transactions */
if (trans_type == PF_PIO_ADDR)
break;
case PCI_PCIX_CMD_CFWR:
/*
* Check to see if it is a non-posted write. If so, a
* UR Completion would have been sent.
*/
sts = PF_HDL_FOUND;
err = PF_MATCHED_RC;
break;
}
break;
case PCI_PCIX_CMD_SPL:
break;
default:
/* Unexpected situation, panic */
}
if (sts == PF_HDL_NOTFOUND)
} else {
/*
* Check to see if it is a non-posted write. If so, a UR
* Completion would have been sent.
*/
err = PF_MATCHED_RC;
/* Check for posted writes. Transaction is lost. */
}
/*
* All other scenarios are due to read completions. Check for
* PERR on the primary side. If found the primary side error
* handling will take care of this error.
*/
if (err == PF_NO_ERROR) {
else
}
}
return (err);
}
/*
* PCIe Poisoned TLP error analyser. If a PCIe device receives a Poisoned TLP,
* check the logs and see if an associated handler for this transaction can be
* found.
*/
/* ARGSUSED */
static int
{
/*
* If AERs are supported find the logs in this device, otherwise look in
* it's parent's logs.
*/
/*
* Double check that the log contains a poisoned TLP.
* Some devices like PLX switch do not log poison TLP headers.
*/
return (PF_MATCHED_DEVICE);
}
return (PF_PANIC);
}
return (PF_MATCHED_PARENT);
}
return (PF_PANIC);
}
/*
* PCIe-PCI Bridge Received Master and Target abort error analyser on Split
* associated handler for this transaction can be found.
*/
/* ARGSUSED */
static int
{
int sts = PF_HDL_NOTFOUND;
return (PF_PANIC);
return (PF_PANIC);
if (cmd == PCI_PCIX_CMD_SPL)
if (sts == PF_HDL_NOTFOUND)
return (PF_PANIC);
return (PF_MATCHED_DEVICE);
}
/*
* PCIe Timeout error analyser. This error can be forgiven if it is marked as
* CE Advisory. If it is marked as advisory, this means the HW can recover
*/
/* ARGSUSED */
static int
{
/*
* If the Advisory Non-Fatal is set, that means HW will automatically
* retry the failed transaction.
*/
return (PF_NO_PANIC);
return (PF_PANIC);
}
/*
* PCIe Unexpected Completion. This error can be forgiven if it is marked as
* CE Advisory. If it is marked as advisory, this means the HW can recover
*/
/* ARGSUSED */
static int
{
/*
* Check to see if this TLP was misrouted by matching the device BDF
* with the TLP Log. If misrouting panic, otherwise don't panic.
*/
return (PF_NO_PANIC);
return (PF_PANIC);
}
/*
* PCIe-PCI Bridge Uncorrectable Data error anlyser. All Uncorrectable Data
* errors should have resulted in a PCIe Poisoned TLP to the RC, except for
* Posted Writes. Check the logs for Posted Writes and if the RC did not see a
* Poisoned TLP.
*
* Non-Posted Writes will also generate a UR in the completion status, which the
* RC should also see.
*/
/* ARGSUSED */
static int
{
return (PF_PANIC);
return (PF_MATCHED_RC);
return (PF_PANIC);
return (PF_PANIC);
return (PF_MATCHED_DEVICE);
}
/* ARGSUSED */
static int
{
return (PF_NO_PANIC);
}
/* ARGSUSED */
static int
{
return (PF_MATCHED_DEVICE);
}
/* ARGSUSED */
static int
{
return (PF_PANIC);
}