gcpu.esc revision 5b461e745fd57f3b9708a1b7ccb30395836c735a
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* eversholt rules for generic-sparc sparc cpu errors.
*
* Most propagations are generated by preprocessor macros. The event
* declarations are deliberately not part of the propagation macros
* so that we know we have full coverage - propagations defined without
* events, or events not used in propagations, will produce compiler
* whinges.
*/
#define DIAGNOSE_ERPT (payloadprop_defined("diagnose") && \
payloadprop("diagnose") != 0x0)
#define SET_SERDT (!payloadprop_defined("serd_t") || \
setserdt(payloadprop("serd_t")))
#define SET_SERDN (!payloadprop_defined("serd_n") || \
setserdn(payloadprop("serd_n")))
#define SET_RATIO \
((payloadprop_defined("filter_ratio") && \
payloadprop("filter_ratio") != 0x0) ? \
(setserdincrement(payloadprop("filter_ratio"))) : 1)
/*
* The ereport and fault events are declared at multiple levels:
* some set of @chip, @core, and @strand resources since this is
* a generic DE and must be flexible and handle present and future
* sun4v platforms. For example, one processor may have an L2
* cache per chip, another may have an L2 per core.
*
* For UE errors, faults are produced immediately.
*
* For CE errors, the errors are put through a SERD engine. If
* the SERD engine trips, the fault is produced. SERD engine
* names are of the format:
* serd.cpu.generic-sparc.<resource><suffix>
* Ex: serd.cpu.generic-sparc.chipitlb
* SERD N/T values are set to default values, but can be
* overridden via the ereport or the eft.conf file. The
* order or precedence of the SERD N/T values is:
* - the 'serd_override' tunable via eft.conf
* - the 'serd_n' and 'serd_t' payload members in the
* incoming ereport
* - the built-in default values
*
* The increment rate of the SERD engines can also be
* controlled via the ereport payload using the 'filter_ratio'
* payload member. N in incremented by the value of
* 'filter_ratio' if the payload member is present, 1 otherwise.
*/
/*
* Ereport event for cpu errors
*/
#define ERPT_EVENT(level, leafclass) \
event ereport.cpu.generic-sparc.leafclass@level { within(1s) }
/*
* Ereports for uncorrectable cpu errors
*/
ERPT_EVENT(chip, itlb-uc);
ERPT_EVENT(core, itlb-uc);
ERPT_EVENT(strand, itlb-uc);
ERPT_EVENT(chip, dtlb-uc);
ERPT_EVENT(core, dtlb-uc);
ERPT_EVENT(strand, dtlb-uc);
ERPT_EVENT(chip, icache-uc);
ERPT_EVENT(core, icache-uc);
ERPT_EVENT(chip, dcache-uc);
ERPT_EVENT(core, dcache-uc);
ERPT_EVENT(chip, ireg-uc);
ERPT_EVENT(core, ireg-uc);
ERPT_EVENT(strand, ireg-uc);
ERPT_EVENT(chip, freg-uc);
ERPT_EVENT(core, freg-uc);
ERPT_EVENT(strand, freg-uc);
ERPT_EVENT(chip, mreg-uc);
ERPT_EVENT(core, mreg-uc);
ERPT_EVENT(strand, mreg-uc);
ERPT_EVENT(chip, l2data-uc);
ERPT_EVENT(core, l2data-uc);
ERPT_EVENT(chip, l2tagctl-uc);
ERPT_EVENT(core, l2tagctl-uc);
ERPT_EVENT(chip, l3data-uc);
ERPT_EVENT(core, l3data-uc);
ERPT_EVENT(chip, l3tagctl-uc);
ERPT_EVENT(core, l3tagctl-uc);
ERPT_EVENT(chip, int-mem-ue);
ERPT_EVENT(core, int-mem-ue);
ERPT_EVENT(strand, int-mem-ue);
ERPT_EVENT(chip, gchip-uc);
ERPT_EVENT(core, gcore-uc);
ERPT_EVENT(strand, gstrand-uc);
/*
* Propagations for CPU UE errors
* A fault is produced immediately for a CPU UE errors.
*/
#define FLT_CPU_UE(level, erptleaf, fltleaf) \
event fault.cpu.generic-sparc.fltleaf@level; \
\
prop fault.cpu.generic-sparc.fltleaf@level \
{ DIAGNOSE_ERPT } (0)-> \
ereport.cpu.generic-sparc.erptleaf@level; \
\
event upset.cpu.generic-sparc.fltleaf@level; \
\
prop upset.cpu.generic-sparc.fltleaf@level \
{ !DIAGNOSE_ERPT } (0)-> \
ereport.cpu.generic-sparc.erptleaf@level
FLT_CPU_UE(chip, itlb-uc, chip-uc);
FLT_CPU_UE(core, itlb-uc, core-uc);
FLT_CPU_UE(strand, itlb-uc, strand-uc);
FLT_CPU_UE(chip, dtlb-uc, chip-uc);
FLT_CPU_UE(core, dtlb-uc, core-uc);
FLT_CPU_UE(strand, dtlb-uc, strand-uc);
FLT_CPU_UE(chip, icache-uc, chip-uc);
FLT_CPU_UE(core, icache-uc, core-uc);
FLT_CPU_UE(chip, dcache-uc, chip-uc);
FLT_CPU_UE(core, dcache-uc, core-uc);
FLT_CPU_UE(chip, ireg-uc, chip-uc);
FLT_CPU_UE(core, ireg-uc, core-uc);
FLT_CPU_UE(strand, ireg-uc, strand-uc);
FLT_CPU_UE(chip, mreg-uc, chip-uc);
FLT_CPU_UE(core, mreg-uc, core-uc);
FLT_CPU_UE(strand, mreg-uc, strand-uc);
FLT_CPU_UE(chip, freg-uc, chip-uc);
FLT_CPU_UE(core, freg-uc, core-uc);
FLT_CPU_UE(strand, freg-uc, strand-uc);
FLT_CPU_UE(chip, l2data-uc, chip-uc);
FLT_CPU_UE(core, l2data-uc, core-uc);
FLT_CPU_UE(chip, l2tagctl-uc, chip-uc);
FLT_CPU_UE(core, l2tagctl-uc, core-uc);
FLT_CPU_UE(chip, l3data-uc, chip-uc);
FLT_CPU_UE(core, l3data-uc, core-uc);
FLT_CPU_UE(chip, l3tagctl-uc, chip-uc);
FLT_CPU_UE(core, l3tagctl-uc, core-uc);
FLT_CPU_UE(chip, gchip-uc, chip-uc);
FLT_CPU_UE(core, gcore-uc, core-uc);
FLT_CPU_UE(strand, gstrand-uc, strand-uc);
#define FLT_CPU_UE_UNRETIRED(level, erptleaf, fltleaf) \
event fault.cpu.generic-sparc.fltleaf@level, \
retire=0; \
\
prop fault.cpu.generic-sparc.fltleaf@level \
{ DIAGNOSE_ERPT } (0)-> \
ereport.cpu.generic-sparc.erptleaf@level; \
\
event upset.cpu.generic-sparc.fltleaf@level; \
\
prop upset.cpu.generic-sparc.fltleaf@level \
{ !DIAGNOSE_ERPT } (0)-> \
ereport.cpu.generic-sparc.erptleaf@level
FLT_CPU_UE_UNRETIRED(chip, int-mem-ue, chip-uc-nr);
FLT_CPU_UE_UNRETIRED(core, int-mem-ue, core-uc-nr);
FLT_CPU_UE_UNRETIRED(strand, int-mem-ue, strand-uc-nr);
/*
* Ereport events for corectable errors.
*/
ERPT_EVENT(chip, itlb);
ERPT_EVENT(core, itlb);
ERPT_EVENT(strand, itlb);
ERPT_EVENT(chip, dtlb);
ERPT_EVENT(core, dtlb);
ERPT_EVENT(strand, dtlb);
ERPT_EVENT(chip, icache);
ERPT_EVENT(core, icache);
ERPT_EVENT(chip, dcache);
ERPT_EVENT(core, dcache);
ERPT_EVENT(chip, ireg);
ERPT_EVENT(core, ireg);
ERPT_EVENT(strand, ireg);
ERPT_EVENT(chip, freg);
ERPT_EVENT(core, freg);
ERPT_EVENT(strand, freg);
ERPT_EVENT(chip, mreg);
ERPT_EVENT(core, mreg);
ERPT_EVENT(strand, mreg);
ERPT_EVENT(chip, l2data);
ERPT_EVENT(core, l2data);
ERPT_EVENT(chip, l2tagctl);
ERPT_EVENT(core, l2tagctl);
ERPT_EVENT(chip, l3data);
ERPT_EVENT(core, l3data);
ERPT_EVENT(chip, l3tagctl);
ERPT_EVENT(core, l3tagctl);
ERPT_EVENT(chip, int-mem);
ERPT_EVENT(core, int-mem);
ERPT_EVENT(strand, int-mem);
ERPT_EVENT(chip, gchip);
ERPT_EVENT(core, gcore);
ERPT_EVENT(strand, gstrand);
/*
* Propagations for CE errors
* Errors are serded and fault is generated when the SERD engine trips
* The serd name & the N & T values are set at the running time.
*/
engine serd.cpu.generic-sparc.core@core, N=8, T=1week;
engine serd.cpu.generic-sparc.strand@strand, N=8, T=1week;
#define FLT_CPU_CE(erptleaf, level, fltleaf, n, t) \
\
/* Simple fault event */ \
event fault.cpu.generic-sparc.fltleaf@level, \
engine=serd.cpu.generic-sparc.fltleaf@level; \
\
/* When the correctable engine trips, diagnose a fault */ \
prop fault.cpu.generic-sparc.fltleaf@level \
{ DIAGNOSE_ERPT && setserdsuffix("erptleaf") && \
setserdn(n) && setserdt(t) && SET_SERDN && \
SET_SERDT && SET_RATIO } (0) -> \
ereport.cpu.generic-sparc.erptleaf@level; \
\
event upset.cpu.generic-sparc.fltleaf@level; \
\
prop upset.cpu.generic-sparc.fltleaf@level \
{ !DIAGNOSE_ERPT } (0) -> \
ereport.cpu.generic-sparc.erptleaf@level
FLT_CPU_CE(itlb, core, core, 8, 1week);
FLT_CPU_CE(itlb, strand, strand, 8, 1week);
FLT_CPU_CE(dtlb, core, core, 8, 1week);
FLT_CPU_CE(dtlb, strand, strand, 8, 1week);
FLT_CPU_CE(icache, core, core, 8, 1week);
FLT_CPU_CE(dcache, core, core, 8, 1week);
FLT_CPU_CE(ireg, core, core, 8, 1week);
FLT_CPU_CE(ireg, strand, strand, 8, 1week);
FLT_CPU_CE(freg, core, core, 8, 1week);
FLT_CPU_CE(freg, strand, strand, 8, 1week);
FLT_CPU_CE(mreg, core, core, 8, 1week);
FLT_CPU_CE(mreg, strand, strand, 8, 1week);
FLT_CPU_CE(l2data, core, core, 8, 1week);
FLT_CPU_CE(l2tagctl, core, core, 8, 1week);
FLT_CPU_CE(l3data, core, core, 8, 1week);
FLT_CPU_CE(l3tagctl, core, core, 8, 1week);
FLT_CPU_CE(gcore, core, core, 8, 1week);
FLT_CPU_CE(gstrand, strand, strand, 8, 1week);
engine serd.cpu.generic-sparc.chip-nr@chip, N=8, T=1week;
engine serd.cpu.generic-sparc.core-nr@core, N=8, T=1week;
engine serd.cpu.generic-sparc.strand-nr@strand, N=8, T=1week;
#define FLT_CPU_CE_UNRETIRED(erptleaf, level, fltleaf, n, t) \
\
/* Simple fault event */ \
event fault.cpu.generic-sparc.fltleaf@level, \
retire=0, \
engine=serd.cpu.generic-sparc.fltleaf@level; \
\
/* When the correctable engine trips, diagnose a fault */ \
prop fault.cpu.generic-sparc.fltleaf@level \
{ DIAGNOSE_ERPT && setserdsuffix("erptleaf") && setserdn(n) && \
setserdt(t) && SET_SERDN && SET_SERDT && SET_RATIO } (0) -> \
ereport.cpu.generic-sparc.erptleaf@level; \
\
event upset.fault.cpu.generic-sparc.fltleaf@level; \
\
prop upset.fault.cpu.generic-sparc.fltleaf@level \
{ !DIAGNOSE_ERPT } (0)-> \
ereport.cpu.generic-sparc.erptleaf@level
FLT_CPU_CE_UNRETIRED(itlb, chip, chip-nr, 8, 1week);
FLT_CPU_CE_UNRETIRED(dtlb, chip, chip-nr, 8, 1week);
FLT_CPU_CE_UNRETIRED(icache, chip, chip-nr, 8, 1week);
FLT_CPU_CE_UNRETIRED(dcache, chip, chip-nr, 8, 1week);
FLT_CPU_CE_UNRETIRED(ireg, chip, chip-nr, 8, 1week);
FLT_CPU_CE_UNRETIRED(freg, chip, chip-nr, 8, 1week);
FLT_CPU_CE_UNRETIRED(mreg, chip, chip-nr, 8, 1week);
FLT_CPU_CE_UNRETIRED(l2data, chip, chip-nr, 8, 2h);
FLT_CPU_CE_UNRETIRED(l2tagctl, chip, chip-nr, 8, 2h);
FLT_CPU_CE_UNRETIRED(l3data, chip, chip-nr, 8, 2h);
FLT_CPU_CE_UNRETIRED(l3tagctl, chip, chip-nr, 8, 2h);
FLT_CPU_CE_UNRETIRED(gchip, chip, chip-nr, 8, 1week);
FLT_CPU_CE_UNRETIRED(int-mem, chip, chip-nr, 8, 1week);
FLT_CPU_CE_UNRETIRED(int-mem, core, core-nr, 8, 1week);
FLT_CPU_CE_UNRETIRED(int-mem, strand, strand-nr, 8, 1week);
/*
* c2c-link-uc, c2c-prot-uc, c2c-failover errors
* The detector and sender are faulted immediately.
*/
#define CONTAINS_CHIP (payloadprop_contains("sender", asru(chip)))
event ereport.cpu.generic-sparc.c2c-link-uc@chip { within(1s) };
event ereport.cpu.generic-sparc.c2c-prot-uc@chip { within(1s) };
event ereport.cpu.generic-sparc.c2c-failover@chip { within(1s) };
event fault.cpu.generic-sparc.c2c-uc@chip, retire=0;
event fault.cpu.generic-sparc.c2c-failover@chip, retire=0;
event upset.cpu.generic-sparc.c2c-uc@chip;
event upset.cpu.generic-sparc.c2c-failover@chip;
prop fault.cpu.generic-sparc.c2c-uc@chip
{ DIAGNOSE_ERPT && CONTAINS_CHIP } (0) ->
ereport.cpu.generic-sparc.c2c-link-uc@chip<x>;
prop fault.cpu.generic-sparc.c2c-uc@chip
{ DIAGNOSE_ERPT } (0) ->
ereport.cpu.generic-sparc.c2c-link-uc@chip;
prop fault.cpu.generic-sparc.c2c-uc@chip
{ DIAGNOSE_ERPT && CONTAINS_CHIP } (0) ->
ereport.cpu.generic-sparc.c2c-prot-uc@chip<x>;
prop fault.cpu.generic-sparc.c2c-uc@chip
{ DIAGNOSE_ERPT } (0) ->
ereport.cpu.generic-sparc.c2c-prot-uc@chip;
prop upset.cpu.generic-sparc.c2c-uc@chip
{ !DIAGNOSE_ERPT } (0) ->
ereport.cpu.generic-sparc.c2c-prot-uc@chip,
ereport.cpu.generic-sparc.c2c-link-uc@chip;
prop fault.cpu.generic-sparc.c2c-failover@chip
{ DIAGNOSE_ERPT && CONTAINS_CHIP } (0) ->
ereport.cpu.generic-sparc.c2c-failover@chip<x>;
prop fault.cpu.generic-sparc.c2c-failover@chip
{ DIAGNOSE_ERPT } (0) ->
ereport.cpu.generic-sparc.c2c-failover@chip;
prop upset.cpu.generic-sparc.c2c-failover@chip
{ !DIAGNOSE_ERPT } (0) ->
ereport.cpu.generic-sparc.c2c-failover@chip;
/*
* c2c-link, c2c-prot. Errors are serded. When the serd trips,
* the detector & sender will be faulted.
*/
event ereport.cpu.generic-sparc.c2c-link@chip { within(1s) };
event ereport.cpu.generic-sparc.c2c-prot@chip { within(1s) };
engine serd.cpu.generic-sparc.c2c@chip, N=120, T=30min;
event fault.cpu.generic-sparc.c2c@chip, retire=0,
engine=serd.cpu.generic-sparc.c2c@chip;
event upset.cpu.generic-sparc.c2c-link@chip;
prop fault.cpu.generic-sparc.c2c@chip
{ DIAGNOSE_ERPT && CONTAINS_CHIP && setserdsuffix("link") &&
SET_SERDN && SET_SERDT && SET_RATIO } (0) ->
ereport.cpu.generic-sparc.c2c-link@chip<x>;
prop fault.cpu.generic-sparc.c2c@chip
{ DIAGNOSE_ERPT && setserdsuffix("link") && SET_SERDN &&
SET_SERDT && SET_RATIO } (0) ->
ereport.cpu.generic-sparc.c2c-link@chip;
prop fault.cpu.generic-sparc.c2c@chip
{ DIAGNOSE_ERPT && CONTAINS_CHIP && setserdsuffix("prot") &&
SET_SERDN && SET_SERDT && SET_RATIO } (0) ->
ereport.cpu.generic-sparc.c2c-prot@chip<x>;
prop fault.cpu.generic-sparc.c2c@chip
{ DIAGNOSE_ERPT && setserdsuffix("prot") && SET_SERDN &&
SET_SERDT && SET_RATIO } (0) ->
ereport.cpu.generic-sparc.c2c-prot@chip;
prop upset.cpu.generic-sparc.c2c-link@chip
{ !DIAGNOSE_ERPT } (0) ->
ereport.cpu.generic-sparc.c2c-prot@chip,
ereport.cpu.generic-sparc.c2c-link@chip;
/*
* addr-oob is a firmware error - there is no associated FRU/ASRU
* and firmware is not represented in topology. Rather than ignore
* the error, the DE generates a defect with no FRU or ASRU. It
* is generated @chassis so no location (label) is picked up out
* of the topology. The associated knowledge article can instruct
* users what steps to take to address the error.
*/
fru NULL;
asru NULL;
event ereport.cpu.generic-sparc.addr-oob@chassis;
event defect.fw.generic-sparc.addr-oob@chassis,
ASRU=NULL,
FRU=NULL;
event upset.fw.generic-sparc.addr-oob@chassis;
prop defect.fw.generic-sparc.addr-oob@chassis
{ DIAGNOSE_ERPT } (0) ->
ereport.cpu.generic-sparc.addr-oob@chassis;
prop upset.fw.generic-sparc.addr-oob@chassis
{ !DIAGNOSE_ERPT } (0) ->
ereport.cpu.generic-sparc.addr-oob@chassis;
event ereport.cpu.generic-sparc.inconsistent@chassis;
event defect.fw.generic-sparc.erpt-gen@chassis,
ASRU=NULL,
FRU=NULL;
event upset.fw.generic-sparc.erpt-gen@chassis;
prop defect.fw.generic-sparc.erpt-gen@chassis
{ DIAGNOSE_ERPT } (0) ->
ereport.cpu.generic-sparc.inconsistent@chassis;
prop upset.fw.generic-sparc.erpt-gen@chassis
{ !DIAGNOSE_ERPT } (0) ->
ereport.cpu.generic-sparc.inconsistent@chassis;
/*
* bootbus-to and bootbus-par errors. Fault the detector.
*/
event ereport.cpu.generic-sparc.bootbus-to@chip;
event ereport.cpu.generic-sparc.bootbus-par@chip;
event upset.cpu.generic-sparc.bootbus@chip;
event fault.cpu.generic-sparc.bootbus@chip, retire=0;
prop fault.cpu.generic-sparc.bootbus@chip
{ DIAGNOSE_ERPT } (0) ->
ereport.cpu.generic-sparc.bootbus-to@chip;
prop fault.cpu.generic-sparc.bootbus@chip
{ DIAGNOSE_ERPT } (0) ->
ereport.cpu.generic-sparc.bootbus-par@chip;
prop upset.cpu.generic-sparc.bootbus@chip
{ !DIAGNOSE_ERPT } (0) ->
ereport.cpu.generic-sparc.bootbus-to@chip,
ereport.cpu.generic-sparc.bootbus-par@chip;
/*
* ignore the pio-read error.
*/
event ereport.cpu.generic-sparc.pio-read@chip;
event upset.cpu.generic-sparc.discard@chip;
prop upset.cpu.generic-sparc.discard@chip (0) ->
ereport.cpu.generic-sparc.pio-read@chip;