intel.esc revision f5961f52af4b214e15b671fc88e5f5ea948e9dee
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#pragma dictionary "INTEL"
/*
* Eversholt rules for the intel CPU/Memory
*/
/*
* Ereports for Simple error codes.
*/
#define SMPL_EVENT(leafclass, t) \
event ereport.cpu.intel.leafclass@chip/core/strand { within(t) }
SMPL_EVENT(unknown, 1s);
SMPL_EVENT(unclassified, 1s);
SMPL_EVENT(microcode_rom_parity, 1s);
SMPL_EVENT(external, 1s);
SMPL_EVENT(frc, 1s);
SMPL_EVENT(internal_timer, 1s);
SMPL_EVENT(internal_parity, 1s);
SMPL_EVENT(internal_unclassified, 1s);
/*
* Propogations for all but "external" and "unknown" simple errors.
* If the error is uncorrected we produce a fault immediately, otherwise
* we diagnose it to an upset and decalre a fault when the SERD engine
* trips. prop statement for ereport.cpu.intel.internal_unclassified is
* moved to the Nehalem EX section to deal with poison case.
*/
engine serd.cpu.intel.simple@chip/core/strand, N=3, T=72h;
event fault.cpu.intel.internal@chip/core/strand,
engine=serd.cpu.intel.simple@chip/core/strand;
prop fault.cpu.intel.internal@chip/core/strand
{ payloadprop("error_uncorrected") == 1 ? setserdincrement(4) : 1} (0)->
ereport.cpu.intel.microcode_rom_parity@chip/core/strand,
ereport.cpu.intel.internal_timer@chip/core/strand,
ereport.cpu.intel.internal_parity@chip/core/strand,
ereport.cpu.intel.unclassified@chip/core/strand,
ereport.cpu.intel.frc@chip/core/strand;
/*
* Ereports for Compound error codes. These are in pairs "foo" and "foo_uc"
* for the corrected and uncorrected version of each error type. All are
* detected at chip/core/strand.
*/
#define CMPND_EVENT(leafclass, t) \
event ereport.cpu.intel.leafclass@chip/core/strand { within(t) }; \
event ereport.cpu.intel.leafclass/**/_uc@chip/core/strand { within(t) }
/*
* Ereports for Compound error codes - intel errors
*/
CMPND_EVENT(l0cache, 1s);
CMPND_EVENT(l1cache, 1s);
CMPND_EVENT(l2cache, 1s);
CMPND_EVENT(cache, 1s);
/*
* Ereports for Compound error codes - TLB errors
*/
CMPND_EVENT(l0dtlb, 1s);
CMPND_EVENT(l1dtlb, 1s);
CMPND_EVENT(l2dtlb, 1s);
CMPND_EVENT(dtlb, 1s);
CMPND_EVENT(l0itlb, 1s);
CMPND_EVENT(l1itlb, 1s);
CMPND_EVENT(l2itlb, 1s);
CMPND_EVENT(itlb, 1s);
CMPND_EVENT(l0tlb, 1s);
CMPND_EVENT(l1tlb, 1s);
CMPND_EVENT(l2tlb, 1s);
CMPND_EVENT(tlb, 1s);
/*
* Ereports for Compound error codes - memory hierarchy errors
*/
CMPND_EVENT(l0dcache, 1s);
CMPND_EVENT(l1dcache, 1s);
CMPND_EVENT(l2dcache, 1s);
CMPND_EVENT(dcache, 1s);
CMPND_EVENT(l0icache, 1s);
CMPND_EVENT(l1icache, 1s);
CMPND_EVENT(l2icache, 1s);
CMPND_EVENT(icache, 1s);
/*
* Ereports for Compound error codes - bus and interconnect errors
*/
CMPND_EVENT(bus_interconnect, 1s);
CMPND_EVENT(bus_interconnect_memory, 1s);
CMPND_EVENT(bus_interconnect_io, 1s);
/*
* Compound error propogations.
*
* We resist the temptation propogate, for example, a single dcache fault
* to all ereports mentioning dcache (l0dcache, l1dcache, l2dcache, dcache).
* Instead we will diagnose a distinct fault for each possible cache level,
* whether or not current chips have dcaches at all levels.
*
* Corrected errors are SERDed and produce a fault when the engine fires;
* the same fault is diagnosed immediately for a corresponding uncorrected
* error.
*/
#define CMPND_FLT_PROP_1(erptleaf, fltleaf, n, t) \
engine serd.cpu.intel.fltleaf@chip/core/strand, N=n, T=t; \
event fault.cpu.intel.fltleaf@chip/core/strand, \
engine=serd.cpu.intel.fltleaf@chip/core/strand; \
\
prop fault.cpu.intel.fltleaf@chip/core/strand (0)-> \
ereport.cpu.intel.erptleaf@chip/core/strand; \
\
prop fault.cpu.intel.fltleaf@chip/core/strand \
{ setserdincrement(n + 1) } (0)-> \
ereport.cpu.intel.erptleaf/**/_uc@chip/core/strand
#define CMPND_FLT_PROP_2(erptleaf, fltleaf, n, t) \
engine serd.cpu.intel.fltleaf@chip/core/strand, N=n, T=t; \
event fault.cpu.intel.fltleaf@chip/core/strand, retire=0, response=0,\
engine=serd.cpu.intel.fltleaf@chip/core/strand; \
\
prop fault.cpu.intel.fltleaf@chip/core/strand (0)-> \
ereport.cpu.intel.erptleaf@chip/core/strand; \
\
prop fault.cpu.intel.fltleaf@chip/core/strand \
{ setserdincrement(n + 1) } (0)-> \
ereport.cpu.intel.erptleaf/**/_uc@chip/core/strand
CMPND_FLT_PROP_1(l0cache, l0cache, 3, 72h);
CMPND_FLT_PROP_1(l1cache, l1cache, 3, 72h);
CMPND_FLT_PROP_1(l2cache, l2cache, 3, 72h);
CMPND_FLT_PROP_1(cache, cache, 12, 72h);
CMPND_FLT_PROP_1(l0dtlb, l0dtlb, 3, 72h);
CMPND_FLT_PROP_1(l1dtlb, l1dtlb, 3, 72h);
CMPND_FLT_PROP_1(l2dtlb, l2dtlb, 3, 72h);
CMPND_FLT_PROP_1(dtlb, dtlb, 12, 72h);
CMPND_FLT_PROP_1(l0itlb, l0itlb, 3, 72h);
CMPND_FLT_PROP_1(l1itlb, l1itlb, 3, 72h);
CMPND_FLT_PROP_1(l2itlb, l2itlb, 3, 72h);
CMPND_FLT_PROP_1(itlb, itlb, 12, 72h);
CMPND_FLT_PROP_1(l0tlb, l0tlb, 3, 72h);
CMPND_FLT_PROP_1(l1tlb, l1tlb, 3, 72h);
CMPND_FLT_PROP_1(l2tlb, l2tlb, 3, 72h);
CMPND_FLT_PROP_1(tlb, tlb, 12, 72h);
CMPND_FLT_PROP_1(l0dcache, l0dcache, 3, 72h);
CMPND_FLT_PROP_1(l1dcache, l1dcache, 3, 72h);
CMPND_FLT_PROP_1(l2dcache, l2dcache, 3, 72h);
CMPND_FLT_PROP_1(dcache, dcache, 12, 72h);
CMPND_FLT_PROP_1(l0icache, l0icache, 3, 72h);
CMPND_FLT_PROP_1(l1icache, l1icache, 3, 72h);
CMPND_FLT_PROP_1(l2icache, l2icache, 3, 72h);
CMPND_FLT_PROP_1(icache, icache, 12, 72h);
CMPND_FLT_PROP_2(bus_interconnect, bus_interconnect, 10, 72h);
CMPND_FLT_PROP_2(bus_interconnect_memory, bus_interconnect_memory, 10, 72h);
CMPND_FLT_PROP_2(bus_interconnect_io, bus_interconnect_io, 10, 72h);
event upset.discard@chip/core/strand;
prop upset.discard@chip/core/strand (0)->
ereport.cpu.intel.external@chip/core/strand,
ereport.cpu.intel.unknown@chip/core/strand;
/* errors detected in northbridge */
/*
* SET_ADDR and SET_OFFSET are used to set a payload value in the fault that
* we diagnose for page faults, to record the physical address of the faulting
* page.
*/
#define SET_ADDR (!payloadprop_defined("physaddr") || \
setpayloadprop("asru-physaddr", payloadprop("physaddr")))
#define SET_OFFSET (!payloadprop_defined("offset") || \
setpayloadprop("asru-offset", payloadprop("offset")))
#define EREPORT_BUS_ERROR \
ereport.cpu.intel.bus_interconnect_memory_uc@chip/core/strand, \
ereport.cpu.intel.bus_interconnect_uc@chip/core/strand, \
ereport.cpu.intel.bus_interconnect_memory@chip/core/strand, \
ereport.cpu.intel.bus_interconnect@chip/core/strand, \
ereport.cpu.intel.external@chip/core/strand
engine stat.ce_pgflt@memory-controller/dram-channel/dimm;
event ereport.cpu.intel.nb.mem_ue@motherboard/memory-controller{within(12s)};
event ereport.cpu.intel.nb.ddr2_mem_ue@
motherboard/memory-controller{within(12s)};
event ereport.cpu.intel.nb.fbd.ma@motherboard/memory-controller{within(12s)};
event fault.memory.intel.page_ue@
motherboard/memory-controller/dram-channel/dimm/rank,
message=0, response=0;
event fault.memory.intel.dimm_ue@
motherboard/memory-controller/dram-channel/dimm/rank;
prop fault.memory.intel.page_ue@
motherboard/memory-controller/dram-channel/dimm/rank[rank_num]
{ payloadprop_defined("rank") && rank_num == payloadprop("rank") &&
(payloadprop_defined("physaddr") || payloadprop_defined("offset")) &&
SET_ADDR && SET_OFFSET } (1)->
ereport.cpu.intel.nb.mem_ue@motherboard/memory-controller,
ereport.cpu.intel.nb.ddr2_mem_ue@motherboard/memory-controller,
ereport.cpu.intel.nb.fbd.ma@motherboard/memory-controller;
prop fault.memory.intel.dimm_ue@
motherboard/memory-controller/dram-channel/dimm/rank[rank_num]
{ payloadprop_defined("rank") && rank_num == payloadprop("rank") } (1)->
ereport.cpu.intel.nb.mem_ue@motherboard/memory-controller,
ereport.cpu.intel.nb.ddr2_mem_ue@motherboard/memory-controller,
ereport.cpu.intel.nb.fbd.ma@motherboard/memory-controller;
event upset.memory.intel.discard@motherboard/memory-controller{within(1s)};
prop upset.memory.intel.discard@motherboard/memory-controller (0)->
ereport.cpu.intel.nb.mem_ue@motherboard/memory-controller,
ereport.cpu.intel.nb.ddr2_mem_ue@motherboard/memory-controller,
ereport.cpu.intel.nb.fbd.ma@motherboard/memory-controller;
prop upset.memory.intel.discard@motherboard/memory-controller (0)->
EREPORT_BUS_ERROR;
#define PAGE_CE_COUNT 2
#define PAGE_CE_TIME 72h
#define DIMM_CE_COUNT 10
#define DIMM_CE_TIME 1week
#define MBDIMM motherboard/memory-controller/dram-channel/dimm
event ereport.cpu.intel.nb.mem_ce@MBDIMM/rank{within(12s)};
event ereport.cpu.intel.nb.ddr2_mem_ce@MBDIMM/rank{within(12s)};
event ereport.cpu.intel.nb.ddr2_mem_ce@
motherboard/memory-controller{within(12s)};
engine serd.memory.intel.page_ce@MBDIMM/rank, N=PAGE_CE_COUNT, T=PAGE_CE_TIME;
event fault.memory.intel.page_ce@MBDIMM/rank, message=0, response=0,
count=stat.ce_pgflt@MBDIMM, engine=serd.memory.intel.page_ce@MBDIMM/rank;
prop fault.memory.intel.page_ce@MBDIMM/rank
{ (payloadprop_defined("physaddr") || payloadprop_defined("offset")) &&
SET_ADDR && SET_OFFSET } (0)->
ereport.cpu.intel.nb.mem_ce@MBDIMM/rank,
ereport.cpu.intel.nb.ddr2_mem_ce@MBDIMM/rank;
engine serd.memory.intel.dimm_ce@MBDIMM/rank, N=DIMM_CE_COUNT, T=DIMM_CE_TIME;
event fault.memory.intel.dimm_ce@MBDIMM/rank,
engine=serd.memory.intel.dimm_ce@MBDIMM/rank;
prop fault.memory.intel.dimm_ce@MBDIMM/rank
{ !confprop_defined(MBDIMM, "dimm-size") &&
count(stat.ce_pgflt@MBDIMM) > 512 } (1)->
ereport.cpu.intel.nb.mem_ce@MBDIMM/rank,
ereport.cpu.intel.nb.ddr2_mem_ce@MBDIMM/rank;
#define DIMM_CE(dimm_size, n, t, fault_rate) \
prop fault.memory.intel.dimm_ce@MBDIMM/rank { \
confprop(MBDIMM, "dimm-size") == dimm_size && \
count(stat.ce_pgflt@MBDIMM) > fault_rate && \
setserdn(n) & setserdt(t) } (1)-> \
ereport.cpu.intel.nb.mem_ce@MBDIMM/rank, \
ereport.cpu.intel.nb.ddr2_mem_ce@MBDIMM/rank;
DIMM_CE("8G", 8, 1week, 2000)
DIMM_CE("4G", 4, 1week, 1500)
DIMM_CE("2G", 4, 2week, 1000)
DIMM_CE("1G", 4, 4week, 500)
DIMM_CE("512M", 4, 8week, 250)
DIMM_CE("256M", 4, 16week, 125)
prop upset.memory.intel.discard@motherboard/memory-controller (0)->
ereport.cpu.intel.nb.ddr2_mem_ce@motherboard/memory-controller;
event ereport.cpu.intel.nb.fbd.alert@rank{within(12s)};
event fault.memory.intel.fbd.alert@rank, retire=0;
prop fault.memory.intel.fbd.alert@rank (1)->
ereport.cpu.intel.nb.fbd.alert@rank;
prop fault.memory.intel.fbd.alert@rank (0)->
EREPORT_BUS_ERROR;
event ereport.cpu.intel.nb.fbd.crc@rank{within(12s)};
event fault.memory.intel.fbd.crc@rank, retire=0;
prop fault.memory.intel.fbd.crc@rank (1)->
ereport.cpu.intel.nb.fbd.crc@rank;
prop fault.memory.intel.fbd.crc@rank (0)-> EREPORT_BUS_ERROR;
event ereport.cpu.intel.nb.fbd.reset_timeout@memory-controller {within(12s)};
event fault.memory.intel.fbd.reset_timeout@memory-controller, retire=0;
prop fault.memory.intel.fbd.reset_timeout@memory-controller (1)->
ereport.cpu.intel.nb.fbd.reset_timeout@memory-controller;
prop fault.memory.intel.fbd.reset_timeout@memory-controller (0)->
EREPORT_BUS_ERROR;
event ereport.cpu.intel.nb.fbd.ch@dram-channel {within(12s)};
engine serd.cpu.intel.nb.fbd.ch@dram-channel, N=2, T=1month;
event fault.memory.intel.fbd.ch@dram-channel, retire=0,
engine=serd.cpu.intel.nb.fbd.ch@dram-channel;
prop fault.memory.intel.fbd.ch@dram-channel (1)->
ereport.cpu.intel.nb.fbd.ch@dram-channel;
prop fault.memory.intel.fbd.ch@dram-channel (0)->
EREPORT_BUS_ERROR;
event ereport.cpu.intel.nb.fbd.otf@dram-channel {within(12s)};
engine serd.cpu.intel.nb.fbd_otf@dram-channel, N=2, T=1week;
event fault.memory.intel.fbd.otf@dram-channel, retire=0, response=0,
engine=serd.cpu.intel.nb.fbd_otf@dram-channel;
prop fault.memory.intel.fbd.otf@dram-channel (1)->
ereport.cpu.intel.nb.fbd.otf@dram-channel;
event ereport.cpu.intel.nb.otf@motherboard {within(12s)};
event fault.cpu.intel.nb.otf@motherboard, retire=0, response=0;
prop fault.cpu.intel.nb.otf@motherboard (1)->
ereport.cpu.intel.nb.otf@motherboard;
event ereport.cpu.intel.nb.unknown@motherboard {within(12s)};
event ereport.cpu.intel.nb.unknown@memory-controller {within(12s)};
event ereport.cpu.intel.nb.unknown@memory-controller/dram-channel {within(12s)};
event ereport.cpu.intel.nb.spd@memory-controller/dram-channel {within(12s)};
event ereport.cpu.intel.nb.ddr2_spd@
memory-controller/dram-channel {within(12s)};
event upset.discard@memory-controller;
prop upset.discard@memory-controller (0)->
ereport.cpu.intel.nb.unknown@motherboard,
ereport.cpu.intel.nb.unknown@memory-controller,
ereport.cpu.intel.nb.unknown@memory-controller/dram-channel,
ereport.cpu.intel.nb.spd@memory-controller/dram-channel,
ereport.cpu.intel.nb.ddr2_spd@memory-controller/dram-channel;
event ereport.cpu.intel.nb.mem_ds@memory-controller{within(30s)};
event ereport.cpu.intel.nb.ddr2_mem_ds@memory-controller{within(30s)};
event fault.memory.intel.fbd.mem_ds@memory-controller/dram-channel/dimm/rank,
retire=0;
prop fault.memory.intel.fbd.mem_ds@
memory-controller/dram-channel/dimm/rank[rank_num]
{ payloadprop_defined("rank") && rank_num == payloadprop("rank") } (1)->
ereport.cpu.intel.nb.mem_ds@memory-controller,
ereport.cpu.intel.nb.ddr2_mem_ds@memory-controller;
event ereport.cpu.intel.nb.fsb@chip{within(12s)};
event fault.cpu.intel.nb.fsb@chip, retire=0;
prop fault.cpu.intel.nb.fsb@chip (1)->
ereport.cpu.intel.nb.fsb@chip;
prop fault.cpu.intel.nb.fsb@chip (0)-> EREPORT_BUS_ERROR;
event ereport.cpu.intel.nb.ie@motherboard{within(12s)};
event fault.cpu.intel.nb.ie@motherboard, retire=0;
event upset.cpu.intel.nb.ie_ce@motherboard{within(12s)};
prop upset.cpu.intel.nb.ie_ce@motherboard
{ payloadprop("intel-error-list") == "B6" } (0)->
ereport.cpu.intel.nb.ie@motherboard;
prop fault.cpu.intel.nb.ie@motherboard
{ payloadprop("intel-error-list") != "B6" } (1)->
ereport.cpu.intel.nb.ie@motherboard;
prop fault.cpu.intel.nb.ie@motherboard (0)-> EREPORT_BUS_ERROR;
event ereport.cpu.intel.nb.dma@motherboard{within(12s)};
event upset.cpu.intel.nb.dma@motherboard;
prop upset.cpu.intel.nb.dma@motherboard (1)->
ereport.cpu.intel.nb.dma@motherboard;
event ereport.cpu.intel.nb.esi@motherboard{within(12s)};
event ereport.cpu.intel.nb.pex@hostbridge{within(12s)};
event upset.cpu.intel.nb.pex@hostbridge;
prop upset.cpu.intel.nb.pex@hostbridge (1)->
ereport.cpu.intel.nb.esi@motherboard,
ereport.cpu.intel.nb.pex@hostbridge;
prop upset.cpu.intel.nb.pex@hostbridge (0)-> EREPORT_BUS_ERROR;
event ereport.cpu.intel.nb.unknown@rank{within(12s)};
event upset.discard@rank;
prop upset.discard@rank (1)->
ereport.cpu.intel.nb.unknown@rank;
prop upset.discard@rank (0)-> EREPORT_BUS_ERROR;
/*
* CPU integrated memory controller
*/
#define CONTAINS_RANK (payloadprop_contains("resource", \
asru(chip/memory-controller/dram-channel/dimm/rank)) || \
payloadprop_contains("resource", \
asru(chip/memory-controller/dram-channel/dimm)))
#define STAT_CPU_MEM_CE_PGFLTS \
stat.ce_pgflt@chip/memory-controller/dram-channel/dimm
#define SET_RES_OFFSET \
(!payloadprop_defined("resource[0].hc-specific.offset") || \
setpayloadprop("asru-offset", \
payloadprop("resource[0].hc-specific.offset")))
engine STAT_CPU_MEM_CE_PGFLTS;
event ereport.cpu.intel.quickpath.mem_ue@chip/memory-controller
{within(12s)}, discard_if_config_unknown=1;
event fault.memory.intel.page_ue@
chip/memory-controller/dram-channel/dimm/rank,
message=0, response=0; /* do not message individual pageflts */
prop fault.memory.intel.page_ue@
chip/memory-controller/dram-channel/dimm/rank
{ CONTAINS_RANK && (payloadprop_defined("physaddr") ||
payloadprop_defined("resource[0].hc-specific.offset")) &&
SET_ADDR && SET_RES_OFFSET } (0)->
ereport.cpu.intel.quickpath.mem_ue@chip/memory-controller;
#define CHIPDIMM chip/memory-controller/dram-channel/dimm
event fault.memory.intel.dimm_ue@CHIPDIMM/rank;
event error.memory.intel.dimm_ue_ep@CHIPDIMM/rank;
event error.memory.intel.dimm_ue_ex@CHIPDIMM/rank;
prop fault.memory.intel.dimm_ue@CHIPDIMM/rank (1)->
error.memory.intel.dimm_ue_ep@CHIPDIMM/rank,
error.memory.intel.dimm_ue_ex@CHIPDIMM/rank;
prop error.memory.intel.dimm_ue_ep@CHIPDIMM/rank
{ CONTAINS_RANK } (1)->
ereport.cpu.intel.quickpath.mem_ue@chip/memory-controller;
prop fault.memory.intel.dimm_ue@CHIPDIMM/rank (0)-> EREPORT_BUS_ERROR;
event ereport.cpu.intel.quickpath.mem_ce@
chip/memory-controller {within(12s)}, discard_if_config_unknown=1;
engine serd.memory.intel.page_ce@CHIPDIMM/rank, N=PAGE_CE_COUNT, T=PAGE_CE_TIME;
event fault.memory.intel.page_ce@CHIPDIMM/rank, message=0, response=0,
count=STAT_CPU_MEM_CE_PGFLTS,
engine=serd.memory.intel.page_ce@CHIPDIMM/rank;
prop fault.memory.intel.page_ce@CHIPDIMM/rank
{ CONTAINS_RANK && (payloadprop_defined("physaddr") ||
payloadprop_defined("resource[0].hc-specific.offset")) &&
SET_ADDR && SET_RES_OFFSET } (0)->
ereport.cpu.intel.quickpath.mem_ce@chip/memory-controller;
engine serd.memory.intel.dimm_ce@CHIPDIMM, N=PAGE_CE_COUNT, T=PAGE_CE_TIME;
event fault.memory.intel.dimm_ce@CHIPDIMM,
engine=serd.memory.intel.dimm_ce@CHIPDIMM;
prop fault.memory.intel.dimm_ce@CHIPDIMM
{ !confprop_defined(CHIPDIMM, "dimm-size") &&
count(STAT_CPU_MEM_CE_PGFLTS) > 512 } (0)->
ereport.cpu.intel.quickpath.mem_ce@chip/memory-controller;
#define CPU_MEM_DIMM_CE(dimm_size, n, t, fault_rate) \
prop fault.memory.intel.dimm_ce@CHIPDIMM { \
confprop(CHIPDIMM, "dimm-size") == dimm_size && \
count(STAT_CPU_MEM_CE_PGFLTS) > fault_rate && \
setserdn(n) & setserdt(t) } (0)-> \
ereport.cpu.intel.quickpath.mem_ce@ \
chip/memory-controller;
CPU_MEM_DIMM_CE("16G", 16, 1week, 2000)
CPU_MEM_DIMM_CE("8G", 8, 1week, 2000)
CPU_MEM_DIMM_CE("4G", 4, 1week, 1500)
CPU_MEM_DIMM_CE("2G", 4, 2week, 1000)
CPU_MEM_DIMM_CE("1G", 4, 4week, 500)
CPU_MEM_DIMM_CE("512M", 4, 8week, 250)
event ereport.cpu.intel.quickpath.mem_unknown@chip/memory-controller
{within(12s)}, discard_if_config_unknown=1;
event ereport.cpu.intel.quickpath.mem_unknown@
chip/memory-controller/dram-channel {within(12s)},
discard_if_config_unknown=1;
event ereport.cpu.intel.quickpath.mem_unknown@
chip/memory-controller/dram-channel/dimm/rank{within(12s)};
event upset.discard@chip/memory-controller;
event upset.discard@chip/memory-controller/dram-channel/dimm/rank;
prop upset.discard@chip/memory-controller (0)->
ereport.cpu.intel.quickpath.mem_unknown@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_unknown@
chip/memory-controller/dram-channel;
prop upset.discard@
chip/memory-controller/dram-channel/dimm/rank (1)->
ereport.cpu.intel.quickpath.mem_unknown@
chip/memory-controller/dram-channel/dimm/rank;
event ereport.cpu.intel.quickpath.mem_parity@chip/memory-controller
{within(1s)}, discard_if_config_unknown=1;
event fault.cpu.intel.quickpath.mem_parity@chip/memory-controller;
prop fault.cpu.intel.quickpath.mem_parity@chip/memory-controller (1)->
ereport.cpu.intel.quickpath.mem_parity@chip/memory-controller;
event ereport.cpu.intel.quickpath.mem_addr_parity@chip/memory-controller
{within(1s)}, discard_if_config_unknown=1;
event fault.cpu.intel.quickpath.mem_addr_parity@
chip/memory-controller;
event fault.cpu.intel.quickpath.mem_addr_parity@CHIPDIMM;
event fault.cpu.intel.quickpath.mem_addr_parity@CHIPDIMM/rank;
prop fault.cpu.intel.quickpath.mem_addr_parity@
chip/memory-controller (1)->
ereport.cpu.intel.quickpath.mem_addr_parity@chip/memory-controller;
prop fault.cpu.intel.quickpath.mem_addr_parity@CHIPDIMM
{ payloadprop_contains("resource", asru(CHIPDIMM)) } (1)->
ereport.cpu.intel.quickpath.mem_addr_parity@chip/memory-controller;
prop fault.cpu.intel.quickpath.mem_addr_parity@CHIPDIMM/rank
{ payloadprop_contains("resource", asru(CHIPDIMM/rank)) } (1)->
ereport.cpu.intel.quickpath.mem_addr_parity@chip/memory-controller;
event ereport.cpu.intel.quickpath.mem_bad_addr@chip/memory-controller
{within(1s)}, discard_if_config_unknown=1;
event fault.cpu.intel.quickpath.mem_bad_addr@chip/memory-controller;
prop fault.cpu.intel.quickpath.mem_bad_addr@chip/memory-controller (1)->
ereport.cpu.intel.quickpath.mem_bad_addr@chip/memory-controller;
event ereport.cpu.intel.quickpath.mem_spare@chip/memory-controller
{within(1s)}, discard_if_config_unknown=1;
event fault.cpu.intel.quickpath.mem_spare@
chip/memory-controller/dram-channel/dimm;
prop fault.cpu.intel.quickpath.mem_spare@
chip/memory-controller/dram-channel/dimm (1)->
ereport.cpu.intel.quickpath.mem_spare@chip/memory-controller;
event ereport.cpu.intel.quickpath.mem_bad_id@chip/memory-controller
{within(1s)}, discard_if_config_unknown=1;
event fault.cpu.intel.quickpath.mem_bad_id@chip/memory-controller;
prop fault.cpu.intel.quickpath.mem_bad_id@chip/memory-controller (1)->
ereport.cpu.intel.quickpath.mem_bad_id@chip/memory-controller;
event ereport.cpu.intel.quickpath.mem_redundant@chip/memory-controller
{within(1s)}, discard_if_config_unknown=1;
engine serd.cpu.intel.quickpath.mem_redundant@CHIPDIMM, N=2, T=72h;
event fault.cpu.intel.quickpath.mem_redundant@CHIPDIMM,
engine=serd.cpu.intel.quickpath.mem_redundant@CHIPDIMM;
event error.cpu.intel.quickpath.mem_redundant@CHIPDIMM/rank;
prop fault.cpu.intel.quickpath.mem_redundant@CHIPDIMM (1)->
error.cpu.intel.quickpath.mem_redundant@CHIPDIMM/rank<>;
prop error.cpu.intel.quickpath.mem_redundant@CHIPDIMM/rank
{ CONTAINS_RANK } (1)->
ereport.cpu.intel.quickpath.mem_redundant@
chip/memory-controller;
#define STATUS_UC (payloadprop("error_uncorrected") == 1)
event ereport.cpu.intel.quickpath.interconnect@chip
{within(1s)};
event upset.cpu.intel.quickpath.interconnect@chip;
/* Diagnose corrected events to upsets */
prop upset.cpu.intel.quickpath.interconnect@chip
{ !STATUS_UC } (1)->
ereport.cpu.intel.quickpath.interconnect@chip;
engine serd.cpu.intel.quickpath.interconnect@chip,
N=3, T=72h;
event fault.cpu.intel.quickpath.interconnect@chip,
engine=serd.cpu.intel.quickpath.interconnect@chip;
/* Diagnose uncorrected events to faults */
prop fault.cpu.intel.quickpath.interconnect@chip
{ STATUS_UC } (0)->
ereport.cpu.intel.quickpath.interconnect@chip;
/*
* Nehalem EX specific rules
*/
/* MBox errors */
#define EX_MEM_EVENT(leafclass, t) \
event ereport.cpu.intel.quickpath.leafclass@ \
chip/memory-controller { within(t) }, discard_if_config_unknown=1
EX_MEM_EVENT(mem_lnktrns, 1s);
EX_MEM_EVENT(mem_lnkpers, 1s);
EX_MEM_EVENT(mem_sbfbdlinkerr, 1s);
EX_MEM_EVENT(mem_nbfbdlnkerr, 1s);
EX_MEM_EVENT(mem_lnkcrcvld, 1s);
engine serd.cpu.intel.quickpath.mem_link_ce@chip/memory-controller,
N=500, T=1week;
event fault.cpu.intel.quickpath.mem_link_ce@chip/memory-controller,
engine=serd.cpu.intel.quickpath.mem_link_ce@chip/memory-controller,
retire=0, response=0;
prop fault.cpu.intel.quickpath.mem_link_ce@chip/memory-controller ->
ereport.cpu.intel.quickpath.mem_lnktrns@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_lnkpers@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_sbfbdlinkerr@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_nbfbdlnkerr@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_lnkcrcvld@chip/memory-controller;
EX_MEM_EVENT(mem_lnkuncorr_uc, 1s);
EX_MEM_EVENT(mem_lnkpers_uc, 1s);
EX_MEM_EVENT(mem_sbfbdlinkerr_uc, 1s);
EX_MEM_EVENT(mem_nbfbdlnkerr_uc, 1s);
EX_MEM_EVENT(mem_lnkcrcvld_uc, 1s);
event fault.cpu.intel.quickpath.mem_link_ue@chip/memory-controller,
retire=0;
prop fault.cpu.intel.quickpath.mem_link_ue@chip/memory-controller ->
ereport.cpu.intel.quickpath.mem_lnkuncorr_uc@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_lnkpers_uc@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_sbfbdlinkerr_uc@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_nbfbdlnkerr_uc@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_lnkcrcvld_uc@chip/memory-controller;
EX_MEM_EVENT(mem_ptrl_fsm_err, 1s);
EX_MEM_EVENT(mem_errflw_fsm_fail, 1s);
EX_MEM_EVENT(mem_vberr, 1s);
engine serd.cpu.intel.quickpath.mem_controller_ce@chip/memory-controller,
N=500, T=1week;
event fault.cpu.intel.quickpath.mem_controller_ce@chip/memory-controller,
engine=serd.cpu.intel.quickpath.mem_controller_ce@chip/memory-controller,
retire=0, response=0;
prop fault.cpu.intel.quickpath.mem_controller_ce@chip/memory-controller ->
ereport.cpu.intel.quickpath.mem_ptrl_fsm_err@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_errflw_fsm_fail@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_vberr@chip/memory-controller;
EX_MEM_EVENT(mem_ptrl_fsm_err_uc, 1s);
EX_MEM_EVENT(mem_errflw_fsm_fail_uc, 1s);
EX_MEM_EVENT(mem_mcpar_fsmerr_uc, 1s);
EX_MEM_EVENT(mem_vberr_uc, 1s);
EX_MEM_EVENT(mem_fberr_uc, 1s);
event fault.cpu.intel.quickpath.mem_controller_ue@chip/memory-controller,
retire=0;
prop fault.cpu.intel.quickpath.mem_controller_ue@chip/memory-controller ->
ereport.cpu.intel.quickpath.mem_ptrl_fsm_err_uc@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_errflw_fsm_fail_uc@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_mcpar_fsmerr_uc@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_vberr_uc@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_fberr_uc@chip/memory-controller;
EX_MEM_EVENT(mem_scrubbing_uc, 1s);
event fault.cpu.intel.quickpath.mem_scrubbing@
chip/memory-controller/dram-channel/dimm/rank,
response=0;
prop fault.cpu.intel.quickpath.mem_scrubbing@
chip/memory-controller/dram-channel/dimm/rank[rank_num]
{ payloadprop_defined("rank") && rank_num == payloadprop("rank") &&
(payloadprop_defined("physaddr") || payloadprop_defined("offset")) &&
SET_ADDR && SET_OFFSET } (1)->
ereport.cpu.intel.quickpath.mem_scrubbing_uc@chip/memory-controller;
EX_MEM_EVENT(mem_ecc_uc, 12s);
EX_MEM_EVENT(mem_even_parity_uc, 1s);
EX_MEM_EVENT(mem_ecc, 12s);
EX_MEM_EVENT(mem_even_parity, 1s);
event error.memory.intel.ex_dimm_ce@
chip/memory-controller/dram-channel/dimm/rank;
prop fault.memory.intel.page_ue@
chip/memory-controller/dram-channel/dimm/rank[rank_num]
{ payloadprop_defined("rank") && rank_num == payloadprop("rank") &&
(payloadprop_defined("physaddr") || payloadprop_defined("offset")) &&
SET_ADDR && SET_OFFSET } (0)->
ereport.cpu.intel.quickpath.mem_ecc_uc@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_even_parity_uc@chip/memory-controller;
prop fault.memory.intel.page_ce@
chip/memory-controller/dram-channel/dimm/rank[rank_num]
{ payloadprop_defined("rank") && rank_num == payloadprop("rank") &&
(payloadprop_defined("physaddr") || payloadprop_defined("offset")) &&
SET_ADDR && SET_OFFSET } (0)->
ereport.cpu.intel.quickpath.mem_ecc@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_even_parity@chip/memory-controller;
prop error.memory.intel.dimm_ue_ex@
chip/memory-controller/dram-channel/dimm/rank[rank_num]
{ payloadprop_defined("rank") && rank_num == payloadprop("rank") } (1)->
ereport.cpu.intel.quickpath.mem_ecc_uc@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_even_parity_uc@chip/memory-controller;
prop fault.memory.intel.dimm_ce@
chip/memory-controller/dram-channel/dimm
{ !confprop_defined(chip/memory-controller/dram-channel/dimm,
"dimm-size") && setserdn(10) & setserdt(1week) } (0)->
error.memory.intel.ex_dimm_ce@
chip/memory-controller/dram-channel/dimm/rank;
prop error.memory.intel.ex_dimm_ce@
chip/memory-controller/dram-channel/dimm/rank[rank_num]
{ payloadprop_defined("rank") && rank_num == payloadprop("rank") &&
!confprop_defined(chip/memory-controller/dram-channel/dimm,
"dimm-size") &&
count(STAT_CPU_MEM_CE_PGFLTS) > 512 } (1)->
ereport.cpu.intel.quickpath.mem_ecc@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_even_parity@chip/memory-controller;
#define EX_CPU_MEM_DIMM_CE(dimm_size, n, t, fault_rate) \
prop fault.memory.intel.dimm_ce@ \
chip/memory-controller/dram-channel/dimm { \
confprop(chip/memory-controller/dram-channel/dimm, \
"dimm-size") == dimm_size && \
setserdn(n) & setserdt(t) } (0)-> \
error.memory.intel.ex_dimm_ce@ \
chip/memory-controller/dram-channel/dimm/rank; \
prop error.memory.intel.ex_dimm_ce@ \
chip/memory-controller/dram-channel/dimm/rank[rank_num] { \
payloadprop_defined("rank") && rank_num == payloadprop("rank") && \
confprop(chip/memory-controller/dram-channel/dimm, \
"dimm-size") == dimm_size && \
count(STAT_CPU_MEM_CE_PGFLTS) > fault_rate } (1)-> \
ereport.cpu.intel.quickpath.mem_ecc@chip/memory-controller, \
ereport.cpu.intel.quickpath.mem_even_parity@chip/memory-controller;
EX_CPU_MEM_DIMM_CE("16G", 16, 1week, 2000)
EX_CPU_MEM_DIMM_CE("8G", 8, 1week, 2000)
EX_CPU_MEM_DIMM_CE("4G", 4, 1week, 1500)
EX_CPU_MEM_DIMM_CE("2G", 4, 2week, 1000)
EX_CPU_MEM_DIMM_CE("1G", 4, 4week, 500)
event upset.memory.intel.discard@chip/memory-controller{within(1s)};
prop upset.memory.intel.discard@chip/memory-controller (0)->
ereport.cpu.intel.quickpath.mem_scrubbing_uc@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_ecc_uc@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_even_parity_uc@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_ecc@chip/memory-controller,
ereport.cpu.intel.quickpath.mem_even_parity@chip/memory-controller;
EX_MEM_EVENT(mem_failover_mir, 1s);
event fault.cpu.intel.quickpath.mem_failover_mir@chip/memory-controller,
retire=0;
prop fault.cpu.intel.quickpath.mem_failover_mir@chip/memory-controller ->
ereport.cpu.intel.quickpath.mem_failover_mir@chip/memory-controller;
/*
* RBox errors
*/
#define EX_EVENT(leafclass, t) \
event ereport.cpu.intel.quickpath.leafclass@chip { within(t) }
engine serd.cpu.intel.quickpath.bus_interconnect@chip,
N=3, T=72h;
event fault.cpu.intel.quickpath.bus_interconnect@chip,
engine=serd.cpu.intel.quickpath.bus_interconnect@chip,
retire=0;
EX_EVENT(bus_retry_abort, 1s);
EX_EVENT(bus_link_init_ce, 1s);
event upset.cpu.intel.quickpath.discard@chip;
prop upset.cpu.intel.quickpath.discard@chip (0)->
ereport.cpu.intel.quickpath.bus_retry_abort@chip,
ereport.cpu.intel.quickpath.bus_link_init_ce@chip;
EX_EVENT(bus_unknown, 1s);
EX_EVENT(bus_single_ecc, 1s);
EX_EVENT(bus_crc_flit, 1s);
prop fault.cpu.intel.quickpath.bus_interconnect@chip (0)->
ereport.cpu.intel.quickpath.bus_unknown@chip,
ereport.cpu.intel.quickpath.bus_single_ecc@chip,
ereport.cpu.intel.quickpath.bus_crc_flit@chip;
EX_EVENT(bus_unknown_external, 1s);
EX_EVENT(bus_crc_flit_external, 1s);
prop upset.cpu.intel.quickpath.discard@chip (0)->
ereport.cpu.intel.quickpath.bus_unknown_external@chip,
ereport.cpu.intel.quickpath.bus_crc_flit_external@chip;
EX_EVENT(bus_unknown_uc, 1s);
EX_EVENT(bus_opr_poison_err, 1s);
EX_EVENT(bus_eot_parity, 1s);
EX_EVENT(bus_rta_parity, 1s);
EX_EVENT(bus_bad_sbu_route, 1s);
EX_EVENT(bus_bad_msg, 1s);
EX_EVENT(bus_bad_vn_credit, 1s);
EX_EVENT(bus_hdr_double_ecc, 1s);
EX_EVENT(bus_link_retry_err, 1s);
prop fault.cpu.intel.quickpath.bus_interconnect@chip
{ setserdincrement(4) } (0)->
ereport.cpu.intel.quickpath.bus_unknown_uc@chip,
ereport.cpu.intel.quickpath.bus_opr_poison_err@chip,
ereport.cpu.intel.quickpath.bus_eot_parity@chip,
ereport.cpu.intel.quickpath.bus_rta_parity@chip,
ereport.cpu.intel.quickpath.bus_bad_sbu_route@chip,
ereport.cpu.intel.quickpath.bus_bad_msg@chip,
ereport.cpu.intel.quickpath.bus_bad_vn_credit@chip,
ereport.cpu.intel.quickpath.bus_hdr_double_ecc@chip,
ereport.cpu.intel.quickpath.bus_link_retry_err@chip;
EX_EVENT(bus_unknown_uc_external, 1s);
EX_EVENT(bus_opr_poison_err_external, 1s);
EX_EVENT(bus_eot_parity_external, 1s);
EX_EVENT(bus_rta_parity_external, 1s);
EX_EVENT(bus_bad_sbu_route_external, 1s);
EX_EVENT(bus_bad_msg_external, 1s);
EX_EVENT(bus_bad_vn_credit_external, 1s);
EX_EVENT(bus_hdr_double_ecc_external, 1s);
EX_EVENT(bus_link_retry_err_external, 1s);
prop upset.cpu.intel.quickpath.discard@chip (0)->
ereport.cpu.intel.quickpath.bus_unknown_uc_external@chip,
ereport.cpu.intel.quickpath.bus_opr_poison_err_external@chip,
ereport.cpu.intel.quickpath.bus_eot_parity_external@chip,
ereport.cpu.intel.quickpath.bus_rta_parity_external@chip,
ereport.cpu.intel.quickpath.bus_bad_sbu_route_external@chip,
ereport.cpu.intel.quickpath.bus_bad_msg_external@chip,
ereport.cpu.intel.quickpath.bus_bad_vn_credit_external@chip,
ereport.cpu.intel.quickpath.bus_hdr_double_ecc_external@chip,
ereport.cpu.intel.quickpath.bus_link_retry_err_external@chip;
/*
* CBox errors
*/
EX_EVENT(llc_ewb_uc, 1s);
event fault.cpu.intel.quickpath.llc_ewb@chip,
retire=0, response=0;
prop fault.cpu.intel.quickpath.llc_ewb@chip
{ (payloadprop_defined("physaddr") || payloadprop_defined("offset")) &&
SET_ADDR && SET_OFFSET } (1)->
ereport.cpu.intel.quickpath.llc_ewb_uc@chip;
prop upset.cpu.intel.quickpath.discard@chip (0)->
ereport.cpu.intel.quickpath.llc_ewb_uc@chip;
/*
* SBox errors
*/
EX_EVENT(system_cache_uc, 1s);
event fault.cpu.intel.quickpath.system_cache@chip,
retire=0, response=0;
prop fault.cpu.intel.quickpath.system_cache@chip ->
ereport.cpu.intel.quickpath.system_cache_uc@chip;
/*
* BBox errors
*/
EX_EVENT(home_agent_uc, 1s);
event fault.cpu.intel.quickpath.home_agent@chip,
retire=0, response=0;
prop fault.cpu.intel.quickpath.home_agent@chip ->
ereport.cpu.intel.quickpath.home_agent_uc@chip;
/*
* UBox errors
*/
EX_EVENT(sys_cfg_cfa_ecc, 1s);
EX_EVENT(sys_cfg_uc, 1s);
engine serd.cpu.intel.quickpath.sys_cfg@chip,
N=2, T=72h;
event fault.cpu.intel.quickpath.sys_cfg@chip,
engine=serd.cpu.intel.quickpath.sys_cfg@chip,
retire=0, response=0;
prop fault.cpu.intel.quickpath.sys_cfg@chip (0)->
ereport.cpu.intel.quickpath.sys_cfg_cfa_ecc@chip;
prop fault.cpu.intel.quickpath.sys_cfg@chip
{ setserdincrement(3) } (0)->
ereport.cpu.intel.quickpath.sys_cfg_uc@chip;
/*
* Handling poison errors
*/
engine stat.has_poison@motherboard;
event fault.cpu.intel.has_poison@motherboard,
count=stat.has_poison@motherboard[0],
message=0, retire=0, response=0;
engine stat.discard_fatal@motherboard;
event fault.cpu.intel.discard_fatal@motherboard,
count=stat.discard_fatal@motherboard[0],
message=0, retire=0, response=0;
prop fault.cpu.intel.has_poison@motherboard
{ payloadprop_defined("poison") && 1 == payloadprop("poison") } (1)->
ereport.cpu.intel.quickpath.mem_scrubbing_uc@chip<>/memory-controller<>,
ereport.cpu.intel.quickpath.llc_ewb_uc@chip<>,
ereport.cpu.intel.quickpath.system_cache_uc@chip<>,
ereport.cpu.intel.quickpath.bus_opr_poison_err@chip<>,
ereport.cpu.intel.quickpath.bus_opr_poison_err_external@chip<>;
prop fault.cpu.intel.discard_fatal@motherboard
{ count(stat.has_poison@motherboard[0]) > count(stat.discard_fatal@motherboard[0]) &&
payloadprop_defined("bank_number") && 5 == payloadprop("bank_number") &&
payloadprop_defined("processor_context_corrupt") &&
1 == payloadprop("processor_context_corrupt") } (0)->
ereport.cpu.intel.internal_unclassified@chip<>/core<>/strand<> {within(10s)};
prop fault.cpu.intel.internal@chip/core/strand
{ (count(stat.has_poison@motherboard[0]) <= count(stat.discard_fatal@motherboard[0]) ||
!payloadprop_defined("bank_number") || 5 != payloadprop("bank_number") ||
!payloadprop_defined("processor_context_corrupt") ||
0 == payloadprop("processor_context_corrupt")) &&
(payloadprop("error_uncorrected") == 1 ? setserdincrement(4) : 1) } (0)->
ereport.cpu.intel.internal_unclassified@chip/core/strand;