/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Support routines for managing per-Lxcache state.
*/
#include <errno.h>
#include <strings.h>
#include <fcntl.h>
#include <unistd.h>
#include <stropts.h>
#include <cmd_Lxcache.h>
#include <cmd_mem.h>
#include <cmd_cpu.h>
#include <cmd_state.h>
#include <cmd.h>
#define _KERNEL
#include <sys/cheetahregs.h>
#include <sys/mem_cache.h>
#include <sys/errclassify.h>
#include <fmd_adm.h>
#include <fmd_adm_impl.h>
#include <fmd_rpc_adm.h>
CMD_ERRCL_CPC | CMD_ERRCL_EDC | \
/* Note that these are the same for panther L2 and L3 (see prm) */
#define PN_ECSTATE_INV 0
/* Macro for putting 64-bit onto stack as two 32-bit ints */
/*
* e (for ecctable) maps single bit positions (0-127, or 0-0x7F) to the
* corresponding ECC syndromes for an error in that position.
*/
int e[] = {
/* From Table P-4, JPS1 US-III Supplement */
/* 0 1 2 3 4 5 6 7 */
/* 00 */ 0x03B, 0x127, 0x067, 0x097, 0x10F, 0x08F, 0x04F, 0x02C,
/* 08 */ 0x147, 0x0C7, 0x02F, 0x01C, 0x117, 0x032, 0x08A, 0x04A,
/* 10 */ 0x01F, 0x086, 0x046, 0x026, 0x09B, 0x08C, 0x0C1, 0x0A1,
/* 18 */ 0x01A, 0x016, 0x061, 0x091, 0x052, 0x00E, 0x109, 0x029,
/* 20 */ 0x02A, 0x019, 0x105, 0x085, 0x045, 0x025, 0x015, 0x103,
/* 28 */ 0x031, 0x00D, 0x083, 0x043, 0x051, 0x089, 0x023, 0x007,
/* 30 */ 0x0B9, 0x049, 0x013, 0x0A7, 0x057, 0x00B, 0x07A, 0x187,
/* 38 */ 0x0F8, 0x11B, 0x079, 0x034, 0x178, 0x1D8, 0x05B, 0x04C,
/* 40 */ 0x064, 0x1B4, 0x037, 0x03D, 0x058, 0x13C, 0x1B1, 0x03E,
/* 48 */ 0x1C3, 0x0BC, 0x1A0, 0x1D4, 0x1CA, 0x190, 0x124, 0x13A,
/* 50 */ 0x1C0, 0x188, 0x122, 0x114, 0x184, 0x182, 0x160, 0x118,
/* 58 */ 0x181, 0x150, 0x148, 0x144, 0x142, 0x141, 0x130, 0x0A8,
/* 60 */ 0x128, 0x121, 0x0E0, 0x094, 0x112, 0x10C, 0x0D0, 0x0B0,
/* 68 */ 0x10A, 0x106, 0x062, 0x1B2, 0x0C8, 0x0C4, 0x0C2, 0x1F0,
/* 70 */ 0x0A4, 0x0A2, 0x098, 0x1D1, 0x070, 0x1E8, 0x1C6, 0x1C5,
/* 78 */ 0x068, 0x1E4, 0x1E2, 0x1E1, 0x1D2, 0x1CC, 0x1C9, 0x1B8,
/* Now we have the check bits */
/* C0 C1 C2 C3 C4 C5 C6 C7 C8 */
0x001, 0x002, 0x004, 0x008, 0x010, 0x020, 0x040, 0x080, 0x100,
};
#define NBITS (sizeof (e)/sizeof (e[0]))
/*
* This table is used to determine which bit(s) is(are) bad when an ECC
* error occurs. The array is indexed by an 9-bit syndrome. The entries
* of this array have the following semantics:
*
* 00-127 The number of the bad bit, when only one bit is bad.
* 128 ECC bit C0 is bad.
* 129 ECC bit C1 is bad.
* 130 ECC bit C2 is bad.
* 131 ECC bit C3 is bad.
* 132 ECC bit C4 is bad.
* 133 ECC bit C5 is bad.
* 134 ECC bit C6 is bad.
* 135 ECC bit C7 is bad.
* 136 ECC bit C8 is bad.
* 137-143 reserved for Mtag Data and ECC.
* 144(M2) Two bits are bad within a nibble.
* 145(M3) Three bits are bad within a nibble.
* 146(M3) Four bits are bad within a nibble.
* 147(M) Multiple bits (5 or more) are bad.
* 148 NO bits are bad.
* Based on "Cheetah Programmer's Reference Manual" rev 1.1, Tables 11-4,11-5.
*/
#define M 147
#else /* JALAPENO || SERRANO */
#endif /* JALAPENO || SERRANO */
#endif /* JALAPENO || SERRANO */
{
#else /* JALAPENO || SERRANO */
#endif /* JALAPENO || SERRANO */
#else /* JALAPENO || SERRANO */
#endif /* JALAPENO || SERRANO */
};
/* 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 */
/* 1 */ 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 0, 0, 0, 0,
/* 2 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 3 */ 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
/* 4 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -1, -1, -1, -1,
/* 5 */-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1,
/* 6 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* 7 */ 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
/* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -1, -1, -1, -1,
};
/* 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 */
/* 1 */ 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 19, 20, 21, 22,
/* 2 */23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
/* 3 */39, 40, 41, 42, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
/* 6 */23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
/* 7 */39, 40, 41, 42, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
};
/* 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 */
/* 1 */ 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3,
/* 2 */ 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3,
/* 3 */ 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, -1, -1,
/* 4 */-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
/* 5 */ 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
/* 6 */ 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
/* 7 */ 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, -1, -1,
/* 8 */-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
};
/* 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 */
/* 1 */ 0, 0, 1, 1, 2, 2, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28,
/* 2 */29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36,
/* 5 */ 0, 0, 1, 1, 2, 2, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28,
/* 6 */29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36,
};
{
int i;
for (i = 0; i < (NDATABITS/2); i++) {
((clo & 1) ? e[i] : 0);
chi >>= 1;
clo >>= 1;
}
}
{
}
static uint8_t
{
switch (pstype) {
case CMD_PTR_CPU_L2TAG:
break;
case CMD_PTR_CPU_L3TAG:
break;
}
return (way_bit);
}
static int8_t
{
switch (pstype) {
case CMD_PTR_CPU_L2TAG:
break;
case CMD_PTR_CPU_L3TAG:
break;
}
return (way);
}
static int32_t
{
switch (pstype) {
case CMD_PTR_CPU_L2TAG:
>> PN_CACHE_LINE_SHIFT);
break;
case CMD_PTR_CPU_L3TAG:
>> PN_CACHE_LINE_SHIFT);
break;
}
return (index);
}
static int
{
int i, retired_ways;
retired_ways = 0;
for (i = 0; i < PN_CACHE_NWAYS; i++) {
if ((tag_data[i] & CH_ECSTATE_MASK) ==
retired_ways++;
}
return (retired_ways);
}
static cmd_evdisp_t
const char *fltnm)
{
char *payload_namep;
int tag_afar_status;
int i;
if (tag_afar_status == -1) {
"\n%s:cpu_id = %d Invalid afar status in nvlist\n",
return (CMD_EVD_BAD);
}
switch (pstype) {
case CMD_PTR_CPU_L2TAG:
break;
case CMD_PTR_CPU_L3TAG:
break;
default:
return (CMD_EVD_BAD);
}
&recheck_of_tags) != 0)
recheck_of_tags = 0;
if ((recheck_of_tags) || (test_mode))
"\n%s: cpu_id = %d index = %d could not find %s"
" in nvlist\n",
"\n%s: cpu_id = %d Reading tag data through"
" mem_cache driver.\n",
tag_data));
}
for (i = 0; i < PN_CACHE_NWAYS; i++) {
}
return (CMD_EVD_OK);
}
static void
{
int i;
for (i = 0; i < PN_CACHE_NWAYS; i++) {
tag_ecc[i] =
((tag_data[i] & PN_TAG_ECC_MASK)
}
"\n%s: cpu_id = %d ecc[0] = 0x%03x ecc[1] = 0x%03x"
" ecc[2] = 0x%03x ecc[3] = 0x%03x\n",
tag_ecc[3]);
}
static int
{
int i;
for (i = 0; i < PN_CACHE_NWAYS; i++) {
tag_ecc[i] =
((tag_data[i] & PN_TAG_ECC_MASK)
return (1);
}
}
return (0);
}
static void
{
int i;
for (i = 0; i < PN_CACHE_NWAYS; i++) {
tag_ecc[i] =
((tag_data[i] & PN_TAG_ECC_MASK)
switch (pstype) {
case CMD_PTR_CPU_L2TAG:
break;
case CMD_PTR_CPU_L3TAG:
break;
}
}
/*
* We now assemble the 128 bit data swizzling the Physical tags
* and states we obtained for all the 4 ways.
*/
data_for_ecc_gen[0] = 0; /* high order 64 bits */
switch (pstype) {
case CMD_PTR_CPU_L2TAG:
data_for_ecc_gen[1] |=
data_for_ecc_gen[1] |=
data_for_ecc_gen[1] |=
/* bits 63:60 of low order 64 bits are 0s */
/*
* We now start with hig order 64 bits.
* the low 12 bits are 0s
*/
break;
case CMD_PTR_CPU_L3TAG:
bit_position = 0;
/*
* Swizzle state bits for way 1 and way 3
*/
for (i = 0; i < 3; i++) {
data_for_ecc_gen[1] |=
bit_position++;
data_for_ecc_gen[1] |=
bit_position++;
}
/*
* Swizzle physical tag bits for way 1 and way 3
*/
for (i = 0; i < 20; i++) {
data_for_ecc_gen[1] |=
bit_position++;
data_for_ecc_gen[1] |=
bit_position++;
}
/*
* start the high order 64 bits.
*/
bit_position = 0;
/*
* Swizzle state bits for way 0 and way 2
*/
for (i = 0; i < 3; i++) {
data_for_ecc_gen[0] |=
bit_position++;
data_for_ecc_gen[0] |=
bit_position++;
}
/*
* Swizzle physical tag bits for way 0 and way 2
*/
for (i = 0; i < 20; i++) {
data_for_ecc_gen[0] |=
bit_position++;
data_for_ecc_gen[0] |=
bit_position++;
}
break;
}
}
static uint16_t
{
return (tag_synd);
}
static int16_t
{
sticky_bit = bit;
sticky_bit |= MSB_BIT;
return (sticky_bit);
}
static cmd_Lxcache_t *
{
const char *fltnm;
/*
* We first create a new Lxcache and add the event ep
* that is in Lxcache to the new case we create.
* we then destroy the Lxcache that has the event ep in its SERD engine.
*/
if (new_Lxcache == NULL) {
"\n%s:cpu_id %d:Failed to create a Lxcache for"
" index %d way %d bit %d\n",
return (NULL);
}
return (new_Lxcache);
}
int
{
if (reason == CMD_LXSUSPECT_0_TAG) {
/*
* clear MSB bit to retire as SUSPECT_0_TAG
* We need to update the Lxcache asru to reflect
* the change in bit value.
*/
if (errno) {
"\n%s:cpu_id %d: failed to update",
" CACHE_BIT in asru.\n",
return (CMD_EVD_BAD);
}
}
if (reason == CMD_LXCONVICTED)
else
if (reason == CMD_LXSUSPECT_0_TAG)
return (CMD_EVD_BAD);
/*
* Update the persistence storage of
* Lxcache.
*/
"\n%s:cpu_id %d:reason = %s flags = %s\n",
return (CMD_EVD_OK);
}
int
{
/*
* This routine is called only when handling anonymous TAG or DATA
* errors. When we exit this routine we would have destroyed the
* anonymous_Lxcache structure that was passed to us and created
* a new Lxcache if we were successful in determining a way to retire.
*/
if (ways_retired == -1) {
/*
* Couldn't determine how many ways have been retired at this
* index. Destroy the anonymous_Lxcache and return failure.
*/
return (CMD_EVD_BAD);
}
/*
* Before retiring a way check if we have already
* retired 3 ways for this index.
* For TAG errors we will not perform this check because
* we could reretire cachlines retired for DATA errors.
* The get_lowest_retirable_way() will ensure that we do
* not end up retiring all 4 ways.
*/
if (ways_retired >= 3) {
"\n%s: cpu %d: num of ways retired for index %d"
" is %d will fault the CPU\n",
/*
* destroy the anonymous_Lxcache
*/
return (CMD_EVD_OK);
}
}
/*
* No ways have been retired as "SUSPECT" for this bit.
* We need to retire the lowest unretired way as suspect.
*/
"\n%s: cpu_id %d Checking for the lowest retirable"
" way at index %d\n",
if (lowest_retirable_way != -1) {
"\n%s: cpu_id %d lowest retirable way is %d\n",
if ((new_Lxcache == NULL) ||
return (CMD_EVD_BAD);
}
else
} else {
"\n%s:cpu_id %d we are unable to determine which"
" way is faulty at cache index %d."
" Will retire the CPU.\nRecommended-Action:"
" Service action required\n",
/*
* destroy the anonymous_Lxcache
*/
return (CMD_EVD_OK);
}
}
int
const char *fltnm)
{
/*
* This routine is called only when handling anonymous TAG or DATA
* errors. When we exit this routine we would have destroyed the
* anonymous_Lxcache structure that was passed to us.
*/
"\n%s:cpu_id %d found index %d way %d"
" bit %d retired as %s. Will unretire this now.\n",
/*
* Save the way because we will destroy the
* suspect_Lxcache after we successfully unretire it.
*/
/*
* unretire the retired_way.
*/
== B_TRUE) {
"\n%s:cpu_id %d index %d way %d"
" successfully unretired. Will"
" destroy this Lxcache now.\n",
} else {
/*
* destroy the anonymous_Lxcache
*/
return (CMD_EVD_BAD);
}
/*
* retire the next retirable way
*/
if (next_retirable_way == -1) {
/*
* There is no retirable way that is next to the
* one we just retired. We need to offline the
* CPU since we are unable to determine which
* way is reporting the errors.
*/
"\n%s:cpu_id %d we are unable to determine"
" which way is faulty at cache index %d."
" It is likely that we have a leaky bit"
" that gets corrected.\n Will retire"
" the CPU.\nRecommended-Action: Service"
" action required\n",
/*
* destroy the anonymous_Lxcache
*/
return (CMD_EVD_OK);
} else {
"\n%s:cpu_id %d found way %d at index %d to"
" retire as SUSPECT_0/SUSPECT_DATA\n",
/*
* We need to create a new Lxcache struture.
* The existing Lxcache is for anonymous way.
*/
if ((new_Lxcache == NULL) ||
return (CMD_EVD_BAD);
}
else
}
}
void
{
const char *fltnm;
if (anonymous_Lxcache != NULL) {
"\n%s:cpu_id = %d index = %d We are destroying the"
" anonymous Lxcache now.\n",
/*
* Free the resources allocated to handle
* recheck_of_tags. Delete the Lxcache.
*/
}
}
void
{
const char *class;
/*
* We search thru the entire Lxcache structures to find
* a matching id.
*/
"Could not find Lxcache for timeout_id 0x%x\n", id);
return;
}
"\n%s:anonymous_tag_error_timeout:index = %d\n",
/*
* Set timeout_id to -1 to indicate that we have processed the
* timeout.
*/
switch (Lxcache->Lxcache_type) {
case CMD_PTR_CPU_L2TAG:
class = "ereport.cpu.ultraSPARC-IVplus.thce";
break;
case CMD_PTR_CPU_L3TAG:
break;
default:
"Unexpected pstype 0x%x found in"
" anonymous_tag_error_timeout: index = %d\n",
return;
}
}
{
int way_already_retired = 0;
/*
* We now extract physical tags and states
* and also look for matching ECC on all 4 ways.
*/
if (ret != 0)
return (ret);
while (matching_ecc(tag_data) != 0) {
return (CMD_EVD_BAD);
"\n%s:cpu_id = %d index = %d ECCs don't match.\n"
"Reading tag info again.\n",
}
"\n%s:cpu_id %d: found %d ways retired at the index %d\n",
if (ret != CMD_EVD_OK) {
"ret value = %d for nvlist_lookup of recheck_of_tags\n",
ret);
recheck_of_tags = 0;
}
if (tag_synd == 0) {
/*
* The bit has been corrected by writeback, we will
* first check if we are processing the re-check of tags
* that we scheduled thru the timeout call.
* if so we will exit if we reached the max retries.
* Else we start a timeout and exit.
* We will create a Lxcache structure for this index with way
* as -1 and bit as -1. We will also keep a count of
* attempts we made to check the tag data at this index.
*
*/
way = -1;
bit = -1;
if (recheck_of_tags) {
/*
* We are processing the re-read of tags scheduled by
* timeout. Exit if retry limit has been
* reached. Else start another timeout.
*/
/*
* This shouldn't happen.
*/
"\n%s: cpu_id = %d failed to lookup"
" index = %d way %d bit %d\n",
return (CMD_EVD_BAD);
}
"\n%s: cpu_id = %d index = %d syndrome"
" computed is 0 in attempt #%d.\n",
if (Lxcache->Lxcache_retry_count >=
/*
* We free only the nvl list here.
* anonymous SERD engine will be freed
* when the Lxcache gets destroyed.
* We need the anonymous SERD engine still
* because it has the event ep.
* reset or destroy of SERD engine frees the
* event ep.
*/
}
"\n%s:cpu_id %d Max retry count reached. Giving up.\n",
Lxcache->Lxcache_retry_count = 0;
} else {
(void *)CMD_TIMERTYPE_ANONYMOUS_TAG_ERROR,
NULL,
return (CMD_EVD_OK);
}
}
/*
* Check if we already have a Lxcache structure
* with anonymous way and bit created.
*/
"\n%s:cpu_id %d Failed to create Lxcache"
" for index=%d\n",
return (CMD_EVD_BAD);
}
}
/*
* We have another syndrome = 0 condition while we are
* still in the process of retrying for the previous
* condition.
*/
"\n%s: cpu_id = %d index = %d We have another"
" syndrome = 0 condition while we have already"
" scheduled a timeout. We will ignore this"
" event.\n",
return (CMD_EVD_OK);
}
"\n%s: cpu_id = %d index = %d syndrome computed is 0."
"Looks like the bit got corrected."
" Will check later to see if it is OK.\n",
/*
* We need to store the following arguments passed to
* this function(tag_error_handler) so that we can
* invoke this function from timeout routine.
*
* nvl, ep, clcode
*/
"\n%s:cpu_id %d Failed to duplicate nvl"
" for index=%d\n",
return (CMD_EVD_BAD);
}
FM_EREPORT_RECHECK_OF_TAGS, 1) != 0) {
"\n%s:cpu_id %d Failed to add"
" RECHECK_OF_TAGS in nvl for index=%d\n",
return (CMD_EVD_BAD);
}
}
/*
* We are called with CMP_CPU_LEVEL_CORE masked out
* from cmd_txce(), cmd_l3_thce() routines.
* We need to set CMD_CPU_LEVEL_CORE because we want to handle
* both the cores on the Chip as one single cpu_id.
*/
/*
* we need to preserve the event ep so that it does
* not get destroyed when we return from this call.
* We do that by adding the event ep to the SERD engine.
* The SERD engine we create is different from the one
* we create when we handle the actual event at label
* process_after_finding_way_bit.
*/
"\n%s: cpu_id %d: created a SERD engine"
" %s\n",
}
(void) fmd_serd_record(hdl,
ep);
}
(void *)CMD_TIMERTYPE_ANONYMOUS_TAG_ERROR, NULL,
return (CMD_EVD_OK);
} else {
/*
* tag_synd != 0
* determine way and bit
*/
"\n%s: cpu_id = %d index = %d tag_bit %03d is faulty.\n",
" Unexpected MTAG or Multiple bit error detected\n",
index);
return (CMD_EVD_BAD);
}
/*
* ECC bit is corrupted.
* Need to offline the CPU
*/
way = 0;
"\n%s: cpu_id = %d ECC bit is faulty.\n",
} else {
if (way < 0) {
"\n%s: cpu_id = %d %d bit indicted is a"
" meta bit !!\n",
index);
return (CMD_EVD_BAD);
}
}
} /* end of tag_synd != 0 */
"\n%s:cpu %d: the case for %s is already solved.\n",
return (CMD_EVD_REDUND);
}
"\n%s:cpu %d: Failed to create Lxcache for index %d",
" way %d bit %d\n",
return (CMD_EVD_BAD);
}
return (CMD_EVD_BAD);
}
"\n%s: cpu_id %d: created a SERD engine %s\n",
}
}
"\n%s:cpu_id %d: Checking if the SERD engine %s has fired.\n",
if (way >= 0) {
/*
* Now that we have recorded the event ep we can do the
* necessary cleanup of resources allocated for recheck of tags.
*/
}
return (CMD_EVD_OK);
if (way == -1) {
/*
* The assignment below is to make the code easier to maintain.
* We need to destroy the anonymous_Lxcache after we have
* identifed a way to retire. If we cannot detrmine a way to
* retire we will destrory the anonymous_Lxcache and fault the
* cpu.
*/
/*
* Anonymous TAG way retirement.
* - if a way at this index has already been retired as
* "suspect-1", unretire that way, and retire the next
* unretired way as "suspect-0", using a pattern of all zeros
* for the PA bits.
* - if a way at this index has already been retired as
* "suspect-0", re-retire that way as "suspect-1", using a
* pattern of all ones for the PA bits.
* - if no ways have been retired as "suspect" for this index,
* retire the lowest unretired way as "suspect-0" for this
* bit, using a pattern of all zeros for the PA bits.
* - if there is no next retirable way, fault the CPU.
*/
if (suspect_Lxcache) {
fltnm);
return (ret);
} /* end SUSPECT_1_TAG */
if (suspect_Lxcache) {
"\n%s:cpu_id %d found index %d way %d"
" bit %d retired as SUSPECT_0_TAG. Will"
" re-retire this now as SUSPECT_1_TAG.\n",
/*
* destroy the anonymous_Lxcache
*/
/*
* We need to update the FM_FMRI_CPU_CACHE_BIT entry
* in the Lxcache_asru_nvl. This entry was last updated
* when the cacheline was retired as SUSPECT_0.
* Therefore the MSB of FM_FMRI_CPU_CACHE_BIT entry
* value will be reset. To retire cacheline as
* SUSPECT_1 the MSB has to be set.
*/
if (errno) {
"\n%s:cpu_id %d: failed to update",
" CACHE_BIT in asru.\n",
}
} /* end of SUSPECT_0_TAG */
/*
* No ways have been retired as "SUSPECT_x" for this bit.
* We need to retire the lowest unretired way as suspect.
*/
fltnm);
return (ret);
} /* End of Anonymous TAG retirement */
/*
* Identified bit and way has fired.
* - Destroy any anonymous SERD engine at that index.
* - If the bad bit is an ECC bit, fault the CPU.
* - If the way was already convicted due to tag errors, fault the CPU.
* - If the bad bit is a state bit, then:
* - if the stable value of the bad bit will hold the NA encoding,
* retire the containing way as "convicted".
* - if the stable value of the bad bit will not hold the NA
* encoding, fault the CPU.
*/
if ((bit >= PN_LX_TAG_ECC_START_BIT) &&
(bit <= PN_LX_TAG_ECC_END_BIT)) {
"\n%s:cpu_id %d Bad ECC bit %d at cache index %d way %d"
" detected. Will offline the CPU.\n",
return (CMD_EVD_OK);
}
/*
* Check if a STATE bit is faulty.
* If so we need to ensure that we will be able to
* make the way NA, else fault the CPU.
*/
if (bit <= PN_LX_STATE_END_BIT) {
"%s cpu_id = %d: STATE bit %d is faulty.\n",
/*
* If the stable value of bit will hold the NA encoding
* retire the containing way Else fault the cpu.
*/
/*
* The stable value of the bad bit will not hold the
* NA encoding. will fault the CPU.
*/
"\n%s:cpu_id %d STATE bit %d is faulty at"
" cache index %d way %d. STATE = 0x%x\n"
" The bad bit will not hold the encoding we need"
" to mark the cacheline as retired, so will offline"
" the CPU.\n",
return (CMD_EVD_OK);
}
}
/*
* Check if we are getting fault on a way that is already retired.
* if the way was already convicted due to tag errors, fault the CPU.
* Note that the way could have previously been retired due to
* data errors. This is okay; we just re-retire it due to tag errors,
* so that we can write the offending tag bit to a stable value.
*/
/*
* Looking for CONVICTED TAG fault first.
* If found retire the CPU.
*/
if (retired_Lxcache) {
"\n%s: cpu %d: The cache index %d way %d previously"
" retired for %s fault at bit %d is reporting"
" fault. Will fault the CPU\n",
return (CMD_EVD_OK);
}
way_already_retired = 1;
}
/*
* If any way(Including the current way) at this index is retired as
* "suspect" due to tag errors, unretire it. (If that suspect way
* really was bad, it will start producing errors again and will
* eventually be retired again.)
*/
if (suspect_Lxcache) {
"\n%s:cpu_id %d found index %d way %d"
" bit %d retired as SUSPECT_x. Will"
" unretire this now.\n",
/*
* unretire the suspect_x retired_way.
*/
== B_TRUE) {
"\n%s:cpu_id %d index %d way %d"
" successfully unretired. Will"
" destroy this Lxcache now.\n",
} else {
/*
* We are unable to unretire the previously retired
* SUSPECT way at the fault index.
* If the previously retired way is same as the way
* we are attempting to retire then return failure.
*/
if (suspect_Lxcache->Lxcache_way ==
return (CMD_EVD_BAD);
}
}
if (ways_retired == -1)
return (CMD_EVD_BAD);
/*
* Before retiring a way check if we have already
* retired 3 ways for this index.
* If the way was already retired due to DATA error or
* SUSPECT_X TAG error then we skip the check.
*/
if (!way_already_retired) {
if (ways_retired >= 3) {
"\n%s: cpu %d: num of ways retired for index %d"
" is %d will fault the CPU\n",
return (CMD_EVD_OK);
}
}
"\n%s: cpu %d: num of ways retired for index %d is %d\n",
sticky_bit)) != 0 ||
}
static boolean_t
{
int ec_data_idx, i;
/*
* skip Retired and Invalid ways
*/
if ((state == PN_ECSTATE_NA) ||
(state == CH_ECSTATE_INV))
continue;
/*
* Each 16 bytes of data are protected by 9-bit ECC field.
*/
for (i = 0; i < (CH_ECACHE_SUBBLK_SIZE/16); i++) {
ec_data_idx = (i/2);
if ((i & 1) == 0) {
} else {
}
if ((calc_synd != 0) &&
"\ncomputed syndrome matches with the reported syndrome"
" 0x%x index = %d way = %d\n",
xr->xr_error_way);
} else {
"\ncomputed syndrome matches with"
" the reported syndrome"
" 0x%x index = %d way = %d\n",
way);
}
return (B_TRUE);
}
}
}
return (B_FALSE);
}
/* add to cheetahregs.h */
static int32_t
{
switch (type) {
case CMD_PTR_CPU_L2DATA:
>> PN_CACHE_LINE_SHIFT);
break;
case CMD_PTR_CPU_L3DATA:
>> PN_CACHE_LINE_SHIFT);
break;
}
return (index);
}
/*
* cmd_cache_ce_panther
*
* This routine handles L2 and L3 cachedata errors for the Panther.
* It's called when the train processing for L2 and L3 correctable
* data errors are about to issue a fault.
*
* This routine retrieves payload information gathered during the XR
* processing and generates a unique SERD engine and cache data
* associated with the CPU if one does not exist.
* If the SERD fires for the given engine it will initiate a cache
* line fault if the way is not anonomyous.
* If the way is anonomyous, it will attempt to choose a way for the
* given index to fault. If the maximum for the index has not been
* reached, it will attempt to unretire a different way previously retired
* under suspicion for the index prior to faulting
* the selected way.
* The routine will also fault the CPU if the maximum number of
* retired ways for the CPU has been exceeded based on the category.
*/
/*ARGSUSED*/
int
{
int ways_retired;
int ret;
/*
* The caller of this routine cmd_xxc_hdlr() expects us to
* return CMD_EVD_OK for success and CMD_EVD_BAD for failures.
* If this is not a Panther or one of the Panther specific
* errors that we handle here, then exit
*/
return (CMD_EVD_BAD);
return (CMD_EVD_BAD);
/* Set up Cache specific structs */
} else {
}
/* Ensure that our case is not solved */
return (CMD_EVD_OK);
/* L3 errors arrive as mem scheme errors - convert to CPU */
if (type == CMD_PTR_CPU_L3DATA) {
}
return (CMD_EVD_BAD);
}
/*
* Data bit. Set bit in the range 0-511
*/
} else {
/*
* ECC bit. Set bit in the range 512-547
*/
}
return (CMD_EVD_BAD);
}
/*
* The payload information for the DATA errors are assembled
* after first looking for a valid line that matches the fault AFAR.
* If no match is found all 4 ways are logged and xr_num_ways
* will be 4. If a matching way is found only that entry is logged
* and xr_num_ways is set as 1.
* The xr_error_way is set as -1 when xr_num_ways is 4, else
* xr_error_way is set to the matching way.
* errors.
* For UCC and EDC errors the xr_error_way will be set correctly.
*/
case CMD_ERRCL_WDC:
case CMD_ERRCL_L3_WDC:
/*
* WDC is a disrupting trap, and invalidates and
* overwrites the problematic way. Any match is due to
* a refetch of the AFAR, which could have been to any
* way. So these are treated as "anonymous".
*/
break;
case CMD_ERRCL_CPC:
case CMD_ERRCL_L3_CPC:
/*
* CPC is a disrupting trap, but since it happens due to
* a snoop, the problematic way could become invalid,
* overwritten by a different cache line, and then the
* AFAR accessed and pulled into a different way,
* causing a false positive match. So it's best to not
* look for a matching way and just ascribe these to
* the "anonymous" way.
*/
break;
case CMD_ERRCL_UCC:
case CMD_ERRCL_L3_UCC:
/*
* UCC is a precise trap, so, absent activity from the
* other core, the tag address values read by the TL=1
* trap handler are likely to be the same as those at
* the time of the trap.
* (A snoop from another CPU might cause a change in
* state from valid to invalid, but the tag address
* won't change.) If we find a matching valid tag,
* that identifies the way.
*/
xr->xr_num_ways);
"\n%s:cpu_id %d: error way = %d\n",
xr->xr_error_way);
break;
case CMD_ERRCL_EDC:
case CMD_ERRCL_L3_EDC:
/*
* EDC is a disrupting trap, but again if a matching
* valid way is found, it is likely to be the correct
* way.
*/
xr->xr_num_ways);
"\n%s:cpu_id %d: error way = %d\n",
xr->xr_error_way);
break;
default:
}
if ((type == CMD_PTR_CPU_L2DATA) &&
}
"\n%s: cpu %d: creating a case for index %d way %d"
" bit %d\n",
"\n%s:cpu_id %d:Failed to create a Lxcache for"
" index %d way %d bit %d\n",
return (CMD_EVD_BAD);
}
}
return (CMD_EVD_BAD);
if (!fmd_serd_exists(hdl,
"\n%s: cpu_id %d: created a SERD engine %s\n",
}
}
/* Ensure that our case is not solved */
"\n%s:cpu %d: the case for %s is already solved.\n",
return (CMD_EVD_REDUND);
}
"\n%s:cpu_id %d: checking if SERD engine %s has fired.\n",
== FMD_B_FALSE)
return (CMD_EVD_OK); /* serd engine hasn't fired yet */
/*
* as suspect. We need this information for both anonymous way and
* identified way handling. We store this info in suspect_Lxcache.
*/
"\n%s:cpu_id %d checking if there is a way at"
" index %d retired as suspect due to bit %d\n",
/*
* IDENTIFIED WAY DATA error handling.
*
* If there is a way at that index retired as suspect due
* to that bit, unretire it.
* retire the identified way, and mark the way as "convicted"
* for this bit. Destroy any anonymous SERD engine named by
* that index and bit.
*/
if (suspect_Lxcache != NULL) {
"\n%s:cpu_id %d found index %d way %d"
" bit %d retired on suspicion. Will"
" unretire this now.\n",
/*
* unretire the retired_way.
*/
}
/*
* We proceed to retire the identified way even if
* we are unable to unretire the suspect way.
* We will not end up retiring all 4 ways because
* we check the actual number of ways retired
* at this index by reading the info from processor
* directly. The call to get_index_retired_ways() does
* that.
*/
}
/*
* Before retiring a way check if we have already
* retired 3 ways for this index.
*/
if (ways_retired == -1) {
"\n%s: cpu %d: We are unable to determine how many"
" ways are retired at this index. We will not be"
" retiring the identified cacheline at index %d"
" way %d\n",
return (CMD_EVD_BAD);
}
if (ways_retired >= 3) {
"\n%s: cpu %d: num of ways retired for index %d"
" is %d. Will fault the CPU\n",
return (CMD_EVD_OK);
}
/*
* retire the cache line
*/
if (ret != CMD_EVD_OK)
return (ret);
/*
* anonymous serd engines for DATA faults will have valid bit
* but way as -1.
*/
bit);
return (CMD_EVD_OK);
} /* end of IDENTIFIED WAY error handling */
/*
* ANONYMOUS WAY DATA error handling.
*
* - if a way at this index has already been retired as "suspect"
* for this bit, unretire that way, and retire the next retirable
* way as "suspect" for this bit.
* - if no ways have been retired as "suspect" for this bit,
* retire the lowest unretired way as "suspect" for this bit.
* - if there is no next retirable way, fault the CPU.
*/
/*
* The assignment below is to make the code easier to maintain.
* We need to destroy the anonymous_Lxcache after we have
* identifed a way to retire. If we cannot detrmine a way to
* retire we will destrory the anonymous_Lxcache and fault the cpu.
*/
if (suspect_Lxcache != NULL) {
} else {
}
return (ret);
}
/* ARGSUSED */
int
{
return (0);
&xr->xr_detector_nvlist) != 0) {
return (-1);
}
"look up for FM_EREPORT_PAYLOAD_NAME_AFSR failed\n");
return (-1);
}
if (CMD_ERRCL_ISL3XXCU(clcode)) {
&xr->xr_num_ways) != 0) {
"look up for FM_EREPORT_PAYLOAD_NAME_L3_WAYS failed\n");
return (-1);
}
&sz) != 0) {
"look up for FM_EREPORT_PAYLOAD_NAME_L3_DATA failed\n");
}
} else {
&xr->xr_num_ways) != 0) {
"look up for FM_EREPORT_PAYLOAD_NAME_L2_WAYS failed\n");
return (-1);
}
&sz) != 0) {
"look up for FM_EREPORT_PAYLOAD_NAME_L2_DATA failed\n");
}
}
"xr_num_ways > PN_CACHE_WAYS\n");
return (-1);
}
if (cache_data == NULL) {
return (0);
}
/*
* Our error handler checks for a matching valid way
* If there is a match, there is only 1 data set, the set
* associated with the cache-line/way that was "valid"
* Otherwise, it stores all of the ways
*/
/* If there is more than 1 way structure, set way to Anonymous */
return (0);
}