cmd_Lxcacheerr.c revision 7bebe46c240b554f47faeed19186123896281967
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* Support routines for managing per-Lxcache state.
*/
#include <errno.h>
#include <strings.h>
#include <fcntl.h>
#include <unistd.h>
#include <stropts.h>
#include <cmd_Lxcache.h>
#include <cmd_mem.h>
#include <cmd_cpu.h>
#include <cmd_state.h>
#include <cmd.h>
#define _KERNEL
#include <sys/cheetahregs.h>
#include <sys/mem_cache.h>
#include <sys/errclassify.h>
#include <fmd_adm.h>
#include <fmd_adm_impl.h>
#include <fmd_rpc_adm.h>
CMD_ERRCL_CPC | CMD_ERRCL_EDC | \
/* Note that these are the same for panther L2 and L3 (see prm) */
#define LX_INDEX_MASK PN_L2_INDEX_MASK
#define LX_INDEX_SHIFT 6
#define PN_ECSTATE_NA 5
#define LX_PA_MASK2_32BIT_CORRECT 16
#define LX_PA_MASK3_32BIT_CORRECT 24
#define LX_PA_MASK2 0x7fffff8
#define LX_PA_MASK3 0x7ffff8
#define MAX_RETRIES_FOR_ECC_MATCH 3
#define PN_TAG_ECC_MASK 0x7fc0
#define PN_L2_PTAG_SHIFT 19
#define PN_L3_PTAG_SHIFT 24
#define L2_PTAG_MASK 0xffffff
#define L3_PTAG_MASK 0xfffff
#define BIT_MASK 0x7f
#define MSB_BIT 0x8000
#define LX_NWAYS 4
int test_mode = 0; /* should be 0 in production version. */
/*
* e (for ecctable) maps single bit positions (0-127, or 0-0x7F) to the
* corresponding ECC syndromes for an error in that position.
*/
int e[] = {
/* From Table P-4, JPS1 US-III Supplement */
/* 0 1 2 3 4 5 6 7 */
/* 00 */ 0x03B, 0x127, 0x067, 0x097, 0x10F, 0x08F, 0x04F, 0x02C,
/* 08 */ 0x147, 0x0C7, 0x02F, 0x01C, 0x117, 0x032, 0x08A, 0x04A,
/* 10 */ 0x01F, 0x086, 0x046, 0x026, 0x09B, 0x08C, 0x0C1, 0x0A1,
/* 18 */ 0x01A, 0x016, 0x061, 0x091, 0x052, 0x00E, 0x109, 0x029,
/* 20 */ 0x02A, 0x019, 0x105, 0x085, 0x045, 0x025, 0x015, 0x103,
/* 28 */ 0x031, 0x00D, 0x083, 0x043, 0x051, 0x089, 0x023, 0x007,
/* 30 */ 0x0B9, 0x049, 0x013, 0x0A7, 0x057, 0x00B, 0x07A, 0x187,
/* 38 */ 0x0F8, 0x11B, 0x079, 0x034, 0x178, 0x1D8, 0x05B, 0x04C,
/* 40 */ 0x064, 0x1B4, 0x037, 0x03D, 0x058, 0x13C, 0x1B1, 0x03E,
/* 48 */ 0x1C3, 0x0BC, 0x1A0, 0x1D4, 0x1CA, 0x190, 0x124, 0x13A,
/* 50 */ 0x1C0, 0x188, 0x122, 0x114, 0x184, 0x182, 0x160, 0x118,
/* 58 */ 0x181, 0x150, 0x148, 0x144, 0x142, 0x141, 0x130, 0x0A8,
/* 60 */ 0x128, 0x121, 0x0E0, 0x094, 0x112, 0x10C, 0x0D0, 0x0B0,
/* 68 */ 0x10A, 0x106, 0x062, 0x1B2, 0x0C8, 0x0C4, 0x0C2, 0x1F0,
/* 70 */ 0x0A4, 0x0A2, 0x098, 0x1D1, 0x070, 0x1E8, 0x1C6, 0x1C5,
/* 78 */ 0x068, 0x1E4, 0x1E2, 0x1E1, 0x1D2, 0x1CC, 0x1C9, 0x1B8,
/* Now we have the check bits */
/* C0 C1 C2 C3 C4 C5 C6 C7 C8 */
0x001, 0x002, 0x004, 0x008, 0x010, 0x020, 0x040, 0x080, 0x100,
};
#define NBITS (sizeof (e)/sizeof (e[0]))
#define NDATABITS (128)
/*
* This table is used to determine which bit(s) is(are) bad when an ECC
* error occurs. The array is indexed by an 9-bit syndrome. The entries
* of this array have the following semantics:
*
* 00-127 The number of the bad bit, when only one bit is bad.
* 128 ECC bit C0 is bad.
* 129 ECC bit C1 is bad.
* 130 ECC bit C2 is bad.
* 131 ECC bit C3 is bad.
* 132 ECC bit C4 is bad.
* 133 ECC bit C5 is bad.
* 134 ECC bit C6 is bad.
* 135 ECC bit C7 is bad.
* 136 ECC bit C8 is bad.
* 137-143 reserved for Mtag Data and ECC.
* 144(M2) Two bits are bad within a nibble.
* 145(M3) Three bits are bad within a nibble.
* 146(M3) Four bits are bad within a nibble.
* 147(M) Multiple bits (5 or more) are bad.
* 148 NO bits are bad.
* Based on "Cheetah Programmer's Reference Manual" rev 1.1, Tables 11-4,11-5.
*/
#define C0 128
#define C1 129
#define C2 130
#define C3 131
#define C4 132
#define C5 133
#define C6 134
#define C7 135
#define C8 136
#define MT1 138
#define MT2 139
#define MTC1 141
#define MTC2 142
#define MTC3 143
#define M2 144
#define M3 145
#define M4 146
#define M 147
#define NA 148
#else /* JALAPENO || SERRANO */
#endif /* JALAPENO || SERRANO */
#define BPAR15 167
#endif /* JALAPENO || SERRANO */
static uint8_t ecc_syndrome_tab[] =
{
#else /* JALAPENO || SERRANO */
#endif /* JALAPENO || SERRANO */
#else /* JALAPENO || SERRANO */
#endif /* JALAPENO || SERRANO */
};
/* 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 */
/* 1 */ 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 0, 0, 0, 0,
/* 2 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 3 */ 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
/* 6 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* 7 */ 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
};
/* 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 */
/* 1 */ 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 19, 20, 21, 22,
/* 2 */23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
/* 3 */39, 40, 41, 42, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
/* 6 */23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
/* 7 */39, 40, 41, 42, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
};
/* 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 */
/* 1 */ 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3,
/* 2 */ 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3,
/* 5 */ 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
/* 6 */ 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
};
/* 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 */
/* 1 */ 0, 0, 1, 1, 2, 2, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28,
/* 2 */29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36,
/* 5 */ 0, 0, 1, 1, 2, 2, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28,
/* 6 */29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36,
};
{
int i;
for (i = 0; i < (NDATABITS/2); i++) {
((clo & 1) ? e[i] : 0);
chi >>= 1;
clo >>= 1;
}
}
{
int i;
for (i = 0; i < (NDATABITS/2); i++) {
((clo & 1) ? e[i] : 0);
chi >>= 1;
clo >>= 1;
}
}
static uint8_t
{
switch (pstype) {
case CMD_PTR_CPU_L2TAG:
return (way_bit);
break;
case CMD_PTR_CPU_L3TAG:
return (way_bit);
break;
}
return (way_bit);
}
static uint8_t
{
switch (pstype) {
case CMD_PTR_CPU_L2TAG:
break;
case CMD_PTR_CPU_L3TAG:
break;
}
return (way);
}
static uint32_t
{
switch (pstype) {
case CMD_PTR_CPU_L2TAG:
>> PN_CACHE_LINE_SHIFT);
break;
case CMD_PTR_CPU_L3TAG:
>> PN_CACHE_LINE_SHIFT);
break;
}
return (index);
}
static cmd_evdisp_t
const char *fltnm)
{
char *payload_namep;
int i;
&tag_afar) != 0) {
"%s:cpu_id = %d could not find AFAR in nvlist\n",
return (CMD_EVD_BAD);
}
&tag_afar_status) != 0) {
"%s: cpu_id = %d index = %d could not find AFAR_STATUS"
" in nvlist\n",
return (CMD_EVD_BAD);
}
switch (pstype) {
case CMD_PTR_CPU_L2TAG:
break;
case CMD_PTR_CPU_L3TAG:
break;
}
if (test_mode) {
tag_data));
} else {
"%s: cpu_id = %d index = %d could not find %s"
" in nvlist\n",
"%s: cpu_id = %d Reading tag data through"
" mem_cache driver.\n",
tag_data));
}
}
for (i = 0; i < PN_CACHE_NWAYS; i++) {
}
return (CMD_EVD_OK);
}
static void
{
int i;
for (i = 0; i < PN_CACHE_NWAYS; i++) {
}
"%s: cpu_id = %d ecc[0] = 0x%03x, ecc[1] = 0x%03x, ecc[2] = 0x%03x,"
" ecc[3] = 0x%03x\n",
tag_ecc[3]);
}
static int
{
int i;
for (i = 0; i < PN_CACHE_NWAYS; i++) {
return (1);
}
}
return (0);
}
static void
{
int i;
for (i = 0; i < PN_CACHE_NWAYS; i++) {
switch (pstype) {
case CMD_PTR_CPU_L2TAG:
break;
case CMD_PTR_CPU_L3TAG:
break;
}
}
/*
* We now assemble the 128 bit data swizzling the Physical tags
* and states we obtained for all the 4 ways.
*/
data_for_ecc_gen[0] = 0; /* high order 64 bits */
switch (pstype) {
case CMD_PTR_CPU_L2TAG:
data_for_ecc_gen[1] |=
data_for_ecc_gen[1] |=
data_for_ecc_gen[1] |=
/* bits 63:60 of low order 64 bits are 0s */
/*
* We now start with hig order 64 bits.
* the low 12 bits are 0s
*/
break;
case CMD_PTR_CPU_L3TAG:
bit_position = 0;
/*
* Swizzle state bits for way 1 and way 3
*/
for (i = 0; i < 3; i++) {
data_for_ecc_gen[1] |=
bit_position++;
data_for_ecc_gen[1] |=
bit_position++;
}
/*
* Swizzle physical tag bits for way 1 and way 3
*/
for (i = 0; i < 20; i++) {
data_for_ecc_gen[1] |=
bit_position++;
data_for_ecc_gen[1] |=
bit_position++;
}
/*
* start the high order 64 bits.
*/
bit_position = 0;
/*
* Swizzle state bits for way 0 and way 2
*/
for (i = 0; i < 3; i++) {
data_for_ecc_gen[0] |=
bit_position++;
data_for_ecc_gen[0] |=
bit_position++;
}
/*
* Swizzle physical tag bits for way 0 and way 2
*/
for (i = 0; i < 20; i++) {
data_for_ecc_gen[0] |=
bit_position++;
data_for_ecc_gen[0] |=
bit_position++;
}
break;
}
}
static uint16_t
{
return (tag_synd);
}
static uint16_t
{
return (sticky_bit);
}
return (bit);
}
const char *fltnm)
{
const char *uuid;
int ways_retired, ret;
#if defined(lint)
#endif
/*
* We now extract physical tags and states
* and also look for matching ECC on all 4 ways.
*/
if (ret != 0)
return (ret);
while (matching_ecc(tag_data) != 0) {
return (CMD_EVD_BAD);
"%s:cpu_id = %d index = %d ECCs don't match.\n"
"Reading tag info again.\n",
continue;
}
if (tag_synd == 0) {
/*
* The bit has been corrected by writeback, we will
* check this later to see if the bit becomes sticky again
*/
"%s: cpu_id = %d index = %d syndrome computed is 0."
"Looks like the bit got corrected."
" Will check later to see if it is OK.\n",
return (CMD_EVD_OK);
}
"%s: cpu_id = %d index = %d tag_bit %03d is faulty.\n",
" Unexpected MTAG or Multiple bit error detected\n",
return (CMD_EVD_BAD);
}
/*
* ECC bit is corrupted.
* Need to offline the CPU
*/
way = 0;
} else {
"%s: cpu_id = %d %d bit indicted is a meta bit !!\n",
return (CMD_EVD_BAD);
}
}
"%s:cpu %d: the case for %s is already solved.\n",
return (CMD_EVD_REDUND);
}
&uuid);
"%s:cpu_id %d:created a case for index %d way %d bit %d\n",
}
"%s: cpu_id %d: created a SERD engine %s\n",
}
"%s:cpu_id %d: Checking if the SERD engine %s has fired.\n",
return (CMD_EVD_OK); /* engine hasn't fired */
return (CMD_EVD_OK);
}
/*
* Check if a STATE bit is faulty.
* If so we need to ensure that we will be able to
* make the way NA, else fault the CPU.
*/
if (bit <= 2) {
"%s cpu_id = %d: STATE bit %d is faulty.\n",
/*
* If the stable value of bit will hold the NA encoding
* retire the containing way Else fault the cpu.
*/
cpu_fault = 0;
if (bit == 1) {
/*
* The stable value should be 0.
*/
cpu_fault = 1;
} else {
/*
* The stable value should be 1.
*/
cpu_fault = 1;
}
"%s cpu_id = %d: STATE bit %d is faulty."
"cpu_fault = %d STATE = 0x%x\n",
if (cpu_fault) {
return (CMD_EVD_OK);
}
}
/*
* Before retiring a way check if we have already
* retired 3 ways for this index.
*/
if (ways_retired == -1)
return (CMD_EVD_BAD);
if (ways_retired >= 3) {
"%s: cpu %d: num of ways retired for index %d is %d"
" will fault the CPU\n",
return (CMD_EVD_OK);
}
/*
* Check if we getting fault on a way that is already retired.
* If so we need to retire the CPU.
*/
"%s: cpu %d: An already retired index %d way %d is"
" reporting fault. will fault the CPU\n",
return (CMD_EVD_OK);
}
"%s: cpu %d: num of ways retired for index %d is %d\n",
sticky_bit)) != 0 ||
return (CMD_EVD_OK);
}
static int
{
int ret_val = 0; /* 0 = failure */
if (type == CMD_PTR_CPU_L2DATA) {
ret_val = 1;
}
} else {
ret_val = 1;
}
}
if ((ec_tag == CH_ECSTATE_INV) ||
(ec_tag == CH_ECSTATE_OWN) ||
(ec_tag == CH_ECSTATE_MOD)) {
ret_val = 0;
ec_tag);
}
return (ret_val);
}
{
int err = 0;
if (err)
return (-1);
"cpu:///%s=%u/%s=%s/%s=%u/%s=%u/%s=%u/%s=%d",
}
int
{
int err;
FMD_ADM_VERSION)) == NULL) {
return (-1);
}
err = -1;
goto out;
}
if (err)
err = -1;
out:
return (err);
}
/* Find the lowest way SERD engine not faulted for the given index */
{
*other_cache = NULL;
*other_cache = cache;
break;
}
}
if (pstype == CMD_PTR_CPU_L2DATA) {
} else {
}
/* return this way if larger */
*other_cache = cache;
return (way);
} else {
return (way1);
}
}
}
}
/*
* Find the lowest way SERD engine faulted but not convicted for the
* given index
*/
{
*other_cache = NULL;
*other_cache = cache;
break;
}
}
if (pstype == CMD_PTR_CPU_L2DATA) {
} else {
}
/* Return the smaller of the two */
*other_cache = cache;
return (way);
} else {
return (way1);
}
}
}
return ((uint32_t)-1);
}
/* Count the number of ways convicted for a given index */
{
way_count++;
}
}
return (way_count);
}
/*
* cmd_cache_ce_panther
*
* This routine handles L2 and L3 cachedata errors for the Panther.
* It's called when the train processing for L2 and L3 correctable
* data errors are about to issue a fault.
*
* This routine retrieves payload information gathered during the XR
* processing and generates a unique SERD engine and cache data
* associated with the CPU if one does not exist.
* If the SERD fires for the given engine it will initiate a cache
* line fault if the way is not anonomyous.
* If the way is anonomyous, it will attempt to choose a way for the
* given index to fault. If the maximum for the index has not been
* reached, it will attempt to unretire a different way previously retired
* under suspicion for the index prior to faulting
* the selected way.
* The routine will also fault the CPU if the maximum number of
* retired ways for the CPU has been exceeded based on the category.
*/
/*ARGSUSED*/
int
{
struct ch_ec_data *data_ptr;
const char *uuid;
/*
* If this is not a Panther or one of the Panther specific
* errors that we handle here, then exit
*/
return ((int)~CPU_ULTRASPARC_IVplus);
return (-1);
} else
/* Account for trw deref if necessary */
/* Set up Cache specific structs */
} else {
}
/* Ensure that our case is not solved */
return (0);
/* L3 errors arrive as mem scheme errors - convert to CPU */
if (type == CMD_PTR_CPU_L3DATA) {
}
/* Check for valid syndrome */
return (0);
}
/* Retrieve pointer to our payload data */
offset = 0;
for (i = 0; i < 4; i++) {
if ((i < 3) &&
break;
} else {
offset = (7 - i);
break;
}
}
if ((i < 3) &&
break;
} else {
offset = (3 - i);
break;
}
}
}
else {
}
xr->xr_error_tag);
xr->xr_error_index);
xr->xr_error_way);
/* First, register cache error */
"%s: cpu %d: creating a case for index %d way %d"
" bit %x\n",
if (!fmd_serd_exists(hdl,
}
} else {
"retired -- ignoring\n");
return (0);
}
}
/* Ensure that our case is not solved */
return (0);
" fired.\n",
ep) == FMD_B_FALSE)
return (0); /* serd engine hasn't fired yet */
&rsrc_nvl, 0) != 0) {
}
/* This cache line's SERD Engine has fired. Prepare to convict it */
/* Get the number of ways convicted for this index */
/* Tally apprpropiate ways convinced due to Tag faults */
if (type == CMD_PTR_CPU_L2DATA) {
} else {
}
/* If there are none left fault the CPU */
if (way_count == 4) {
"Already 3 ways are retired for this line."
"Retiring the CPU because all ways are faulty.\n");
return (CMD_EVD_OK);
} else {
/* Find the lowest way not faulted */
"setting to 0\n");
new_way = 0;
}
/*
* If a previous case for this way exists,
* destroy it as we are replacing it with the
* triggered Anonymous Way SERD case
*/
if (other_cache != NULL) {
}
/* Change the way for this case */
"from %d to %d\n", uuid,
/* Remove the ANON way SERD Engine */
/* Now replace it with the new_way SERD */
/*
* If we have retired a previous way in this
* cache line under suspicion, unretire it
*/
&repair_nvl, 0) != 0) {
"duplicate resource FMRI for "
"repair");
}
/* Repair the cache line */
FM_FMRI_CPU_CACHE_WAY, unretire_way) == 0)
if (cmd_Lx_repair_rsrc(hdl,
repair_nvl)) {
" to repair index %d"
" way %d\n",
}
else
"way to nvl to repair resource",
"with way %d\n", unretire_way);
}
/* Indicate our reason for retiring */
}
}
/*
* if this SERD engine specifies a way, then destroy any
*/
}
return (0);
}
/* ARGSUSED */
int
{
struct ch_ec_data *data_ptr;
int i;
return (0);
&xr->xr_detector_nvlist) != 0)
return (-1);
return (-1);
if (CMD_ERRCL_ISL3XXCU(clcode)) {
&xr->xr_num_ways) != 0)
return (-1);
&sz) != 0)
return (-1);
} else {
&xr->xr_num_ways) != 0)
return (-1);
&sz) != 0)
return (-1);
}
return (-1);
for (i = 0; i < xr->xr_num_ways; i++) {
xr->xr_error_index =
}
/* If there is more than 1 way structure, set way to Anonymous */
return (0);
}