cmd_memerr.c revision f176577eacb9d0c0c87cbb13b1c92417a58a8890
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* Ereport-handling routines for memory errors
*/
#include <cmd_mem.h>
#include <cmd_dimm.h>
#include <cmd_bank.h>
#include <cmd_page.h>
#include <cmd_cpu.h>
#ifdef sun4u
#include <cmd_dp.h>
#include <cmd_dp_page.h>
#endif
#include <cmd.h>
#include <strings.h>
#include <string.h>
#include <errno.h>
#include <sys/errclassify.h>
struct ce_name2type {
const char *name;
};
{
static const struct ce_name2type old[] = {
{ NULL }
};
static const struct ce_name2type new[] = {
{ CE_DISP_DESC_P, CE_DISP_PERS },
{ CE_DISP_DESC_L, CE_DISP_LEAKY },
{ CE_DISP_DESC_S, CE_DISP_STICKY },
{ NULL }
};
const struct ce_name2type *tp;
return (CE_DISP_UNKNOWN);
}
static void
{
fmd_case_t *cp;
cmd_dimm_t *d;
int foundrw;
/* We've already complained about this DIMM */
return;
}
return; /* Don't warn until over specified % of system memory */
/* Look for CEs on DIMMs in other banks */
d != NULL; d = cmd_list_next(d)) {
if (d == dimm) {
dret += d->dimm_nretired;
continue;
}
continue;
foundrw = 1;
dret += d->dimm_nretired;
}
}
if (foundrw) {
/*
* Found a DIMM in another bank with a significant number of
* retirements. Something strange is going on, perhaps in the
* datapath or with a bad CPU. A real person will need to
* figure out what's really happening. Emit a fault designed
* to trigger just that.
*/
d = cmd_list_next(d)) {
continue;
continue;
if (!(d->dimm_flags & CMD_MEM_F_FAULTING)) {
d->dimm_flags |= CMD_MEM_F_FAULTING;
cmd_dimm_dirty(hdl, d);
}
"fault.memory.datapath",
}
return;
}
}
/* Create a fresh index block for MQSC CE correlation. */
cmd_mq_t *
{
return (cp);
}
/*
* Add an index block for a new CE, sorted
* a) by ascending unit position
* b) order of arrival (~= time order)
*/
void
{
int cw, unit_position;
return; /* not a CE */
/*
* Found a duplicate cw, unit_position, and afar.
* Delete this node, to be superseded by the new
* node added below.
*/
}
else
}
/*
* Prune the MQSC index lists (one for each checkword), by deleting
* outdated index blocks from each list.
*/
void
{
int cw;
} /* tstamp < now - ce_t */
} /* per checkword */
} /* cw = 0...3 */
}
/*
* Check the MQSC index lists (one for each checkword) by making a
* complete pass through each list, checking if the criteria for either
* Rule 4A or 4B have been met. Rule 4A checking is done for each checkword;
* 4B check is done at end.
*
* Rule 4A: fault a DIMM "whenever Solaris reports two or more CEs from
* two or more different physical addresses on each of two or more different
* bit positions from the same DIMM within 72 hours of each other, and all
* the addresses are in the same relative checkword (that is, the AFARs
* are all the same modulo 64). [Note: This means at least 4 CEs; two
* from one bit position, with unique addresses, and two from another,
* also with unique addresses, and the lower 6 bits of all the addresses
* are the same."
*
* Rule 4B: fault a DIMM "whenever Solaris reports two or more CEs from
* two or more different physical addresses on each of three or more
* different outputs from the same DRAM within 72 hours of each other, as
* long as the three outputs do not all correspond to the same relative
* bit position in their respective checkwords. [Note: This means at least
* 6 CEs; two from one DRAM output signal, with unique addresses, two from
* another output from the same DRAM, also with unique addresses, and two
* more from yet another output from the same DRAM, again with unique
* addresses, as long as the three outputs do not all correspond to the
* same relative bit position in their respective checkwords.]"
*/
void
{
typedef struct upos_pair {
int upos;
int dram;
} upos_pair_t;
upos_pairs = 0;
i = upos_pairs;
curr_upos = -1;
else if (i > upos_pairs &&
continue; /* skip triples, quads, etc. */
/* we have a pair */
}
}
if (i - upos_pairs >= 2) {
for (j = upos_pairs; j < i; j++) {
}
return;
}
upos_pairs = i;
}
if (upos_pairs < 3)
return; /* 4B violation needs at least 3 pairs */
for (i = 0; i < upos_pairs; i++) {
for (j = i+1; j < upos_pairs; j++) {
continue;
for (k = j+1; k < upos_pairs; k++) {
continue;
if ((upos_array[i].upos !=
upos_array[j].upos) ||
(upos_array[j].upos !=
upos_array[k].upos)) {
dimm, "fault.memory.dimm",
return;
}
}
}
}
}
/*ARGSUSED*/
{
const char *uuid;
if (afar_status != AFLT_STAT_VALID ||
return (CMD_EVD_UNUSED);
return (CMD_EVD_REDUND);
#ifdef sun4u
return (CMD_EVD_UNUSED);
}
#endif /* sun4u */
return (NULL);
}
return (CMD_EVD_UNUSED);
}
/*
* Add to MQSC correlation lists all CEs which pass validity
* checks above.
*/
}
}
switch (type) {
case CE_DISP_UNKNOWN:
return (CMD_EVD_UNUSED);
case CE_DISP_INTERMITTENT:
return (CMD_EVD_UNUSED);
case CE_DISP_POSS_PERS:
break;
case CE_DISP_PERS:
break;
case CE_DISP_LEAKY:
break;
case CE_DISP_POSS_STICKY:
{
if (CE_XDIAG_TESTVALID(ptnrinfo)) {
/* Should have been CE_DISP_STICKY */
return (CMD_EVD_BAD);
} else if (ce1) {
/* Partner could see and could fix CE */
} else {
/* Partner could not see ce1 (ignore ce2) */
}
} else {
}
return (CMD_EVD_UNUSED);
}
case CE_DISP_STICKY:
break;
default:
return (CMD_EVD_BAD);
}
}
switch (type) {
case CE_DISP_POSS_PERS:
case CE_DISP_PERS:
}
return (CMD_EVD_OK); /* engine hasn't fired */
break; /* to retire */
case CE_DISP_LEAKY:
case CE_DISP_STICKY:
break; /* to retire */
}
dimm->dimm_nretired++;
return (CMD_EVD_OK);
}
/*
* Solve a bank case with suspect "fault.memory.bank". The caller must
* have populated bank->bank_case.cc_cp and is also responsible for adding
* associated ereport(s) to that case.
*/
void
{
return; /* Only complain once per bank */
#ifdef sun4u
#else /* sun4v */
{
/*
* Break up the bank's unum into separate unums for each dimm.
* Create an asru from each unum.
*/
cmd_bank_memb_t *d;
char dimm_unum_string[MAXPATHLEN];
const char *q, *r;
/*
* This method of breaking apart the bank unum works for
* sun4v bank unums, until such time as a dimm enumerator
* is written for libtopo.
*/
while (*q == ' ') {
if (r == NULL)
q+1, r-q-1);
"failed to expand dimm FMRI from "
"previously validated bank\n");
}
/*
* If dimm structure doesn't already exist for
* each dimm, create and link to bank.
*/
q = r;
}
/* create separate fault for each dimm in bank */
d != NULL; d = cmd_list_next(d)) {
"fault.memory.bank", CMD_FLTMAXCONF);
}
}
#endif /* sun4u */
}
/*ARGSUSED*/
{
#ifdef sun4u
/*
* Note: Currently all sun4u processors using this code share
* L2 and L3 cache at CMD_CPU_LEVEL_CORE.
*/
#else /* sun4v */
#endif /* sun4u */
return (CMD_EVD_UNUSED);
}
/*
* The following code applies only to sun4u, because sun4u does
* not poison data in L2 cache resulting from the fetch of a
* memory UE.
*/
#ifdef sun4u
if (afar_status != AFLT_STAT_VALID) {
/*
* Had this report's AFAR been valid, it would have
* contributed an address to the UE cache. We don't
* know what the AFAR would have been, and thus we can't
* add anything to the cache. If a xxU is caused by
* this UE, we won't be able to detect it, and will thus
* erroneously offline the CPU. To prevent this
* situation, we need to assume that all xxUs generated
* through the next E$ flush are attributable to the UE.
*/
} else {
}
#endif /* sun4u */
if (synd_status != AFLT_STAT_VALID) {
return (CMD_EVD_UNUSED);
}
cpu) == CMD_EVD_UNUSED)
return (CMD_EVD_UNUSED);
if (afar_status != AFLT_STAT_VALID)
return (CMD_EVD_UNUSED);
return (CMD_EVD_REDUND);
return (NULL);
}
return (CMD_EVD_UNUSED);
const char *uuid;
}
#ifdef sun4u
if (cmd_dp_error(hdl)) {
return (CMD_EVD_OK);
return (CMD_EVD_UNUSED);
}
#endif /* sun4u */
bank->bank_nretired++;
return (CMD_EVD_OK);
}
void
{
}
void
{
}