gmem_memerr.c revision aab83bb83be7342f6cfccaed8d5fe0b2f404855d
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
/*
* Ereport-handling routines for memory errors
*/
#include <gmem_mem.h>
#include <gmem_dimm.h>
#include <gmem_page.h>
#include <gmem.h>
#include <strings.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <sys/errclassify.h>
#define OFFBIT 0xFFFFFFFFFFFC07FFULL
#define BIT28_32 0x00000001F0000000ULL
#define BIT13_17 0x000000000003E000ULL
#define BIT18_19 0x00000000000C0000ULL
#define BIT11_12 0x0000000000001800ULL
struct ce_name2type {
const char *name;
};
static ce_dispact_t
gmem_mem_name2type(const char *name)
{
static const struct ce_name2type new[] = {
{ "mem-unk", CE_DISP_UNKNOWN },
{ "mem-is", CE_DISP_INTERMITTENT },
{ "mem-cs", CE_DISP_PERS },
{ "mem-ss", CE_DISP_STICKY },
{ NULL }
};
const struct ce_name2type *tp;
}
return (CE_DISP_UNKNOWN);
}
/*ARGSUSED*/
static int
{
int err, i;
return (TOPO_WALK_NEXT);
if (err != 0) {
return (TOPO_WALK_NEXT);
}
return (TOPO_WALK_NEXT);
}
return (TOPO_WALK_NEXT);
}
for (i = 0; i < n1; i++) {
&name1);
return (TOPO_WALK_NEXT);
}
}
}
return (TOPO_WALK_TERMINATE);
}
nvlist_t *
int err;
return (NULL);
return (NULL);
}
return (fru_nvl);
}
/*
* fault the FRU of the common detector between two DIMMs
*/
void
{
uint_t n;
int i, j;
fmd_case_t *cp;
return;
for (i = 0; i < n; i++) {
break;
}
n = i + 1;
return;
for (i = 0; i < n; i++) {
(void) nvlist_alloc(&hcl[i],
}
for (i = 0, j = 0; i < n; i++) {
j++;
break;
}
for (i = 0; i < n; i++) {
nvlist_free(hcl[i]);
}
}
for (i = 0; i < n; i++) {
nvlist_free(hcl[i]);
}
}
}
for (i = 0; i < n; i++) {
nvlist_free(hcl[i]);
}
}
/*
* formula to conver an unhashed address to hashed address
* PA[17:11] = (PA[32:28] xor PA[17:13]) :: ((PA[19:18] xor PA[12:11])
*/
static void
{
}
/*
* check if a dimm has n CEs that have the same symbol-in-error
*/
int
{
int i;
int count = 0;
for (i = 0; i < GMEM_MAX_CKWDS; i++) {
count++;
return (1);
}
}
}
return (0);
}
/*
* check if smaller number of retired pages > 1/16 of larger number of
* retired pages
*/
int
{
double ratio;
} else
return (0);
return (1);
}
return (0);
}
/*
* check bad rw on any two DIMMs. The check succeeds if
* - each DIMM has a n CEs which have the same symbol-in-error,
* - the smaller number of retired pages > 1/16 larger number of retired pages
*/
static int
{
int i;
for (i = 0; i < GMEM_MAX_CKWDS; i++) {
return (1);
}
}
}
}
}
return (0);
}
static void
{
gmem_dimm_t *d, *next;
next = gmem_list_next(d);
if (d == ce_dimm)
continue;
continue;
"check_bad_rw_dimms succeeded: %s %s\n",
return;
}
}
}
/*
* rule 5a checking. The check succeeds if
* - nretired >= 512
* - nretired >= 128 and (addr_hi - addr_low) / (nretired -1 ) > 512KB
*/
static void
{
fmd_case_t *cp;
uint64_t delta_addr = 0;
return;
return;
(nret - 1);
}
}
/*
* rule 5b checking. The check succeeds if more than 120
* non-intermittent CEs are reported against one symbol
* position of one afar in 72 hours
*/
static void
{
fmd_case_t *cp;
int cw;
"mq_5b_check succeeded: duplicate CE=%d",
ip->mq_dupce_count);
dimm->dimm_serial);
return;
}
}
}
}
/*
* delete the expired duplicate CE time stamps
*/
static void
{
ip->mq_dupce_count--;
}
}
}
static void
{
ip->mq_dupce_count++;
}
/*
* Create a fresh index block for MQSC CE correlation.
*/
{
/*
* Create SERD to keep this event from being removed
* by fmd which may not know there is an event pointer
* saved here. This SERD is *never* meant to fire.
*/
return (cp);
}
{
}
}
return (jp);
}
/*
* Add an index block for a new CE, sorted
* a) by ascending unit position
* b) order of arrival (~= time order)
*/
void
{
/* list is in unit position order */
break;
/*
* Found a duplicate cw, unit_position, and afar.
* Delete this node, to be superseded by the new
* node added below.
* update the mq_t structure
*/
return;
} else {
}
}
else
}
/*
* Prune the MQSC index lists (one for each checkword), by deleting
* outdated index blocks from each list.
*/
void
{
int cw;
/*
* This event has timed out - delete the
* mq block as well as serd for the event.
*/
} else {
/* tstamp < now - ce_t */
}
} /* per checkword */
} /* cw = 0...3 */
}
/*
* Check the MQSC index lists (one for each checkword) by making a
* complete pass through each list, checking if the criteria for
* Rule 4A has been met. Rule 4A checking is done for each checkword.
*
* Rule 4A: fault a DIMM "whenever Solaris reports two or more CEs from
* two or more different physical addresses on each of two or more different
* bit positions from the same DIMM within 72 hours of each other, and all
* the addresses are in the same relative checkword (that is, the AFARs
* are all the same modulo 64). [Note: This means at least 4 CEs; two
* from one bit position, with unique addresses, and two from another,
* also with unique addresses, and the lower 6 bits of all the addresses
* are the same."
*/
void
{
typedef struct upos_pair {
int upos;
} upos_pair_t;
/*
* Each upos_array[] member represents a pair of CEs for the same
* unit position (symbol) which is a 4 bit nibble.
* MQSC rule 4 requires pairs of CEs from the same symbol (same DIMM
* for rule 4A, and same DRAM for rule 4B) for a violation - this
* is why CE pairs are tracked.
*/
upos_pairs = 0;
i = upos_pairs;
curr_upos = -1;
/*
* mq_root[] is an array of cumulative lists of CEs
* indexed by checkword where the list is in unit position
* order. Loop through checking for duplicate unit position
* entries (filled in at mq_create()).
* The upos_array[] is filled in each time a duplicate
* unit position is found; the first time through the loop
* of a unit position sets curr_upos but does not fill in
* upos_array[] until the second symbol is found.
*/
/* Set initial current position */
} else if (i > upos_pairs &&
/*
* Only keep track of CE pairs; skip
* triples, quads, etc...
*/
continue;
/* Have a pair. Add to upos_array[] */
}
}
if (i - upos_pairs >= 2) {
/* Rule 4A violation */
for (j = upos_pairs; j < i; j++) {
}
return;
}
upos_pairs = i;
}
}
/*ARGSUSED*/
{
uint32_t filter_ratio = 0;
const char *uuid;
char *sn;
int skip_error = 0;
&diagnose);
return (GMEM_EVD_UNUSED);
&phyaddr) != 0) ||
&offset) != 0)) {
return (GMEM_EVD_BAD);
}
return (GMEM_EVD_REDUND);
&rsrc) != 0 ||
return (GMEM_EVD_BAD);
}
return (GMEM_EVD_BAD);
/*
* Find dimm fru by serial number.
*/
return (GMEM_EVD_UNUSED);
}
return (GMEM_EVD_UNUSED);
}
}
/*
* Add to MQSC correlation lists all CEs which pass validity
* checks above. If there is no symbol_pos & relative ckword
* in the ereport, skip rule 4A checking.
*/
&symbol_pos);
if (err == 0) {
if (!skip_error ||
if (!skip_error)
}
}
}
switch (type) {
case CE_DISP_UNKNOWN:
return (GMEM_EVD_UNUSED);
case CE_DISP_INTERMITTENT:
return (GMEM_EVD_UNUSED);
case CE_DISP_PERS:
break;
case CE_DISP_STICKY:
break;
default:
return (GMEM_EVD_BAD);
}
return (GMEM_EVD_REDUND);
}
return (GMEM_EVD_UNUSED);
}
}
}
switch (type) {
case CE_DISP_PERS:
return (GMEM_EVD_OK); /* engine hasn't fired */
}
break; /* to retire */
case CE_DISP_STICKY:
break; /* to retire */
}
if (rc) {
dimm->dimm_nretired++;
}
return (GMEM_EVD_OK);
}
void
{
}