gmem_memerr.c revision 1529f529004c61fcfd0d95ab79b0f257d6ad4451
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * CDDL HEADER START
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * The contents of this file are subject to the terms of the
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Common Development and Distribution License (the "License").
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * You may not use this file except in compliance with the License.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * or http://www.opensolaris.org/os/licensing.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * See the License for the specific language governing permissions
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * and limitations under the License.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * When distributing Covered Code, include this CDDL HEADER in each
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * If applicable, add the following below this CDDL HEADER, with the
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * fields enclosed by brackets "[]" replaced with your own identifying
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * information: Portions Copyright [yyyy] [name of copyright owner]
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * CDDL HEADER END
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Use is subject to license terms.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Ereport-handling routines for memory errors
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenportce_thresh_check(fmd_hdl_t *hdl, gmem_dimm_t *dimm)
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport /* We've already complained about this DIMM */
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * fault the dimm if number retired page >= max_retired_pages
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport rsc = gmem_find_dimm_rsc(hdl, dimm->dimm_serial);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport dflt = fmd_nvl_create_fault(hdl, GMEM_FAULT_DIMM_PAGES, GMEM_FLTMAXCONF,
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Create a fresh index block for MQSC CE correlation.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport uint64_t afar, uint16_t upos, uint16_t dram, uint16_t ckwd, uint64_t now)
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport cp = fmd_hdl_zalloc(hdl, sizeof (gmem_mq_t), FMD_SLEEP);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport gmem_mq_serdnm_create(hdl, "mq", afar, ckwd, upos);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Create SERD to keep this event from being removed
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * by fmd which may not know there is an event pointer
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * saved here. This SERD is *never* meant to fire.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport fmd_serd_create(hdl, cp->mq_serdnm, GMEM_MQ_SERDN, GMEM_MQ_SERDT);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport (void) fmd_serd_record(hdl, cp->mq_serdnm, ep);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenportmq_destroy(fmd_hdl_t *hdl, gmem_list_t *lp, gmem_mq_t *ip)
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Add an index block for a new CE, sorted
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * a) by ascending unit position
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * b) order of arrival (~= time order)
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenportmq_add(fmd_hdl_t *hdl, gmem_dimm_t *dimm, fmd_event_t *ep,
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport uint64_t afar, uint16_t unit_position, uint16_t dram, uint16_t ckwd,
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport for (ip = gmem_list_next(&dimm->mq_root[cw]); ip != NULL; ) {
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport /* list is in unit position order */
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport } else if (ip->mq_unit_position == unit_position &&
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Found a duplicate cw, unit_position, and afar.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Delete this node, to be superseded by the new
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * node added below.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport ip = mq_destroy(hdl, &dimm->mq_root[cw], ip);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport jp = mq_create(hdl, ep, afar, unit_position, dram, cw, now);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport gmem_list_insert_before(&dimm->mq_root[cw], ip, jp);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Prune the MQSC index lists (one for each checkword), by deleting
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * outdated index blocks from each list.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenportmq_prune(fmd_hdl_t *hdl, gmem_dimm_t *dimm, uint64_t now)
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport for (ip = gmem_list_next(&dimm->mq_root[cw]); ip != NULL; ) {
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport if (ip->mq_tstamp < now - GMEM_MQ_TIMELIM) {
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * This event has timed out - delete the
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * mq block as well as serd for the event.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport ip = mq_destroy(hdl, &dimm->mq_root[cw], ip);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport /* tstamp < now - ce_t */
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport } /* per checkword */
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport } /* cw = 0...3 */
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Check the MQSC index lists (one for each checkword) by making a
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * complete pass through each list, checking if the criteria for either
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Rule 4A or 4B have been met. Rule 4A checking is done for each checkword;
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * 4B check is done at end.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Rule 4A: fault a DIMM "whenever Solaris reports two or more CEs from
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * two or more different physical addresses on each of two or more different
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * bit positions from the same DIMM within 72 hours of each other, and all
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * the addresses are in the same relative checkword (that is, the AFARs
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * are all the same modulo 64). [Note: This means at least 4 CEs; two
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * from one bit position, with unique addresses, and two from another,
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * also with unique addresses, and the lower 6 bits of all the addresses
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * are the same."
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Rule 4B: fault a DIMM "whenever Solaris reports two or more CEs from
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * two or more different physical addresses on each of three or more
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * different outputs from the same DRAM within 72 hours of each other, as
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * long as the three outputs do not all correspond to the same relative
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * bit position in their respective checkwords. [Note: This means at least
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * 6 CEs; two from one DRAM output signal, with unique addresses, two from
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * another output from the same DRAM, also with unique addresses, and two
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * more from yet another output from the same DRAM, again with unique
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * addresses, as long as the three outputs do not all correspond to the
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * same relative bit position in their respective checkwords.]"
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenportmq_check(fmd_hdl_t *hdl, gmem_dimm_t *dimm, int16_t dram)
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport typedef struct upos_pair {
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport upos_pair_t upos_array[16]; /* max per cw = 2, * 8 cw's */
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Each upos_array[] member represents a pair of CEs for the same
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * unit position (symbol) which is a 4 bit nibble.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * MQSC rule 4 requires pairs of CEs from the same symbol (same DIMM
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * for rule 4A, and same DRAM for rule 4B) for a violation - this
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * is why CE pairs are tracked.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * mq_root[] is an array of cumulative lists of CEs
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * indexed by checkword where the list is in unit position
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * order. Loop through checking for duplicate unit position
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * entries (filled in at mq_create()).
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * The upos_array[] is filled in each time a duplicate
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * unit position is found; the first time through the loop
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * of a unit position sets curr_upos but does not fill in
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * upos_array[] until the second symbol is found.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport for (ip = gmem_list_next(&dimm->mq_root[cw]); ip != NULL;
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport /* Set initial current position */
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport } else if (i > upos_pairs &&
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Only keep track of CE pairs; skip
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * triples, quads, etc...
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport /* Have a pair. Add to upos_array[] */
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport /* Rule 4A violation */
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport rsc = gmem_find_dimm_rsc(hdl, dimm->dimm_serial);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport flt = fmd_nvl_create_fault(hdl, GMEM_FAULT_DIMM_4A,
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport GMEM_FLTMAXCONF, NULL, gmem_dimm_fru(dimm), rsc);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport for (j = upos_pairs; j < i; j++) {
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport fmd_case_add_suspect(hdl, dimm->dimm_case.cc_cp, flt);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport if ((dram == INVALID_DRAM) || (upos_pairs < 3)) {
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport fmd_hdl_debug(hdl, "Skip rules 4B upos_pairs=%d\n", upos_pairs);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport return; /* 4B violation needs at least 3 pairs */
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Walk through checking for a rule 4B violation.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Since we only keep track of two CE pairs per CW we'll only have
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * a max of potentially 16 lements in the array. So as not to run
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * off the end of the array, need to be careful with i and j indexes.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport if (upos_array[i].dram != upos_array[j].dram)
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * These two pairs aren't the same dram;
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * continue looking for pairs that are.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport if (upos_array[j].dram != upos_array[k].dram)
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * DRAMs must be the same for a rule
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * 4B violation. Continue looking for
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * pairs that have the same DRAMs.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenportgmem_ce(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport err = nvlist_lookup_boolean_value(nvl, GMEM_ERPT_PAYLOAD_DIAGNOSE,
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport if ((nvlist_lookup_uint64(nvl, GMEM_ERPT_PAYLOAD_PHYSADDR,
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport (nvlist_lookup_uint64(nvl, GMEM_ERPT_PAYLOAD_OFFSET,
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport fmd_hdl_debug(hdl, "Can't get page phyaddr or offset");
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport fmd_hdl_debug(hdl, "phyaddr %llx offset %llx", phyaddr, offset);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport if ((page = gmem_page_lookup(phyaddr)) != NULL &&
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport fmd_case_solved(hdl, page->page_case.cc_cp))
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport if (nvlist_lookup_nvlist(nvl, GMEM_ERPT_PAYLOAD_RESOURCE,
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport nvlist_lookup_string(rsrc, FM_FMRI_HC_SERIAL_ID, &sn) != 0) {
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport fmd_hdl_debug(hdl, "Can't get dimm serial\n");
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Find dimm fru by serial number.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport fmd_hdl_debug(hdl, "Dimm is not present\n");
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport if ((dimm = gmem_dimm_lookup(hdl, fru)) == NULL &&
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport (dimm = gmem_dimm_create(hdl, fru)) == NULL) {
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport dimm->dimm_case.cc_cp = gmem_case_create(hdl,
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport &dimm->dimm_header, GMEM_PTR_DIMM_CASE, &uuid);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * Add to MQSC correlation lists all CEs which pass validity
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * checks above. If there is no symbol_pos & relative ckword
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * in the ereport, skip rules 4A & 4B checking.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport * If there is no dram in the ereport, skip the rule 4B checking.
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport if (nvlist_lookup_uint16(nvl, GMEM_ERPT_PAYLOAD_DRAM, &erpt_dram) != 0)
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport err = nvlist_lookup_uint16(nvl, GMEM_ERPT_PAYLOAD_SYMBOLPOS,
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport err |= nvlist_lookup_uint16(nvl, GMEM_ERPT_PAYLOAD_CKW, &cw);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport fmd_hdl_debug(hdl, "symbol_pos=%d dram=%d cw=%d",
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport if (!(dimm->dimm_flags & GMEM_F_FAULTING) && (err == 0)) {
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport mq_add(hdl, dimm, ep, phyaddr, symbol_pos, dram,
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport type = gmem_mem_name2type(strstr(class, "mem"));
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport page = gmem_page_create(hdl, fru, phyaddr, offset);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport page->page_case.cc_cp = gmem_case_create(hdl,
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport &page->page_header, GMEM_PTR_PAGE_CASE, &uuid);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport fmd_hdl_debug(hdl, "adding persistent event to CE serd");
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport filter_ratio = gmem_get_serd_filter_ratio(nvl);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport fmd_hdl_debug(hdl, "filter_ratio %d\n", filter_ratio);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport if (gmem_serd_record(hdl, page->page_case.cc_serdnm,
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport return (GMEM_EVD_OK); /* engine hasn't fired */
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport fmd_case_add_serd(hdl, page->page_case.cc_cp,
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport fmd_serd_reset(hdl, page->page_case.cc_serdnm);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport break; /* to retire */
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport fmd_case_add_ereport(hdl, page->page_case.cc_cp, ep);
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport break; /* to retire */
1529f529004c61fcfd0d95ab79b0f257d6ad4451Scott Davenport topo_rsc = gmem_find_dimm_rsc(hdl, dimm->dimm_serial);