cmd_Lxcache.c revision d69c2551e89e9440043ac6ff5739b58746286f33
7bebe46c240b554f47faeed19186123896281967jc * CDDL HEADER START
7bebe46c240b554f47faeed19186123896281967jc * The contents of this file are subject to the terms of the
7bebe46c240b554f47faeed19186123896281967jc * Common Development and Distribution License (the "License").
7bebe46c240b554f47faeed19186123896281967jc * You may not use this file except in compliance with the License.
7bebe46c240b554f47faeed19186123896281967jc * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
7bebe46c240b554f47faeed19186123896281967jc * See the License for the specific language governing permissions
7bebe46c240b554f47faeed19186123896281967jc * and limitations under the License.
7bebe46c240b554f47faeed19186123896281967jc * When distributing Covered Code, include this CDDL HEADER in each
7bebe46c240b554f47faeed19186123896281967jc * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
7bebe46c240b554f47faeed19186123896281967jc * If applicable, add the following below this CDDL HEADER, with the
7bebe46c240b554f47faeed19186123896281967jc * fields enclosed by brackets "[]" replaced with your own identifying
7bebe46c240b554f47faeed19186123896281967jc * information: Portions Copyright [yyyy] [name of copyright owner]
7bebe46c240b554f47faeed19186123896281967jc * CDDL HEADER END
7bebe46c240b554f47faeed19186123896281967jc * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
7bebe46c240b554f47faeed19186123896281967jc * Use is subject to license terms.
7bebe46c240b554f47faeed19186123896281967jc#pragma ident "%Z%%M% %I% %E% SMI"
7bebe46c240b554f47faeed19186123896281967jc * Support routines for managing per-Lxcache state.
7bebe46c240b554f47faeed19186123896281967jc * These values are our threshold values for SERDing CPU's based on the
7bebe46c240b554f47faeed19186123896281967jc * the # of times we have retired a cache line for each category.
7bebe46c240b554f47faeed19186123896281967jcstatic void
7bebe46c240b554f47faeed19186123896281967jc fmd_buf_write(hdl, NULL, Lxcache->Lxcache_bufname, Lxcache,
d69c2551e89e9440043ac6ff5739b58746286f33jc switch (pstype) {
d69c2551e89e9440043ac6ff5739b58746286f33jc return ("l2data");
d69c2551e89e9440043ac6ff5739b58746286f33jc return ("l3data");
d69c2551e89e9440043ac6ff5739b58746286f33jc return ("l2tag");
d69c2551e89e9440043ac6ff5739b58746286f33jc return ("l3tag");
d69c2551e89e9440043ac6ff5739b58746286f33jc return ("unknown");
7bebe46c240b554f47faeed19186123896281967jccmd_Lxcache_free(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_Lxcache_t *Lxcache,
7bebe46c240b554f47faeed19186123896281967jc fmd_hdl_debug(hdl, "Entering cmd_Lxcache_free for %s destroy = %d\n",
7bebe46c240b554f47faeed19186123896281967jccmd_Lxcache_destroy(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_Lxcache_t *Lxcache)
7bebe46c240b554f47faeed19186123896281967jcLxcache_lookup_by_type_index_way_bit(cmd_cpu_t *cpu, cmd_ptrsubtype_t pstype,
7bebe46c240b554f47faeed19186123896281967jc for (Lxcache = cmd_list_next(&cpu->cpu_Lxcaches); Lxcache != NULL;
7bebe46c240b554f47faeed19186123896281967jccmd_Lxcache_create(fmd_hdl_t *hdl, cmd_xr_t *xr, cmd_cpu_t *cpu,
7bebe46c240b554f47faeed19186123896281967jc nvlist_t *modasru, cmd_ptrsubtype_t pstype, uint32_t index,
7bebe46c240b554f47faeed19186123896281967jc const char *pstype_name;
7bebe46c240b554f47faeed19186123896281967jc "creating new Lxcache for cachetype=%d index=%lx way=%lx bit=%x\n",
7bebe46c240b554f47faeed19186123896281967jc Lxcache = fmd_hdl_zalloc(hdl, sizeof (cmd_Lxcache_t), FMD_SLEEP);
7bebe46c240b554f47faeed19186123896281967jc switch (pstype) {
7bebe46c240b554f47faeed19186123896281967jc cmd_bufname(Lxcache->Lxcache_bufname, sizeof (Lxcache->Lxcache_bufname),
7bebe46c240b554f47faeed19186123896281967jc "Lxcache_%s_%04d_%08d_%02d_%03d", pstype_name, cpu->cpu_cpuid,
7bebe46c240b554f47faeed19186123896281967jc (errno = nvlist_add_uint32(asru, FM_FMRI_CPU_CACHE_INDEX,
7bebe46c240b554f47faeed19186123896281967jccmd_Lxcache_lookup_by_index_way(cmd_cpu_t *cpu, cmd_ptrsubtype_t pstype,
7bebe46c240b554f47faeed19186123896281967jc for (cache = cmd_list_next(&cpu->cpu_Lxcaches); cache != NULL;
7bebe46c240b554f47faeed19186123896281967jccmd_Lxcache_lookup(cmd_cpu_t *cpu, cmd_ptrsubtype_t pstype, uint32_t index,
7bebe46c240b554f47faeed19186123896281967jc return (Lxcache_lookup_by_type_index_way_bit(cpu, pstype, index, way,
d69c2551e89e9440043ac6ff5739b58746286f33jccmd_fmri_nvl2str(fmd_hdl_t *hdl, nvlist_t *nvl, char *buf, size_t buflen)
d69c2551e89e9440043ac6ff5739b58746286f33jc if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID, &cpuid) != 0)
d69c2551e89e9440043ac6ff5739b58746286f33jc if (nvlist_lookup_string(nvl, FM_FMRI_CPU_SERIAL_ID, &serstr) != 0)
d69c2551e89e9440043ac6ff5739b58746286f33jc if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_CACHE_INDEX, &index) != 0)
d69c2551e89e9440043ac6ff5739b58746286f33jc if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_CACHE_WAY, &way) != 0)
d69c2551e89e9440043ac6ff5739b58746286f33jc if (nvlist_lookup_uint8(nvl, FM_FMRI_CPU_CACHE_TYPE, &type) != 0)
d69c2551e89e9440043ac6ff5739b58746286f33jc "\ncmd_fmri_nvl2str: missing %s in fmri\n",
d69c2551e89e9440043ac6ff5739b58746286f33jc return (-1);
d69c2551e89e9440043ac6ff5739b58746286f33jc "cpu:///%s=%u/%s=%s/%s=%u/%s=%u/%s=%d",
d69c2551e89e9440043ac6ff5739b58746286f33jc fmd_hdl_debug(hdl, "Could not contact fmadm to unretire\n");
d69c2551e89e9440043ac6ff5739b58746286f33jc return (-1);
7bebe46c240b554f47faeed19186123896281967jcLxcache_wrapv1(fmd_hdl_t *hdl, cmd_Lxcache_pers_t *pers, size_t psz)
7bebe46c240b554f47faeed19186123896281967jc fmd_hdl_abort(hdl, "size of state doesn't match size of "
7bebe46c240b554f47faeed19186123896281967jc "version 1 state (%u bytes).\n",
7bebe46c240b554f47faeed19186123896281967jc Lxcache = fmd_hdl_zalloc(hdl, sizeof (cmd_Lxcache_t), FMD_SLEEP);
7bebe46c240b554f47faeed19186123896281967jccmd_Lxcache_restore(fmd_hdl_t *hdl, fmd_case_t *cp, cmd_case_ptr_t *ptr)
7bebe46c240b554f47faeed19186123896281967jc * We need to first extract the cpu name by reading directly
7bebe46c240b554f47faeed19186123896281967jc * from fmd buffers in order to begin our search for Lxcache in
7bebe46c240b554f47faeed19186123896281967jc * the appropriate cpu list.
7bebe46c240b554f47faeed19186123896281967jc * After we identify the cpu list using buf name we could look
7bebe46c240b554f47faeed19186123896281967jc * in cpu list for our Lxcache states.
7bebe46c240b554f47faeed19186123896281967jc fmd_hdl_debug(hdl, "restoring Lxcache from %s\n", ptr->ptr_name);
7bebe46c240b554f47faeed19186123896281967jc if ((Lxcachesz = fmd_buf_size(hdl, NULL, ptr->ptr_name)) == 0) {
7bebe46c240b554f47faeed19186123896281967jc "not exist in saved state\n",
7bebe46c240b554f47faeed19186123896281967jc fmd_hdl_abort(hdl, "Lxcache buffer referenced by case %s "
7bebe46c240b554f47faeed19186123896281967jc "is %d bytes. Expected size is %d bytes\n",
7bebe46c240b554f47faeed19186123896281967jc cpu = cmd_restore_cpu_only(hdl, cp, Lxcache->Lxcache_cpu_bufname);
7bebe46c240b554f47faeed19186123896281967jc recovered_Lxcache = Lxcache; /* save the recovered Lxcache */
7bebe46c240b554f47faeed19186123896281967jc for (Lxcache = cmd_list_next(&cpu->cpu_Lxcaches); Lxcache != NULL;
7bebe46c240b554f47faeed19186123896281967jc if (strcmp(Lxcache->Lxcache_bufname, ptr->ptr_name) == 0)
7bebe46c240b554f47faeed19186123896281967jc "for Lxcache state referenced by case %s.\n",
7bebe46c240b554f47faeed19186123896281967jc "cpu_id %d: serdname for the case is %s\n",
7bebe46c240b554f47faeed19186123896281967jc "cpu_id %d: restoring the case for index %d way %d bit %d\n",
7bebe46c240b554f47faeed19186123896281967jc cmd_case_restore(hdl, &Lxcache->Lxcache_case, cp, serdnm);
7bebe46c240b554f47faeed19186123896281967jc/*ARGSUSED*/
7bebe46c240b554f47faeed19186123896281967jc if (fmd_nvl_fmri_unusable(hdl, Lxcache->Lxcache_asru_nvl)) {
7bebe46c240b554f47faeed19186123896281967jc /* No need to rewrite the FMRIs in the Lxcache - they don't change */
7bebe46c240b554f47faeed19186123896281967jc while ((Lxcache = cmd_list_next(&cpu->cpu_Lxcaches)) != NULL)
7bebe46c240b554f47faeed19186123896281967jccmd_Lxcache_serdnm_create(fmd_hdl_t *hdl, uint32_t cpu_id,
7bebe46c240b554f47faeed19186123896281967jc const char *serdbase;
7bebe46c240b554f47faeed19186123896281967jc switch (pstype) {
7bebe46c240b554f47faeed19186123896281967jc sz = (snprintf(NULL, 0, fmt, cpu_id, serdbase, index, way, bit) + 1);
7bebe46c240b554f47faeed19186123896281967jc (void) snprintf(nm, sz, fmt, cpu_id, serdbase, index, way, bit);
7bebe46c240b554f47faeed19186123896281967jc return (nm);
7bebe46c240b554f47faeed19186123896281967jc * Count the number of SERD type 2 ways retired for a given cpu
7bebe46c240b554f47faeed19186123896281967jc * These are defined to be L3 Cache data retirements
7bebe46c240b554f47faeed19186123896281967jc for (cache = cmd_list_next(&cpu->cpu_Lxcaches); cache != NULL;
7bebe46c240b554f47faeed19186123896281967jc * Count the number of SERD type 1 ways retired for a given cpu
7bebe46c240b554f47faeed19186123896281967jc * These are defined to be L2 Data, tag and L3 Tag retirements
7bebe46c240b554f47faeed19186123896281967jc for (cache = cmd_list_next(&cpu->cpu_Lxcaches); cache != NULL;
7bebe46c240b554f47faeed19186123896281967jccmd_fault_the_cpu(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_ptrsubtype_t pstype,
7bebe46c240b554f47faeed19186123896281967jc const char *fltnm)
7bebe46c240b554f47faeed19186123896281967jc const char *uuid;
7bebe46c240b554f47faeed19186123896281967jc if ((errno = fmd_nvl_fmri_expand(hdl, cpu->cpu_asru_nvl)) != 0)
7bebe46c240b554f47faeed19186123896281967jc cmd_cpu_create_faultlist(hdl, cp, cpu, fltnm, NULL, 100);
7bebe46c240b554f47faeed19186123896281967jccmd_Lxcache_fault(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_Lxcache_t *Lxcache,
7bebe46c240b554f47faeed19186123896281967jc (void) snprintf(fltnm, sizeof (fltnm), "fault.cpu.%s.%s-line",
7bebe46c240b554f47faeed19186123896281967jc if (nvlist_add_boolean_value(flt, FM_SUSPECT_MESSAGE, B_FALSE) != 0)
7bebe46c240b554f47faeed19186123896281967jc fmd_hdl_abort(hdl, "failed to add no-message member to fault");
7bebe46c240b554f47faeed19186123896281967jc fmd_case_add_suspect(hdl, Lxcache->Lxcache_case.cc_cp, flt);
d69c2551e89e9440043ac6ff5739b58746286f33jc if (cmd_fmri_nvl2str(hdl, Lxcache->Lxcache_asru.fmri_nvl,
d69c2551e89e9440043ac6ff5739b58746286f33jc "\n%s:cpu_id %d: Failed to save the"
d69c2551e89e9440043ac6ff5739b58746286f33jc " retired fmri string\n",
d69c2551e89e9440043ac6ff5739b58746286f33jc "\n%s:cpu_id %d:Saved the retired fmri string %s\n",
7bebe46c240b554f47faeed19186123896281967jc /* Retrieve the number of retired ways for each category */
7bebe46c240b554f47faeed19186123896281967jc fmd_hdl_debug(hdl, "CPU %d retired Type 1 way count is: %d\n",
7bebe46c240b554f47faeed19186123896281967jc fmd_hdl_debug(hdl, "CPU %d retired Type 2 way count is: %d\n",
7bebe46c240b554f47faeed19186123896281967jc * We will destroy the case and serd engine.
7bebe46c240b554f47faeed19186123896281967jc * The rest will be destroyed when we retire the CPU
7bebe46c240b554f47faeed19186123896281967jc * until then we keep the Lxcache strutures alive.
7bebe46c240b554f47faeed19186123896281967jc "cpu_id = %d could not open %s to read tag info.\n",
7bebe46c240b554f47faeed19186123896281967jc switch (pstype) {
7bebe46c240b554f47faeed19186123896281967jc if (ioctl(fd, MEM_CACHE_READ_ERROR_INJECTED_TAGS, &cache_info)
7bebe46c240b554f47faeed19186123896281967jc " MEM_CACHE_READ_ERROR_INJECTED_TAGS failed"
7bebe46c240b554f47faeed19186123896281967jc " errno = %d\n",
7bebe46c240b554f47faeed19186123896281967jc " MEM_CACHE_READ_TAGS failed"
7bebe46c240b554f47faeed19186123896281967jc " errno = %d\n",
7bebe46c240b554f47faeed19186123896281967jcget_cpu_retired_ways(cmd_cpu_t *cpu, cmd_ptrsubtype_t pstype)
7bebe46c240b554f47faeed19186123896281967jc return (-1);
7bebe46c240b554f47faeed19186123896281967jc switch (pstype) {
7bebe46c240b554f47faeed19186123896281967jc return (-1);
7bebe46c240b554f47faeed19186123896281967jc for (i = 0; i < PN_CACHE_NWAYS; i++) {
7bebe46c240b554f47faeed19186123896281967jcget_index_retired_ways(cmd_cpu_t *cpu, cmd_ptrsubtype_t pstype, uint32_t index)
7bebe46c240b554f47faeed19186123896281967jc return (-1);
7bebe46c240b554f47faeed19186123896281967jc for (i = 0; i < PN_CACHE_NWAYS; i++) {
7bebe46c240b554f47faeed19186123896281967jcis_index_way_retired(cmd_cpu_t *cpu, cmd_ptrsubtype_t pstype, uint32_t index,
7bebe46c240b554f47faeed19186123896281967jc return (-1);
7bebe46c240b554f47faeed19186123896281967jc return (1);
7bebe46c240b554f47faeed19186123896281967jc return (0);
d69c2551e89e9440043ac6ff5739b58746286f33jccmd_cache_way_retire(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_Lxcache_t *Lxcache)
d69c2551e89e9440043ac6ff5739b58746286f33jc "fltnm:cpu_id %d open of %s failed\n",
d69c2551e89e9440043ac6ff5739b58746286f33jc "\n%s:cpu %d: Retiring index %d, way %d bit %d\n",
d69c2551e89e9440043ac6ff5739b58746286f33jc "fltnm:cpu_id %d MEM_CACHE_RETIRE ioctl failed\n",
d69c2551e89e9440043ac6ff5739b58746286f33jccmd_cache_way_unretire(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_Lxcache_t *Lxcache)
d69c2551e89e9440043ac6ff5739b58746286f33jc "fltnm:cpu_id %d open of %s failed\n",
d69c2551e89e9440043ac6ff5739b58746286f33jc "\n%s:cpu %d: Unretiring index %d, way %d bit %d\n",
d69c2551e89e9440043ac6ff5739b58746286f33jc "fltnm:cpu_id %d MEM_CACHE_UNRETIRE ioctl failed\n",
d69c2551e89e9440043ac6ff5739b58746286f33jccmd_Lxcache_lookup_by_type_index_way_flags(cmd_cpu_t *cpu,
d69c2551e89e9440043ac6ff5739b58746286f33jc cmd_ptrsubtype_t type, uint32_t index, int8_t way, int32_t flags)
d69c2551e89e9440043ac6ff5739b58746286f33jccmd_Lxcache_unretire(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_Lxcache_t *cmd_Lxcache,
d69c2551e89e9440043ac6ff5739b58746286f33jc const char *fltnm)
d69c2551e89e9440043ac6ff5739b58746286f33jc * If we are unretiring a cacheline retired due to suspected TAG
d69c2551e89e9440043ac6ff5739b58746286f33jc * fault, then we must first check if we are using a cacheline
d69c2551e89e9440043ac6ff5739b58746286f33jc * that was retired earlier for DATA fault.
d69c2551e89e9440043ac6ff5739b58746286f33jc * If so we will not unretire the cacheline.
d69c2551e89e9440043ac6ff5739b58746286f33jc * We will change the flags to reflect the current condition.
d69c2551e89e9440043ac6ff5739b58746286f33jc * We will return success, though.
d69c2551e89e9440043ac6ff5739b58746286f33jc "\n%s:cpuid %d checking if there is a %s"
d69c2551e89e9440043ac6ff5739b58746286f33jc " cacheline re-retired at this index %d and way %d\n",
d69c2551e89e9440043ac6ff5739b58746286f33jc retired_Lxcache = cmd_Lxcache_lookup_by_type_index_way_flags(
d69c2551e89e9440043ac6ff5739b58746286f33jc if (cmd_cache_way_unretire(hdl, cpu, cmd_Lxcache) == B_FALSE)
d69c2551e89e9440043ac6ff5739b58746286f33jc * We have unretired the cacheline. We need to inform the fmd
d69c2551e89e9440043ac6ff5739b58746286f33jc * that we have repaired the faulty fmri that we retired earlier.
d69c2551e89e9440043ac6ff5739b58746286f33jc * The cpumem agent will not unretire cacheline in response to
d69c2551e89e9440043ac6ff5739b58746286f33jc * the list.repair events it receives.
d69c2551e89e9440043ac6ff5739b58746286f33jc "\n%s:cpuid %d Repairing the retired fmri %s",
d69c2551e89e9440043ac6ff5739b58746286f33jc "\n%s:cpuid %d Failed to repair"
d69c2551e89e9440043ac6ff5739b58746286f33jc " retired fmri.",
d69c2551e89e9440043ac6ff5739b58746286f33jc * We need to retire the cacheline that we just
d69c2551e89e9440043ac6ff5739b58746286f33jc * unretired.
d69c2551e89e9440043ac6ff5739b58746286f33jc * A hopeless situation.
d69c2551e89e9440043ac6ff5739b58746286f33jc * cannot maintain consistency of cacheline
d69c2551e89e9440043ac6ff5739b58746286f33jc * sate between fmd and DE.
d69c2551e89e9440043ac6ff5739b58746286f33jc * Aborting the DE.
d69c2551e89e9440043ac6ff5739b58746286f33jc "\n%s:cpuid %d We are unable to repair"
d69c2551e89e9440043ac6ff5739b58746286f33jc " the fmri we just unretired and are"
d69c2551e89e9440043ac6ff5739b58746286f33jc " unable to restore the DE and fmd to"
d69c2551e89e9440043ac6ff5739b58746286f33jc " a sane state.\n",