cpu/generic_cpu/gcpu_mca.c

	gcpu_mca.c revision 0ad0f0b2adb964c7bd56bbf5a831721e1a67beaf
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/mca_x86.h>
#include <sys/cpu_module_impl.h>
#include <sys/cpu_module_ms.h>
#include <sys/cmn_err.h>
#include <sys/cpuvar.h>
#include <sys/pghw.h>
#include <sys/x86_archext.h>
#include <sys/sysmacros.h>
#include <sys/regset.h>
#include <sys/privregs.h>
#include <sys/systm.h>
#include <sys/types.h>
#include <sys/log.h>
#include <sys/psw.h>
#include <sys/fm/protocol.h>
#include <sys/fm/util.h>
#include <sys/errorq.h>
#include <sys/mca_x86.h>
#include <sys/fm/cpu/GMCA.h>
#include <sys/sysevent.h>
#include <sys/ontrap.h>

#include "gcpu.h"

/*
 * Clear to log telemetry found at initialization.  While processor docs
 * say you should process this telemetry on all but Intel family 0x6
 * there are way too many exceptions and we want to avoid bogus
 * diagnoses.
 */
int gcpu_suppress_log_on_init = 1;

/*
 * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at
 * error logout time.  The stack will be included in the ereport if the
 * error type selects stack inclusion, or in all cases if
 * gcpu_mca_stack_ereport_include is nonzero.
 */
int gcpu_mca_stack_flag = 0;
int gcpu_mca_stack_ereport_include = 0;

/*
 * The number of times to re-read MCA telemetry to try to obtain a
 * consistent snapshot if we find it to be changing under our feet.
 */
int gcpu_mca_telemetry_retries = 5;

#ifndef __xpv
int gcpu_mca_cmci_throttling_threshold = 10;
int gcpu_mca_cmci_reenable_threshold = 1000;
#endif

static gcpu_error_disp_t gcpu_errtypes[] = {

    /*
     * Unclassified
     */
    {
        FM_EREPORT_CPU_GENERIC_UNCLASSIFIED,
        NULL,
        FM_EREPORT_PAYLOAD_FLAGS_COMMON,
        MCAX86_SIMPLE_UNCLASSIFIED_MASKON,
        MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF
    },

    /*
     * Microcode ROM Parity Error
     */
    {
        FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY,
        NULL,
        FM_EREPORT_PAYLOAD_FLAGS_COMMON,
        MCAX86_SIMPLE_MC_CODE_PARITY_MASKON,
        MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF
    },

    /*
     * External - BINIT# from another processor during power-on config
     */
    {
        FM_EREPORT_CPU_GENERIC_EXTERNAL,
        NULL,
        FM_EREPORT_PAYLOAD_FLAGS_COMMON,
        MCAX86_SIMPLE_EXTERNAL_MASKON,
        MCAX86_SIMPLE_EXTERNAL_MASKOFF
    },

    /*
     * Functional redundancy check master/slave error
     */
    {
        FM_EREPORT_CPU_GENERIC_FRC,
        NULL,
        FM_EREPORT_PAYLOAD_FLAGS_COMMON,
        MCAX86_SIMPLE_FRC_MASKON,
        MCAX86_SIMPLE_FRC_MASKOFF
    },

    /*
     * Internal parity error
     */
    {
        FM_EREPORT_CPU_GENERIC_INTERNAL_PARITY,
        NULL,
        FM_EREPORT_PAYLOAD_FLAGS_COMMON,
        MCAX86_SIMPLE_INTERNAL_PARITY_MASKON,
        MCAX86_SIMPLE_INTERNAL_PARITY_MASKOFF
    },


    /*
     * Internal timer error
     */
    {
        FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER,
        NULL,
        FM_EREPORT_PAYLOAD_FLAGS_COMMON,
        MCAX86_SIMPLE_INTERNAL_TIMER_MASKON,
        MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF
    },

    /*
     * Internal unclassified
     */
    {
        FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS,
        NULL,
        FM_EREPORT_PAYLOAD_FLAGS_COMMON,
        MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON,
        MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF
    },

    /*
     * Compound error codes - generic memory hierarchy
     */
    {
        FM_EREPORT_CPU_GENERIC_GENMEMHIER,
        NULL,
        FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */
        MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON,
        MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF
    },

    /*
     * Compound error codes - TLB errors
     */
    {
        FM_EREPORT_CPU_GENERIC_TLB,
        "%1$s" "TLB" "%2$s" "_ERR",
        FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
        MCAX86_COMPOUND_TLB_MASKON,
        MCAX86_COMPOUND_TLB_MASKOFF
    },

    /*
     * Compound error codes - memory hierarchy
     */
    {
        FM_EREPORT_CPU_GENERIC_MEMHIER,
        "%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR",
        FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
        MCAX86_COMPOUND_MEMHIER_MASKON,
        MCAX86_COMPOUND_MEMHIER_MASKOFF
    },

    /*
     * Compound error codes - bus and interconnect errors
     */
    {
        FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT,
        "BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR",
        FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
        MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON,
        MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF
    },
    /*
     * Compound error codes - memory controller errors
     */
    {
        FM_EREPORT_CPU_GENERIC_MEMORY_CONTROLLER,
        "MC" "_" "%8$s" "_" "%9$s" "_ERR",
        FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
        MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKON,
        MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKOFF
    },
};

static gcpu_error_disp_t gcpu_unknown = {
    FM_EREPORT_CPU_GENERIC_UNKNOWN,
    "UNKNOWN",
    FM_EREPORT_PAYLOAD_FLAGS_COMMON,
    0,
    0
};

static errorq_t *gcpu_mca_queue;
static kmutex_t gcpu_mca_queue_lock;

#ifdef __xpv
static int isxpv = 1;
#else
static int isxpv = 0;
#endif

static const gcpu_error_disp_t *
gcpu_disp_match(uint16_t code)
{
    const gcpu_error_disp_t *ged = gcpu_errtypes;
    int i;

    for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t);
        i++, ged++) {
        uint16_t on = ged->ged_errcode_mask_on;
        uint16_t off = ged->ged_errcode_mask_off;

        if ((code & on) == on && (code & off) == 0)
            return (ged);
    }

    return (NULL);
}

static uint8_t
bit_strip(uint16_t code, uint16_t mask, uint16_t shift)
{
    return ((uint8_t)(code & mask) >> shift);
}

#define BIT_STRIP(code, name) \
    bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \
    MCAX86_ERRCODE_##name##_SHIFT)

#define GCPU_MNEMONIC_UNDEF "undefined"
#define GCPU_MNEMONIC_RESVD "reserved"

/*
 * Mappings of TT, LL, RRRR, PP, II and T values to compound error name
 * mnemonics and to ereport class name components.
 */

struct gcpu_mnexp {
    const char *mne_compound;   /* used in expanding compound errname */
    const char *mne_ereport;    /* used in expanding ereport class */
};

static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */
    { "I", FM_EREPORT_CPU_GENERIC_TT_INSTR },       /* INSTR */
    { "D", FM_EREPORT_CPU_GENERIC_TT_DATA },        /* DATA */
    { "G", FM_EREPORT_CPU_GENERIC_TT_GEN },         /* GEN */
    { GCPU_MNEMONIC_UNDEF, "" }
};

static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */
    { "LO", FM_EREPORT_CPU_GENERIC_LL_L0 },         /* L0 */
    { "L1", FM_EREPORT_CPU_GENERIC_LL_L1 },         /* L1 */
    { "L2", FM_EREPORT_CPU_GENERIC_LL_L2 },         /* L2 */
    { "LG", FM_EREPORT_CPU_GENERIC_LL_LG }          /* LG */
};

static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */
    { "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR },     /* ERR */
    { "RD", FM_EREPORT_CPU_GENERIC_RRRR_RD },       /* RD */
    { "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR },       /* WR */
    { "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD },     /* DRD */
    { "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR },     /* DWR */
    { "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD },     /* IRD */
    { "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH },   /* PREFETCH */
    { "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT },     /* EVICT */
    { "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP },     /* SNOOP */
};

static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */
    { "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC },       /* SRC */
    { "RES", FM_EREPORT_CPU_GENERIC_PP_RES },       /* RES */
    { "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS },       /* OBS */
    { "", FM_EREPORT_CPU_GENERIC_PP_GEN }           /* GEN */
};

static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */
    { "M", FM_EREPORT_CPU_GENERIC_II_MEM },         /* MEM */
    { GCPU_MNEMONIC_RESVD, "" },
    { "IO", FM_EREPORT_CPU_GENERIC_II_IO },         /* IO */
    { "", FM_EREPORT_CPU_GENERIC_II_GEN }           /* GEN */
};

static struct gcpu_mnexp gcpu_T_mnemonics[] = {  /* MCAX86_ERRCODE_T_* */
    { "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT },    /* NONE */
    { "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT }     /* TIMEOUT */
};

static struct gcpu_mnexp gcpu_CCCC_mnemonics[] = { /* MCAX86_ERRCODE_CCCC_* */
    { "CH0", FM_EREPORT_CPU_GENERIC_CCCC },     /* CH0 */
    { "CH1", FM_EREPORT_CPU_GENERIC_CCCC },     /* CH1 */
    { "CH2", FM_EREPORT_CPU_GENERIC_CCCC },     /* CH2 */
    { "CH3", FM_EREPORT_CPU_GENERIC_CCCC },     /* CH3 */
    { "CH4", FM_EREPORT_CPU_GENERIC_CCCC },     /* CH4 */
    { "CH5", FM_EREPORT_CPU_GENERIC_CCCC },     /* CH5 */
    { "CH6", FM_EREPORT_CPU_GENERIC_CCCC },     /* CH6 */
    { "CH7", FM_EREPORT_CPU_GENERIC_CCCC },     /* CH7 */
    { "CH8", FM_EREPORT_CPU_GENERIC_CCCC },     /* CH8 */
    { "CH9", FM_EREPORT_CPU_GENERIC_CCCC },     /* CH9 */
    { "CH10", FM_EREPORT_CPU_GENERIC_CCCC },    /* CH10 */
    { "CH11", FM_EREPORT_CPU_GENERIC_CCCC },    /* CH11 */
    { "CH12", FM_EREPORT_CPU_GENERIC_CCCC },    /* CH12 */
    { "CH13", FM_EREPORT_CPU_GENERIC_CCCC },    /* CH13 */
    { "CH14", FM_EREPORT_CPU_GENERIC_CCCC },    /* CH14 */
    { "CH", FM_EREPORT_CPU_GENERIC_CCCC }       /* GEN */
};

static struct gcpu_mnexp gcpu_MMM_mnemonics[] = { /* MCAX86_ERRCODE_MMM_* */
    { "GEN", FM_EREPORT_CPU_GENERIC_MMM_ERR },  /* GEN ERR */
    { "RD", FM_EREPORT_CPU_GENERIC_MMM_RD },    /* READ  */
    { "WR", FM_EREPORT_CPU_GENERIC_MMM_WR },    /* WRITE  */
    { "ADDR_CMD", FM_EREPORT_CPU_GENERIC_MMM_ADRCMD },  /* ADDR, CMD  */
    { GCPU_MNEMONIC_RESVD, ""},         /* RESERVED  */
    { GCPU_MNEMONIC_RESVD, ""},         /* RESERVED  */
    { GCPU_MNEMONIC_RESVD, ""},         /* RESERVED  */
    { GCPU_MNEMONIC_RESVD, ""}          /* RESERVED  */
};

enum gcpu_mn_namespace {
    GCPU_MN_NAMESPACE_COMPOUND,
    GCPU_MN_NAMESPACE_EREPORT
};

static const char *
gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint8_t val,
    enum gcpu_mn_namespace nspace)
{
    if (val >= tbl_sz)
        return (GCPU_MNEMONIC_UNDEF);   /* for all namespaces */

    switch (nspace) {
    case GCPU_MN_NAMESPACE_COMPOUND:
        return (tbl[val].mne_compound);
        /*NOTREACHED*/

    case GCPU_MN_NAMESPACE_EREPORT:
        return (tbl[val].mne_ereport);
        /*NOTREACHED*/

    default:
        return (GCPU_MNEMONIC_UNDEF);
        /*NOTREACHED*/
    }
}

/*
 * The ereport class leaf component is either a simple string with no
 * format specifiers, or a string with one or more embedded %n$s specifiers -
 * positional selection for string arguments.  The kernel snprintf does
 * not support %n$ (and teaching it to do so is too big a headache) so
 * we will expand this restricted format string ourselves.
 */

#define GCPU_CLASS_VARCOMPS 9

#define GCPU_MNEMONIC(code, name, nspace) \
    gcpu_mnemonic(gcpu_##name##_mnemonics, \
    sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \
    BIT_STRIP(code, name), nspace)

static void
gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
    enum gcpu_mn_namespace nspace)
{
    uint16_t code = MCAX86_ERRCODE(status);
    const char *mn[GCPU_CLASS_VARCOMPS];
    char *p = buf;          /* current position in buf */
    char *q = buf + buflen;     /* pointer past last char in buf */
    int which, expfmtchar, error;
    char c;

    mn[0] = GCPU_MNEMONIC(code, TT, nspace);
    mn[1] = GCPU_MNEMONIC(code, LL, nspace);
    mn[2] = GCPU_MNEMONIC(code, RRRR, nspace);
    mn[3] = GCPU_MNEMONIC(code, PP, nspace);
    mn[4] = GCPU_MNEMONIC(code, II, nspace);
    mn[5] = GCPU_MNEMONIC(code, T, nspace);
    mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : "";
    mn[7] = GCPU_MNEMONIC(code, CCCC, nspace);
    mn[8] = GCPU_MNEMONIC(code, MMM, nspace);

    while (p < q - 1 && (c = *fmt++) != '\0') {
        if (c != '%') {
            /* not the beginning of a format specifier - copy */
            *p++ = c;
            continue;
        }

        error = 0;
        which = -1;
        expfmtchar = -1;

nextfmt:
        if ((c = *fmt++) == '\0')
            break;  /* early termination of fmt specifier */

        switch (c) {
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
            if (which != -1) { /* allow only one positional digit */
                error++;
                break;
            }
            which = c - '1';
            goto nextfmt;
            /*NOTREACHED*/

        case '$':
            if (which == -1) { /* no position specified */
                error++;
                break;
            }
            expfmtchar = 's';
            goto nextfmt;
            /*NOTREACHED*/

        case 's':
            if (expfmtchar != 's') {
                error++;
                break;
            }
            (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
                mn[which]);
            p += strlen(p);
            break;

        default:
            error++;
            break;
        }

        if (error)
            break;
    }

    *p = '\0';  /* NUL termination */
}

static void
gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
    const char *cpuclass, const char *leafclass)
{
    char *p = buf;          /* current position in buf */
    char *q = buf + buflen;     /* pointer past last char in buf */

    (void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.",
        FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC);

    p += strlen(p);
    if (p >= q)
        return;

    if (leafclass == NULL) {
        gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status,
            GCPU_MN_NAMESPACE_EREPORT);
    } else {
        (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
            leafclass);
    }
}

/*
 * Create an "hc" scheme FMRI identifying the given cpu with
 * motherboard/chip/core/strand instance numbers.
 */
static nvlist_t *
gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva)
{
    nvlist_t *nvl;

    if ((nvl = fm_nvlist_create(nva)) == NULL)
        return (NULL);

    fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 4,
        "motherboard", 0,
        "chip", cmi_hdl_chipid(hdl),
        "core", cmi_hdl_coreid(hdl),
        "strand", cmi_hdl_strandid(hdl));

    return (nvl);
}

int gcpu_bleat_count_thresh = 5;
hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL;

/*
 * Called when we are unable to propogate a logout structure onto an
 * errorq for subsequent ereport preparation and logging etc.  The caller
 * should usually only decide to call this for severe errors - those we
 * suspect we may need to panic for.
 */
static void
gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl)
{
    hrtime_t now  = gethrtime_waitfree();
    static hrtime_t gcpu_last_bleat;
    gcpu_bank_logout_t *gbl;
    static int bleatcount;
    int i;

    /*
     * Throttle spamming of the console.  The first gcpu_bleat_count_thresh
     * can come as fast as we like, but once we've spammed that many
     * to the console we require a minimum interval to pass before
     * any more complaints.
     */
    if (++bleatcount > gcpu_bleat_count_thresh) {
        if (now - gcpu_last_bleat < gcpu_bleat_min_interval)
            return;
        else
            bleatcount = 0;
    }
    gcpu_last_bleat = now;

    cmn_err(CE_WARN,
        "Machine-Check Errors unlogged on chip %d core %d strand %d, "
        "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl),
        cmi_hdl_strandid(hdl));
    cmn_err(CE_WARN, "MCG_STATUS 0x%016llx",
        (u_longlong_t)gcl->gcl_mcg_status);
    for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
        uint64_t status = gbl->gbl_status;

        if (!(status & MSR_MC_STATUS_VAL))
            continue;

        switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) {
        case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV:
            cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
                "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx",
                i, IA32_MSR_MC(i, STATUS),
                (u_longlong_t)status,
                (u_longlong_t)gbl->gbl_addr,
                (u_longlong_t)gbl->gbl_misc);
            break;

        case MSR_MC_STATUS_ADDRV:
            cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
                "STAT 0x%016llx ADDR 0x%016llx",
                i, IA32_MSR_MC(i, STATUS),
                (u_longlong_t)status,
                (u_longlong_t)gbl->gbl_addr);
            break;

        case MSR_MC_STATUS_MISCV:
            cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
                "STAT 0x%016llx MISC 0x%016llx",
                i, IA32_MSR_MC(i, STATUS),
                (u_longlong_t)status,
                (u_longlong_t)gbl->gbl_misc);
            break;

        default:
            cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
                "STAT 0x%016llx",
                i, IA32_MSR_MC(i, STATUS),
                (u_longlong_t)status);
            break;

        }
    }
}

#define _GCPU_BSTATUS(status, what) \
    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \
    (status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE

static void
gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl,
    uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code)
{
    uint64_t members = ged ? ged->ged_ereport_members :
        FM_EREPORT_PAYLOAD_FLAGS_COMMON;
    uint64_t mcg = gcl->gcl_mcg_status;
    int mcip = mcg & MCG_STATUS_MCIP;
    const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno];
    uint64_t bstat = gbl->gbl_status;

    /*
     * Include the compound error name if requested and if this
     * is a compound error type.
     */
    if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged &&
        ged->ged_compound_fmt != NULL) {
        char buf[FM_MAX_CLASS];

        gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code,
            GCPU_MN_NAMESPACE_COMPOUND);
        fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR,
            DATA_TYPE_STRING, buf, NULL);
    }

    /*
     * Include disposition information for this error
     */
    if (members & FM_EREPORT_PAYLOAD_FLAG_DISP &&
        gbl->gbl_disp != 0) {
        int i, empty = 1;
        char buf[128];
        char *p = buf, *q = buf + 128;
        static struct _gcpu_disp_name {
            uint64_t dv;
            const char *dn;
        } disp_names[] = {
            { CMI_ERRDISP_CURCTXBAD,
                "processor_context_corrupt" },
            { CMI_ERRDISP_RIPV_INVALID,
                "return_ip_invalid" },
            { CMI_ERRDISP_UC_UNCONSTRAINED,
                "unconstrained" },
            { CMI_ERRDISP_FORCEFATAL,
                "forcefatal" },
            { CMI_ERRDISP_IGNORED,
                "ignored" },
            { CMI_ERRDISP_PCC_CLEARED,
                "corrupt_context_cleared" },
            { CMI_ERRDISP_UC_CLEARED,
                "uncorrected_data_cleared" },
            { CMI_ERRDISP_POISONED,
                "poisoned" },
            { CMI_ERRDISP_INCONSISTENT,
                "telemetry_unstable" },
        };

        for (i = 0; i < sizeof (disp_names) /
            sizeof (struct _gcpu_disp_name); i++) {
            if ((gbl->gbl_disp & disp_names[i].dv) == 0)
                continue;

            (void) snprintf(p, (uintptr_t)q - (uintptr_t)p,
                "%s%s", empty ? "" : ",", disp_names[i].dn);
            p += strlen(p);
            empty = 0;
        }

        if (p != buf)
            fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP,
                DATA_TYPE_STRING, buf, NULL);
    }

    /*
     * If MCG_STATUS is included add that and an indication of whether
     * this ereport was the result of a machine check or poll.
     */
    if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) {
        fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS,
            DATA_TYPE_UINT64, mcg, NULL);

        fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP,
            DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL);
    }

    /*
     * If an instruction pointer is to be included add one provided
     * MCG_STATUS indicated it is valid; meaningless for polled events.
     */
    if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP &&
        mcg & MCG_STATUS_EIPV) {
        fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP,
            DATA_TYPE_UINT64, gcl->gcl_ip, NULL);
    }

    /*
     * Add an indication of whether the trap occured during privileged code.
     */
    if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) {
        fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV,
            DATA_TYPE_BOOLEAN_VALUE,
            gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL);
    }

    /*
     * If requested, add the index of the MCA bank.  This indicates the
     * n'th bank of 4 MCA registers, and does not necessarily correspond
     * to MCi_* - use the bank offset to correlate
     */
    if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) {
        fm_payload_set(ereport,
            /* Bank number */
            FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno,
            /* Offset of MCi_CTL */
            FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64,
            IA32_MSR_MC(bankno, CTL),
            NULL);
    }

    /*
     * Add MCi_STATUS if requested, and decode it.
     */
    if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) {
        const char *tbes[] = {
            "No tracking",          /* 00 */
            "Green - below threshold",  /* 01 */
            "Yellow - above threshold", /* 10 */
            "Reserved"          /* 11 */
        };

        fm_payload_set(ereport,
            /* Bank MCi_STATUS */
            FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat,
            /* Overflow? */
            _GCPU_BSTATUS(bstat, OVER),
            /* Uncorrected? */
            _GCPU_BSTATUS(bstat, UC),
            /* Enabled? */
            _GCPU_BSTATUS(bstat, EN),
            /* Processor context corrupt? */
            _GCPU_BSTATUS(bstat, PCC),
            /* Error code */
            FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE,
            DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat),
            /* Model-specific error code */
            FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE,
            DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat),
            NULL);

        /*
         * If MCG_CAP.TES_P indicates that that thresholding info
         * is present in the architural component of the bank status
         * then include threshold information for this bank.
         */
        if (gcl->gcl_flags & GCPU_GCL_F_TES_P) {
            fm_payload_set(ereport,
                FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES,
                DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)],
                NULL);
        }
    }

    /*
     * MCi_ADDR info if requested and valid.
     */
    if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR &&
        bstat & MSR_MC_STATUS_ADDRV) {
        fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR,
            DATA_TYPE_UINT64, gbl->gbl_addr, NULL);
    }

    /*
     * MCi_MISC if requested and MCi_STATUS.MISCV).
     */
    if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC &&
        bstat & MSR_MC_STATUS_MISCV) {
        fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC,
            DATA_TYPE_UINT64, gbl->gbl_misc, NULL);
    }

}

/*
 * Construct and post an ereport based on the logout information from a
 * single MCA bank.  We are not necessarily running on the cpu that
 * detected the error.
 */
static void
gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx,
    const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status)
{
    gcpu_data_t *gcpu = gcl->gcl_gcpu;
    cmi_hdl_t hdl = gcpu->gcpu_hdl;
    const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx];
    const char *cpuclass = NULL, *leafclass = NULL;
    uint16_t code = MCAX86_ERRCODE(status);
    errorq_elem_t *eqep, *scr_eqep;
    nvlist_t *ereport, *detector;
    char buf[FM_MAX_CLASS];
    const char *classfmt;
    nv_alloc_t *nva;

    if (panicstr) {
        if ((eqep = errorq_reserve(ereport_errorq)) == NULL)
            return;
        ereport = errorq_elem_nvl(ereport_errorq, eqep);

        /*
         * Allocate another element for scratch space, but fallback
         * to the one we have if that fails.  We'd like to use the
         * additional scratch space for nvlist construction.
         */
        if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL)
            nva = errorq_elem_nva(ereport_errorq, scr_eqep);
        else
            nva = errorq_elem_nva(ereport_errorq, eqep);
    } else {
        ereport = fm_nvlist_create(NULL);
        nva = NULL;
    }

    if (ereport == NULL)
        return;

    /*
     * Common payload data required by the protocol:
     *  - ereport class
     *  - detector
     *  - ENA
     */

    /*
     * Ereport class - call into model-specific support to allow it to
     * provide a cpu class or leaf class, otherwise calculate our own.
     */
    cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass);
    classfmt = ged ?  ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN;
    gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass,
        leafclass);

    /*
     * The detector FMRI.
     */
    if ((detector = cms_ereport_detector(hdl, mscookie, nva)) == NULL)
        detector = gcpu_fmri_create(hdl, nva);

    /*
     * Should we define a new ENA format 3?? for chip/core/strand?
     * It will be better when virtualized.
     */
    fm_ereport_set(ereport, FM_EREPORT_VERSION, buf,
        fm_ena_generate_cpu(gcl->gcl_timestamp,
        cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 |
        cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL);

    if (panicstr) {
        fm_nvlist_destroy(detector, FM_NVA_RETAIN);
        nv_alloc_reset(nva);
    } else {
        fm_nvlist_destroy(detector, FM_NVA_FREE);
    }

    /*
     * Add the architectural ereport class-specific payload data.
     */
    gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code);

    /*
     * Allow model-specific code to add ereport members.
     */
    cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status,
        gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie);

    /*
     * Include stack if options is turned on and either selected in
     * the payload member bitmask or inclusion is forced.
     */
    if (gcpu_mca_stack_flag &&
        (cms_ereport_includestack(hdl, mscookie) ==
        B_TRUE || gcpu_mca_stack_ereport_include)) {
        fm_payload_stack_add(ereport, gcl->gcl_stack,
            gcl->gcl_stackdepth);
    }

    /*
     * If injection has taken place anytime in the past then note this
     * on the ereport.
     */
    if (cmi_inj_tainted() == B_TRUE) {
        fm_payload_set(ereport, "__injected", DATA_TYPE_BOOLEAN_VALUE,
            B_TRUE, NULL);
    }

    /*
     * Post ereport.
     */
    if (panicstr) {
        errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC);
        if (scr_eqep)
            errorq_cancel(ereport_errorq, scr_eqep);
    } else {
        (void) fm_ereport_post(ereport, EVCH_TRYHARD);
        fm_nvlist_destroy(ereport, FM_NVA_FREE);
    }

}

/*ARGSUSED*/
void
gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe)
{
    const gcpu_logout_t *gcl = data;
    const gcpu_bank_logout_t *gbl;
    int i;

    for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
        const gcpu_error_disp_t *gened;
        cms_cookie_t mscookie;

        if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
            !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
            uint16_t code = MCAX86_ERRCODE(gbl->gbl_status);

            /*
             * Perform a match based on IA32 MCA architectural
             * components alone.
             */
            gened = gcpu_disp_match(code); /* may be NULL */

            /*
             * Now see if an model-specific match can be made.
             */
            mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, i,
                gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc,
                gcl->gcl_ms_logout);

            /*
             * Prepare and dispatch an ereport for logging and
             * diagnosis.
             */
            gcpu_ereport_post(gcl, i, gened, mscookie,
                gbl->gbl_status);
        } else if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
            (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
            /*
             * Telemetry kept changing as we tried to read
             * it.  Force an unknown ereport leafclass but
             * keep the telemetry unchanged for logging.
             */
            gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL,
                gbl->gbl_status);
        }
    }
}

static size_t gcpu_mca_queue_datasz = 0;

/*
 * The following code is ready to make a weak attempt at growing the
 * errorq structure size.  Since it is not foolproof (we don't know
 * who may already be producing to the outgoing errorq) our caller
 * instead assures that we'll always be called with no greater data
 * size than on our first call.
 */
static void
gcpu_errorq_init(size_t datasz)
{
    int slots;

    mutex_enter(&gcpu_mca_queue_lock);

    if (gcpu_mca_queue_datasz >= datasz) {
        mutex_exit(&gcpu_mca_queue_lock);
        return;
    }

    membar_producer();
    if (gcpu_mca_queue) {
        gcpu_mca_queue_datasz = 0;
        errorq_destroy(gcpu_mca_queue);
    }

    slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS);
    slots = MIN(slots, GCPU_MCA_MAX_ERRORS);

    gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain,
        NULL, slots, datasz, 1, ERRORQ_VITAL);

    if (gcpu_mca_queue != NULL)
        gcpu_mca_queue_datasz = datasz;

    mutex_exit(&gcpu_mca_queue_lock);
}

/*
 * Perform MCA initialization as described in section 14.6 of Intel 64
 * and IA-32 Architectures Software Developer's Manual Volume 3A.
 */

static uint_t global_nbanks;

void
gcpu_mca_init(cmi_hdl_t hdl)
{
    gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
    uint64_t cap;
    uint_t vendor = cmi_hdl_vendor(hdl);
    uint_t family = cmi_hdl_family(hdl);
    gcpu_mca_t *mca = &gcpu->gcpu_mca;
    int mcg_ctl_present;
    uint_t nbanks;
    uint32_t ctl_skip_mask = 0;
    uint32_t status_skip_mask = 0;
    size_t mslsz;
    int i;
#ifndef __xpv
    int mcg_ctl2_present;
    uint32_t cmci_capable = 0;
#endif

    if (gcpu == NULL)
        return;

    /*
     * Protect from some silly /etc/system settings.
     */
    if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100)
        gcpu_mca_telemetry_retries = 5;

    if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS)
        return;

    /*
     * CPU startup code only calls cmi_mca_init if x86_feature indicates
     * both MCA and MCE support (i.e., X86_MCA).  P5, K6, and earlier
     * processors, which have their own * more primitive way of doing
     * machine checks, will not have cmi_mca_init called since their
     * CPUID information will not indicate both MCA and MCE features.
     */
    ASSERT(x86_feature & X86_MCA);

    /*
     * Determine whether the IA32_MCG_CTL register is present.  If it
     * is we will enable all features by writing -1 to it towards
     * the end of this initialization;  if it is absent then volume 3A
     * says we must nonetheless continue to initialize the individual
     * banks.
     */
    mcg_ctl_present = cap & MCG_CAP_CTL_P;
#ifndef __xpv
    mcg_ctl2_present = cap & MCG_CAP_CTL2_P;
#endif

    /*
     * We squirell values away for inspection/debugging.
     */
    mca->gcpu_mca_bioscfg.bios_mcg_cap = cap;
    if (mcg_ctl_present)
        (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL,
            &mca->gcpu_mca_bioscfg.bios_mcg_ctl);

    /*
     * Determine the number of error-reporting banks implemented.
     */
    mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK;

    if (nbanks != 0 && global_nbanks == 0)
        global_nbanks = nbanks; /* no race - BSP will get here first */

    /*
     * If someone is hiding the number of banks (perhaps we are fully
     * virtualized?) or if this processor has more banks than the
     * first to set global_nbanks then bail.  The latter requirement
     * is because we need to size our errorq data structure and we
     * don't want to have to grow the errorq (destroy and recreate)
     * which may just lose some telemetry.
     */
    if (nbanks == 0 || nbanks > global_nbanks)
        return;

    mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks *
        sizeof (struct gcpu_bios_bankcfg), KM_SLEEP);

    /*
     * Calculate the size we need to allocate for a gcpu_logout_t
     * with a gcl_data array big enough for all banks of this cpu.
     * Add any space requested by the model-specific logout support.
     */
    mslsz = cms_logout_size(hdl);
    mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) +
        (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz;

    for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
        gcpu_logout_t *gcl;

        mca->gcpu_mca_logout[i] = gcl =
            kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP);
        gcl->gcl_gcpu = gcpu;
        gcl->gcl_nbanks = nbanks;
        gcl->gcl_ms_logout = (mslsz == 0) ? NULL :
            (char *)(&gcl->gcl_data[0]) + nbanks *
            sizeof (gcpu_bank_logout_t);

    }

#ifdef __xpv
    gcpu_xpv_mca_init(nbanks);
#endif

    mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1;

#ifndef __xpv
    mca->gcpu_bank_cmci = kmem_zalloc(sizeof (gcpu_mca_cmci_t) * nbanks,
        KM_SLEEP);
#endif

    /*
     * Create our errorq to transport the logout structures.  This
     * can fail so users of gcpu_mca_queue must be prepared for NULL.
     */
    gcpu_errorq_init(mca->gcpu_mca_lgsz);

    /*
     * Not knowing which, if any, banks are shared between cores we
     * assure serialization of MCA bank initialization by each cpu
     * on the chip.  On chip architectures in which some banks are
     * shared this will mean the shared resource is initialized more
     * than once - we're simply aiming to avoid simultaneous MSR writes
     * to the shared resource.
     *
     * Even with these precautions, some platforms may yield a GP fault
     * if a core other than a designated master tries to write anything
     * but all 0's to MCi_{STATUS,ADDR,CTL}.  So we will perform
     * those writes under on_trap protection.
     */
    mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);

    /*
     * Initialize poller data, but don't start polling yet.
     */
    gcpu_mca_poll_init(hdl);

    /*
     * Work out which MCA banks we will initialize.  In MCA logout
     * code we will only read those banks which we initialize here.
     */
    for (i = 0; i < nbanks; i++) {
        boolean_t skipctl = cms_bankctl_skipinit(hdl, i);
        boolean_t skipstatus = cms_bankstatus_skipinit(hdl, i);

        if (!cms_present(hdl)) {
            /*
             * Model-specific support is not present, try to use
             * sane defaults.
             *
             * On AMD family 6 processors, reports about spurious
             * machine checks indicate that bank 0 should be
             * skipped.
             *
             * On Intel family 6 processors, the documentation tells
             * us not to write to MC0_CTL.
             *
             */
            if (i == 0 && family == 6) {
                switch (vendor) {
                case X86_VENDOR_AMD:
                    skipstatus = B_TRUE;
                    /*FALLTHRU*/
                case X86_VENDOR_Intel:
                    skipctl = B_TRUE;
                    break;
                }
            }
        }

        ctl_skip_mask |= skipctl << i;
        status_skip_mask |= skipstatus << i;

        if (skipctl && skipstatus)
            continue;

        /*
         * Record which MCA banks were enabled, from the point of view
         * of the whole chip (if some cores share a bank we must be
         * sure either can logout from it).
         */
        atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i);

#ifndef __xpv
        /*
         * check CMCI capability
         */
        if (mcg_ctl2_present) {
            uint64_t ctl2;
            uint32_t cap = 0;
            (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
            if (ctl2 & MSR_MC_CTL2_EN)
                continue;
            ctl2 |= MSR_MC_CTL2_EN;
            (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
            (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
            mca->gcpu_bank_cmci[i].cmci_cap = cap =
                (ctl2 & MSR_MC_CTL2_EN) ? 1 : 0;
            if (cap)
                cmci_capable ++;
            /*
             * Set threshold to 1 while unset the en field, to avoid
             * CMCI trigged before APIC LVT entry init.
             */
            ctl2 = ctl2 & (~MSR_MC_CTL2_EN) | 1;
            (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);

            /*
             * init cmci related count
             */
            mca->gcpu_bank_cmci[i].cmci_enabled = 0;
            mca->gcpu_bank_cmci[i].drtcmci = 0;
            mca->gcpu_bank_cmci[i].ncmci = 0;
        }
#endif
    }

#ifndef __xpv
    if (cmci_capable)
        cmi_enable_cmci = 1;
#endif

#ifndef __xpv
    /*
     * Log any valid telemetry lurking in the MCA banks, but do not
     * clear the status registers.  Ignore the disposition returned -
     * we have already paniced or reset for any nasty errors found here.
     *
     * Intel vol 3A says that we should not do this on family 0x6,
     * and that for any extended family the BIOS clears things
     * on power-on reset so you'll only potentially find valid telemetry
     * on warm reset (we do it for both - on power-on reset we should
     * just see zeroes).
     *
     * AMD docs since K7 say we should process anything we find here.
     */
    if (!gcpu_suppress_log_on_init &&
        (vendor == X86_VENDOR_Intel && family >= 0xf ||
        vendor == X86_VENDOR_AMD))
        gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE,
            GCPU_MPT_WHAT_POKE_ERR);

    /*
     * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the
     * model-specific module the power of veto.
     */
    for (i = 0; i < nbanks; i++) {
        struct gcpu_bios_bankcfg *bcfgp =
            mca->gcpu_mca_bioscfg.bios_bankcfg + i;

        /*
         * Stash inherited bank MCA state, even for banks we will
         * not initialize ourselves.  Do not read the MISC register
         * unconditionally - on some processors that will #GP on
         * banks that do not implement the MISC register (would be
         * caught by on_trap, anyway).
         */
        (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL),
            &bcfgp->bios_bank_ctl);

        (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
            &bcfgp->bios_bank_status);

        if (bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV)
            (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR),
                &bcfgp->bios_bank_addr);

        /*
         * In some old BIOS the status value after boot can indicate
         * MISCV when there is actually no MISC register for
         * that bank.  The following read could therefore
         * aggravate a general protection fault.  This should be
         * caught by on_trap, but the #GP fault handler is busted
         * and can suffer a double fault even before we get to
         * trap() to check for on_trap protection.  Until that
         * issue is fixed we remove the one access that we know
         * can cause a #GP.
         *
         * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV)
         *  (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC),
         *      &bcfgp->bios_bank_misc);
         */
        bcfgp->bios_bank_misc = 0;

        if (!(ctl_skip_mask & (1 << i))) {
            (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL),
                cms_bankctl_val(hdl, i, -1ULL));
        }

        if (!(status_skip_mask & (1 << i))) {
            (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS),
                cms_bankstatus_val(hdl, i, 0ULL));
        }
    }
#endif
    /*
     * Now let the model-specific support perform further initialization
     * of non-architectural features.
     */
    cms_mca_init(hdl, nbanks);

#ifndef __xpv
    (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL);
    membar_producer();

    /* enable all machine-check features */
    if (mcg_ctl_present)
        (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL,
            cms_mcgctl_val(hdl, nbanks, -1ULL));
#endif

    mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);

#ifndef __xpv
    /* enable machine-check exception in CR4 */
    cmi_hdl_enable_mce(hdl);
#endif
}

static uint64_t
gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu,
    gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp)
{
    int curctxbad = 0, unconstrained = 0, forcefatal = 0;
    gcpu_mca_t *mca = &gcpu->gcpu_mca;
    int nbanks = mca->gcpu_mca_nbanks;
    gcpu_mce_status_t mce;
    gcpu_bank_logout_t *gbl;
    uint64_t disp = 0;
    int i;

    if (mcesp == NULL)
        mcesp = &mce;

    mcesp->mce_nerr = nerr;

    mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc =
        mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned =
        mcesp->mce_forcefatal = mcesp->mce_ignored = 0;

    /*
     * If this a machine check then if the return instruction pointer
     * is not valid the current context is lost.
     */
    if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV))
        disp |= CMI_ERRDISP_RIPV_INVALID;

    for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
        uint64_t mcistatus = gbl->gbl_status;
        uint32_t ms_scope;
        int pcc, uc;
        int poisoned;

        if (!(mcistatus & MSR_MC_STATUS_VAL))
            continue;

        if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)
            continue;

        pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0;
        uc = (mcistatus & MSR_MC_STATUS_UC) != 0;
        mcesp->mce_npcc += pcc;
        mcesp->mce_nuc += uc;

        ms_scope = cms_error_action(hdl, ismc, i, mcistatus,
            gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout);

        if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) {
            pcc = 0;
            mcesp->mce_npcc_ok++;
            gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED;
        }

        if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) {
            uc = 0;
            mcesp->mce_nuc_ok++;
            gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED;
        }

        if (uc) {
            poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0;
            if (poisoned) {
                mcesp->mce_nuc_poisoned++;
                gbl->gbl_disp |= CMI_ERRDISP_POISONED;
            }
        }

        if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) {
            /*
             * We're not being instructed to ignore the error,
             * so apply our standard disposition logic to it.
             */
            if (uc && !poisoned) {
                unconstrained++;
                gbl->gbl_disp |= disp |
                    CMI_ERRDISP_UC_UNCONSTRAINED;
            }

            if (pcc && ismc) {
                curctxbad++;
                gbl->gbl_disp |= disp |
                    CMI_ERRDISP_CURCTXBAD;
            }

            /*
             * Even if the above may not indicate that the error
             * is terminal, model-specific support may insist
             * that we treat it as such.  Such errors wil be
             * fatal even if discovered via poll.
             */
            if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) {
                forcefatal++;
                mcesp->mce_forcefatal++;
                gbl->gbl_disp |= disp |
                    CMI_ERRDISP_FORCEFATAL;
            }
        } else {
            mcesp->mce_ignored++;
            gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED;
        }
    }

    if (unconstrained > 0)
        disp |= CMI_ERRDISP_UC_UNCONSTRAINED;

    if (curctxbad > 0)
        disp |= CMI_ERRDISP_CURCTXBAD;

    if (forcefatal > 0)
        disp |= CMI_ERRDISP_FORCEFATAL;

    if (gcpu_mca_queue != NULL) {
        int how;

        if (ismc) {
            how = cmi_mce_response(rp, disp) ?
                ERRORQ_ASYNC :  /* no panic, so arrange drain */
                ERRORQ_SYNC;    /* panic flow will drain */
        } else {
            how = (disp & CMI_ERRDISP_FORCEFATAL &&
                cmi_panic_on_ue()) ?
                ERRORQ_SYNC :   /* poller will panic */
                ERRORQ_ASYNC;   /* no panic */
        }

        errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how);
    } else if (disp != 0) {
        gcpu_bleat(hdl, gcl);
    }

    mcesp->mce_disp = disp;

    return (disp);
}

/*
 * Gather error telemetry from our source, and then submit it for
 * processing.
 */

#define IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \
    ((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0)

#define STATUS_EQV(s1, s2) \
    (((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER))

static uint32_t gcpu_deferrred_polled_clears;

#ifndef __xpv
static void
gcpu_cmci_logout(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
    uint64_t status, int what)
{
    uint64_t ctl2;

    if (bank_cmci_p->cmci_cap && (what == GCPU_MPT_WHAT_CYC_ERR) &&
        (!(status & MSR_MC_STATUS_VAL) || ((status & MSR_MC_STATUS_VAL) &&
        !(status & MSR_MC_STATUS_CEC_MASK)))) {

        if (!(bank_cmci_p->cmci_enabled)) {
            /*
             * when cmci is disabled, and the bank has no error or
             * no corrected error for
             * gcpu_mca_cmci_reenable_threshold consecutive polls,
             * turn on this bank's cmci.
             */

            bank_cmci_p->drtcmci ++;

            if (bank_cmci_p->drtcmci >=
                gcpu_mca_cmci_reenable_threshold) {

                /* turn on cmci */

                (void) cmi_hdl_rdmsr(hdl,
                    IA32_MSR_MC_CTL2(bank), &ctl2);
                ctl2 |= MSR_MC_CTL2_EN;
                (void) cmi_hdl_wrmsr(hdl,
                    IA32_MSR_MC_CTL2(bank), ctl2);

                /* reset counter and set flag */
                bank_cmci_p->drtcmci = 0;
                bank_cmci_p->cmci_enabled = 1;
            }
        } else {
            /*
             * when cmci is enabled,if is in cyclic poll and the
             * bank has no error or no corrected error, reset ncmci
             * counter
             */
            bank_cmci_p->ncmci = 0;
        }
    }
}

static void
gcpu_cmci_throttle(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
    int what)
{
    uint64_t ctl2 = 0;

    /*
     * if cmci of this bank occurred beyond
     * gcpu_mca_cmci_throttling_threshold between 2 polls,
     * turn off this bank's CMCI;
     */
    if (bank_cmci_p->cmci_enabled && what == GCPU_MPT_WHAT_CMCI_ERR) {

        /* if it is cmci trap, increase the count */
        bank_cmci_p->ncmci++;

        if (bank_cmci_p->ncmci >= gcpu_mca_cmci_throttling_threshold) {

            /* turn off cmci */

            (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(bank),
                &ctl2);
            ctl2 &= ~MSR_MC_CTL2_EN;
            (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(bank),
                ctl2);

            /* clear the flag and count */

            bank_cmci_p->cmci_enabled = 0;
            bank_cmci_p->ncmci = 0;
        }
    }
}
#endif

static void
clear_mc(int first, int last, int ismc, boolean_t clrstatus,
    cmi_hdl_t hdl, gcpu_logout_t *gcl, gcpu_logout_t *pgcl)
{
    int i;
    gcpu_bank_logout_t *gbl, *pgbl;
    uint64_t status;

    for (i = first, gbl = &gcl->gcl_data[first]; i < last; i++, gbl++) {
        status = gbl->gbl_status;
        if (status == 0)
            continue;
        if (clrstatus == B_FALSE)
            goto serialize;

        /*
         * For i86xpv we always clear status in order to invalidate
         * the interposed telemetry.
         *
         * For native machine checks we always clear status here.  For
         * native polls we must be a little more cautious since there
         * is an outside chance that we may clear telemetry from a
         * shared MCA bank on which a sibling core is machine checking.
         *
         * For polled observations of errors that look like they may
         * produce a machine check (UC/PCC and ENabled, although these
         * do not guarantee a machine check on error occurence)
         * we will not clear the status at this wakeup unless
         * we saw the same status at the previous poll.  We will
         * always process and log the current observations - it
         * is only the clearing of MCi_STATUS which may be
         * deferred until the next wakeup.
         */
        if (isxpv || ismc || !IS_MCE_CANDIDATE(status)) {
            (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL);
            goto serialize;
        }

        /*
         * We have a polled observation of a machine check
         * candidate.  If we saw essentially the same status at the
         * last poll then clear the status now since this appears
         * not to be a #MC candidate after all.  If we see quite
         * different status now then do not clear, but reconsider at
         * the next poll.  In no actual machine check clears
         * the status in the interim then the status should not
         * keep changing forever (meaning we'd never clear it)
         * since before long we'll simply have latched the highest-
         * priority error and set the OVerflow bit.  Nonetheless
         * we count how many times we defer clearing and after
         * a while insist on clearing the status.
         */
        pgbl = &pgcl->gcl_data[i];
        if (pgbl->gbl_clrdefcnt != 0) {
            /* We deferred clear on this bank at last wakeup */
            if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) ||
                pgbl->gbl_clrdefcnt > 5) {
                /*
                 * Status is unchanged so clear it now and,
                 * since we have already logged this info,
                 * avoid logging it again.
                 */
                gbl->gbl_status = 0;
                (void) cmi_hdl_wrmsr(hdl,
                    IA32_MSR_MC(i, STATUS), 0ULL);
            } else {
                /* Record deferral for next wakeup */
                gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1;
            }
        } else {
            /* Record initial deferral for next wakeup */
            gbl->gbl_clrdefcnt = 1;
            gcpu_deferrred_polled_clears++;
        }

serialize:
        {
#ifdef __xpv
            ;
#else
            /*
             * Intel Vol 3A says to execute a serializing
             * instruction here, ie CPUID.  Well WRMSR is also
             * defined to be serializing, so the status clear above
             * should suffice.  To be a good citizen, and since
             * some clears are deferred, we'll execute a CPUID
             * instruction here.
             */
            struct cpuid_regs tmp;
            (void) __cpuid_insn(&tmp);
#endif
        }
    }
}

/*ARGSUSED5*/
void
gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask,
    gcpu_mce_status_t *mcesp, boolean_t clrstatus, int what)
{
    gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
    gcpu_mca_t *mca = &gcpu->gcpu_mca;
    int nbanks = mca->gcpu_mca_nbanks;
    gcpu_bank_logout_t *gbl;
    gcpu_logout_t *gcl, *pgcl;
    int ismc = (rp != NULL);
    int ispoll = !ismc;
    int i, nerr = 0;
    cmi_errno_t err;
    uint64_t mcg_status;
    uint64_t disp;
    uint64_t cap;
    int first = 0;
    int last = 0;
    int willpanic = 0;

    if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
        CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) !=
        CMI_SUCCESS) {
        if (mcesp != NULL)
            mcesp->mce_nerr = mcesp->mce_disp = 0;
        return;
    }

    if (ismc) {
        gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION];
    } else {
        int pidx = mca->gcpu_mca_nextpoll_idx;
        int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ?
            GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1;

        gcl = mca->gcpu_mca_logout[pidx];   /* current logout */
        pgcl = mca->gcpu_mca_logout[ppidx]; /* previous logout */
        mca->gcpu_mca_nextpoll_idx = ppidx; /* switch next time */
    }

    gcl->gcl_timestamp = gethrtime_waitfree();
    gcl->gcl_mcg_status = mcg_status;
    gcl->gcl_ip = rp ? rp->r_pc : 0;

    gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0;
    if (cap & MCG_CAP_TES_P)
        gcl->gcl_flags |= GCPU_GCL_F_TES_P;

    for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
        uint64_t status, status2, addr, misc;
        int retries = gcpu_mca_telemetry_retries;

        gbl->gbl_status = 0;
        gbl->gbl_disp = 0;
        gbl->gbl_clrdefcnt = 0;

        /*
         * Only logout from MCA banks we have initialized from at
         * least one core.  If a core shares an MCA bank with another
         * but perhaps lost the race to initialize it, then it must
         * still be allowed to logout from the shared bank.
         */
        if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i))
            continue;

        /*
         * On a poll look only at the banks we've been asked to check.
         */
        if (rp == NULL && !(bankmask & 1 << i))
            continue;


        if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) !=
            CMI_SUCCESS)
            continue;

#ifndef __xpv
        gcpu_cmci_logout(hdl, i, &mca->gcpu_bank_cmci[i], status, what);
#endif

retry:
        if (!(status & MSR_MC_STATUS_VAL))
            continue;

        if (first == 0)
            first = i;
        last = i;

        addr = -1;
        misc = 0;

        if (status & MSR_MC_STATUS_ADDRV)
            (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr);

        if (status & MSR_MC_STATUS_MISCV)
            (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc);

#ifndef __xpv
        gcpu_cmci_throttle(hdl, i, &mca->gcpu_bank_cmci[i], what);
#endif

        /*
         * Allow the model-specific code to extract bank telemetry.
         */
        cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout);

        /*
         * Not all cpu models assure us that the status/address/misc
         * data will not change during the above sequence of MSR reads,
         * or that it can only change by the addition of the OVerflow
         * bit to the status register.  If the status has changed
         * other than in the overflow bit then we attempt to reread
         * for a consistent snapshot, but eventually give up and
         * go with what we've got.  We only perform this check
         * for a poll - a further #MC during a #MC will reset, and
         * polled errors should not overwrite higher-priority
         * trapping errors (but could set the overflow bit).
         */
        if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
            &status2)) == CMI_SUCCESS) {
            if (!STATUS_EQV(status, status2)) {
                if (retries-- > 0) {
                    status = status2;
                    goto retry;
                } else {
                    gbl->gbl_disp |=
                        CMI_ERRDISP_INCONSISTENT;
                }
            }
        } else if (ispoll && err != CMI_SUCCESS) {
            gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT;
        }

        nerr++;
        gbl->gbl_status = status;
        gbl->gbl_addr = addr;
        gbl->gbl_misc = misc;
    }

    if (gcpu_mca_stack_flag)
        gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH);
    else
        gcl->gcl_stackdepth = 0;

    /*
     * Decide our disposition for this error or errors, and submit for
     * logging and subsequent diagnosis.
     */
    if (nerr != 0) {
        disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp);

        willpanic = (ismc && cmi_mce_response(rp, disp) == 0);

        if (!willpanic)
            clear_mc(first, last, ismc, clrstatus, hdl, gcl, pgcl);
    } else {
        disp = 0;
        if (mcesp) {
            mcesp->mce_nerr = mcesp->mce_disp = 0;
        }
    }

    /*
     * Clear MCG_STATUS if MCIP is set (machine check in progress).
     * If a second #MC had occured before now the system would have
     * reset.  We can only do thise once gcpu_mca_process has copied
     * the logout structure.
     */
    if (ismc && mcg_status & MCG_STATUS_MCIP)
        (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0);

    /*
     * At this point we have read and logged all telemetry that is visible
     * under the MCA.  On architectures for which the NorthBridge is
     * on-chip this may include NB-observed errors, but where the NB
     * is off chip it may have been the source of the #MC request and
     * so we must call into the memory-controller driver to give it
     * a chance to log errors.
     */
    if (ismc) {
        cmi_mc_logout(hdl, 1, willpanic);
    }
}

#ifndef __xpv
int gcpu_mca_trap_vomit_summary = 0;

/*
 * On a native machine check exception we come here from mcetrap via
 * cmi_mca_trap.  A machine check on one cpu of a chip does not trap others
 * cpus of the chip, so it is possible that another cpu on this chip could
 * initiate a poll while we're in the #mc handler;  it is also possible that
 * this trap has occured during a poll on this cpu.  So we must acquire
 * the chip-wide poll lock, but be careful to avoid deadlock.
 *
 * The 'data' pointer cannot be NULL due to init order.
 */
uint64_t
gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp)
{
    gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
    kmutex_t *poll_lock = NULL;
    gcpu_mce_status_t mce;
    uint64_t mcg_status;
    int tooklock = 0;

    if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
        CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP))
        return (0);

    /*
     * Synchronize with any poller from another core that may happen
     * to share access to one or more of the MCA banks.
     */
    if (gcpu->gcpu_shared != NULL)
        poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock;

    if (poll_lock != NULL && !mutex_owned(poll_lock)) {
        /*
         * The lock is not owned by the thread we have
         * interrupted.  Spin for this adaptive lock.
         */
        while (!mutex_tryenter(poll_lock)) {
            while (mutex_owner(poll_lock) != NULL)
                ;
        }
        tooklock = 1;
    }

    gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE, GCPU_MPT_WHAT_MC_ERR);

    if (tooklock)
        mutex_exit(poll_lock);

    /*
     * gcpu_mca_trap_vomit_summary may be set for debug assistance.
     */
    if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) {
        cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, "
            "%u PCC (%u ok), "
            "%u UC (%d ok, %u poisoned), "
            "%u forcefatal, %u ignored",
            mce.mce_nerr, (u_longlong_t)mce.mce_disp,
            mce.mce_npcc, mce.mce_npcc_ok,
            mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned,
            mce.mce_forcefatal, mce.mce_ignored);
    }

    return (mce.mce_disp);
}
#endif

/*ARGSUSED*/
void
gcpu_faulted_enter(cmi_hdl_t hdl)
{
    /* Nothing to do here */
}

/*ARGSUSED*/
void
gcpu_faulted_exit(cmi_hdl_t hdl)
{
    gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);

    gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING;
}

/*
 * Write the requested values to the indicated MSRs.  Having no knowledge
 * of the model-specific requirements for writing to these model-specific
 * registers, we will only blindly write to those MSRs if the 'force'
 * argument is nonzero.  That option should only be used in prototyping
 * and debugging.
 */
/*ARGSUSED*/
cmi_errno_t
gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs,
    int force)
{
    int i, errs = 0;

    for (i = 0; i < nregs; i++) {
        uint_t msr = regs[i].cmr_msrnum;
        uint64_t val = regs[i].cmr_msrval;

        if (cms_present(hdl)) {
            if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS)
                errs++;
        } else if (force) {
            errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS);
        } else {
            errs++;
        }
    }

    return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN);
}