ao_poll.c revision 7aec1d6e253b21f9e9b7ef68b4d81ab9859b51fe
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin/*
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * CDDL HEADER START
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin *
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * The contents of this file are subject to the terms of the
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Common Development and Distribution License, Version 1.0 only
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * (the "License"). You may not use this file except in compliance
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * with the License.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin *
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * or http://www.opensolaris.org/os/licensing.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * See the License for the specific language governing permissions
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * and limitations under the License.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin *
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * When distributing Covered Code, include this CDDL HEADER in each
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * If applicable, add the following below this CDDL HEADER, with the
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * fields enclosed by brackets "[]" replaced with your own identifying
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * information: Portions Copyright [yyyy] [name of copyright owner]
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin *
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * CDDL HEADER END
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin */
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin/*
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Use is subject to license terms.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin */
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin#pragma ident "%Z%%M% %I% %E% SMI"
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin/*
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * AMD Athlon64/Opteron CPU Module Machine-Check Poller
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin *
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * The AMD Opteron processor doesn't yet report correctable errors via #mc's.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Instead, it fixes the problem, silently updates the error state MSRs, and
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * resumes operation. In order to discover occurrances of correctable errors,
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * we have to poll in the background using the omni cyclics mechanism. The
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * error injector also has the ability to manually request an immediate poll.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * Locking is fairly simple within the poller: the per-CPU mutex
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * ao->ao_mca.ao_mca_poll_lock ensures that only one poll request is active.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin */
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin#include <sys/types.h>
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin#include <sys/sysmacros.h>
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin#include <sys/x86_archext.h>
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin#include <sys/ddi.h>
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin#include <sys/sunddi.h>
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin#include <sys/ksynch.h>
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin#include <sys/sdt.h>
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner#include "ao.h"
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinstatic uint_t ao_mca_poll_trace_nent = 100;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin#ifdef DEBUG
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulknerstatic uint_t ao_mca_poll_trace_always = 1;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin#else
34f9b3eef6fdadbda0a846aa4d68691ac40eace5Roland Mainzstatic uint_t ao_mca_poll_trace_always = 0;
34f9b3eef6fdadbda0a846aa4d68691ac40eace5Roland Mainz#endif
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinstatic cyclic_id_t ao_mca_poll_cycid;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinstatic hrtime_t ao_mca_poll_interval = NANOSEC * 10ULL;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinstatic void
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinao_mca_poll_trace(ao_mca_t *mca, uint32_t what, uint32_t nerr)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin{
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin uint_t next;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin ao_mca_poll_trace_t *pt;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin ASSERT(MUTEX_HELD(&mca->ao_mca_poll_lock));
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin DTRACE_PROBE2(ao__poll__trace, uint32_t, what, uint32_t, nerr);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin if (mca->ao_mca_poll_trace == NULL)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin return; /* poll trace buffer is disabled */
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin next = (mca->ao_mca_poll_curtrace + 1) % ao_mca_poll_trace_nent;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin pt = &mca->ao_mca_poll_trace[next];
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin pt->mpt_when = 0;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin pt->mpt_what = what;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin if (what == AO_MPT_WHAT_CYC_ERR)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin pt->mpt_nerr = MIN(nerr, UINT8_MAX);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin pt->mpt_when = gethrtime();
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin mca->ao_mca_poll_curtrace = next;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin}
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinstatic void
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinao_mca_poll_common(ao_mca_t *mca, int what)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin{
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin ao_cpu_logout_t *acl = &mca->ao_mca_logout[AO_MCA_LOGOUT_POLLER];
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin int i, n, fatal;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin if (mca->ao_mca_flags & AO_MCA_F_UNFAULTING) {
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin mca->ao_mca_flags &= ~AO_MCA_F_UNFAULTING;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin ao_mca_poll_trace(mca, AO_MPT_WHAT_UNFAULTING, 0);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin /*
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * On the first poll after re-enabling a faulty CPU we clear
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * the status registers; see ao_faulted_exit() for more info.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin */
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin if (what == AO_MPT_WHAT_CYC_ERR) {
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner for (i = 0; i < AMD_MCA_BANK_COUNT; i++)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin wrmsr(ao_bank_regs[i].abr_status, 0);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin return;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin }
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin }
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin fatal = ao_mca_logout(acl, NULL, &n);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin ao_mca_poll_trace(mca, what, n);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin if (fatal && cmi_panic_on_uncorrectable_error)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin fm_panic("Unrecoverable Machine-Check Exception");
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin}
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulknerstatic void
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinao_mca_poll_cyclic(void *arg)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin{
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin ao_data_t *ao = arg;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin if (ao != NULL && mutex_tryenter(&ao->ao_mca.ao_mca_poll_lock)) {
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin ao_mca_poll_common(&ao->ao_mca, AO_MPT_WHAT_CYC_ERR);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin mutex_exit(&ao->ao_mca.ao_mca_poll_lock);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin }
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin}
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinvoid
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinao_mca_poke(void *arg)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin{
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin ao_data_t *ao = arg;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin mutex_enter(&ao->ao_mca.ao_mca_poll_lock);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin ao_mca_poll_common(&ao->ao_mca, AO_MPT_WHAT_POKE_ERR);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin mutex_exit(&ao->ao_mca.ao_mca_poll_lock);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin}
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin/*ARGSUSED*/
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinstatic void
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinao_mca_poll_online(void *arg, cpu_t *cpu, cyc_handler_t *cyh, cyc_time_t *cyt)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin{
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin cyt->cyt_when = 0;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin cyh->cyh_level = CY_LOW_LEVEL;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin /*
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * If the CPU coming on-line isn't supported by this CPU module, then
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * disable the cylic by cranking cyt_interval and setting arg to NULL.
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin */
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin if (cpu->cpu_m.mcpu_cmi != NULL &&
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin cpu->cpu_m.mcpu_cmi->cmi_ops != &_cmi_ops) {
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin cyt->cyt_interval = INT64_MAX;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin cyh->cyh_func = ao_mca_poll_cyclic;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin cyh->cyh_arg = NULL;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin } else {
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin cyt->cyt_interval = ao_mca_poll_interval;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin cyh->cyh_func = ao_mca_poll_cyclic;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin cyh->cyh_arg = cpu->cpu_m.mcpu_cmidata;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin }
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin}
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin/*ARGSUSED*/
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinstatic void
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinao_mca_poll_offline(void *arg, cpu_t *cpu, void *cyh_arg)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin{
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin /* nothing to do here */
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin}
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinvoid
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinao_mca_poll_init(ao_mca_t *mca)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin{
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin mutex_init(&mca->ao_mca_poll_lock, NULL, MUTEX_DRIVER, NULL);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin if (ao_mca_poll_trace_always) {
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin mca->ao_mca_poll_trace =
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin kmem_zalloc(sizeof (ao_mca_poll_trace_t) *
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin ao_mca_poll_trace_nent, KM_SLEEP);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin mca->ao_mca_poll_curtrace = 0;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin }
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin}
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinvoid
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinao_mca_poll_start(void)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin{
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin cyc_omni_handler_t cyo;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin if (ao_mca_poll_interval == 0)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin return; /* if manually tuned to zero, disable polling */
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin cyo.cyo_online = ao_mca_poll_online;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin cyo.cyo_offline = ao_mca_poll_offline;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin cyo.cyo_arg = NULL;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin mutex_enter(&cpu_lock);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin ao_mca_poll_cycid = cyclic_add_omni(&cyo);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin mutex_exit(&cpu_lock);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin}
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin