error.c revision 367c34e9209dae40b80ed2062f852875086a137f
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/machsystm.h>
#include <sys/sysmacros.h>
#include <sys/hypervisor_api.h>
#include <sys/prom_plat.h>
#include <sys/archsystm.h>
#define MAX_CE_FLTS 10
#define MAX_ASYNC_FLTS 6
/*
* Being used by memory test driver.
* ce_verbose_memory - covers CEs in DIMMs
* ce_verbose_other - covers "others" (ecache, IO, etc.)
*
* If the value is 0, nothing is logged.
* If the value is 1, the error is logged to the log file, but not console.
* If the value is 2, the error is logged to the log file and console.
*/
int ce_verbose_memory = 1;
int ce_verbose_other = 1;
int ce_show_data = 0;
int ce_debug = 0;
int ue_debug = 0;
int reset_debug = 0;
/*
* Tunables for controlling the handling of asynchronous faults (AFTs). Setting
* these to non-default values on a non-DEBUG kernel is NOT supported.
*/
int aft_verbose = 0; /* log AFT messages > 1 to log only */
int aft_panic = 0; /* panic (not reboot) on fatal usermode AFLT */
int aft_testfatal = 0; /* force all AFTs to panic immediately */
/*
* Used for vbsc hostshutdown (power-off buton)
*/
int err_shutdown_triggered = 0; /* only once */
/*
* Defined in bus_func.c but initialised in error_init
*/
static void cpu_queue_one_event(errh_async_flt_t *);
static void errh_rq_full(struct async_flt *);
static void errh_handle_attr(errh_async_flt_t *);
static void errh_handle_asr(errh_async_flt_t *);
/*ARGSUSED*/
void
{
while (head_offset != tail_offset) {
/* kernel buffer starts right after the resumable queue */
/* Copy the error report to local buffer */
sizeof (errh_er_t));
/* Increment the queue head */
/* Wrap around */
/* set error handle to zero so it can hold new error report */
case ERRH_DESC_UCOR_RE:
/*
* Check error attribute, handle individual error
* if it is needed.
*/
break;
case ERRH_DESC_WARN_RE:
/*
* Power-off requested, but handle it one time only.
*/
if (!err_shutdown_triggered) {
}
continue;
default:
" invalid in resumable error handler",
continue;
}
>> ERRH_MODE_SHIFT) == ERRH_MODE_PRIV);
/* If it is an error on other cpu */
else
/*
* Handle resumable queue full case.
*/
(void) errh_rq_full(aflt);
}
/*
* Queue the error on ce or ue queue depend on flt_panic.
* Even if flt_panic is set, the code still keep processing
* the rest element on rq until the panic starts.
*/
(void) cpu_queue_one_event(&errh_flt);
/*
* Panic here if aflt->flt_panic has been set.
* Enqueued errors will be logged as part of the panic flow.
*/
fm_panic("Unrecoverable error on another CPU");
}
}
}
void
{
int trampolined = 0;
int expected = DDI_FM_ERR_UNEXPECTED;
while (head_offset != tail_offset) {
/* kernel buffer starts right after the nonresumable queue */
/* Copy the error report to local buffer */
sizeof (errh_er_t));
/* Increment the queue head */
/* Wrap around */
/* set error handle to zero so it can hold new error report */
trampolined = 0;
else
>> ERRH_MODE_SHIFT;
(aft_testfatal != 0));
/*
* For the first error packet on the queue, check if it
*/
if (flags & ERRH_U_SPILL_FILL) {
u_spill_fill = 1;
} else
u_spill_fill = 0;
case ERRH_DESC_PR_NRE:
if (u_spill_fill) {
break;
}
/*
* Fall through, precise fault also need to check
* to see if it was protected.
*/
/*FALLTHRU*/
case ERRH_DESC_DEF_NRE:
/*
* If the trap occurred in privileged mode at TL=0,
* we need to check to see if we were executing
* in kernel under on_trap() or t_lofault
* protection. If so, and if it was a PIO or MEM
* error, then modify the saved registers so that
* we return from the trap to the appropriate
* trampoline routine.
*/
}
} else if (!trampolined &&
}
/*
* Check error attribute, handle individual error
* if it is needed.
*/
/*
* If PIO error, we need to query the bus nexus
* for fatal errors.
*/
expected);
}
break;
case ERRH_DESC_USER_DCORE:
/*
* User generated panic. Call panic directly
* since there are no FMA e-reports to
* display.
*/
panic("Panic - Generated at user request");
break;
default:
" invalid in non-resumable error handler",
break;
}
/*
* Queue the error report for further processing. If
* flt_panic is set, code still process other errors
* in the queue until the panic routine stops the
* kernel.
*/
(void) cpu_queue_one_event(&errh_flt);
/*
* Panic here if aflt->flt_panic has been set.
* Enqueued errors will be logged as part of the panic flow.
*/
fm_panic("Unrecoverable hardware error");
}
/*
* Call page_retire() to handle memory errors.
*/
/*
* If we queued an error and the it was in user mode, or
* protected by t_lofault, or user_spill_fill is set, we
* set AST flag so the queue will be drained before
* returning to user mode.
*/
u_spill_fill) {
int pcb_flag = 0;
pcb_flag |= ASYNC_HWERR;
pcb_flag |= ASYNC_BERR;
}
}
}
/*
* For PIO errors, this routine calls nexus driver's error
* callback routines. If the callback routine returns fatal, and
* we are in kernel or unknow mode without any error protection,
* we need to turn on the panic flag.
*/
void
{
int status;
/*
* If error is protected, it will jump to proper routine
* to handle the handle; if it is in user level, we just
* kill the user process; if the driver thinks the error is
* not fatal, we can drive on. If none of above are true,
* we panic
*/
(status == DDI_FM_FATAL))
}
/*
* This routine checks to see if we are under any error protection when
* the error happens. If we are under error protection, we unwind to
* the protection and indicate fault.
*/
static int
{
int trampolined = 0;
trampolined = 1;
}
trampolined = 1;
/*
* for peek and caut_gets
* errors are expected
*/
if (!hp)
}
trampolined = 1;
}
return (trampolined);
}
/*
* Queue one event.
*/
static void
{
else
}
/*
* handle logging for CPU events that are dequeued. As such, it can be invoked
* from softint context, from AST processing in the trap() flow, or from the
* panic flow. We decode the CPU-specific data, and log appropriate messages.
*/
void
cpu_async_log_err(void *flt)
{
case ERRH_DESC_UCOR_RE:
/*
* Turn on the PR_UE flag. The page will be
* scrubbed when it is freed.
*/
}
break;
case ERRH_DESC_PR_NRE:
case ERRH_DESC_DEF_NRE:
/*
* For non-resumable memory error, retire
* the page here.
*/
/*
* If we are going to panic, scrub the page first
*/
}
break;
default:
break;
}
}
/*
* Called from ce_drain().
*/
void
{
case CPU_FAULT:
break;
case BUS_FAULT:
break;
default:
break;
}
}
/*
* Called from ue_drain().
*/
void
{
case CPU_FAULT:
break;
case BUS_FAULT:
break;
default:
break;
}
}
/*
* Turn on flag on the error memory region.
*/
static void
{
return;
for (current_addr = flt_real_addr_start;
}
}
void
{
scrubbed_len = 0;
while (length > 0) {
break;
pa += scrubbed_len;
length -= scrubbed_len;
}
}
/*
* Call hypervisor to flush the memory region.
* Both va and len must be MMU_PAGESIZE aligned.
* Returns the total number of bytes flushed.
*/
{
uint64_t total_flushed = 0;
if (orig_len == 0)
return (total_flushed);
/* align va */
/* round up len to MMU_PAGESIZE aligned */
while (len > 0) {
return (total_flushed);
flushed = 0;
while (length > 0) {
return (total_flushed);
total_flushed += flushed;
}
}
return (total_flushed);
}
/*
* If resumable queue is full, we need to check if any cpu is in
* error state. If not, we drive on. If yes, we need to panic. The
* hypervisor call hv_cpu_state() is being used for checking the
* cpu state. And reset %tick_compr in case tick-compare was lost.
*/
static void
{
break;
}
}
}
/*
* Return processor specific async error structure
* size used.
*/
int
cpu_aflt_size(void)
{
return (sizeof (errh_async_flt_t));
}
#define SZ_TO_ETRS_SHIFT 6
/*
* Message print out when resumable queue is overflown
*/
/*ARGSUSED*/
void
{
}
/*
* Handler to process a fatal error. This routine can be called from a
* softint, called from trap()'s AST handling, or called from the panic flow.
*/
/*ARGSUSED*/
static void
{
}
/*
* Handler to process a correctable error. This routine can be called from a
* softint. We just call the CPU module's logging routine.
*/
/*ARGSUSED*/
static void
{
}
/*
* Handler to process vbsc hostshutdown (power-off button).
*/
static int
{
do_shutdown();
/*
* just in case do_shutdown() fails
*/
return (DDI_INTR_CLAIMED);
}
/*
* Allocate error queue sizes based on max_ncpus. max_ncpus is set just
* after ncpunode has been determined. ncpus is set in start_other_cpus
* which is called after error_init() but may change dynamically.
*/
void
error_init(void)
{
char tmp_name[MAXSYSNAME];
/*
* Initialize the correctable and uncorrectable error queues.
*/
panic("failed to create required system error queue");
/*
* Setup interrupt handler for power-off button.
*/
/*
* Initialize the busfunc list mutex. This must be a PIL_15 spin lock
* because we will need to acquire it from cpu_async_error().
*/
node = prom_rootnode();
return;
}
(size <= MAXSYSNAME) &&
if (reset_debug) {
"System booting after fatal error %s\n", tmp_name);
}
}
}
/*
* Nonresumable queue is full, panic here
*/
/*ARGSUSED*/
void
{
fm_panic("Nonresumable queue full");
}
/*
* This is the place for special error handling for individual errors.
*/
static void
{
case ERRH_ATTR_CPU:
case ERRH_ATTR_MEM:
case ERRH_ATTR_PIO:
case ERRH_ATTR_IRF:
case ERRH_ATTR_FRF:
case ERRH_ATTR_SHUT:
break;
case ERRH_ATTR_ASR:
break;
case ERRH_ATTR_ASI:
case ERRH_ATTR_PREG:
case ERRH_ATTR_RQF:
break;
default:
break;
}
}
/*
* Handle ASR bit set in ATTR
*/
static void
{
case ASR_REG_VALID | ASR_REG_TICK:
/*
* For Tick Compare Register error, it only happens when
* the register is being read or compared with the %tick
* register. Since we lost the contents of the register,
* we set the %tick_compr in the future. An interrupt will
* happen when %tick matches the value field of %tick_compr.
*/
/* Do not panic */
break;
default:
break;
}
}