/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
/*
* Panic software-diagnosis subsidiary
*
* We model a system panic as a defect diagnosis in FMA. When a system
* panicks, savecore publishes events which we subscribe to here.
*
* Our driving events are all raised by savecore, run either from
* startup of the dumpadm service or interactively at the command line.
* The following describes the logic for the handling of these events.
*
* On reboot after panic we will run savecore as part of the dumpadm
* service startup; we run savecore even if savecore is otherwise
* disabled (ie dumpadm -n in effect) - we run savecore -c to check for
* a valid dump and raise the initial event.
*
* If savecore (or savecore -c) observes a valid dump pending on the
* device, it raises a "dump_pending_on_device" event provided this
* was not an FMA-initiated panic (for those we will replay ereports
* from the dump device as usual and make a diagnosis from those; we do
* not need to open a case for the panic). We subscribe to the
* "dump_pending_on_device" event and use that to open a case; we
* open a case requesting the same case uuid as the panic dump image
* has for the OS instance uuid - if that fails because of a duplicate
* uuid then we have already opened a case for this panic so no need
* to open another.
*
* Included in the "dump_pending_on_device" event is an indication of
* whether or not dumpadm is enabled. If not (dumpadm -n in effect)
* then we do not expect any further events regarding this panic
* until such time as the admin runs savecore manually (if ever).
* subsequent events arrive when savecore is run manually, we will toss
* them.
*
* If dumpadm is enabled then savecore, run from dumpadm service startup,
* will attempt to process the dump - either to copy it off the dump
* device (if saving compressed) or to uncompress it off the dump device.
* If this succeeds savecore raises a "dump_available" event which
* includes information on the directory it was saved in, the instance
* number, image uuid, compressed form or not, and whether the dump
* was complete (as per the dumphdr). If the savecore fails for
* some reason then it exits and raises a "savecore_failure" event.
* These two events are raised even for FMA-initiated panics.
*
* We subscribe to both the "dump_available" and "savecore_failed" events,
* and in the handling thereof we will close the case opened earlier (if
* this is not an FMA-initiated panic). On receipt of the initial
* "dump_available" event we also arm a timer for +10 minutes if
* dumpadm is enabled - if no "dump_available" or "savecore_failed" arrives
* in that time we will solve the case on timeout.
*
* When the timer fires we check whether the initial event for each panic
* case was received more than 30 minutes ago; if it was we solve the case
* with what we have. If we're still within the waiting period we rearm
* for a further 10 minutes. The timer is shared by all cases that we
* create, which is why the fire interval is shorter than the maximum time
* we are prepared to wait.
*/
#include <strings.h>
#include <alloca.h>
#include <zone.h>
#include "panic.h"
/*
* Our serialization structure type.
*/
typedef struct swde_panic_casedata {
/* packed attr nvlist follows */
static struct {
} swde_panic_stats = {
{ "swde_panic_diagnosed", FMD_TYPE_UINT64,
"panic defects published" },
{ "swde_panic_badclass", FMD_TYPE_UINT64,
"incorrect event class received" },
{ "swde_panic_noattr", FMD_TYPE_UINT64,
"malformed event - missing attr nvlist" },
{ "swde_panic_unexpected_fm_panic", FMD_TYPE_UINT64,
"dump available for an fm_panic()" },
{ "swde_panic_badattr", FMD_TYPE_UINT64,
"malformed event - invalid attr list" },
{ "swde_panic_badfmri", FMD_TYPE_UINT64,
"malformed event - fmri2str fails" },
{ "swde_panic_noinstance", FMD_TYPE_UINT64,
"malformed event - no instance number" },
{ "swde_panic_nouuid", FMD_TYPE_UINT64,
"malformed event - missing uuid" },
{ "swde_panic_dupuuid", FMD_TYPE_UINT64,
"duplicate events received" },
{ "swde_panic_nocase", FMD_TYPE_UINT64,
"case missing for uuid" },
{ "swde_panic_notime", FMD_TYPE_UINT64,
"missing crash dump time" },
{ "swde_panic_nopanicstr", FMD_TYPE_UINT64,
"missing panic string" },
{ "swde_panic_nodumpdir", FMD_TYPE_UINT64,
"missing crashdump save directory" },
{ "swde_panic_nostack", FMD_TYPE_UINT64,
"missing panic stack" },
{ "swde_panic_incomplete", FMD_TYPE_UINT64,
"missing panic incomplete" },
{ "swde_panic_failed", FMD_TYPE_UINT64,
"missing panic failed" },
{ "swde_panic_badcasedata", FMD_TYPE_UINT64,
"bad case data during timeout" },
{ "swde_panic_failsrlz", FMD_TYPE_UINT64,
"failures to serialize case data" },
};
static nvlist_t *
{
int err = 0;
if (!err)
return (fmri);
else
return (0);
}
static void
{
int i;
/*
* Attribute members to include in event-specific defect
* payload. Some attributes will not be present for some
* cases - e.g., if we timed out and solved the case without
* a "dump_available" report.
*/
const char *toadd[] = {
"os-instance-uuid", /* same as case uuid */
"panicstr", /* for initial classification work */
"panicstack", /* for initial classification work */
"crashtime", /* in epoch time */
"panic-time", /* Formatted crash time */
};
/*
* As a temporary solution we create and fmri in the sw scheme
* in panic_sw_fmri. This should become a generic fmri constructor
*
* We need to user a resource FMRI which will have a sufficiently
* unique string representation such that fmd will not see
* repeated panic diagnoses (all using the same defect class)
* as duplicates and discard later cases. We can't actually diagnose
* the panic to anything specific (e.g., a path to a module and
* to misunderstanding. So we choose a path based on <dumpdir>
* and the OS instance UUID - "<dumpdir>/.<os-instance-uuid>".
* There's no file at that path (*) but no matter. We can't use
* <dumpdir>/vmdump.N or similar because if savecore is disabled
* or failed we don't have any file or instance number.
*
* (*) Some day it would seem tidier to keep all files to do
* in a distinct directory, and <dumpdir>/.<uuid> seems like a good
* choice. For compatability we'd symlink into it. So that is
* another reason for this choice - some day it may exist!
*/
/* LINTED: E_SEC_SPRINTF_UNBOUNDED_COPY */
if (savecore_success) {
const char **pathfmts;
int files = 0;
int i;
&compressed);
for (i = 0; i < 2; i++) {
continue;
}
files++;
}
files);
} else {
char *rsn;
}
/*
* Not all attributes will necessarily be available - eg if
* dumpadm was not enabled there'll be no instance and dumpdir.
*/
}
/*
* Close the case. Do no free casedata - framework does that for us
* on closure callback.
*/
}
/*ARGSUSED*/
static void
{
int remain = 0;
if (vers != SWDE_PANIC_CASEDATA_VERS)
} else {
}
} else {
remain++;
}
}
if (remain) {
}
}
/*
* Our verify entry point is called for each of our open cases during
* module load. We must return 0 for the case to be closed by our caller,
* or 1 to keep it (or if we have already closed it during this call).
*/
static int
{
if (vers != SWDE_PANIC_CASEDATA_VERS)
return (0); /* case will be closed */
return (1); /* case already closed */
} else {
return (0); /* close case */
}
}
if (mytimerid != 0)
return (1); /* retain case */
}
/*
* Handler for ireport.os.sunos.panic.dump_pending_on_device.
*
* A future RFE should try adding a means of avoiding diagnosing repeated
* defects on panic loops, which would just add to the mayhem and potentially
* log lots of calls through ASR. Panics with similar enough panic
* period of time, for example.
*/
/*ARGSUSED*/
void
{
char *fmribuf;
char *uuid;
return;
}
return;
}
return;
}
/*
* Prepare serialization data to be associated with a new
* case. Our serialization data consists of a swde_panic_casedata_t
* structure followed by a packed nvlist of the attributes of
* the initial event.
*/
return;
}
/*
* Open a case with UUID matching the the panicking kernel, add this
* event to the case.
*/
return;
}
"solve now\n");
return;
}
/*
* We expect to see either a "dump_available" or a "savecore_failed"
* event before too long. In case that never shows up, for whatever
* reason, we want to be able to solve the case anyway.
*/
if (mytimerid == 0) {
} else {
}
}
/*
* savecore has now run and saved a crash dump to the filesystem. It is
* either a compressed dump (vmdump.n) or uncompressed {unix.n, vmcore.n}
* Savecore has raised an ireport to say the dump is there.
*/
/*ARGSUSED*/
void
{
char *uuid;
"success" : "fail");
return;
}
return; /* not expected, but just in case */
}
return;
}
/*
* Find the case related to the panicking kernel; our cases have
* the same uuid as the crashed OS image.
*/
if (!cp) {
/* Unable to find the case. */
"image %s\n", uuid);
return;
}
}
/*
* Something has to subscribe to every fault
* or defect diagnosed in fmd. We do that here, but throw it away.
*/
};
/*ARGSUSED*/
int
int *nelemp)
{
if (getzoneid() != GLOBAL_ZONEID)
return (SW_SUB_INIT_FAIL_VOLUNTARY);
sizeof (swde_panic_stats) / sizeof (fmd_stat_t),
(fmd_stat_t *)&swde_panic_stats);
*dpp = &swde_panic_disp[0];
return (SW_SUB_INIT_SUCCESS);
}
void
{
if (mytimerid)
}
"panic diagnosis", /* swsub_name */
SW_CASE_PANIC, /* swsub_casetype */
swde_panic_init, /* swsub_init */
swde_panic_fini, /* swsub_fini */
swde_panic_timeout, /* swsub_timeout */
NULL, /* swsub_case_close */
swde_panic_vrfy, /* swsub_case_vrfy */
};