fme.c revision 3e8d8e182b274ca2da0da571da1f62acac10fe6f
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
* fme.c -- fault management exercise module
*
* this module provides the simulated fault management exercise.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <ctype.h>
#include <alloca.h>
#include <libnvpair.h>
#include "alloc.h"
#include "out.h"
#include "stats.h"
#include "stable.h"
#include "literals.h"
#include "lut.h"
#include "tree.h"
#include "ptree.h"
#include "itree.h"
#include "ipath.h"
#include "fme.h"
#include "evnv.h"
#include "eval.h"
#include "config.h"
#include "platform.h"
/* imported from eft.c... */
extern int Autoconvict;
extern char *Autoclose;
extern nv_alloc_t Eft_nv_hdl;
/* fme under construction is global so we can free it on module abort */
static const char *Undiag_reason;
static int Nextid = 0;
/* list of fault management exercises underway */
static struct fme {
unsigned long long ull; /* time when fme was created */
int id; /* FME id */
/*
* The initial error report that created this FME is kept in
* two forms. e0 points to the instance tree node and is used
* by fme_eval() as the starting point for the inference
* algorithm. e0r is the event handle FMD passed to us when
* the ereport first arrived and is used when setting timers,
* which are always relative to the time of this initial
* report.
*/
int nsuspects; /* count of suspects */
int nonfault; /* zero if all suspects T_FAULT */
int posted_suspects; /* true if we've posted a diagnosis */
int hesitated; /* true if we hesitated */
int uniqobs; /* number of unique events observed */
int peek; /* just peeking, don't track suspects */
enum fme_state {
FME_WAIT, /* need to wait for more info */
FME_CREDIBLE, /* suspect list is credible */
FME_DISPROVED /* no valid suspects found */
} state;
unsigned long long pull; /* time passed since created */
unsigned long long wull; /* wait until this time for re-eval */
/* fmd interfacing */
/* stats */
struct stats *Hcallcount;
struct stats *Rcallcount;
struct stats *Ccallcount;
struct stats *Ecallcount;
struct stats *Tcallcount;
struct stats *Marrowcount;
static struct case_list {
unsigned long long at_latest_by, unsigned long long *pdelay,
static void destroy_fme(struct fme *f);
static struct fme *
alloc_fme(void)
{
return (fmep);
}
/*
* fme_ready -- called when all initialization of the FME (except for
* stats) has completed successfully. Adds the fme to global lists
* and establishes its stats.
*/
static struct fme *
{
char nbuf[100];
if (EFMElist) {
} else
"calls to requirements_test()", 1);
fmep->Ecallcount =
"arrows marked by mark_arrows()", 1);
return (fmep);
}
static struct fme *
{
return (NULL);
}
Nfmep->posted_suspects = 0;
return (NULL);
}
return (NULL);
}
}
void
fme_fini(void)
{
}
/* clean up closed fmes */
fp = ClosedFMEs;
}
ClosedFMEs = NULL;
}
/* if we were in the middle of creating an fme, free it now */
if (Nfmep) {
}
}
/*
* Allocated space for a buffer name. 20 bytes allows for
* a ridiculous 9,999,999 unique observations.
*/
#define OBBUFNMSZ 20
/*
* serialize_observation
*
* Create a recoverable version of the current observation
* (f->ecurrent). We keep a serialized version of each unique
* observation in order that we may resume correctly the fme in the
* correct state if eft or fmd crashes and we're restarted.
*/
static void
{
char *estr;
}
}
/*
* init_fme_bufs -- We keep several bits of state about an fme for
* use if eft or fmd crashes and we're restarted.
*/
static void
{
sizeof (cfglen));
if (cfglen != 0) {
}
sizeof (fp->posted_suspects));
}
static void
{
int o;
}
}
/*
* reconstitute_observations -- convert a case's serialized observations
* back into struct events. Returns zero if all observations are
* successfully reconstituted.
*/
static int
{
char *sepptr;
char *estr;
int ocnt;
int elen;
if (elen == 0) {
"reconstitute_observation: no %s buffer found.",
tmpbuf);
break;
}
"reconstitute_observation: %s: "
"missing @ separator in %s.",
break;
}
*sepptr = '\0';
"reconstitute_observation: %s: "
"trouble converting path string \"%s\" "
"to internal representation.",
break;
}
/* construct the event */
"reconstitute_observation: %s: "
"lookup of \"%s\" in itree failed.",
break;
}
/*
* We may or may not have a saved nvlist for the observation
*/
if (pkdlen != 0) {
if (nvlist_xunpack(pkd,
}
if (ocnt == 0)
/* link it into list of observations seen */
}
return (0);
}
return (1);
}
/*
* restart_fme -- called during eft initialization. Reconstitutes
* an in-progress fme.
*/
void
{
goto badcase;
}
sizeof (size_t));
goto badcase;
}
if (rawsz > 0) {
goto badcase;
}
} else {
}
/* case not properly saved or irretrievable */
goto badcase;
}
goto badcase;
} else {
}
goto badcase;
} else {
(void *)&fmep->posted_suspects,
sizeof (fmep->posted_suspects));
}
goto badcase;
} else {
}
goto badcase;
} else {
}
if (reconstitute_observations(fmep) != 0)
goto badcase;
/* give the diagnosis algorithm a shot at the new FME state */
return;
/*
* Since we're unable to restart the case, add it to the undiagable
* list and solve and close it as appropriate.
*/
if (Undiagablecaselist != NULL)
} else {
if (Undiag_reason != NULL)
(void) nvlist_add_string(defect,
}
} else {
}
}
void
destroy_fme(struct fme *f)
{
stats_delete(f->Rcount);
stats_delete(f->Hcallcount);
stats_delete(f->Rcallcount);
stats_delete(f->Ccallcount);
stats_delete(f->Ecallcount);
stats_delete(f->Tcallcount);
stats_delete(f->Marrowcount);
stats_delete(f->diags);
itree_free(f->eventtree);
config_free(f->cfgdata);
FREE(f);
}
static const char *
fme_state2str(enum fme_state s)
{
switch (s) {
case FME_NOTHING: return ("NOTHING");
case FME_WAIT: return ("WAIT");
case FME_CREDIBLE: return ("CREDIBLE");
case FME_DISPROVED: return ("DISPROVED");
default: return ("UNKNOWN");
}
}
static int
is_problem(enum nametype t)
{
}
static int
{
return (t == N_FAULT);
}
static int
{
return (t == N_DEFECT);
}
static int
{
return (t == N_UPSET);
}
/*ARGSUSED*/
static void
{
continue;
}
}
/*
* call this function with initcode set to 0 to initialize cycle tracking
*/
static void
{
}
static void
{
}
else
}
}
}
static struct node *
pathstring2epnamenp(char *path)
{
char *sep = "/";
char *ptr;
return (ret);
}
/*
* for a given upset sp, increment the corresponding SERD engine. if the
* SERD engine trips, return the ename and ipp of the resulting ereport.
* returns true if engine tripped and *enamep and *ippp were filled in.
*/
static int
{
char *serdname;
/*
* obtain instanced SERD engine from the upset sp. from this
* derive serdname, the string used to identify the SERD engine.
*/
return (NULL);
/* no SERD engine yet, so create it */
}
/*
* increment SERD engine. if engine fires, reset serd
* engine and return trip_strcode
*/
return (1);
}
return (0);
}
/*
* search a suspect list for upsets. feed each upset to serd_eval() and
* build up tripped[], an array of ereports produced by the firing of
* any SERD engines. then feed each ereport back into
* fme_receive_report().
*
* returns ntrip, the number of these ereports produced.
*/
static int
{
/* we build an array of tripped ereports that we send ourselves */
struct {
const char *ename;
} *tripped;
/*
* we avoid recursion by calling fme_receive_report() at the end of
* this function with a NULL ffep
*/
return (0);
/*
* count the number of upsets to determine the upper limit on
* expected trip ereport strings. remember that one upset can
* lead to at most one ereport.
*/
nupset = 0;
nupset++;
}
if (nupset == 0)
return (0);
/*
* get to this point if we have upsets and expect some trip
* ereports
*/
ntrip = 0;
ntrip++;
for (i = 0; i < ntrip; i++)
return (ntrip);
}
/*
* fme_receive_external_report -- call when an external ereport comes in
*
* this routine just converts the relevant information from the ereport
* into a format used internally and passes it on to fme_receive_report().
*/
void
const char *eventstring)
{
/*
* XFILE: If we ended up without a path, it's an X-file.
* For now, use our undiagnosable interface.
*/
return;
}
}
static void
{
int matched = 0;
/* decide which FME it goes to */
int prev_verbose;
unsigned long long my_delay = TIMEVAL_EVENTUALLY;
/* look up event in event tree for this FME */
continue;
/* note observation */
/* link it into list of observations seen */
}
/* tell hypothesise() not to mess with suspect list */
/* don't want this to be verbose (unless Debug is set) */
if (Debug == 0)
Verbose = 0;
/* put verbose flag back */
if (state != FME_DISPROVED) {
/* found an FME that explains the ereport */
matched++;
if (ffep)
/* re-eval FME */
} else {
/* not a match, undo noting of observation */
/* unlink it from observations */
}
}
}
if (matched)
return; /* explained by at least one existing FME */
/* clean up closed fmes */
ofmep = ClosedFMEs;
}
ClosedFMEs = NULL;
/* start a new FME */
return;
}
/* open a case */
/* note observation */
/* link it into list of observations seen */
}
if (ffep) {
}
/* give the diagnosis algorithm a shot at the new FME state */
}
void
fme_status(int flags)
{
return;
}
}
/*
* "indent" routines used mostly for nicely formatted debug output, but also
* for sanity checking for infinite recursion bugs.
*/
#define MAX_INDENT 1024
static const char *indent_s[MAX_INDENT];
static int current_indent;
static void
indent_push(const char *s)
{
if (current_indent < MAX_INDENT)
indent_s[current_indent++] = s;
else
}
static void
indent_set(const char *s)
{
current_indent = 0;
indent_push(s);
}
static void
indent_pop(void)
{
if (current_indent > 0)
else
}
static void
indent(void)
{
int i;
if (!Verbose)
return;
for (i = 0; i < current_indent; i++)
}
static int
{
return (1);
}
}
#define SLNEW 1
#define SLCHANGED 2
#define SLWAIT 3
#define SLDISPROVED 4
static void
{
if (circumstance == SLCHANGED) {
} else if (circumstance == SLWAIT) {
} else if (circumstance == SLDISPROVED) {
} else {
}
return;
}
}
}
static struct node *
{
}
#define MAXDIGITIDX 23
static int
{
if (n == NULL)
return (1);
/*
* check value.v since we are being asked to convert an unsigned
* long long int to an unsigned int
*/
return (1);
return (0);
}
static nvlist_t *
{
char *failure;
int err, i;
/* XXX do we need to be able to handle a non-T_NAME node? */
return (NULL);
break;
depth++;
}
/* We bailed early, something went wrong */
return (NULL);
}
for (i = 0; i < depth; i++)
if (err != 0) {
failure = "basic construction of FMRI failed";
goto boom;
}
i = 0;
if (err != 0) {
failure = "alloc of an hc-pair failed";
goto boom;
}
if (err != 0) {
failure = "construction of an hc-pair failed";
goto boom;
}
pa[i++] = p;
}
if (err == 0) {
for (i = 0; i < depth; i++)
nvlist_free(pa[i]);
return (f);
}
failure = "addition of hc-pair array to FMRI failed";
boom:
for (i = 0; i < depth; i++)
nvlist_free(pa[i]);
nvlist_free(f);
/*NOTREACHED*/
}
static uint_t
{
unsigned long long s = sum * 10;
}
static uint8_t
{
unsigned long long p = part * 1000;
}
static struct rsl {
};
/*
* rslfree -- free internal members of struct rsl not expected to be
* freed elsewhere.
*/
static void
{
}
/*
* rslcmp -- compare two rsl structures. Use the following
* comparisons to establish cardinality:
*
* 1. Name of the suspect's class. (simple strcmp)
* 2. Name of the suspect's ASRU. (trickier, since nvlist)
*
*/
static int
rslcmp(const void *a, const void *b)
{
int rv;
if (rv != 0)
return (rv);
return (0);
return (-1);
return (1);
}
/*
* rsluniq -- given an array of rsl structures, seek out and "remove"
* any duplicates. Dups are "remove"d by NULLing the suspect pointer
* of the array element. Removal also means updating the number of
* problems and the number of problems which are not faults. User
* provides the first and last element pointers.
*/
static void
{
if (*nprobs == 1)
return;
/*
* At this point, we only expect duplicate defects.
* Eversholt's diagnosis algorithm prevents duplicate
* suspects, but we rewrite defects in the platform code after
* the diagnosis is made, and that can introduce new
* duplicates.
*/
first++;
continue;
}
(*nprobs)--;
(*nnonf)--;
}
}
/*
* assume all defects are in order after our
* sort and short circuit here with "else break" ?
*/
cr++;
}
first++;
}
}
/*
* get_resources -- for a given suspect, determine what ASRU, FRU and
* RSRC nvlists should be advertised in the final suspect list.
*/
void
{
char *pathstr;
/*
* initial fault tree.
*/
/*
* Create FMRIs based on those definitions
*/
/*
* Allow for platform translations of the FMRIs
*/
pathstr);
}
/*
* trim_suspects -- prior to publishing, we may need to remove some
* suspects from the list. If we're auto-closing upsets, we don't
* want any of those in the published list. If the ASRUs for multiple
* defects resolve to the same ASRU (driver) we only want to publish
* that as a single suspect.
*/
static void
{
int rpcnt;
/*
* First save the suspects in the psuspects, then copy back
* only the ones we wish to retain. This resets nsuspects to
* zero.
*/
/*
* allocate an array of resource pointers for the suspects.
* We may end up using less than the full allocation, but this
* is a very short-lived array. publish_suspects() will free
* this array when it's done using it.
*/
/* first pass, remove any unwanted upsets and populate our array */
continue;
rp++;
}
/* if all we had was unwanted upsets, we're done */
return;
/* sort the array */
}
static void
{
/*
* The current fmd interfaces don't allow us to solve a case
* that's already solved. If we make a new case, what of the
* ereports? We don't appear to have an interface that allows
* us to access the ereports attached to a case (if we wanted
* to copy the original case's ereport attachments to the new
* case) and it's also a bit unclear if there would be any
* problems with having ereports attached to multiple cases
* we'll just output a message.
*/
if (fmep->posted_suspects ||
}
return;
}
/*
* If we're auto-closing upsets, we don't want to include them
* in any produced suspect lists or certainty accounting.
*/
/*
* If the resulting suspect list has no members, we're
* done. Returning here will simply close the case.
*/
"[FME%d, case %s (all suspects are upsets)]",
return;
}
/*
* If the suspect list is all faults, then for a given fault,
* say X of N, X's certainty is computed via:
*
* fitrate(X) / (fitrate(1) + ... + fitrate(N)) * 100
*
* If none of the suspects are faults, and there are N suspects,
* the certainty of a given suspect is 100/N.
*
* If there are are a mixture of faults and other problems in
* the suspect list, we take an average of the faults'
* FITrates and treat this average as the FITrate for any
* non-faults. The fitrate of any given suspect is then
* computed per the first formula above.
*/
/* NO faults in the suspect list */
} else {
/* sum the fitrates */
struct node *n;
continue;
continue;
}
fr = 1;
} else if (fr == 0) {
fr = 1;
}
frcnt++;
}
}
}
/* Add them in reverse order of our sort, as fmd reverses order */
continue;
cert,
}
if (Autoconvict) {
continue;
}
}
/*
* revert to the original suspect list
*/
}
static void
{
"[undiagnosable ereport received, "
"creating and closing a new case (%s)]",
if (Undiagablecaselist != NULL)
if (Undiag_reason != NULL)
}
static void
fme_undiagnosable(struct fme *f)
{
if (Undiag_reason != NULL)
destroy_fme_bufs(f);
}
/*
* fme_close_case
*
* Find the requested case amongst our fmes and close it. Free up
* the related fme.
*/
void
{
continue;
}
else
return;
}
break;
}
return;
}
else
/* Get rid of any timer this fme has set */
if (ClosedFMEs == NULL) {
ClosedFMEs = fmep;
} else {
ClosedFMEs = fmep;
}
}
/*
* fme_set_timer()
* If the time we need to wait for the given FME is less than the
* current timer, kick that old timer out and establish a new one.
*/
static void
{
/* we've waited at least wull already, don't need timer */
return;
}
} else {
}
/* New timer would fire later than established timer */
return;
}
void
{
break;
(void *)fmep);
return;
}
/*
* normal timer (not the hesitation timer
*/
} else {
}
}
/*
* Preserve the fme's suspect list in its psuspects list, NULLing the
* suspects list in the meantime.
*/
static void
{
/* zero out the previous suspect list */
}
/* zero out the suspect list, copying it to previous suspect list */
ep->is_suspect = 0;
}
}
/*
* Retrieve the fme's suspect list from its psuspects list.
*/
static void
{
}
}
/*
* this is what we use to call the Emrys prototype code instead of main()
*/
static void
{
unsigned long long my_delay = TIMEVAL_EVENTUALLY;
indent_set(" ");
}
if (fmep->posted_suspects) {
/*
* this FME has already posted a diagnosis, so see if
* the event changed the diagnosis and print a warning
* if it did.
*
*/
if (suspects_changed(fmep)) {
}
} else {
case FME_CREDIBLE:
/*
* if the suspect list contains any upsets, we
* turn off the hesitation logic (by setting
* the hesitate flag which normally indicates
* we've already done the hesitate logic).
* this is done because hesitating with upsets
* causes us to explain away additional soft errors
* while the upset FME stays open.
*/
struct event *s;
if (s->t == N_UPSET) {
break;
}
}
}
if (Hesitate &&
/*
* about to publish multi-entry suspect list,
* set the hesitation timer if not already set.
*/
"[hesitate FME%d, case %s ",
(unsigned long long *)&Hesitate);
} else {
"[still hesitating FME%d, case %s]",
}
} else {
(void *)&fmep->posted_suspects,
sizeof (fmep->posted_suspects));
}
break;
case FME_WAIT:
/*
* singleton suspect list implies
* no point in waiting
*/
(void *)&fmep->posted_suspects,
sizeof (fmep->posted_suspects));
} else {
}
break;
case FME_DISPROVED:
break;
}
}
int doclose = 0;
doclose = 1;
doclose = 1;
doclose = 0;
break;
}
}
}
if (doclose) {
}
}
}
/*
* below here is the code derived from the Emrys prototype
*/
static void indent(void);
struct event *fault_event);
unsigned long long at_latest_by, unsigned long long *pdelay,
unsigned long long at_latest_by, unsigned long long *pdelay);
static int
{
int count = 0;
continue;
/* check count of marks against K in the bubble */
return (1);
}
}
return (0);
}
static void
{
continue;
struct constraintlist *ctp;
int do_not_follow = 0;
/*
* see if false constraint prevents us
* from traversing this arrow, but don't
* bother if the event is an ereport we
* haven't seen
*/
&value) == 0 ||
value.v == 0) {
do_not_follow = 1;
break;
}
}
}
if (do_not_follow) {
indent();
" False arrow to ");
continue;
}
mark))
}
}
}
}
static enum fme_state
{
struct event *error_event;
indent_push(" E");
indent();
indent();
break;
} else {
}
}
indent();
indent_pop();
return (return_value);
}
static enum fme_state
unsigned long long at_latest_by, unsigned long long *pdelay,
{
int waiting_events;
int credible_events;
unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
unsigned long long arrow_delay;
unsigned long long my_delay;
indent_push(" R");
indent();
} else {
*pdelay = at_latest_by;
}
/*
* evaluate constraints only for current observation
*/
struct constraintlist *ctp;
indent();
" False constraint ");
break;
}
}
}
indent();
switch (return_value) {
case FME_CREDIBLE:
break;
case FME_DISPROVED:
break;
case FME_WAIT:
break;
default:
break;
}
indent_pop();
return (return_value);
}
/* this event is not a report, descend the tree */
continue;
credible_events = 0;
waiting_events = 0;
/*
* n is -1 for 'A' so adjust it.
* XXX just count up the arrows for now.
*/
if (n < 0) {
n = 0;
n++;
indent();
} else {
indent();
}
if (n <= credible_events)
break;
/* XXX adding max timevals! */
case FME_CREDIBLE:
break;
case FME_DISPROVED:
break;
case FME_WAIT:
if (my_delay < arrow_delay)
break;
default:
"Bug in requirements_test.");
}
else
}
indent();
if (credible_events + waiting_events < n) {
/* Can never meet requirements */
indent();
indent_pop();
return (FME_DISPROVED);
}
if (credible_events < n) { /* will have to wait */
/* wait time is shortest known */
if (arrow_delay < overall_delay)
}
} else {
indent();
}
}
}
/*
* evaluate constraints for ctlist, which is the list of
* constraints for the arrow pointing into this node of the tree
*/
struct constraintlist *ctp;
indent();
" False constraint ");
break;
}
}
}
if (return_value == FME_WAIT)
*pdelay = overall_delay;
indent();
indent_pop();
return (return_value);
}
static enum fme_state
unsigned long long at_latest_by, unsigned long long *pdelay)
{
unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
unsigned long long my_delay;
int credible_results = 0;
int waiting_results = 0;
struct event *tail_event;
int k = 1;
indent_push(" C");
indent();
continue;
struct constraintlist *ctp;
int do_not_follow = 0;
/*
* see if false constraint prevents us
* from traversing this arrow
*/
&value) == 0 ||
value.v == 0) {
do_not_follow = 1;
break;
}
}
if (do_not_follow) {
indent();
" False arrow from ");
continue;
}
/*
* get to this point if this is not the
* first time we're going through this
* arrow in the causes test. consider this
* branch to be credible and let the
* credible/noncredible outcome depend on
* the other branches in this cycle.
*/
} else {
/*
* get to this point if this is the first
* time we're going through this arrow.
*/
}
switch (fstate) {
case FME_WAIT:
if (my_delay < overall_delay)
break;
case FME_CREDIBLE:
break;
case FME_DISPROVED:
break;
default:
}
}
}
/* compare against K */
if (credible_results + waiting_results < k) {
indent();
indent_pop();
return (FME_DISPROVED);
}
if (waiting_results != 0) {
*pdelay = overall_delay;
indent();
indent_pop();
return (FME_WAIT);
}
indent();
indent_pop();
return (FME_CREDIBLE);
}
static enum fme_state
unsigned long long at_latest_by, unsigned long long *pdelay,
{
unsigned long long my_delay;
unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
indent_push(" H");
indent();
if (rtr != FME_DISPROVED) {
if (is_problem(ep->t)) {
if (otr != FME_DISPROVED) {
}
}
} else
if ((otr != FME_DISPROVED) &&
*pdelay = overall_delay;
}
if (rtr == FME_DISPROVED) {
indent();
indent_pop();
return (FME_DISPROVED);
}
indent();
indent_pop();
return (FME_DISPROVED);
}
if (otr == FME_DISPROVED) {
indent();
indent_pop();
return (FME_DISPROVED);
}
indent();
indent_pop();
return (FME_WAIT);
}
indent();
indent_pop();
return (FME_CREDIBLE);
}