fmd_case.c revision 97c04605eb1864c046164dcf2d01aa7271313df6
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* FMD Case Subsystem
*
* Diagnosis engines are expected to group telemetry events related to the
* diagnosis of a particular problem on the system into a set of cases. The
* diagnosis engine may have any number of cases open at a given point in time.
* Some cases may eventually be *solved* by associating a suspect list of one
* or more problems with the case, at which point fmd publishes a list.suspect
* event for the case and it becomes visible to administrators and agents.
*
* Every case is named using a UUID, and is globally visible in the case hash.
* Cases are reference-counted, except for the reference from the case hash
* itself. Consumers of case references include modules, which store active
* cases on the mod_cases list, ASRUs in the resource cache, and the RPC code.
*
* Cases obey the following state machine. In states UNSOLVED, SOLVED, and
* CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine
* or transport) and the case is referenced by the mod_cases list. Once the
* case reaches the CLOSED or REPAIRED states, a case's module changes to refer
* to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases.
*
* +------------+
* +----------| UNSOLVED |
* | +------------+
* 1 | 4 |
* | |
* +----v---+ /-2->+------v-----+ 3 +--------+
* | SOLVED |< | CLOSE_WAIT |--------->| CLOSED |
* +--------+ \-5->+------------+ +--------+
* | |
* 6 | | 7
* +------v-----+ |
* | REPAIRED |<-------------+
* +------------+
*
* The state machine changes are triggered by calls to fmd_case_transition()
* from various locations inside of fmd, as described below:
*
* [1] Called by: fmd_case_solve()
* Actions: FMD_CF_SOLVED flag is set in ci_flags
* conviction policy is applied to suspect list
* suspects convicted are marked faulty (F) in R$
* list.suspect event logged and dispatched
*
* [2] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose()
* Actions: FMD_CF_ISOLATED flag is set in ci_flags
* suspects convicted (F) are marked unusable (U) in R$
* diagnosis engine fmdo_close() entry point scheduled
* case transitions to CLOSED [3] upon exit from CLOSE_WAIT
*
* [3] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
* Actions: list.isolated event dispatched
* case deleted from module's list of open cases
*
* [4] Called by: fmd_case_close(), fmd_case_uuclose()
* Actions: diagnosis engine fmdo_close() entry point scheduled
* case is subsequently discarded by fmd_case_delete()
*
* [5] Called by: fmd_case_repair(), fmd_case_update()
* Actions: FMD_CF_REPAIR flag is set in ci_flags
* diagnosis engine fmdo_close() entry point scheduled
* case transitions to REPAIRED [6] upon exit from CLOSE_WAIT
*
* [6] Called by: fmd_case_repair(), fmd_case_update()
* Actions: FMD_CF_REPAIR flag is set in ci_flags
* suspects convicted are marked non faulty (!F) in R$
* list.repaired event dispatched
*
* [7] Called by: fmd_case_repair(), fmd_case_update()
* Actions: FMD_CF_REPAIR flag is set in ci_flags
* suspects convicted are marked non faulty (!F) in R$
* list.repaired event dispatched
*/
#include <alloca.h>
#include <fmd_alloc.h>
#include <fmd_module.h>
#include <fmd_error.h>
#include <fmd_conf.h>
#include <fmd_case.h>
#include <fmd_string.h>
#include <fmd_subr.h>
#include <fmd_protocol.h>
#include <fmd_event.h>
#include <fmd_eventq.h>
#include <fmd_dispq.h>
#include <fmd_buf.h>
#include <fmd_log.h>
#include <fmd_asru.h>
#include <fmd_fmri.h>
#include <fmd_xprt.h>
#include <fmd.h>
static const char *const _fmd_case_snames[] = {
"UNSOLVED", /* FMD_CASE_UNSOLVED */
"SOLVED", /* FMD_CASE_SOLVED */
"CLOSE_WAIT", /* FMD_CASE_CLOSE_WAIT */
"CLOSED", /* FMD_CASE_CLOSED */
"REPAIRED" /* FMD_CASE_REPAIRED */
};
extern volatile uint32_t fmd_asru_fake_not_present;
fmd_case_hash_create(void)
{
return (chp);
}
/*
* Destroy the case hash. Unlike most of our hash tables, no active references
* are kept by the case hash itself; all references come from other subsystems.
* The hash must be destroyed after all modules are unloaded; if anything was
* present in the hash it would be by definition a reference count leak.
*/
void
{
}
/*
* Take a snapshot of the case hash by placing an additional hold on each
* member in an auxiliary array, and then call 'func' for each case.
*/
void
{
for (i = 0; i < chp->ch_hashlen; i++) {
}
}
for (i = 0; i < cpc; i++) {
}
}
/*
* Look up the diagcode for this case and cache it in ci_code. If no suspects
* were defined for this case or if the lookup fails, the event dictionary or
* module code is broken, and we set the event code to a precomputed default.
*/
static const char *
{
const char *s;
keyp++;
}
}
}
nvlist_t *
{
boolean_t b;
/*
* For each suspect associated with the case, store its fault event
* nvlist in 'nva'. We also look to see if any of the suspect faults
* have asked not to be messaged. If any of them have made such a
* request, propagate that attribute to the composite list.* event.
* Finally, store each suspect's faulty status into the bitmap 'ba'.
*/
FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
*bp = 0;
if (fmd_asru_fake_not_present ||
*bp |= FM_SUSPECT_NOT_PRESENT;
if (fmd_asru_fake_not_present ||
*bp |= FM_SUSPECT_UNUSABLE;
*bp |= FM_SUSPECT_FAULTY;
bp++;
} else
*bp++ = 0;
}
(void) fmd_case_mkcode(cp);
return (nvl);
}
/*
* Convict suspects in a case by applying a conviction policy and updating the
* resource cache prior to emitting the list.suspect event for the given case.
* At present, our policy is very simple: convict every suspect in the case.
* In the future, this policy can be extended and made configurable to permit:
*
* - convicting the suspect with the highest FIT rate
* - convicting the suspect with the cheapest FRU
* - convicting the suspect with the FRU that is in a depot's inventory
* - convicting the suspect with the longest lifetime
*
* and so forth. A word to the wise: this problem is significantly harder that
* it seems at first glance. Future work should heed the following advice:
*
* Hacking the policy into C code here is a very bad idea. The policy needs to
* be decided upon very carefully and fundamentally encodes knowledge of what
* suspect list combinations can be emitted by what diagnosis engines. As such
* fmd's code is the wrong location, because that would require fmd itself to
* be updated for every diagnosis engine change, defeating the entire design.
* The FMA Event Registry knows the suspect list combinations: policy inputs
* can be derived from it and used to produce per-module policy configuration.
*
* If the policy needs to be dynamic and not statically fixed at either fmd
* startup or module load time, any implementation of dynamic policy retrieval
* must employ some kind of caching mechanism or be part of a built-in module.
* The fmd_case_convict() function is called with locks held inside of fmd and
* is not a place where unbounded blocking on some inter-process or inter-
* system communication to another service (e.g. another daemon) can occur.
*/
static void
{
(void) fmd_case_mkcode(cp);
continue; /* no ASRU provided by diagnosis engine */
continue;
}
(void) fmd_asru_clrflags(asru,
(void) fmd_asru_setflags(asru,
}
}
void
{
fmd_event_t *e;
char *class;
if (state == FMD_CASE_CURRENT)
switch (state) {
case FMD_CASE_SOLVED:
if (cip->ci_tv_valid == 0) {
}
break;
case FMD_CASE_CLOSE_WAIT:
break;
case FMD_CASE_CLOSED:
break;
case FMD_CASE_REPAIRED:
break;
}
}
{
uint_t h;
break;
}
/*
* If deleting bit is set, treat the case as if it doesn't exist.
*/
(void) fmd_set_errno(EFMD_CASE_INVAL);
return ((fmd_case_t *)cip);
}
static fmd_case_impl_t *
{
uint_t h;
return (eip); /* uuid already present */
}
}
return (cip);
}
static void
{
uint_t h;
else
break;
}
fmd_panic("case %p (%s) not found on hash chain %u\n",
}
}
{
/*
* Calling libuuid: get a clue. The library interfaces cleverly do not
* define any constant for the length of an unparse string, and do not
* permit the caller to specify a buffer length for safety. The spec
* says it will be 36 bytes, but we make it tunable just in case.
*/
/*
* We expect this loop to execute only once, but code it defensively
* against the possibility of libuuid bugs. Keep generating uuids and
* attempting to do a hash insert until we get a unique one.
*/
do {
return ((fmd_case_t *)cip);
}
static void
{
}
cip->ci_nsuspects = 0;
}
{
if (state > FMD_CASE_CLOSE_WAIT)
/*
* Insert the case into the global case hash. If the specified UUID is
* already present, check to see if it is an orphan: if so, reclaim it;
* otherwise if it is owned by a different module then return NULL.
*/
/*
* If the ASRU cache is trying to recreate an orphan, then just
* return the existing case that we found without changing it.
*/
return ((fmd_case_t *)cip);
}
/*
* If the existing case isn't an orphan or is being proxied,
* then we have a UUID conflict: return failure to the caller.
*/
return (NULL);
}
/*
* If the new module is reclaiming an orphaned case, remove
* the case from the root module, switch ci_mod, and then fall
* through to adding the case to the new owner module 'mp'.
*/
}
return ((fmd_case_t *)cip);
}
void
{
if (visible) {
}
}
}
void
{
}
void
{
fmd_panic("attempt to hold a deleting case %p (%s)\n",
}
static fmd_case_impl_t *
{
/*
* If the case's "deleting" bit is unset, hold and return case,
* otherwise, return NULL.
*/
} else {
}
return (cip);
}
void
{
else
}
int
{
int new;
else
break;
}
return (new);
}
int
{
int new;
else
break;
}
/*
* If the event is already in the case or the case is already solved,
* there is no reason to save it: just transition it appropriately.
*/
return (new);
}
return (new);
}
void
{
cip->ci_nsuspects++;
}
void
{
boolean_t b;
FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
cip->ci_nsuspects++;
}
void
{
}
/*
* Grab ci_lock and update the case state and set the dirty bit. Then perform
* whatever actions and emit whatever events are appropriate for the state.
* Refer to the topmost block comment explaining the state machine for details.
*/
void
{
fmd_event_t *e;
return; /* already in specified state */
}
switch (state) {
case FMD_CASE_SOLVED:
}
break;
case FMD_CASE_CLOSE_WAIT:
/*
* If the case was never solved, do not change ASRUs.
* If the case was never fmd_case_closed, do not change ASRUs.
* If the case was repaired, do not change ASRUs.
*/
goto close_wait_finish;
/*
* For each fault event in the suspect list, attempt to look up
* the corresponding ASRU in the ASRU dictionary. If the ASRU
* is found there and is marked faulty, we now mark it unusable
* and record the case meta-data and fault event with the ASRU.
*/
(void) fmd_asru_setflags(asru,
}
}
/*
* If an orphaned case transitions to CLOSE_WAIT, the owning
* module is no longer loaded: continue on to CASE_CLOSED.
*/
if (fmd_case_orphaned(cp))
break;
case FMD_CASE_REPAIRED:
break;
}
/*
* If the module has initialized, then publish the appropriate event
* for the new case state. If not, we are being called from the
* checkpoint code during module load, in which case the module's
* _fmd_init() routine hasn't finished yet, and our event dictionaries
* may not be open yet, which will prevent us from computing the event
* code. Defer the call to fmd_case_publish() by enqueuing a PUBLISH
* event in our queue: this won't be processed until _fmd_init is done.
*/
else {
}
/*
* If we transitioned to REPAIRED, adjust the reference count to
* reflect our removal from fmd.d_rmod->mod_cases. If the caller has
* not placed an additional hold on the case, it will now be freed.
*/
if (state == FMD_CASE_REPAIRED)
}
/*
* Transition the specified case to *at least* the specified state by first
* re-validating the suspect list using the resource cache. This function is
* employed by the checkpoint code when restoring a saved, solved case to see
* if the state of the case has effectively changed while fmd was not running
* or the module was not loaded. If none of the suspects are present anymore,
* advance the state to REPAIRED. If none are usable, advance to CLOSE_WAIT.
*/
void
{
int faulty = 0; /* are any suspects faulty? */
int usable = 0; /* are any suspects usable? */
faulty++;
if (fmd_asru_fake_not_present == 0 &&
usable++;
}
}
/*
* If none of the suspects were faulty, it implies they were either
* repaired already or not present and the rsrc.age time has expired.
* We can move the state on to repaired.
*/
if (!faulty) {
flags |= FMD_CF_REPAIRED;
} else if (!usable) {
flags |= FMD_CF_ISOLATED;
}
}
void
{
}
void
{
}
void
{
}
}
/*
* Indicate that the case may need to change state because one or more of the
* ASRUs named as a suspect has changed state. We examine all the suspects
* and if none are still faulty, we initiate a case close transition.
*/
void
{
int astate = 0;
return; /* update is not appropriate */
}
}
}
if (astate & FMD_ASRU_FAULTY)
return; /* one or more suspects are still marked faulty */
if (cstate == FMD_CASE_CLOSED)
else
}
/*
* Delete a closed case from the module's case list once the fmdo_close() entry
* point has run to completion. If the case is owned by a transport module,
* tell the transport to proxy a case close on the other end of the transport.
* If not, transition to the appropriate next state based on ci_flags. This
* function represents the end of CLOSE_WAIT and transitions the case to either
* CLOSED or REPAIRED or discards it entirely because it was never solved;
* refer to the topmost block comment explaining the state machine for details.
*/
void
{
/*
* If the case is not proxied and it has been solved, then retain it
* on the root module's case list at least until we're transitioned.
* Otherwise free the case with our final fmd_case_rele() below.
*/
}
/*
* If a proxied case finishes CLOSE_WAIT, then it can be discarded
* rather than orphaned because by definition it can have no entries
* in the resource cache of the current fault manager.
*/
}
void
{
}
static void
{
}
/*
* Indicate that the problem corresponding to a case has been repaired by
* clearing the faulty bit on each ASRU named as a suspect. If the case hasn't
* already been closed, this function initiates the transition to CLOSE_WAIT.
* The caller must have the case held from fmd_case_hash_lookup(), so we can
* grab and drop ci_lock without the case being able to be freed in between.
*/
int
{
fmd_asru_t **aa;
return (fmd_set_errno(EFMD_CASE_OWNER));
}
return (fmd_set_errno(EFMD_CASE_STATE));
}
/*
* Take a snapshot of any ASRUs referenced by the case that are present
* in the resource cache. Then drop ci_lock and clear the faulty bit
* on each ASRU (we can't call fmd_asru_clrflags() with ci_lock held).
*/
FM_FAULT_ASRU, &nvl) == 0)
}
/*
* For each suspect ASRU, if the case associated with this ASRU matches
* case 'cp', close all ASRUs contained by 'ap' and clear FAULTY. Note
* that at present, we're assuming that when a given resource FMRI R1
* contains another R2, that any faults are related by a common
* diagnosis engine. This is true in our current architecture, but may
* not always be true, at which point we'll need more cleverness here.
*/
for (i = 0; i < an; i++) {
continue; /* no asru was found */
(void) fmd_asru_clrflags(aa[i],
}
}
if (cstate == FMD_CASE_CLOSED)
else
return (0);
}
int
{
int rv = 0;
else
break;
}
if (rv != 0)
return (rv);
}
int
{
}
void
{
}