fmd_asru.c revision 25c6ff4b77fcddf4097ce78a8277275ca603b46c
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <dirent.h>
#include <limits.h>
#include <unistd.h>
#include <alloca.h>
#include <stddef.h>
#include <fmd_alloc.h>
#include <fmd_string.h>
#include <fmd_error.h>
#include <fmd_subr.h>
#include <fmd_protocol.h>
#include <fmd_event.h>
#include <fmd_conf.h>
#include <fmd_fmri.h>
#include <fmd_dispq.h>
#include <fmd_case.h>
#include <fmd_module.h>
#include <fmd_asru.h>
#include <fmd.h>
static const char *const _fmd_asru_events[] = {
};
static const char *const _fmd_asru_snames[] = {
"uf", "uF", "Uf", "UF" /* same order as above */
};
volatile uint32_t fmd_asru_fake_not_present = 0;
static uint_t
{
}
static boolean_t
{
}
static fmd_asru_t *
{
char *s;
if (fmri)
strcmp(s, FM_FMRI_SCHEME_FMD) == 0)
return (ap);
}
static void
{
}
static void
{
}
static fmd_asru_t *
{
return (ap);
}
/*
* Lookup an asru in the hash by name and place a hold on it. If the asru is
* not found, no entry is created and NULL is returned. This internal function
* is for callers who have the ah_lock held and is used by lookup_name below.
*/
{
fmd_asru_t *ap;
uint_t h;
break;
}
(void) fmd_asru_hold(ap);
else
(void) fmd_set_errno(EFMD_ASRU_NOENT);
return (ap);
}
static int
{
int ps = -1;
/*
* Check if there is evidence that this object is no longer present.
* frus, as those are the things that are physically present or not
* present - an asru can be spread over a number of frus some of which
* are present and some not, so fmd_fmri_present() is not generally
* meaningful. However retain a check for asru first for compatibility.
* If we have checked all three and we still get -1 then nothing knows
* whether it's present or not, so err on the safe side and treat it
* as still present.
*/
return (fmd_asru_fake_not_present);
if (ps == -1) {
} else if (ps == FMD_OBJ_STATE_UNKNOWN) {
/* see if we can improve on UNKNOWN */
&rsrc) == 0) {
if (ps2 == FMD_OBJ_STATE_STILL_PRESENT ||
}
}
if (ps == -1) {
} else if (ps == FMD_OBJ_STATE_UNKNOWN) {
/* see if we can improve on UNKNOWN */
if (ps2 == FMD_OBJ_STATE_STILL_PRESENT ||
}
}
if (ps == -1)
return (ps);
}
static void
char *name)
{
ahp->ah_al_count++;
}
static void
char *name)
{
}
static void
{
}
static void
char *name)
{
}
static void
char *name)
{
}
static void
{
}
static fmd_asru_link_t *
{
return (alp);
}
/*ARGSUSED*/
static void
{
else
}
static int
{
return (EFMD_ASRU_FMRI);
return (EFMD_ASRU_FMRI);
}
return (0);
}
static fmd_asru_link_t *
const char *al_uuid)
{
fmd_asru_t *ap;
got_asru = 1;
got_fru = 1;
got_rsrc = 1;
label = "";
/*
* Grab the rwlock as a writer; Then create and insert the asru with
* ahp->ah_lock held and hash it in. We'll then drop the rwlock and
* proceed to initializing the asru.
*/
/*
* Create and initialise the per-fault "link" structure.
*/
if (got_asru)
/*
* If this is the first fault for this asru, then create the per-asru
* structure and link into the hash.
*/
NULL);
} else
/*
* Put the link structure on the list associated with the per-asru
* structure. Then put the link structure on the various hashes.
*/
return (alp);
}
static void
{
int ps;
fmd_asru_t *ap;
fmd_case_t *cp;
char *class;
int err;
/*
* Extract the most recent values of 'faulty' from the event log.
*/
&faulty) != 0) {
return;
}
return;
}
&unusable);
&repaired);
&replaced);
&acquitted);
/*
* Attempt to recreate the case in either the CLOSED or REPAIRED state
* (depending on whether the faulty bit is still set).
* If the case is already present, fmd_case_recreate() will return it.
* If not, we'll create a new orphaned case. Either way, we use the
* ASRU event to insert a suspect into the partially-restored case.
*/
else
/*
* For faults with a resource, re-evaluate the asru from the resource.
*/
}
/*
* Now create the resource cache entries.
*/
/*
* Check to see if the resource is still present in the system.
*/
else if (ps == FMD_OBJ_STATE_REPLACED)
if (faulty) {
}
if (unusable) {
}
if (replaced)
else if (repaired)
else if (acquitted)
}
static void
{
if (err != 0)
else
}
/*
* Open a saved log file and restore it into the ASRU hash. If we can't even
* open the log, rename the log file to <uuid>- to indicate it is corrupt. If
* fmd_log_replay() fails, we either delete the file (if it has reached the
* upper limit on cache age) or rename it for debugging if it was corrupted.
*/
static void
{
uint_t n;
return;
}
n = ahp->ah_al_count;
if (ahp->ah_al_count == n)
}
void
{
int zero;
return;
}
continue; /* skip "." and ".." */
if (zero)
}
}
/*
* If the resource is present and faulty but not unusable, replay the fault
* event that caused it be marked faulty. This will cause the agent
* subscribing to this fault class to again disable the resource.
*/
/*ARGSUSED*/
static void
{
fmd_event_t *e;
char *class;
}
}
void
{
}
/*
* Check if the resource is still present. If not, and if the rsrc.age time
* has expired, then do an implicit repair on the resource.
*/
/*ARGSUSED*/
static void
{
int ps;
int err;
if (ps == FMD_OBJ_STATE_REPLACED) {
} else if (ps == FMD_OBJ_STATE_NOT_PRESENT) {
}
}
void
{
}
{
ahp->ah_al_count = 0;
return (ahp);
}
void
{
uint_t i;
for (i = 0; i < ahp->ah_hashlen; i++) {
}
}
}
/*
* Take a snapshot of the ASRU database by placing an additional hold on each
* member in an auxiliary array, and then call 'func' for each ASRU.
*/
void
{
for (i = 0; i < ahp->ah_hashlen; i++) {
}
for (i = 0; i < apc; i++) {
}
}
void
{
for (i = 0; i < ahp->ah_hashlen; i++) {
}
for (i = 0; i < alpc; i++) {
}
}
static void
{
uint_t h;
/* LINTED pointer alignment */
if (fmd_asru_strcmp(ahp,
/* LINTED pointer alignment */
alpc++;
/* LINTED pointer alignment */
if (fmd_asru_strcmp(ahp,
/* LINTED pointer alignment */
for (i = 0; i < alpc; i++) {
}
}
void
{
}
void
{
}
void
{
}
void
{
}
void
{
}
/*
* Lookup an asru in the hash by name and place a hold on it. If the asru is
* not found, no entry is created and NULL is returned.
*/
{
fmd_asru_t *ap;
return (ap);
}
/*
* Create a resource cache entry using the fault event "nvl" for one of the
* suspects from the case "cp".
*
* The fault event can have the following components : FM_FAULT_ASRU,
* FM_FAULT_FRU, FM_FAULT_RESOURCE. These should be set by the Diagnosis Engine
* when calling fmd_nvl_create_fault(). In the general case, these are all
* optional and an entry will always be added into the cache even if one or all
* of these fields is missing.
*
* However, for hardware faults the recommended practice is that the fault
* event should always have the FM_FAULT_RESOURCE field present and that this
* should be represented in hc-scheme.
*
* Currently the DE should also add the FM_FAULT_ASRU and FM_FAULT_FRU fields
* where known, though at some future stage fmd might be able to fill these
* in automatically from the topology.
*/
{
char *parsed_uuid;
int uuidlen;
/*
* Generate a UUID for the ASRU. libuuid cleverly gives us no
* interface for specifying or learning the buffer size. Sigh.
* The spec says 36 bytes but we use a tunable just to be safe.
*/
/*
* Now create the resource cache entries.
*/
return (alp);
}
/*
* Release the reference count on an asru obtained using fmd_asru_hash_lookup.
* We take 'ahp' for symmetry and in case we need to use it in future work.
*/
/*ARGSUSED*/
void
{
else
}
static void
{
uint_t h;
/* LINTED pointer alignment */
} else
}
}
static void
{
name);
} else
}
void
{
fmd_asru_t *ap;
char *label;
uint_t h;
/*
* first delete hash entries for each suspect
*/
&label) != 0)
label = "";
}
/*
* then delete associated case hash entries
*/
ahp->ah_al_count--;
/*
* decrement case ref.
*/
/*
* If we found a matching ASRU, unlink its log file and
* then release the hash entry. Note that it may still
* be referenced if another thread is manipulating it;
* this is ok because once we unlink, the log file will
* not be restored, and the log data will be freed when
* all of the referencing threads release their
* respective references.
*/
"failed to unlink asru %s", path);
/*
* Now unlink from the global per-resource cache
* and if this is the last link then remove that from
* it's own hash too.
*/
uint_t h;
} else
}
}
} else
}
}
static void
{
}
void
{
int flags;
int rval;
/*
* repair this asru cache entry
*/
/*
* now check if all entries associated with this asru are repaired and
* if so repair containees
*/
if (!(flags & FMD_ASRU_FAULTY))
alp->al_asru_fmri);
/*
* if called from fmd_adm_repair() and we really did clear the bit then
* we need to do a case update to see if the associated case can be
* repaired. No need to do this if called from fmd_case_repair() (ie
* when er is NULL) as the case will be explicitly repaired anyway.
*/
if (er) {
*(int *)er = 0;
if (rval)
}
}
static void
{
}
void
{
int flags;
int rval;
/*
* acquit this asru cache entry
*/
/*
* now check if all entries associated with this asru are acquitted and
* if so acquit containees
*/
if (!(flags & FMD_ASRU_FAULTY))
alp->al_asru_fmri);
/*
* if called from fmd_adm_acquit() and we really did clear the bit then
* we need to do a case update to see if the associated case can be
* repaired. No need to do this if called from fmd_case_acquit() (ie
* when er is NULL) as the case will be explicitly repaired anyway.
*/
if (er) {
*(int *)er = 0;
if (rval)
}
}
static void
{
}
void
{
int flags;
int rval;
int ps;
if (ps == FMD_OBJ_STATE_STILL_PRESENT)
return;
/*
* mark this cache entry as replaced
*/
/*
* now check if all entries associated with this asru are replaced and
* if so replace containees
*/
if (!(flags & FMD_ASRU_FAULTY))
alp->al_asru_fmri);
*(int *)er = 0;
if (rval)
}
static void
{
0))
}
void
{
int flags;
int rval;
/*
* mark this cache entry as replacded
*/
/*
* now check if all entries associated with this asru are removed and
* if so replace containees
*/
if (!(flags & FMD_ASRU_FAULTY))
alp->al_asru_fmri);
if (rval)
}
static void
{
fmd_event_t *e;
char *class;
return; /* can't log events if we can't open the log */
fmd_event_hold(e);
fmd_event_rele(e);
/*
* For now, we close the log file after every update to conserve file
* descriptors and daemon overhead. If this becomes a performance
* issue this code can change to keep a fixed-size LRU cache of logs.
*/
}
int
{
return (0);
}
return (1);
}
int
{
}
return (0);
}
if (sflag == FMD_ASRU_UNUSABLE)
else if (sflag == FMD_ASRU_FAULTY) {
/*
* only clear the faulty bit if all links are clear
*/
if (!(flags & FMD_ASRU_FAULTY))
}
return (1);
}
/*
* Report the current known state of the link entry (ie this particular fault
* affecting this particular ASRU).
*/
int
{
int ps;
if (ps == FMD_OBJ_STATE_NOT_PRESENT)
if (ps == FMD_OBJ_STATE_REPLACED) {
}
/* not supported by scheme - try fmd_fmri_unusable */
} else if (us == FMD_SERVICE_STATE_UNUSABLE) {
st |= FMD_ASRU_UNUSABLE;
return (st);
} else if (us == FMD_SERVICE_STATE_OK) {
st &= ~FMD_ASRU_UNUSABLE;
return (st);
} else if (us == FMD_SERVICE_STATE_DEGRADED) {
st &= ~FMD_ASRU_UNUSABLE;
st |= FMD_ASRU_DEGRADED;
return (st);
}
} else
if (us > 0)
st |= FMD_ASRU_UNUSABLE;
else if (us == 0)
st &= ~FMD_ASRU_UNUSABLE;
return (st);
}
/*
* Report the current known state of the ASRU by refreshing its unusable status
* based upon the routines provided by the scheme module. If the unusable bit
* is different, we do *not* generate a state change here because that change
* may be unrelated to fmd activities and therefore we have no case or event.
* The absence of the transition is harmless as this function is only provided
* for RPC observability and fmd's clients are only concerned with ASRU_FAULTY.
*/
int
{
return (0); /* do not report non-fmd non-present resources */
if (us > 0)
st |= FMD_ASRU_UNUSABLE;
else if (us == 0)
st &= ~FMD_ASRU_UNUSABLE;
return (st);
}