/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
/*
* SMF software-diagnosis subsidiary
*
* We model service instances in maintenance state as a defect diagnosis
* in FMA. When an instance transitions to maintenance state the SMF
* graph engine publishes an event which we subscribe to here, and diagnose
* a corresponding defect.
*
* We always solve a case immediately after opening it. But we leave the
* case close action to the response agent which needs to cache case UUIDs.
* So in the normal case, where software-response is loaded and operational,
* our cases will transition to CLOSED state moments after we solve them.
* But if fmd restarts in the interim or if software-response is not loaded
* then our cases may hang around in SOLVED state for a while, which means
* we could iterate over them on receipt of new events. But we don't -
* we blindly solve a new case for every new maintenance event received,
* and leave it to the fmd duplicate detection and history-based diagnosis
* logic to do the right thing.
*
* Our sibling SMF response subsidiary propogates fmadm-initiated repairs
* into SMF, and svcadm-initiated clears back into FMA. In both cases
* the case is moved on to the RESOLVED state, even if fmd is unable to
* verify that the service is out of maintenance state (i.e., no longer
* isolated). If the service immediately re-enters maintenance state then
* we diagnose a fresh case. The history-based diagnosis changes in fmd
* "do the right thing" and avoid throwing away new cases as duplicates
* of old ones hanging around in the "resolved but not all usable again"
* state.
*/
#include <strings.h>
#include <fm/fmd_fmri.h>
#include "smf.h"
static struct {
} swde_smf_stats = {
{ "swde_smf_diagnosed", FMD_TYPE_UINT64,
"maintenance state defects published" },
{ "swde_smf_bad_class", FMD_TYPE_UINT64,
"incorrect event class received" },
{ "swde_smf_no_attr", FMD_TYPE_UINT64,
"malformed event - missing attr nvlist" },
{ "swde_smf_bad_attr", FMD_TYPE_UINT64,
"malformed event - invalid attr list" },
{ "swde_smf_bad_fmri", FMD_TYPE_UINT64,
"malformed event - fmri2str fails" },
{ "swde_smf_no_uuid", FMD_TYPE_UINT64,
"malformed event - missing uuid" },
{ "swde_smf_no_reason_short", FMD_TYPE_UINT64,
"SMF transition event had no reason-short" },
{ "swde_smf_no_reason_long", FMD_TYPE_UINT64,
"SMF transition event had no reason-long" },
{ "swde_smf_no_svcname", FMD_TYPE_UINT64,
"SMF transition event had no svc-string" },
{ "swde_smf_admin_maint_drop", FMD_TYPE_UINT64,
"maintenance transitions requested by admin - no diagnosis" },
{ "swde_smf_bad_nvlist_pack", FMD_TYPE_UINT64,
"failed nvlist_size or nvlist_pack" },
{ "swde_smf_dupuuid", FMD_TYPE_UINT64,
"duplicate events received" },
};
typedef struct swde_smf_casedata {
/* packed fmri nvlist follows */
/*ARGSUSED*/
void
{
char *fmribuf;
char *uuid;
return;
}
return;
}
return;
}
return;
}
return;
}
return;
}
return;
}
return;
}
/*
* Our case checkpoint data, version 1.
*/
return;
}
/*
* Open a case with UUID matching the originating event, and no
* associated serialization data. Create a defect and add it to
* the case, and link the originating event to the case. This
* call will return NULL if a case with the requested UUID already
* exists, which would mean we are processing an event twice so
* we can discard.
*/
return;
}
/*
* Now solve the case, and immediately close it. Although the
* resource is already isolated (SMF put it in maintenance state)
* we do not immediately close the case here - our sibling response
* logic will do that after caching the case UUID.
*/
}
/*
* In the normal course of events we keep in sync with SMF through the
* state is cleared using svcadm while fmd is not running, the event
* will pend and be consumed when fmd does start and we'll close the
* case (in the response agent).
*
* But is is possible for discontinuities to produce some confusion:
*
* - if an instance is in maintenance state (and so shown in svcs -x
* and fmadm faulty output) at the time we clone a new boot
* environment then when we boot the new BE we can be out of
* sync if the instance is cleared when we boot there
*
* or restore of old state
*
* So as an extra guard we have a case verify function which is called
* at fmd restart (module load for software-diagnosis). We must
* return 0 to close the case, non-zero to retain it.
*/
int
{
uint32_t v;
int rv;
return (0); /* bad or damaged - just close */
return (0); /* ditto */
/*
* Keep case iff in maintenance state
*/
rv = 1;
break;
default:
/*
* Discard the case for all other states - cleared,
* service no longer exists, ... whatever.
*/
rv = 0;
break;
}
return (rv);
}
};
/*ARGSUSED*/
int
{
*dpp = &swde_smf_disp[0];
return (SW_SUB_INIT_SUCCESS);
}
"smf diagnosis", /* swsub_name */
SW_CASE_SMF, /* swsub_casetype */
swde_smf_init, /* swsub_init */
NULL, /* swsub_fini */
NULL, /* swsub_timeout */
NULL, /* swsub_case_close */
swde_smf_vrfy, /* swsub_case_vrfy */
};