/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 1987 Regents of the University of California.
* All rights reserved.
*
* Redistribution and use in source and binary forms are permitted
* provided that the above copyright notice and this paragraph are
* duplicated in all such forms and that any documentation,
* advertising materials, and other materials related to such
* distribution and use acknowledge that the software was developed
* by the University of California, Berkeley. The name of the
* University may not be used to endorse or promote products derived
* from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
* WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
*/
#include "mpd_defs.h"
#include "mpd_tables.h"
/*
* Probe types for probe()
*/
/*
* Format of probe / probe response packets. This is an ICMP Echo request
* or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
*/
struct pr_icmp
{
};
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x1 } };
int cmsg_type);
static void reset_snxt_basetimes(void);
/*
* CRTT - Conservative Round Trip Time Estimate
* Probe success - A matching probe reply received before CRTT ms has elapsed
* after sending the probe.
* Probe failure - No probe reply received and more than CRTT ms has elapsed
* after sending the probe.
*
* TLS - Time last success. Most recent probe ack received at this time.
* TFF - Time first fail. The time of the earliest probe failure in
* a consecutive series of probe failures.
* NUM_PROBE_REPAIRS - Number of consecutive successful probes required
* before declaring phyint repair.
* NUM_PROBE_FAILS - Number of consecutive probe failures required to
* declare a phyint failure.
*
* Phyint state diagram
*
* The state of a phyint that is capable of being probed, is completely
* specified by the 3-tuple <pi_state, pg_state, I>.
*
* A phyint starts in either PI_RUNNING or PI_OFFLINE, depending on whether
* IFF_OFFLINE is set. If the phyint is also configured with a test address
* (the common case) and probe targets, then a phyint must also successfully
* be able to send and receive probes in order to remain in the PI_RUNNING
* state (otherwise, it transitions to PI_FAILED).
*
* Further, if a PI_RUNNING phyint is configured with a test address but is
* unable to find any probe targets, it will transition to the PI_NOTARGETS
* state, which indicates that the link is apparently functional but that
* in.mpathd is unable to send probes to verify functionality (in this case,
* in.mpathd makes the optimistic assumption that the interface is working
* correctly and thus does not mark the interface FAILED, but reports it as
* IPMP_IF_UNKNOWN through the async events and query interfaces).
*
* At any point, a phyint may be administratively marked offline via if_mpadm.
* In this case, the interface always transitions to PI_OFFLINE, regardless
* of its previous state. When the interface is later brought back online,
* in.mpathd acts as if the interface is new (and thus it transitions to
* PI_RUNNING or PI_FAILED based on the status of the link and the result of
* its probes, if probes are sent).
*
* pi_state - PI_RUNNING or PI_FAILED
* PI_RUNNING: The failure detection logic says the phyint is good.
* PI_FAILED: The failure detection logic says the phyint has failed.
*
* pg_state - PG_OK, PG_DEGRADED, or PG_FAILED.
* PG_OK: All interfaces in the group are OK.
* PG_DEGRADED: Some interfaces in the group are unusable.
* PG_FAILED: All interfaces in the group are unusable.
*
* In the case of router targets, we assume that the current list of
* targets obtained from the routing table, is still valid, so the
* phyint stat is PI_FAILED. In the case of host targets, we delete the
* list of targets, and multicast to the all hosts, to reconstruct the
* target list. So the phyints are in the PI_NOTARGETS state.
*
* I - value of (pi_flags & IFF_INACTIVE)
* IFF_INACTIVE: This phyint will not send or receive packets.
* Usually, inactive is tied to standby interfaces that are not yet
* needed (e.g., no non-standby interfaces in the group have failed).
* When failback has been disabled (FAILBACK=no configured), phyint can
* also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint
* subsequently recovers after a failure.
*
* Not all 9 possible combinations of the above 3-tuple are possible.
*
* I is tracked by IP. pi_state is tracked by mpathd.
*
* pi_state state machine
* ---------------------------------------------------------------------------
* Event State New State
* Action:
* ---------------------------------------------------------------------------
* IP interface failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
* detection : set IFF_FAILED on this phyint
*
* IP interface failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
* detection : set IFF_FAILED on this phyint
*
* IP interface repair (PI_FAILED, I == 0, FAILBACK=yes)
* detection -> (PI_RUNNING, I == 0)
* : clear IFF_FAILED on this phyint
*
* IP interface repair (PI_FAILED, I == 0, FAILBACK=no)
* detection -> (PI_RUNNING, I == 1)
* : clear IFF_FAILED on this phyint
* : if failback is disabled set I == 1
*
* Group failure (perform on all phyints in the group)
* detection PI_RUNNING PI_FAILED
* (Router targets) : set IFF_FAILED
*
* Group failure (perform on all phyints in the group)
* detection PI_RUNNING PI_NOTARGETS
* (Host targets) : set IFF_FAILED
* : delete the target list on all phyints
* ---------------------------------------------------------------------------
*/
/*
* Compose and transmit an ICMP ECHO REQUEST packet. The IP header
* will be added on by the kernel. The id field identifies this phyint.
* and the sequence number is an increasing (modulo 2^^16) integer. The data
* portion holds the time value when the packet is sent. On echo this is
* extracted to compute the round-trip time. Three different types of
* probe packets are used.
*
* PROBE_UNI: This type is used to do failure detection / failure recovery
* and RTT calculation. PROBE_UNI probes are spaced apart in time,
* not less than the current CRTT. pii_probes[] stores data
* about these probes. These packets consume sequence number space.
*
* PROBE_RTT: This type is used to make only rtt measurements. Normally these
* are not used. Under heavy network load, the rtt may go up very high,
* due to a spike, or may appear to go high, due to extreme scheduling
* delays. Once the network stress is removed, mpathd takes long time to
* recover, because the probe_interval is already high, and it takes
* a long time to send out sufficient number of probes to bring down the
* rtt. To avoid this problem, PROBE_RTT probes are sent out every
* user_probe_interval ms. and will cause only rtt updates. These packets
* do not consume sequence number space nor is information about these
* packets stored in the pii_probes[]
*
* PROBE_MULTI: This type is only used to construct a list of targets, when
* no targets are known. The packet is multicast to the all hosts addr.
*/
static void
{
int rval;
}
probe_type == PROBE_RTT);
probe_pkt.pr_icmp_code = 0;
probe_pkt.pr_icmp_cksum = 0;
/*
* Since there is no need to do arithmetic on the icmpid,
* (only equality check is done) pii_icmpid is stored in
* network byte order at initialization itself.
*/
/*
* If probe_type is PROBE_MULTI, this packet will be multicast to
* the all hosts address. Otherwise it is unicast to the next target.
*/
targaddrlen = sizeof (struct sockaddr_in6);
if (probe_type == PROBE_MULTI) {
} else if (probe_type == PROBE_UNI) {
} else { /* type is PROBE_RTT */
}
} else {
targaddrlen = sizeof (struct sockaddr_in);
if (probe_type == PROBE_MULTI) {
} else if (probe_type == PROBE_UNI) {
} else { /* type is PROBE_RTT */
}
/*
* Compute the IPv4 icmp checksum. Does not cover the IP header.
*/
}
/*
* Use the current time as the time we sent. Not atomic, but the best
* we can do from here.
*/
sent_hrtime = gethrtime();
/*
* If the send would block, this may either be transient or a hang in a
* lower layer. We pretend the probe was actually sent, the daemon will
* not see a reply to the probe and will fail the interface if normal
* failure detection criteria are met.
*/
} else {
}
/*
* If this is a PROBE_UNI probe packet being unicast to a target, then
* update our tables. We will need this info in processing the probe
* response. PROBE_MULTI and PROBE_RTT packets are not used for
* the purpose of failure or recovery detection. PROBE_MULTI packets
* are only used to construct a list of targets. PROBE_RTT packets are
* used only for updating the rtt and not for failure detection.
*/
/* Collect statistics, before we reuse the last slot. */
/*
* If we have a single variable to denote the next target to
* probe for both rtt probes and failure detection probes, we
* could end up with a situation where the failure detection
* probe targets become disjoint from the rtt probe targets.
* Eg. if 2 targets and the actual fdt is double the user
* specified fdt. So we have 2 variables. In this scheme
* we also reset pii_rtt_target_next for every fdt probe,
* though that may not be necessary.
*/
} else if (probe_type == PROBE_RTT) {
}
}
/*
* Incoming IPv4 data from wire, is received here. Called from main.
*/
void
{
int iphlen;
int len;
logdebug("in_data(%s %s)\n",
}
/*
* Poll has already told us that a message is waiting,
* on this socket. Read it now. We should not block.
*/
return;
}
/*
* If the datalink has indicated the link is down, don't go
* any further.
*/
return;
/* Get the printable address for error reporting */
/* Ignore packets > 64k or control buffers that don't fit */
logdebug("Truncated message: msg_flags 0x%x from %s\n",
}
return;
}
/* Make sure packet contains at least minimum ICMP header */
logdebug("in_data: packet too short (%d bytes)"
}
return;
}
/*
* Subtract the IP hdr length, 'len' will be length of the probe
* reply, starting from the icmp hdr.
*/
/* LINTED */
/* Probe replies are icmp echo replies. Ignore anything else */
return;
/*
* The icmp id should match what we sent, which is stored
* in pi_icmpid. The icmp code for reply must be 0.
* The reply content must be a struct pr_icmp
*/
/* Not in response to our probe */
return;
}
if (reply->pr_icmp_code != 0) {
logtrace("probe reply code %d from %s on %s\n",
return;
}
logtrace("probe reply too short: %d bytes from %s on %s\n",
return;
}
logtrace("message without timestamp from %s on %s\n",
return;
}
/* Unicast probe reply */
/* Multicast reply */
} else {
/* Probably not in response to our probe */
logtrace("probe reply type: %d from %s on %s\n",
return;
}
}
/*
* Incoming IPv6 data from wire is received here. Called from main.
*/
void
{
int len;
void *opt;
logdebug("in6_data(%s %s)\n",
}
return;
}
/*
* If the datalink has indicated that the link is down, don't go
* any further.
*/
return;
/* Get the printable address for error reporting */
if (len < ICMP_MINLEN) {
logdebug("Truncated message: msg_flags 0x%x from %s\n",
}
return;
}
/* Ignore packets > 64k or control buffers that don't fit */
logdebug("Truncated message: msg_flags 0x%x from %s\n",
}
return;
}
return;
/* Not in response to our probe */
return;
}
/*
* The kernel has already verified the the ICMP checksum.
*/
logtrace("ICMPv6 echo reply source address not linklocal from "
return;
}
/* Can't allow routing headers in probe replies */
logtrace("message with routing header from %s on %s\n",
return;
}
if (reply->pr_icmp_code != 0) {
logtrace("probe reply code: %d from %s on %s\n",
return;
}
logtrace("probe reply too short: %d bytes from %s on %s\n",
return;
}
logtrace("message without timestamp from %s on %s\n",
return;
}
} else {
/* Probably not in response to our probe */
logtrace("probe reply type: %d from %s on %s\n",
}
}
/*
* Process the incoming rtt reply, in response to our rtt probe.
* Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
* have any stored information about the probe we sent. So we don't log
* any errors if we receive bad replies.
*/
static void
{
int64_t m; /* rtt measurement in ns */
/* Get the printable address for error reporting */
logdebug("incoming_rtt_reply: %s %s %s\n",
}
/* Do we know this target ? */
return;
/* Invalid rtt. It has wrapped around */
if (m < 0)
return;
/*
* Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
* The initial few responses after the interface is repaired may
* contain high rtt's because they could have been queued up waiting
*/
return;
/*
* Update rtt only if the new rtt is lower than the current rtt.
* (specified by the 3rd parameter to pi_set_crtt).
* If a spike has caused the current probe_interval to be >
* user_probe_interval, then this mechanism is used to bring down
* the rtt rapidly once the network stress is removed.
* If the new rtt is higher than the current rtt, we don't want to
* update the rtt. We are having more than 1 outstanding probe and
* the increase in rtt we are seeing is being unnecessarily weighted
* many times. The regular rtt update will be handled by
* incoming_echo_reply() and will take care of any rtt increase.
*/
/*
* If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
* investigate if we can improve the failure detection time to
* meet whatever the user specified.
*/
if (check_pg_crtt_improved(pg)) {
logerr("Improved failure detection time %d ms "
"on (%s %s) for group \"%s\"\n",
}
/* Avoid any truncation or rounding errors */
/*
* No more rtt probes will be sent. The actual
* fdt has dropped to the user specified value.
* pii_fd_snxt_basetime and pii_snxt_basetime
* will be in sync henceforth.
*/
}
}
}
}
/*
* Process the incoming echo reply, in response to our unicast probe.
* Common for both IPv4 and IPv6
*/
static void
{
int64_t m; /* rtt measurement in ns */
int pr_ndx;
/* Get the printable address for error reporting */
logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n",
}
/* Reject out of window probe replies */
logtrace("out of window probe seq %u snxt %u on %s from %s\n",
return;
}
cur_hrtime = gethrtime();
if (m < 0) {
/*
* This is a ridiculously high value of rtt. rtt has wrapped
* around. Log a message, and ignore the rtt.
*/
logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld "
}
/*
* Get the probe index pr_ndx corresponding to the received icmp seq.
* number in our pii->pii_probes[] array. The icmp sequence number
* pii_snxt corresponds to the probe index pii->pii_probe_next
*/
/*
* Perform sanity checks, whether this probe reply that we
* have received is genuine
*/
/*
* Compare the src. addr of the received ICMP or ICMPv6
* probe reply with the target address in our tables.
*/
/*
* We don't have any record of having sent a probe to
* this target. This is a fake probe reply. Log an error
*/
logtrace("probe status %d Fake probe reply seq %u "
"snxt %u on %s from %s\n",
return;
/*
* The address matches, but our tables indicate that
* this probe reply has been acked already. So this
* is a duplicate probe reply. Log an error
*/
logtrace("probe status %d Duplicate probe reply seq %u "
"snxt %u on %s from %s\n",
return;
}
} else {
/*
* Target must not be NULL in the PR_UNACKED state
*/
/*
* The probe stats slot is unused. So we didn't
* send out any probe to this target. This is a fake.
* Log an error.
*/
logtrace("probe status %d Fake probe reply seq %u "
"snxt %u on %s from %s\n",
}
return;
}
/*
* If the rtt does not appear to be right, don't update the
* rtt stats. This can happen if the system dropped into the
* debugger, or the system was hung or too busy for a
* substantial time that we didn't get a chance to run.
*/
/*
* If the probe corresponding to this received response
* was truly sent 'm' ns. ago, then this response must
* have been rejected by the sequence number checks. The
* fact that it has passed the sequence number checks
* means that the measured rtt is wrong. We were probably
* scheduled long after the packet was received.
*/
goto out;
}
/*
* Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
* The initial few responses after the interface is repaired may
* contain high rtt's because they could have been queued up waiting
*/
goto out;
/*
* Don't update the Conservative Round Trip Time estimate for this
* (phint, target) pair if this is the not the highest ack seq seen
* thus far on this target.
*/
goto out;
/*
* Always update the rtt. This is a failure detection probe
* and we want to measure both increase / decrease in rtt.
*/
/*
* If the crtt exceeds the average time between probes,
* investigate if this slow target is an exception. If so we
* can avoid this target and still meet the failure detection
* time. Otherwise we can't meet the failure detection time.
*/
if (exception) {
/*
* This target is exceptionally slow. Don't use it
* for future probes. check_exception_target() has
* made sure that we have at least MIN_PROBE_TARGETS
* other active targets
*/
if (pii->pii_targets_are_routers) {
/*
* This is a slow router, mark it as slow
* and don't use it for further probes. We
* don't delete it, since it will be populated
* again when we do a router scan. Hence we
* need to maintain extra state (unlike the
* host case below). Mark it as TG_SLOW.
*/
pii->pii_ntargets--;
}
} else {
/*
* the slow target is not a router, we can
* just delete it. Send an icmp multicast and
* pick the fastest responder that is not
* already an active target. target_delete()
* adjusts pii->pii_target_next
*/
}
} else {
/*
* We can't meet the failure detection time.
* Log a message, and update the detection time to
* whatever we can achieve.
*/
if (pg != phyint_anongroup) {
logtrace("Cannot meet requested failure"
" detection time of %d ms on (%s %s) new"
" failure detection time for group \"%s\""
" is %d ms\n", user_failure_detection_time,
}
}
/*
* If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
* investigate if we can improve the failure detection time to
* meet whatever the user specified.
*/
if (check_pg_crtt_improved(pg)) {
if (pg != phyint_anongroup) {
logtrace("Improved failure detection time %d ms"
" on (%s %s) for group \"%s\"\n",
}
/* Avoid any truncation or rounding errors */
/*
* No more rtt probes will be sent. The actual
* fdt has dropped to the user specified value.
* pii_fd_snxt_basetime and pii_snxt_basetime
* will be in sync henceforth.
*/
}
}
}
out:
/*
* Update pii->pii_rack, i.e. the sequence number of the last received
* probe response, based on the echo reply we have received now, if
* either of the following conditions are satisfied.
* a. pii_rack is outside the current receive window of
* [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
* This means we have not received probe responses for a
* long time, and the sequence number has wrapped around.
* b. pii_rack is within the current receive window and this echo
* reply corresponds to the highest sequence number we have seen
* so far.
*/
}
}
/*
* Returns true if seq is the highest unacknowledged seq for target tg
* else returns false
*/
static boolean_t
{
int pr_ndx;
/*
* Get the seq number of the most recent probe sent so far,
* and also get the corresponding probe index in the probe stats
* array.
*/
pr_seq--;
/*
* Start from the most recent probe and walk back, trying to find
* an acked probe corresponding to target tg.
*/
return (_B_FALSE);
}
}
return (_B_TRUE);
}
/*
* Check whether the crtt for the group has improved by a factor of
* LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure
* detection time flapping in the face of small crtt changes.
*/
static boolean_t
{
logdebug("check_pg_crtt_improved()\n");
/*
* The crtt for the group is only improved if each phyint_instance
* for both ipv4 and ipv6 is improved.
*/
return (_B_FALSE);
}
return (_B_TRUE);
}
/*
* Check whether the crtt has improved substantially on this phyint_instance.
* Returns _B_TRUE if there's no crtt information available, because pii
* is NULL or the phyint_instance is not capable of probing.
*/
return (_B_TRUE);
if (!PROBE_CAPABLE(pii) ||
return (_B_TRUE);
continue;
return (_B_FALSE);
}
}
return (_B_TRUE);
}
/*
* This target responds very slowly to probes. The target's crtt exceeds
* the probe interval of its group. Compare against other targets
* and determine if this target is an exception, if so return true, else false
*/
static boolean_t
{
logdebug("check_exception_target(%s %s target %s)\n",
}
/*
* We should have at least MIN_PROBE_TARGETS + 1 good targets now,
* to make a good judgement. Otherwise don't drop this target.
*/
return (_B_FALSE);
/*
* Determine whether only this particular target is slow.
* We know that this target's crtt exceeds the group's probe interval.
* If all other active targets have a
* crtt < (this group's probe interval) / EXCEPTION_FACTOR,
* then this target is considered slow.
*/
return (_B_FALSE);
}
}
}
return (_B_TRUE);
}
/*
* Update the target list. The icmp all hosts multicast has given us
* some host to which we can send probes. If we already have sufficient
* targets, discard it.
*/
static void
/* ARGSUSED */
{
int af;
logdebug("incoming_mcast_reply(%s %s %s)\n",
}
/*
* Using host targets is a fallback mechanism. If we have
* found a router, don't add this host target. If we already
* know MAX_PROBE_TARGETS, don't add another target.
*/
if (pii->pii_targets_are_routers ||
return;
}
}
if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
/*
* Guard against response from 0.0.0.0
* and ::. Log a trace message
*/
logtrace("probe response from %s on %s\n",
return;
}
/*
* This address is one of our own, so reject this address as a
* valid probe target.
*/
if (own_address(fromaddr))
return;
/*
* If the phyint is part a named group, then add the address to all
* members of the group. Otherwise, add the address only to the
* phyint itself, since other phyints in the anongroup may not be on
* the same subnet.
*/
} else {
}
}
/*
* Compute CRTT given an existing scaled average, scaled deviation estimate
* and a new rtt time. The formula is from Jacobson and Karels'
* "Congestion Avoidance and Control" in SIGCOMM '88. The variable names
* are the same as those in Appendix A.2 of that paper.
*
* m = new measurement
* sa = scaled RTT average (8 * average estimates)
* sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
* crtt = Conservative round trip time. Used to determine whether probe
* has timed out.
*
* New scaled average and deviation are passed back via sap and svp
*/
static int64_t
{
if (sa != -1) {
/*
* Update average estimator:
* new rtt = old rtt + 1/8 Error
* where Error = m - old rtt
* i.e. 8 * new rtt = 8 * old rtt + Error
* i.e. new sa = old sa + Error
*/
if ((sa += m) < 0) {
/* Don't allow the smoothed average to be negative. */
sa = 0;
}
/*
* Update deviation estimator:
* new mdev = old mdev + 1/4 (abs(Error) - old mdev)
* i.e. 4 * new mdev = 4 * old mdev +
* (abs(Error) - old mdev)
* i.e. new sv = old sv + (abs(Error) - old mdev)
*/
if (m < 0)
m = -m;
m -= sv >> 2;
sv += m;
} else {
/* Initialization. This is the first response received. */
sa = (m << 3);
sv = (m << 1);
}
logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> "
}
/*
* CRTT = average estimates + 4 * deviation estimates
* = sa / 8 + sv
*/
return (crtt);
}
static void
{
int new_crtt;
int i;
logdebug("pi_set_crtt: target - m %lld\n", m);
/* store the round trip time, in case we need to defer computation */
/*
* If this probe's round trip time would singlehandedly cause an
* increase in the group's probe interval consider it suspect.
*/
logdebug("Received a suspect probe on %s, new_crtt ="
" %d, probe_interval = %d, num_deferred = %d\n",
}
/*
* If we've deferred as many rtts as we plan on deferring, then
* assume the link really did slow down and process all queued
* rtts
*/
logdebug("Received MAXDEFERREDRTT probes which "
"would cause an increased probe_interval. "
"Integrating queued rtt data points.\n");
}
for (i = 0; i <= tg->tg_num_deferred; i++) {
}
tg->tg_num_deferred = 0;
} else {
tg->tg_num_deferred++;
}
return;
}
/*
* If this is a normal probe, or an RTT probe that would lead to a
* reduced CRTT, then update our CRTT data. Further, if this was
* a normal probe, pitch any deferred probes since our probes are
* again being answered within our CRTT estimates.
*/
if (is_probe_uni)
tg->tg_num_deferred = 0;
}
}
/*
* Return a pointer to the specified option buffer.
* If not found return NULL.
*/
static void *
{
}
}
return (NULL);
}
/*
* Try to activate another INACTIVE interface in the same group as `pi'.
* Prefer STANDBY INACTIVE to just INACTIVE.
*/
void
{
return;
continue;
inactivepi = pi2;
break;
}
if (inactivepi != NULL)
}
/*
* Transition a phyint to PI_RUNNING. The caller must ensure that the
* transition is appropriate. Clears IFF_OFFLINE or IFF_FAILED if
* appropriate. Also sets IFF_INACTIVE on this or other interfaces as
* appropriate (see comment below). Finally, also updates the phyint's group
* state to account for the change.
*/
void
{
/*
* The interface is running again, but should it or another interface
* in the group end up INACTIVE? There are three cases:
*
* 1. If it's a STANDBY interface, it should be end up INACTIVE if
* the group is operating at capacity (i.e., there are at least as
* many active interfaces as non-STANDBY interfaces in the group).
* No other interfaces should be changed.
*
* 2. If it's a non-STANDBY interface and we're onlining it or
* FAILBACK is enabled, then it should *not* end up INACTIVE.
* Further, if the group is above capacity as a result of this
* interface, then an active STANDBY interface in the group should
* end up INACTIVE.
*
* 3. If it's a non-STANDBY interface, we're repairing it, and
* FAILBACK is disabled, then it should end up INACTIVE *unless*
* the group was failed (in which case we have no choice but to
* use it). No other interfaces should be changed.
*/
nnonstandby++;
if (phyint_is_functioning(pi2) &&
nactive++;
actstandbypi = pi2;
}
}
}
set = 0;
if (nactive >= nnonstandby)
set |= IFF_INACTIVE;
else
clear |= IFF_INACTIVE;
set |= IFF_INACTIVE;
}
/*
* Update the group state to account for the change.
*/
}
/*
* Adjust IFF_INACTIVE on the provided `pi' to trend the group configuration
* to have at least one active interface and as many active interfaces as
* non-standby interfaces.
*/
void
{
/*
* All phyints in the anonymous group are effectively in their own
* group and thus active regardless of whether they're marked standby.
*/
return;
}
/*
* If the phyint isn't functioning we can't consider it.
*/
if (!phyint_is_functioning(pi))
return;
nnonstandby++;
if (phyint_is_functioning(pi2) &&
nactive++;
}
else if (nactive > nnonstandby)
}
/*
* See if a previously failed interface has started working again.
*/
void
{
if (!phyint_repaired(pi))
return;
} else {
logerr("IP interface repair detected on %s of group %s\n",
}
/*
* If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet.
* So just clear IFF_OFFLINE and defer phyint_transition_to_running()
* until it is brought back online.
*/
return;
}
}
/*
* See if an interface has failed, or if the whole group of interfaces has
* failed.
*/
static void
{
switch (failure_state(pii)) {
case PHYINT_FAILURE:
logerr("IP interface failure detected on %s\n",
} else {
logerr("IP interface failure detected on %s of group"
}
/*
* If the failed interface was active, activate another
* INACTIVE interface in the group if possible.
*/
if (was_active)
/*
* If the interface is offline, the state change will be
* noted when it comes back online.
*/
}
break;
case GROUP_FAILURE:
continue;
/*
* In the case of host targets, we would have flushed
* the targets, and gone to PI_NOTARGETS state.
*/
}
break;
default:
break;
}
}
/*
* Determines if any timeout event has occurred and returns the number of
* milliseconds until the next timeout event for the phyint. Returns
* TIMER_INFINITY for "never".
*/
{
int pr_ndx;
int valid_unack_count;
int i;
int interval;
cur_hrtime = gethrtime();
logdebug("phyint_inst_timer(%s %s)\n",
}
/*
* enough time has passed, then try to bring the interface
* back up; otherwise, schedule a timer to bring it back up
* when enough time *has* elapsed.
*/
if (check_time > cur_time)
return (check_time - cur_time);
}
}
/*
* If probing is not enabled on this phyint instance, don't proceed.
*/
if (!PROBE_ENABLED(pii))
return (TIMER_INFINITY);
/*
* If the timer has fired too soon, probably triggered
* by some other phyint instance, return the remaining
* time
*/
/*
* If the link is down, don't send any probes for now.
*/
return (TIMER_INFINITY);
/*
* Randomize the next probe time, between MIN_RANDOM_FACTOR
* and MAX_RANDOM_FACTOR with respect to the base probe time.
* Base probe time is strictly periodic.
*/
(int)(MIN_RANDOM_FACTOR * user_probe_interval),
(int)(MAX_RANDOM_FACTOR * user_probe_interval));
/*
* Check if the current time > next time to probe. If so, we missed
* sending 1 or more probes, probably due to heavy system load. At least
* 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
* were scheduled. Make adjustments to the times, in multiples of
* user_probe_interval.
*/
int n;
logtrace("missed sending %d probes cur_time %u snxt_time %u"
/* Collect statistics about missed probes */
}
logdebug("cur_time %u snxt_time %u snxt_basetime %u"
}
/*
* If no targets are known, we need to send an ICMP multicast. The
* probe type is PROBE_MULTI. We'll check back in 'interval' msec
* to see if we found a target.
*/
return (interval);
}
if ((user_probe_interval != probe_interval) &&
/*
* the failure detection (fd) probe timer has not yet fired.
* Need to send only an rtt probe. The probe type is PROBE_RTT.
*/
return (interval);
}
/*
* the fd probe timer has fired. Need to do all failure
* detection / recovery calculations, and then send an fd probe
* of type PROBE_UNI.
*/
if (user_probe_interval == probe_interval) {
/*
* We could have missed some probes, and then adjusted
* pii_snxt_basetime above. Otherwise we could have
* blindly added probe_interval to pii_fd_snxt_basetime.
*/
} else {
int n;
}
}
/*
* We can have at most, the latest 2 probes that we sent, in
* the PR_UNACKED state. All previous probes sent, are either
* PR_LOST or PR_ACKED. An unacknowledged probe is considered
* timed out if the probe's time_start + the CRTT < currenttime.
* For each of the last 2 probes, examine whether it has timed
* out. If so, mark it PR_LOST. The probe stats is a circular array.
*/
valid_unack_count = 0;
for (i = 0; i < 2; i++) {
case PR_ACKED:
/*
* We received back an ACK, so the switch clearly
* is not dropping our traffic, and thus we can
* enable failure detection immediately.
*/
logdebug("successful probe on %s; "
"ending quiet period\n",
}
}
break;
case PR_UNACKED:
/*
* The crtt could be zero for some reason,
* Eg. the phyint could be failed. If the crtt is
* not available use group's probe interval,
* which is a worst case estimate.
*/
} else {
}
} else if (i == 1) {
/*
* We are forced to consider this probe
* lost, as we can have at most 2 unack.
* probes any time, and we will be sending a
* probe at the end of this function.
* Normally, we should not be here, but
* this can happen if an incoming response
* that was considered lost has increased
* the crtt for this target, and also bumped
* up the FDT. Note that we never cancel or
* increase the current pii_time_left, so
* when the timer fires, we find 2 valid
* unacked probes, and they are yet to timeout
*/
} else {
/*
* Only the most recent probe can enter
* this 'else' arm. The second most recent
* probe must take either of the above arms,
* if it is unacked.
*/
}
break;
}
}
/*
* We send out 1 probe randomly in the interval between one half
* and one probe interval for the group. Given that the CRTT is always
* less than the group's probe interval, we can have at most 1
* unacknowledged probe now. All previous probes are either lost or
* acked.
*/
/*
* The timer has fired. Take appropriate action depending
* on the current state of the phyint.
*
* PI_RUNNING state - Failure detection
* PI_FAILED state - Repair detection
*/
case PI_FAILED:
/*
* If the most recent probe (excluding unacked probes that
* are yet to time out) has been acked, check whether the
* phyint is now repaired.
*/
}
break;
case PI_RUNNING:
/*
* It's possible our probes have been lost because of a
* spanning-tree mandated quiet period on the switch. If so,
* ignore the lost probes.
*/
break;
/*
* We have 1 or more failed probes (excluding unacked
* probes that are yet to time out). Determine if the
* phyint has failed.
*/
}
break;
default:
logerr("phyint_inst_timer: invalid state %d\n",
abort();
}
/*
* Start the next probe. probe() will also set pii->pii_probe_time_left
* to the group's probe interval. If phyint_failed -> target_flush_hosts
* was called, the target list may be empty.
*/
/*
* If we have just the one probe target, and we're not using
* router targets, try to find another as we presently have
* no resilience.
*/
} else {
}
return (interval);
}
/*
* Start the probe timer for an interface instance.
*/
void
{
/*
* Spread the base probe times (pi_snxt_basetime) across phyints
* uniformly over the (curtime..curtime + the group's probe_interval).
* pi_snxt_basetime is strictly periodic with a frequency of
* the group's probe interval. The actual probe time pi_snxt_time
* adds some randomness to pi_snxt_basetime and happens in probe().
* For the 1st probe on each phyint after the timer is started,
* pi_snxt_time and pi_snxt_basetime are the same.
*/
interval = GET_RANDOM(0,
}
/*
* Restart the probe timer on an interface instance.
*/
static void
{
/*
* We don't need to restart the timer if it was never started in
* the first place (pii->pii_basetime_inited not set), as the timer
* won't have gone off yet.
*/
if (pii->pii_basetime_inited != 0) {
if (debug & D_LINKNOTE)
logdebug("restart timer: restarting timer on %s, "
}
}
static void
{
/*
* Clear the probe statistics arrays, we don't want the repair
* detection logic relying on probes that were successful prior
* to the link going down.
*/
/*
* Check for interface failure. Although we know the interface
* has failed, we don't know if all the other interfaces in the
* group have failed as well.
*/
if (debug & D_LINKNOTE) {
logdebug("process_link_state_down:"
}
}
}
static void
{
/*
* We stopped any running timers on each instance when the link
* went down, so restart them.
*/
}
/*
* Process any changes in link state passed up from the interfaces.
*/
void
{
/* Look for interfaces where the link state has just changed */
/*
* Except when the "phyint" structure is created, this is
* the only place the link state is updated. This allows
* this routine to detect changes in link state, rather
* than just the current state.
*/
/*
* Has link just gone down?
*/
if (old_link_state_up)
} else {
/*
* Has link just gone back up?
*/
if (!old_link_state_up)
}
}
}
void
{
}
}
}
}
}
/*
* Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
* probes on both instances IPv4 and IPv6.
* If the interface has failed, return the time of the first probe failure
* in "tff".
*/
static int
{
int pr_ndx;
/*
* Get the number of consecutive failed probes on
* this phyint across all targets. Also get the number
* of consecutive failed probes on this target only
*/
/* Get the time of first failure, for later use */
/*
* If the current target has not responded to the
* last NUM_PROBE_FAILS probes, and other targets are
* responding delete this target. Dead gateway detection
* will eventually remove this target (if router) from the
* routing tables. If that does not occur, we may end
* up adding this to our list again.
*/
if (pii->pii_targets_are_routers) {
pii->pii_ntargets--;
} else {
}
return (PHYINT_OK);
}
/*
* If the phyint has lost NUM_PROBE_FAILS or more
* consecutive probes, on both IPv4 and IPv6 protocol
* instances of the phyint, then trigger failure
* detection, else return false
*/
return (PHYINT_OK);
if (PROBE_CAPABLE(pii_other)) {
/*
* We have NUM_PROBE_FAILS or more failures
* on both IPv4 and IPv6. Get the earliest
* time when failure was detected on this
* phyint across IPv4 and IPv6.
*/
} else {
/*
* This instance has < NUM_PROBE_FAILS failure.
* So return false
*/
return (PHYINT_OK);
}
}
return (PHYINT_FAILURE);
}
/*
* Check if the link has gone down on this phyint, or it has failed the
* last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
* Also look at other phyints of this group, for group failures.
*/
int
{
int retval;
return (PHYINT_OK);
/*
* At this point, the link is down, or the phyint is suspect, as it
* has lost NUM_PROBE_FAILS or more probes. If the phyint does not
* belong to any group, this is a PHYINT_FAILURE. Otherwise, continue
* on to determine whether this should be considered a PHYINT_FAILURE
* or GROUP_FAILURE.
*/
if (pg == phyint_anongroup)
return (PHYINT_FAILURE);
/*
* Need to compare against other phyints of the same group
* to exclude group failures. If the failure was detected via
* probing, then if the time of last success (tls) of any
* phyint is more recent than the time of first fail (tff) of the
* phyint in question, and the link is up on the phyint,
* then it is a phyint failure. Otherwise it is a group failure.
* If failure was detected via a link down notification sent from
* the driver to IP, we see if any phyints in the group are still
* running and haven't received a link down notification. We
* will usually be processing the link down notification shortly
* after it was received, so there is no point looking at the tls
* of other phyints.
*/
/* Exclude ourself from comparison */
continue;
/*
* We use FLAGS_TO_LINK_STATE() to test the flags
* directly, rather then LINK_UP() or LINK_DOWN(), as
* we may not have got round to processing the link
* state for the other phyints in the group yet.
*
* The check for PI_RUNNING and group failure handles
* the case when the group begins to recover.
* PI_RUNNING will be set, and group failure cleared
* only after receipt of NUM_PROBE_REPAIRS, by which
* time the other phyints should have received at
* least 1 packet, and so will not have NUM_PROBE_FAILS.
*/
break;
}
continue;
}
continue;
/*
* If there's no probe-based failure detection on this
* interface, and its link is still up, then it's still
* working and thus the group has not failed.
*/
break;
}
/*
* Need to compare against both IPv4 and IPv6 instances.
*/
if (psinfo.ps_tls_valid) {
/*
* See comment above regarding check
* for PI_RUNNING and group failure.
*/
!GROUP_FAILED(pg) &&
break;
}
}
}
if (psinfo.ps_tls_valid) {
/*
* See comment above regarding check
* for PI_RUNNING and group failure.
*/
!GROUP_FAILED(pg) &&
break;
}
}
}
}
/*
* Update the group state to account for the changes.
*/
return (retval);
}
/*
* Return the information associated with consecutive probe successes
* starting with the most recent probe. At most the last 2 probes can be
* in the unacknowledged state. All previous probes have either failed
* or succeeded.
*/
static void
struct probe_success_count *psinfo)
{
uint_t i;
now = getcurrenttime();
/*
* Start with the most recent probe, and count the number
* of consecutive probe successes. Latch the number of successes
* on hitting a failure.
*/
i = PROBE_INDEX_PREV(i)) {
case PR_UNACKED:
/*
* Only the most recent 2 probes can be unacknowledged
*/
/*
* The crtt could be zero for some reason,
* Eg. the phyint could be failed. If the crtt is
* not available use the value of the group's probe
* interval which is a worst case estimate.
*/
} else {
timeout +=
}
/*
* We hit a failure. Latch the total number of
* recent consecutive successes.
*/
/*
* We hit a failure for the desired
* target. Latch the number of recent
* consecutive successes for this target
*/
}
}
break;
case PR_ACKED:
/*
* Bump up the count of probe successes, if we
* have not seen any failure so far.
*/
if (!pi_found_failure)
!tg_found_failure) {
psinfo->ps_nsucc_tg++;
}
/*
* Record the time of last success, if this is
* the most recent probe success.
*/
if (!psinfo->ps_tls_valid) {
}
break;
case PR_LOST:
/*
* We hit a failure. Latch the total number of
* recent consecutive successes.
*/
/*
* We hit a failure for the desired target.
* Latch the number of recent consecutive
* successes for this target
*/
}
break;
default:
return;
}
}
}
/*
* Return the information associated with consecutive probe failures
* starting with the most recent probe. Only the last 2 probes can be in the
* unacknowledged state. All previous probes have either failed or succeeded.
*/
static void
struct probe_fail_count *pfinfo)
{
int i;
int most_recent;
int second_most_recent;
now = getcurrenttime();
/*
* Start with the most recent probe, and count the number
* of consecutive probe failures. Latch the number of failures
* on hitting a probe success.
*/
i = PROBE_INDEX_PREV(i)) {
case PR_UNACKED:
/*
* Only the most recent 2 probes can be unacknowledged
*/
/*
* Target is guaranteed to exist in the unack. state
*/
/*
* The crtt could be zero for some reason,
* Eg. the phyint could be failed. If the crtt is
* not available use the group's probe interval,
* which is a worst case estimate.
*/
} else {
timeout +=
}
break;
/* FALLTHRU */
case PR_LOST:
if (!pi_found_success) {
}
!tg_found_success) {
pfinfo->pf_nfail_tg++;
}
break;
default:
/*
* We hit a success or unused slot. Latch the
* total number of recent consecutive failures.
*/
/*
* We hit a success for the desired target.
* Latch the number of recent consecutive
* failures for this target
*/
}
}
}
}
/*
* Change the state of probe `pr' on phyint_instance `pii' to state `state'.
*/
void
{
return;
}
/*
* Check if the phyint has been repaired. If no test address has been
* configured, then consider the interface repaired if the link is up (unless
* the link is flapping; see below). Otherwise, look for proof of probes
* being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
* either IPv4 or IPv6 instance, the phyint can be considered repaired.
*/
static boolean_t
{
int pr_ndx;
return (_B_FALSE);
/*
* If we don't have any test addresses and the link is up, then
* consider the interface repaired, unless we've received more than
* LINK_UP_PERMIN link up notifications in the last minute, in
* which case we keep the link down until we drop back below
* the threshold.
*/
cur_time = getcurrenttime();
pi->pi_lfmsg_printed = 0;
return (_B_TRUE);
}
if (!pi->pi_lfmsg_printed) {
logerr("The link has come up on %s more than %d times "
"in the last minute; disabling repair until it "
}
return (_B_FALSE);
}
if (PROBE_CAPABLE(pii)) {
return (_B_TRUE);
}
if (PROBE_CAPABLE(pii)) {
return (_B_TRUE);
}
return (_B_FALSE);
}
/*
*/
{
int ifsock;
logdebug("change_pif_flags(%s): set %llx clear %llx\n",
}
else
/*
* desired phyint flags. Since we set only phyint flags, we can
* do it on either IPv4 or IPv6 instance.
*/
logperror("change_pif_flags: ioctl (get flags)");
return (_B_FALSE);
}
/* No change in the flags. No need to send ioctl */
return (_B_TRUE);
}
logperror("change_pif_flags: ioctl (set flags)");
return (_B_FALSE);
}
/*
* Keep pi_flags in synch. with actual flags. Assumes flags are
* phyint flags.
*/
return (_B_TRUE);
}
/*
* icmp cksum computation for IPv4.
*/
static int
{
register int sum = 0;
/*
* Our algorithm is simple, using a 32 bit accumulator (sum),
* we add sequential 16 bit words to it, and at the end, fold
* back all the carry bits from the top 16 bits into the lower
* 16 bits.
*/
while (nleft > 1) {
sum += *w++;
nleft -= 2;
}
/* mop up an odd byte, if necessary */
if (nleft == 1) {
}
/*
* add back carry outs from top 16 bits to low 16 bits
*/
return (answer);
}
static void
reset_snxt_basetimes(void)
{
}
}
/*
* Is the address one of our own addresses? Unfortunately,
* we cannot check our phyint tables to determine if the address
* is our own. This is because, we don't track interfaces that
* are not part of any group. We have to either use a 'bind' or
* get the complete list of all interfaces using SIOCGLIFCONF,
* to do this check. We could also use SIOCTMYADDR.
* Bind fails for the local zone address, so we might include local zone
* address as target address. If local zone address is a target address
* and it is up, it is not possible to detect the interface failure.
* SIOCTMYADDR also doesn't consider local zone address as own address.
* So, we choose to use SIOCGLIFCONF to collect the local addresses, and they
* are stored in `localaddrs'
*/
{
return (_B_TRUE);
}
return (_B_FALSE);
}
static int
{
}
static int64_t
{
}