mpd_probe.c revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 1987 Regents of the University of California.
* All rights reserved.
*
* Redistribution and use in source and binary forms are permitted
* provided that the above copyright notice and this paragraph are
* duplicated in all such forms and that any documentation,
* advertising materials, and other materials related to such
* distribution and use acknowledge that the software was developed
* by the University of California, Berkeley. The name of the
* University may not be used to endorse or promote products derived
* from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
* WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include "mpd_defs.h"
#include "mpd_tables.h"
/*
* Probe types for probe()
*/
/*
* Format of probe / probe response packets. This is an ICMP Echo request
* or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
*/
struct pr_icmp
{
};
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x1 } };
static void reset_snxt_basetimes(void);
/*
* CRTT - Conservative Round Trip Time Estimate
* Probe success - A matching probe reply received before CRTT ms has elapsed
* after sending the probe.
* Probe failure - No probe reply received and more than CRTT ms has elapsed
* after sending the probe.
*
* TLS - Time last success. Most recent probe ack received at this time.
* TFF - Time first fail. The time of the earliest probe failure in
* a consecutive series of probe failures.
* NUM_PROBE_REPAIRS - Number of consecutive successful probes required
* before declaring phyint repair.
* NUM_PROBE_FAILS - Number of consecutive probe failures required to
* declare a phyint failure.
*
* Phyint state diagram
*
* The state of a phyint that is capable of being probed, is completely
* specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>.
*
* A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state
* of the link (according to the driver). If the phyint is also configured
* with a test address (the common case) and probe targets, then a phyint must
* also successfully be able to send and receive probes in order to remain in
* the PI_RUNNING state (otherwise, it transitions to PI_FAILED).
*
* Further, if a PI_RUNNING phyint is configured with a test address but is
* unable to find any probe targets, it will transition to the PI_NOTARGETS
* state, which indicates that the link is apparently functional but that
* in.mpathd is unable to send probes to verify functionality (in this case,
* in.mpathd makes the optimistic assumption that the interface is working
* correctly and thus does not perform a failover, but reports the interface
* as IPMP_IF_UNKNOWN through the async events and query interfaces).
*
* At any point, a phyint may be administratively marked offline via if_mpadm.
* In this case, the interface always transitions to PI_OFFLINE, regardless
* of its previous state. When the interface is later brought back online,
* in.mpathd acts as if the interface is new (and thus it transitions to
* PI_RUNNING or PI_FAILED based on the status of the link and the result of
* its probes, if probes are sent).
*
* pi_state - PI_RUNNING or PI_FAILED
* PI_RUNNING: The failure detection logic says the phyint is good.
* PI_FAILED: The failure detection logic says the phyint has failed.
*
* pg_groupfailed - Group failure, all interfaces in the group have failed.
* The pi_state may be either PI_FAILED or PI_NOTARGETS.
* In the case of router targets, we assume that the current list of
* targets obtained from the routing table, is still valid, so the
* phyint stat is PI_FAILED. In the case of host targets, we delete the
* list of targets, and multicast to the all hosts, to reconstruct the
* target list. So the phyints are in the PI_NOTARGETS state.
*
* I - value of (pi_flags & IFF_INACTIVE)
* IFF_INACTIVE: No failovers have been done to the standby, from
* other phyints. This phyint is an inactive standby.
*
* pi_empty
* This phyint has failed over successfully to another phyint, and
* this phyint is currently "empty". It does not host any addresses or
* multicast membership etc. This is the state of a phyint after a
* failover from the phyint has completed successfully and no subsequent
* 'failover to' or 'failback to' has occurred on the phyint.
* IP guarantees that no new logicals will be hosted nor any multicast
* joins permitted on the phyint, since the phyint is either failed or
* inactive. pi_empty is set implies the phyint is either failed or
* inactive.
*
* pi_full
* The phyint hosts all of its own addresses that it "owns". If the
* phyint was previously failed or inactive, failbacks to the phyint
* has completed successfully. i.e. No more failbacks to this phyint
* can produce any change in system state whatsoever.
*
* Not all 32 possible combinations of the above 5-tuple are possible.
* Furthermore some of the above combinations are transient. They may occur
* only because the failover or failback did not complete successfully. The
* reached.
*
* I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd.
* The following are the state machines. 'from' and 'to' are the src and
*
* pi_empty state machine
* ---------------------------------------------------------------------------
* Event State -> New State
* ---------------------------------------------------------------------------
* successful completion from.pi_empty = 0 -> from.pi_empty = 1
* of failover
*
* Initiate failover to.pi_empty = X -> to.pi_empty = 0
*
* Initiate failback to.pi_empty = X -> to.pi_empty = 0
*
* group failure pi_empty = X -> pi_empty = 0
* ---------------------------------------------------------------------------
*
* pi_full state machine
* ---------------------------------------------------------------------------
* Event State -> New State
* ---------------------------------------------------------------------------
* successful completion to.pi_full = 0 -> to.pi_full = 1
* of failback from
* each of the other phyints
*
* Initiate failover from.pi_full = X -> from.pi_full = 0
*
* group failure pi_full = X -> pi_full = 0
* ---------------------------------------------------------------------------
*
* pi_state state machine
* ---------------------------------------------------------------------------
* Event State New State
* Action:
* ---------------------------------------------------------------------------
* NIC failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
* detection : set IFF_FAILED on this phyint
* : failover from this phyint to another
*
* NIC failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 1)
* detection : set IFF_FAILED on this phyint
*
* NIC repair (PI_FAILED, I == 0) -> (PI_RUNNING, I == 0)
* detection : to.pi_empty = 0
* : failback to this phyint if enabled
* : clear IFF_FAILED on this phyint
*
* NIC repair (PI_FAILED, I == 1) -> (PI_RUNNING, I == 1)
* detection : clear IFF_FAILED on this phyint
*
* Group failure (perform on all phyints in the group)
* detection PI_RUNNING PI_FAILED
* (Router targets) : set IFF_FAILED
* : clear pi_empty and pi_full
*
* Group failure (perform on all phyints in the group)
* detection PI_RUNNING PI_NOTARGETS
* (Host targets) : set IFF_FAILED
* : clear pi_empty and pi_full
* : delete the target list on all phyints
* ---------------------------------------------------------------------------
*
* I state machine
* ---------------------------------------------------------------------------
* Event State Action:
* ---------------------------------------------------------------------------
* Turn on I pi_empty == 0 : failover from standby
*
* Turn off I PI_RUNNING, : pi_empty = 0
* pi_full == 0 : failback to this if enabled
* ---------------------------------------------------------------------------
*
* Assertions: (Read '==>' as implies)
*
* (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED)
* (pi_empty == 1) ==> (pi_full == 0)
* (pi_full == 1) ==> (pi_empty == 0)
*
* Invariants
*
* pg_groupfailed = 0 &&
* 1. (I == 1, pi_empty == 0) ==> initiate failover from standby
* 2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint
* 3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint
*
* 1. says that an inactive standby, that is not empty, has to be failed
* over. For a standby to be truly inactive, it should not host any
* addresses. So we move them to some other phyint. Usually we catch the
* turn on of IFF_INACTIVE, and perform this action. However if the failover
* did not complete successfully, then subsequently we have lost the edge
* trigger, and this invariant kicks in and completes the action.
*
* 2. says that any failed phyint that is not empty must be failed over.
* Usually we do the failover when we detect NIC failure. However if the
* failover does not complete successfully, this invariant kicks in and
* completes the failover. We exclude inactive standby which is covered by 1.
*
* 3. says that any running phyint that is not full must be failed back.
* Usually we do the failback when we detect NIC repair. However if the
* failback does not complete successfully, this invariant kicks in and
* completes the failback. Note that we don't want to failback to an inactive
* standby.
*
* The invariants 1 - 3 and the actions are in initifs().
*/
struct probes_missed probes_missed;
/*
* Compose and transmit an ICMP ECHO REQUEST packet. The IP header
* will be added on by the kernel. The id field identifies this phyint.
* and the sequence number is an increasing (modulo 2^^16) integer. The data
* portion holds the time value when the packet is sent. On echo this is
* extracted to compute the round-trip time. Three different types of
* probe packets are used.
*
* PROBE_UNI: This type is used to do failure detection / failure recovery
* and RTT calculation. PROBE_UNI probes are spaced apart in time,
* not less than the current CRTT. pii_probes[] stores data
* about these probes. These packets consume sequence number space.
*
* PROBE_RTT: This type is used to make only rtt measurments. Normally these
* are not used. Under heavy network load, the rtt may go up very high,
* due to a spike, or may appear to go high, due to extreme scheduling
* delays. Once the network stress is removed, mpathd takes long time to
* recover, because the probe_interval is already high, and it takes
* a long time to send out sufficient number of probes to bring down the
* rtt. To avoid this problem, PROBE_RTT probes are sent out every
* user_probe_interval ms. and will cause only rtt updates. These packets
* do not consume sequence number space nor is information about these
* packets stored in the pii_probes[]
*
* PROBE_MULTI: This type is only used to construct a list of targets, when
* no targets are known. The packet is multicast to the all hosts addr.
*/
static void
{
int pr_ndx; /* probe index in pii->pii_probes[] */
}
probe_type == PROBE_RTT);
probe_pkt.pr_icmp_code = 0;
probe_pkt.pr_icmp_cksum = 0;
/*
* Since there is no need to do arithmetic on the icmpid,
* (only equality check is done) pii_icmpid is stored in
* network byte order at initialization itself.
*/
/*
* If probe_type is PROBE_MULTI, this packet will be multicast to
* the all hosts address. Otherwise it is unicast to the next target.
*/
if (probe_type == PROBE_MULTI) {
} else if (probe_type == PROBE_UNI) {
} else {
/* type is PROBE_RTT */
}
}
} else {
if (probe_type == PROBE_MULTI) {
} else if (probe_type == PROBE_UNI) {
} else {
/* type is PROBE_RTT */
}
/*
* Compute the IPv4 icmp checksum. Does not cover the IP header.
*/
}
}
/*
* If this is a PROBE_UNI probe packet being unicast to a target, then
* update our tables. We will need this info in processing the probe
* response. PROBE_MULTI and PROBE_RTT packets are not used for
* the purpose of failure or recovery detection. PROBE_MULTI packets
* are only used to construct a list of targets. PROBE_RTT packets are
* used only for updating the rtt and not for failure detection.
*/
/* Collect statistics, before we reuse the last slot. */
/*
* If we have a single variable to denote the next target to
* probe for both rtt probes and failure detection probes, we
* could end up with a situation where the failure detection
* probe targets become disjoint from the rtt probe targets.
* Eg. if 2 targets and the actual fdt is double the user
* specified fdt. So we have 2 variables. In this scheme
* we also reset pii_rtt_target_next for every fdt probe,
* though that may not be necessary.
*/
} else if (probe_type == PROBE_RTT) {
}
}
/*
* Incoming IPv4 data from wire, is received here. Called from main.
*/
void
{
struct sockaddr_in from;
int iphlen;
int len;
char abuf[INET_ADDRSTRLEN];
logdebug("in_data(%s %s)\n",
}
/*
* Poll has already told us that a message is waiting,
* on this socket. Read it now. We should not block.
*/
if (len < 0) {
return;
}
/*
* If the NIC has indicated the link is down, don't go
* any further.
*/
return;
/* Get the printable address for error reporting */
/* Make sure packet contains at least minimum ICMP header */
logdebug("in_data: packet too short (%d bytes)"
}
return;
}
/*
* Subtract the IP hdr length, 'len' will be length of the probe
* reply, starting from the icmp hdr.
*/
/* LINTED */
/* Probe replies are icmp echo replies. Ignore anything else */
return;
/*
* The icmp id should match what we sent, which is stored
* in pi_icmpid. The icmp code for reply must be 0.
* The reply content must be a struct pr_icmp
*/
/* Not in response to our probe */
return;
}
if (reply->pr_icmp_code != 0) {
logtrace("probe reply code %d from %s on %s\n",
return;
}
logtrace("probe reply too short: %d bytes from %s on %s\n",
return;
}
/* Unicast probe reply */
/* Multicast reply */
} else {
/* Probably not in response to our probe */
logtrace("probe reply type: %d from %s on %s\n",
return;
}
}
/*
* Incoming IPv6 data from wire is received here. Called from main.
*/
void
{
struct sockaddr_in6 from;
int len;
char abuf[INET6_ADDRSTRLEN];
logdebug("in6_data(%s %s)\n",
}
return;
}
/*
* If the NIC has indicated that the link is down, don't go
* any further.
*/
return;
/* Get the printable address for error reporting */
if (len < ICMP_MINLEN) {
logdebug("Truncated message: msg_flags 0x%x from %s\n",
}
return;
}
/* Ignore packets > 64k or control buffers that don't fit */
logdebug("Truncated message: msg_flags 0x%x from %s\n",
}
return;
}
return;
/* Not in response to our probe */
return;
}
/*
* The kernel has already verified the the ICMP checksum.
*/
logtrace("ICMPv6 echo reply source address not linklocal from "
return;
}
/* Can't allow routing headers in probe replies */
logtrace("message with routing header from %s on %s\n",
return;
}
if (reply->pr_icmp_code != 0) {
logtrace("probe reply code: %d from %s on %s\n",
return;
}
logtrace("probe reply too short: %d bytes from %s on %s\n",
return;
}
} else {
/* Probably not in response to our probe */
logtrace("probe reply type: %d from %s on %s\n",
}
}
/*
* Process the incoming rtt reply, in response to our rtt probe.
* Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
* have any stored information about the probe we sent. So we don't log
* any errors if we receive bad replies.
*/
static void
{
int m; /* rtt measurment in ms */
char abuf[INET6_ADDRSTRLEN];
struct phyint_group *pg;
/* Get the printable address for error reporting */
logdebug("incoming_rtt_reply: %s %s %s\n",
}
/* Do we know this target ? */
return;
cur_time = getcurrenttime();
m = (int)(cur_time - pr_icmp_timestamp);
/* Invalid rtt. It has wrapped around */
if (m < 0)
return;
/*
* Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
* The initial few responses after the interface is repaired may
* contain high rtt's because they could have been queued up waiting
*/
return;
/*
* Update rtt only if the new rtt is lower than the current rtt.
* (specified by the 3rd parameter to pi_set_crtt).
* If a spike has caused the current probe_interval to be >
* user_probe_interval, then this mechanism is used to bring down
* the rtt rapidly once the network stress is removed.
* If the new rtt is higher than the current rtt, we don't want to
* update the rtt. We are having more than 1 outstanding probe and
* the increase in rtt we are seeing is being unnecessarily weighted
* many times. The regular rtt update will be handled by
* incoming_echo_reply() and will take care of any rtt increase.
*/
/*
* If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
* investigate if we can improve the failure detection time to
* meet whatever the user specified.
*/
if (check_pg_crtt_improved(pg)) {
logerr("Improved failure detection time %d ms "
"on (%s %s) for group \"%s\"\n",
}
/* Avoid any truncation or rounding errors */
/*
* No more rtt probes will be sent. The actual
* fdt has dropped to the user specified value.
* pii_fd_snxt_basetime and pii_snxt_basetime
* will be in sync henceforth.
*/
}
}
}
}
/*
* Process the incoming echo reply, in response to our unicast probe.
* Common for both IPv4 and IPv6
*/
static void
{
int m; /* rtt measurment in ms */
char abuf[INET6_ADDRSTRLEN];
int pr_ndx;
/* Get the printable address for error reporting */
logdebug("incoming_echo_reply: %s %s %s seq %u\n",
}
/* Reject out of window probe replies */
logtrace("out of window probe seq %u snxt %u on %s from %s\n",
return;
}
cur_time = getcurrenttime();
m = (int)(cur_time - pr_icmp_timestamp);
if (m < 0) {
/*
* This is a ridiculously high value of rtt. rtt has wrapped
* around. Log a message, and ignore the rtt.
*/
logerr("incoming_echo_reply: rtt wraparound cur_time %u reply "
}
/*
* Get the probe index pr_ndx corresponding to the received icmp seq.
* number in our pii->pii_probes[] array. The icmp sequence number
* pii_snxt corresponds to the probe index pii->pii_probe_next
*/
/*
* Perform sanity checks, whether this probe reply that we
* have received is genuine
*/
/*
* Compare the src. addr of the received ICMP or ICMPv6
* probe reply with the target address in our tables.
*/
/*
* We don't have any record of having sent a probe to
* this target. This is a fake probe reply. Log an error
*/
logtrace("probe status %d Fake probe reply seq %u "
"snxt %u on %s from %s\n",
return;
/*
* The address matches, but our tables indicate that
* this probe reply has been acked already. So this
* is a duplicate probe reply. Log an error
*/
logtrace("probe status %d Duplicate probe reply seq %u "
"snxt %u on %s from %s\n",
return;
}
} else {
/*
* Target must not be NULL in the PR_UNACKED state
*/
/*
* The probe stats slot is unused. So we didn't
* send out any probe to this target. This is a fake.
* Log an error.
*/
logtrace("probe status %d Fake probe reply seq %u "
"snxt %u on %s from %s\n",
}
return;
}
/*
* If the rtt does not appear to be right, don't update the
* rtt stats. This can happen if the system dropped into the
* debugger, or the system was hung or too busy for a
* substantial time that we didn't get a chance to run.
*/
/*
* If the probe corresponding to this receieved response
* was truly sent 'm' ms. ago, then this response must
* have been rejected by the sequence number checks. The
* fact that it has passed the sequence number checks
* means that the measured rtt is wrong. We were probably
* scheduled long after the packet was received.
*/
goto out;
}
/*
* Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
* The initial few responses after the interface is repaired may
* contain high rtt's because they could have been queued up waiting
*/
goto out;
/*
* Don't update the Conservative Round Trip Time estimate for this
* (phint, target) pair if this is the not the highest ack seq seen
* thus far on this target.
*/
goto out;
/*
* Always update the rtt. This is a failure detection probe
* and we want to measure both increase / decrease in rtt.
*/
/*
* If the crtt exceeds the average time between probes,
* investigate if this slow target is an exception. If so we
* can avoid this target and still meet the failure detection
* time. Otherwise we can't meet the failure detection time.
*/
if (exception) {
/*
* This target is exceptionally slow. Don't use it
* for future probes. check_exception_target() has
* made sure that we have at least MIN_PROBE_TARGETS
* other active targets
*/
if (pii->pii_targets_are_routers) {
/*
* This is a slow router, mark it as slow
* and don't use it for further probes. We
* don't delete it, since it will be populated
* again when we do a router scan. Hence we
* need to maintain extra state (unlike the
* host case below). Mark it as TG_SLOW.
*/
pii->pii_ntargets--;
}
} else {
/*
* the slow target is not a router, we can
* just delete it. Send an icmp multicast and
* pick the fastest responder that is not
* already an active target. target_delete()
* adjusts pii->pii_target_next
*/
}
} else {
/*
* We can't meet the failure detection time.
* Log a message, and update the detection time to
* whatever we can achieve.
*/
if (pg != phyint_anongroup) {
logerr("Cannot meet requested failure detection"
" time of %d ms on (%s %s) new failure"
" detection time for group \"%s\" is %d"
" ms\n", user_failure_detection_time,
}
}
/*
* If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
* investigate if we can improve the failure detection time to
* meet whatever the user specified.
*/
if (check_pg_crtt_improved(pg)) {
if (pg != phyint_anongroup) {
logerr("Improved failure detection time %d ms "
}
/* Avoid any truncation or rounding errors */
/*
* No more rtt probes will be sent. The actual
* fdt has dropped to the user specified value.
* pii_fd_snxt_basetime and pii_snxt_basetime
* will be in sync henceforth.
*/
}
}
}
out:
/*
* Update pii->pii_rack, i.e. the sequence number of the last received
* probe response, based on the echo reply we have received now, if
* either of the following conditions are satisfied.
* a. pii_rack is outside the current receive window of
* [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
* This means we have not received probe responses for a
* long time, and the sequence number has wrapped around.
* b. pii_rack is within the current receive window and this echo
* reply corresponds to the highest sequence number we have seen
* so far.
*/
}
}
/*
* Returns true if seq is the highest unacknowledged seq for target tg
* else returns false
*/
static boolean_t
{
struct phyint_instance *pii;
int pr_ndx;
/*
* Get the seq number of the most recent probe sent so far,
* and also get the corresponding probe index in the probe stats
* array.
*/
pr_seq--;
/*
* Start from the most recent probe and walk back, trying to find
* an acked probe corresponding to target tg.
*/
return (_B_FALSE);
}
}
return (_B_TRUE);
}
/*
* Check whether the crtt for the group has improved by a factor of
* LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure
* detection time flapping in the face of small crtt changes.
*/
static boolean_t
{
logdebug("check_pg_crtt_improved()\n");
/*
* The crtt for the group is only improved if each phyint_instance
* for both ipv4 and ipv6 is improved.
*/
return (_B_FALSE);
}
return (_B_TRUE);
}
/*
* Check whether the crtt has improved substantially on this phyint_instance.
* Returns _B_TRUE if there's no crtt information available, because pii
* is NULL or the phyint_instance is not capable of probing.
*/
return (_B_TRUE);
if (!PROBE_CAPABLE(pii) ||
return (_B_TRUE);
continue;
return (_B_FALSE);
}
}
return (_B_TRUE);
}
/*
* This target responds very slowly to probes. The target's crtt exceeds
* the probe interval of its group. Compare against other targets
* and determine if this target is an exception, if so return true, else false
*/
static boolean_t
{
char abuf[INET6_ADDRSTRLEN];
logdebug("check_exception_target(%s %s target %s)\n",
}
/*
* We should have at least MIN_PROBE_TARGETS + 1 good targets now,
* to make a good judgement. Otherwise don't drop this target.
*/
return (_B_FALSE);
/*
* Determine whether only this particular target is slow.
* We know that this target's crtt exceeds the group's probe interval.
* If all other active targets have a
* crtt < (this group's probe interval) / EXCEPTION_FACTOR,
* then this target is considered slow.
*/
return (_B_FALSE);
}
}
}
return (_B_TRUE);
}
/*
* Update the target list. The icmp all hosts multicast has given us
* some host to which we can send probes. If we already have sufficient
* targets, discard it.
*/
static void
/* ARGSUSED */
{
int af;
char abuf[INET6_ADDRSTRLEN];
logdebug("incoming_mcast_reply(%s %s %s)\n",
}
/*
* Using host targets is a fallback mechanism. If we have
* found a router, don't add this host target. If we already
* know MAX_PROBE_TARGETS, don't add another target.
*/
if (pii->pii_targets_are_routers ||
return;
}
}
if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
/*
* Guard against response from 0.0.0.0
* and ::. Log a trace message
*/
logtrace("probe response from %s on %s\n",
return;
}
/*
* This address is one of our own, so reject this address as a
* valid probe target.
*/
return;
/*
* If the phyint is part a named group, then add the address to all
* members of the group. Otherwise, add the address only to the
* phyint itself, since other phyints in the anongroup may not be on
* the same subnet.
*/
} else {
}
}
/*
* Compute CRTT given an existing scaled average, scaled deviation estimate
* and a new rtt time. The formula is from Jacobson and Karels'
* "Congestion Avoidance and Control" in SIGCOMM '88. The variable names
* are the same as those in Appendix A.2 of that paper.
*
* m = new measurement
* sa = scaled RTT average (8 * average estimates)
* sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
* crtt = Conservative round trip time. Used to determine whether probe
* has timed out.
*
* New scaled average and deviation are passed back via sap and svp
*/
static int
{
int crtt;
int saved_m = m;
if (sa != -1) {
/*
* Update average estimator:
* new rtt = old rtt + 1/8 Error
* where Error = m - old rtt
* i.e. 8 * new rtt = 8 * old rtt + Error
* i.e. new sa = old sa + Error
*/
if ((sa += m) < 0) {
/* Don't allow the smoothed average to be negative. */
sa = 0;
}
/*
* Update deviation estimator:
* new mdev = old mdev + 1/4 (abs(Error) - old mdev)
* i.e. 4 * new mdev = 4 * old mdev +
* (abs(Error) - old mdev)
* i.e. new sv = old sv + (abs(Error) - old mdev)
*/
if (m < 0)
m = -m;
m -= sv >> 2;
sv += m;
} else {
/* Initialization. This is the first response received. */
sa = (m << 3);
sv = (m << 1);
}
logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = "
}
/*
* CRTT = average estimates + 4 * deviation estimates
* = sa / 8 + sv
*/
return (crtt);
}
static void
{
int new_crtt;
int i;
logdebug("pi_set_crtt: target - m %d\n", m);
/* store the round trip time, in case we need to defer computation */
/*
* If this probe's round trip time would singlehandedly cause an
* increase in the group's probe interval consider it suspect.
*/
logdebug("Received a suspect probe on %s, new_crtt ="
" %d, probe_interval = %d, num_deferred = %d\n",
}
/*
* If we've deferred as many rtts as we plan on deferring, then
* assume the link really did slow down and process all queued
* rtts
*/
logdebug("Received MAXDEFERREDRTT probes which "
"would cause an increased probe_interval. "
"Integrating queued rtt data points.\n");
}
for (i = 0; i <= tg->tg_num_deferred; i++) {
}
tg->tg_num_deferred = 0;
} else {
tg->tg_num_deferred++;
}
return;
}
/*
* If this is a normal probe, or an RTT probe that would lead to a
* reduced CRTT, then update our CRTT data. Further, if this was
* a normal probe, pitch any deferred probes since our probes are
* again being answered within our CRTT estimates.
*/
if (is_probe_uni)
tg->tg_num_deferred = 0;
}
}
/*
* Return a pointer to the specified option buffer.
* If not found return NULL.
*/
static void *
{
}
}
return (NULL);
}
/*
* See if a previously failed interface has started working again.
*/
void
{
if (phyint_repaired(pi)) {
} else {
logerr("NIC repair detected on %s of group %s\n",
}
/*
* If the interface is offline, just clear the FAILED flag,
* delaying the state change and failback operation until it
* is brought back online.
*/
return;
}
} else {
(void) change_lif_flags(pi,
/* Per state diagram */
}
}
/*
* This is the 1st phyint to receive a response
* after group failure.
*/
logerr("At least 1 interface (%s) of group %s has "
}
}
}
/*
* See if a previously functioning interface has failed, or if the
* whole group of interfaces has failed.
*/
static void
{
switch (failure_state(pii)) {
case PHYINT_FAILURE:
} else {
logerr("NIC failure detected on %s of group %s\n",
}
/*
* Do the failover, unless the interface is offline (in
* which case we've already failed over).
*/
}
break;
case GROUP_FAILURE:
logerr("All Interfaces in group %s have failed\n",
continue;
/*
* In the case of host targets, we
* would have flushed the targets,
* and gone to PI_NOTARGETS state.
*/
}
break;
default:
break;
}
}
/*
* Determines if any timeout event has occurred and returns the number of
* milliseconds until the next timeout event for the phyint. Returns
* TIMER_INFINITY for "never".
*/
{
int pr_ndx;
struct probe_stats *pr_statp;
struct phyint_instance *pii_other;
int valid_unack_count;
int i;
int interval;
cur_time = getcurrenttime();
logdebug("phyint_inst_timer(%s %s)\n",
}
/*
* enough time has passed, then try to bring the interface
* back up; otherwise, schedule a timer to bring it back up
* when enough time *has* elapsed.
*/
if (check_time > cur_time)
return (check_time - cur_time);
}
}
/*
* If this phyint is not yet initialized for probes,
* don't proceed further
*/
return (TIMER_INFINITY);
/*
* If the timer has fired too soon, probably triggered
* by some other phyint instance, return the remaining
* time
*/
/*
* If the link is down, don't send any probes for now.
*/
return (TIMER_INFINITY);
/*
* Randomize the next probe time, between MIN_RANDOM_FACTOR
* and MAX_RANDOM_FACTOR with respect to the base probe time.
* Base probe time is strictly periodic.
*/
(int)(MIN_RANDOM_FACTOR * user_probe_interval),
(int)(MAX_RANDOM_FACTOR * user_probe_interval));
/*
* Check if the current time > next time to probe. If so, we missed
* sending 1 or more probes, probably due to heavy system load. At least
* 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
* were scheduled. Make adjustments to the times, in multiples of
* user_probe_interval.
*/
int n;
logtrace("missed sending %d probes cur_time %u snxt_time %u"
/* Collect statistics about missed probes */
}
logdebug("cur_time %u snxt_time %u snxt_basetime %u"
}
/*
* If no targets are known, we need to send an ICMP multicast. The
* probe type is PROBE_MULTI. We'll check back in 'interval' msec
* to see if we found a target.
*/
return (interval);
}
if ((user_probe_interval != probe_interval) &&
/*
* the failure detection (fd) probe timer has not yet fired.
* Need to send only an rtt probe. The probe type is PROBE_RTT.
*/
return (interval);
}
/*
* the fd probe timer has fired. Need to do all failure
* detection / recovery calculations, and then send an fd probe
* of type PROBE_UNI.
*/
if (user_probe_interval == probe_interval) {
/*
* We could have missed some probes, and then adjusted
* pii_snxt_basetime above. Otherwise we could have
* blindly added probe_interval to pii_fd_snxt_basetime.
*/
} else {
int n;
}
}
/*
* We can have at most, the latest 2 probes that we sent, in
* the PR_UNACKED state. All previous probes sent, are either
* PR_LOST or PR_ACKED. An unacknowledged probe is considered
* timed out if the probe's time_sent + the CRTT < currenttime.
* For each of the last 2 probes, examine whether it has timed
* out. If so, mark it PR_LOST. The probe stats is a circular array.
*/
valid_unack_count = 0;
for (i = 0; i < 2; i++) {
case PR_ACKED:
/*
* We received back an ACK, so the switch clearly
* is not dropping our traffic, and thus we can
* enable failure detection immediately.
*/
logdebug("successful probe on %s; "
"ending quiet period\n",
}
}
break;
case PR_UNACKED:
/*
* The crtt could be zero for some reason,
* Eg. the phyint could be failed. If the crtt is
* not available use group's probe interval,
* which is a worst case estimate.
*/
} else {
}
} else if (i == 1) {
/*
* We are forced to consider this probe
* lost, as we can have at most 2 unack.
* probes any time, and we will be sending a
* probe at the end of this function.
* Normally, we should not be here, but
* this can happen if an incoming response
* that was considered lost has increased
* the crtt for this target, and also bumped
* up the FDT. Note that we never cancel or
* increase the current pii_time_left, so
* when the timer fires, we find 2 valid
* unacked probes, and they are yet to timeout
*/
} else {
/*
* Only the most recent probe can enter
* this 'else' arm. The second most recent
* probe must take either of the above arms,
* if it is unacked.
*/
}
break;
}
}
/*
* We send out 1 probe randomly in the interval between one half
* and one probe interval for the group. Given that the CRTT is always
* less than the group's probe interval, we can have at most 1
* unacknowledged probe now. All previous probes are either lost or
* acked.
*/
/*
* The timer has fired. Take appropriate action depending
* on the current state of the phyint.
*
* PI_RUNNING state - Failure detection and failover
* PI_FAILED state - Repair detection and failback
*/
case PI_FAILED:
/*
* If the most recent probe (excluding unacked probes that
* are yet to time out) has been acked, check whether the
* phyint is now repaired. If the phyint is repaired, then
* attempt failback, unless it is an inactive standby.
*/
}
break;
case PI_RUNNING:
/*
* It's possible our probes have been lost because of a
* spanning-tree mandated quiet period on the switch. If so,
* ignore the lost probes and consider the interface to still
* be functioning.
*/
cur_hrtime = gethrtime();
break;
/*
* We have 1 or more failed probes (excluding unacked
* probes that are yet to time out). Determine if the
* phyint has failed. If so attempt a failover,
* unless it is an inactive standby
*/
}
break;
default:
logerr("phyint_inst_timer: invalid state %d\n",
abort();
}
/*
* Start the next probe. probe() will also set pii->pii_probe_time_left
* to the group's probe interval. If phyint_failed -> target_flush_hosts
* was called, the target list may be empty.
*/
/*
* If we have just the one probe target, and we're not using
* router targets, try to find another as we presently have
* no resilience.
*/
} else {
}
return (interval);
}
/*
* Start the probe timer for an interface instance.
*/
void
{
/*
* Spread the base probe times (pi_snxt_basetime) across phyints
* uniformly over the (curtime..curtime + the group's probe_interval).
* pi_snxt_basetime is strictly periodic with a frequency of
* the group's probe interval. The actual probe time pi_snxt_time
* adds some randomness to pi_snxt_basetime and happens in probe().
* For the 1st probe on each phyint after the timer is started,
* pi_snxt_time and pi_snxt_basetime are the same.
*/
interval = GET_RANDOM(0,
}
/*
* Restart the probe timer on an interface instance.
*/
static void
{
/*
* We don't need to restart the timer if it was never started in
* the first place (pii->pii_basetime_inited not set), as the timer
* won't have gone off yet.
*/
if (pii->pii_basetime_inited != 0) {
if (debug & D_LINKNOTE)
logdebug("restart timer: restarting timer on %s, "
}
}
static void
{
/*
* Clear the probe statistics arrays, we don't want the repair
* detection logic relying on probes that were succesful prior
* to the link going down.
*/
/*
* Check for interface failure. Although we know the interface
* has failed, we don't know if all the other interfaces in the
* group have failed as well.
*/
if (debug & D_LINKNOTE) {
logdebug("process_link_state_down:"
}
}
}
static void
{
/*
* We stopped any running timers on each instance when the link
* went down, so restart them.
*/
}
/*
* Process any changes in link state passed up from the interfaces.
*/
void
{
/* Look for interfaces where the link state has just changed */
/*
* Except when the "phyint" structure is created, this is
* the only place the link state is updated. This allows
* this routine to detect changes in link state, rather
* than just the current state.
*/
/*
* Has link just gone down?
*/
if (old_link_state_up)
} else {
/*
* Has link just gone back up?
*/
if (!old_link_state_up)
}
}
}
void
{
struct phyint_instance *pii;
}
}
}
}
}
/*
* Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
* probes on both instances IPv4 and IPv6.
* If the interface has failed, return the time of the first probe failure
* in "tff".
*/
static int
{
struct probe_fail_count pfinfo;
struct phyint_instance *pii_other;
int pr_ndx;
/*
* Get the number of consecutive failed probes on
* this phyint across all targets. Also get the number
* of consecutive failed probes on this target only
*/
/* Get the time of first failure, for later use */
/*
* If the current target has not responded to the
* last NUM_PROBE_FAILS probes, and other targets are
* responding delete this target. Dead gateway detection
* will eventually remove this target (if router) from the
* routing tables. If that does not occur, we may end
* up adding this to our list again.
*/
if (pii->pii_targets_are_routers) {
pii->pii_ntargets--;
} else {
}
return (PHYINT_OK);
}
/*
* If the phyint has lost NUM_PROBE_FAILS or more
* consecutive probes, on both IPv4 and IPv6 protocol
* instances of the phyint, then trigger failure
* detection, else return false
*/
return (PHYINT_OK);
if (PROBE_CAPABLE(pii_other)) {
/*
* We have NUM_PROBE_FAILS or more failures
* on both IPv4 and IPv6. Get the earliest
* time when failure was detected on this
* phyint across IPv4 and IPv6.
*/
} else {
/*
* This instance has < NUM_PROBE_FAILS failure.
* So return false
*/
return (PHYINT_OK);
}
}
return (PHYINT_FAILURE);
}
/*
* Check if the link has gone down on this phyint, or it has failed the
* last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
* Also look at other phyints of this group, for group failures.
*/
int
{
struct probe_success_count psinfo;
struct phyint_instance *pii2;
struct phyint_group *pg;
if (debug & D_FAILOVER)
return (PHYINT_OK);
/*
* At this point, the link is down, or the phyint is suspect,
* as it has lost NUM_PROBE_FAILS or more probes. If the phyint
* does not belong to any group, or is the only member of the
* group capable of being probed, return PHYINT_FAILURE.
*/
if (pg != phyint_anongroup) {
continue;
break;
}
}
}
if (alone)
return (PHYINT_FAILURE);
/*
* Need to compare against other phyints of the same group
* to exclude group failures. If the failure was detected via
* probing, then if the time of last success (tls) of any
* phyint is more recent than the time of first fail (tff) of the
* phyint in question, and the link is up on the phyint,
* then it is a phyint failure. Otherwise it is a group failure.
* If failure was detected via a link down notification sent from
* the driver to IP, we see if any phyints in the group are still
* running and haven't received a link down notification. We
* will usually be processing the link down notification shortly
* after it was received, so there is no point looking at the tls
* of other phyints.
*/
/* Exclude ourself from comparison */
continue;
/*
* We use FLAGS_TO_LINK_STATE() to test the
* flags directly, rather then LINK_UP() or
* LINK_DOWN(), as we may not have got round
* to processing the link state for the other
* phyints in the group yet.
*
* The check for PI_RUNNING and group
* failure handles the case when the
* group begins to recover. The first
* phyint to recover should not trigger
* a failover from the soon-to-recover
* other phyints to the first recovered
* phyint. PI_RUNNING will be set, and
* pg_groupfailed cleared only after
* receipt of NUM_PROBE_REPAIRS, by
* which time the other phyints should
* have received at least 1 packet,
* and so will not have NUM_PROBE_FAILS.
*/
return (PHYINT_FAILURE);
} else {
/*
* Need to compare against both IPv4 and
* IPv6 instances.
*/
if (psinfo.ps_tls_valid) {
/*
* See comment above regarding check
* for PI_RUNNING and group failure.
*/
!GROUP_FAILED(pg) &&
return (PHYINT_FAILURE);
}
}
if (psinfo.ps_tls_valid) {
/*
* See comment above regarding check
* for PI_RUNNING and group failure.
*/
!GROUP_FAILED(pg) &&
return (PHYINT_FAILURE);
}
}
}
}
/*
* Change the group state to PG_FAILED if it's not already.
*/
if (!GROUP_FAILED(pg))
return (GROUP_FAILURE);
}
/*
* Return the information associated with consecutive probe successes
* starting with the most recent probe. At most the last 2 probes can be
* in the unacknowledged state. All previous probes have either failed
* or succeeded.
*/
static void
struct probe_success_count *psinfo)
{
uint_t i;
struct probe_stats *pr_statp;
if (debug & D_FAILOVER)
now = getcurrenttime();
/*
* Start with the most recent probe, and count the number
* of consecutive probe successes. Latch the number of successes
* on hitting a failure.
*/
i = PROBE_INDEX_PREV(i)) {
case PR_UNACKED:
/*
* Only the most recent 2 probes can be unacknowledged
*/
/*
* The crtt could be zero for some reason,
* Eg. the phyint could be failed. If the crtt is
* not available use the value of the group's probe
* interval which is a worst case estimate.
*/
} else {
}
/*
* We hit a failure. Latch the total number of
* recent consecutive successes.
*/
/*
* We hit a failure for the desired
* target. Latch the number of recent
* consecutive successes for this target
*/
}
}
break;
case PR_ACKED:
/*
* Bump up the count of probe successes, if we
* have not seen any failure so far.
*/
if (!pi_found_failure)
!tg_found_failure) {
psinfo->ps_nsucc_tg++;
}
/*
* Record the time of last success, if this is
* the most recent probe success.
*/
if (!psinfo->ps_tls_valid) {
}
break;
case PR_LOST:
/*
* We hit a failure. Latch the total number of
* recent consecutive successes.
*/
/*
* We hit a failure for the desired target.
* Latch the number of recent consecutive
* successes for this target
*/
}
break;
default:
return;
}
}
}
/*
* Return the information associated with consecutive probe failures
* starting with the most recent probe. Only the last 2 probes can be in the
* unacknowledged state. All previous probes have either failed or succeeded.
*/
static void
struct probe_fail_count *pfinfo)
{
int i;
struct probe_stats *pr_statp;
int most_recent;
int second_most_recent;
if (debug & D_FAILOVER)
now = getcurrenttime();
/*
* Start with the most recent probe, and count the number
* of consecutive probe failures. Latch the number of failures
* on hitting a probe success.
*/
i = PROBE_INDEX_PREV(i)) {
case PR_UNACKED:
/*
* Only the most recent 2 probes can be unacknowledged
*/
/*
* Target is guaranteed to exist in the unack. state
*/
/*
* The crtt could be zero for some reason,
* Eg. the phyint could be failed. If the crtt is
* not available use the group's probe interval,
* which is a worst case estimate.
*/
} else {
}
break;
/* FALLTHRU */
case PR_LOST:
if (!pi_found_success) {
}
!tg_found_success) {
pfinfo->pf_nfail_tg++;
}
break;
default:
/*
* We hit a success or unused slot. Latch the
* total number of recent consecutive failures.
*/
/*
* We hit a success for the desired target.
* Latch the number of recent consecutive
* failures for this target
*/
}
}
}
}
/*
* Check if the phyint has been repaired. If no test address has been
* configured, then consider the interface repaired if the link is up (unless
* the link is flapping; see below). Otherwise, look for proof of probes
* being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
* either IPv4 or IPv6 instance, the phyint can be considered repaired.
*/
static boolean_t
{
struct probe_success_count psinfo;
struct phyint_instance *pii;
int pr_ndx;
if (debug & D_FAILOVER)
return (_B_FALSE);
/*
* If we don't have any test addresses and the link is up, then
* consider the interface repaired, unless we've received more than
* LINK_UP_PERMIN link up notifications in the last minute, in
* which case we keep the link down until we drop back below
* the threshold.
*/
cur_time = getcurrenttime();
pi->pi_lfmsg_printed = 0;
return (_B_TRUE);
}
if (!pi->pi_lfmsg_printed) {
logerr("The link has come up on %s more than %d times "
"in the last minute; disabling failback until it "
}
return (_B_FALSE);
}
if (PROBE_CAPABLE(pii)) {
return (_B_TRUE);
}
if (PROBE_CAPABLE(pii)) {
return (_B_TRUE);
}
return (_B_FALSE);
}
/*
* Try failover from phyint 'pi' to a suitable destination.
*/
int
{
int err;
if (debug & D_FAILOVER)
/*
* Attempt to find a failover destination 'dst'.
* dst will be null if any of the following is true
* Phyint is not part of a group OR
* Phyint is the only member of a group OR
* No suitable failover dst was available
*/
return (IPMP_EMINRED);
if (debug & D_FAILOVER) {
logdebug("failed over from %s to %s ret %d\n",
}
if (err == 0) {
/*
* we don't want to print out this message if a
* phyint is leaving the group, nor for failover from
* standby
*/
if (failover_type == FAILOVER_NORMAL) {
logerr("Successfully failed over from NIC %s to NIC "
}
return (0);
} else {
/*
* The failover did not succeed. We must retry the failover
* only after resyncing our state based on the kernel's.
* For eg. either the src or the dst might have been unplumbed
* causing this failure. initifs() will be called again,
* from main, since full_scan_required has been set to true
* by failover();
*/
return (IPMP_FAILURE);
}
}
/*
* global_errno captures the errno value, if failover() or failback()
* fails. This is sent to if_mpadm(1M).
*/
int global_errno;
/*
* Attempt failover from phyint 'from' to phyint 'to'.
* IP moves everything from phyint 'from' to phyint 'to'.
*/
static int
{
int ret;
if (debug & D_FAILOVER) {
logdebug("failing over from %s to %s\n",
}
/*
* Perform the failover. Both IPv4 and IPv6 are failed over
* using a single ioctl by passing in AF_UNSPEC family.
*/
if (ret < 0) {
logperror("failover: ioctl (failover)");
}
/*
* Set full_scan_required to true. This will make us read
* the state from the kernel in initifs() and update our tables,
* to reflect the current state after the failover. If the
* failover has failed it will then reissue the failover.
*/
return (ret);
}
/*
* phyint 'pi' has recovered. Attempt failback from every phyint in the same
* group as phyint 'pi' that is a potential failback source, to phyint 'pi'.
* Return values:
* IPMP_SUCCESS: Failback successful from each of the other
* phyints in the group.
* IPMP_EFBPARTIAL: Failback successful from some of the other
* phyints in the group.
* IPMP_FAILURE: Failback syscall failed with some error.
*
* Note that failback is attempted regardless of the setting of the
* failback_enabled flag.
*/
int
{
if (debug & D_FAILOVER)
/* If this phyint is not part of a named group, return. */
return (IPMP_SUCCESS);
}
/*
* Attempt failback from every phyint in the group to 'pi'.
* The reason for doing this, instead of only from the
* phyint to which we did the failover is given below.
*
* After 'pi' failed, if any app. tries to join on a multicast
* address (IPv6), on the failed phyint, IP picks any arbitrary
* non-failed phyint in the group, instead of the failed phyint,
* in.mpathd is not aware of this. Thus failing back only from the
* interface to which 'pi' failed over, will failback the ipif's
* but not the ilm's. So we need to failback from all members of
* the phyint group
*/
/* Exclude ourself as a failback src */
continue;
/*
* If the 'from' phyint has IPv4 plumbed, the 'to'
* phyint must also have IPv4 plumbed. Similar check
* for IPv6. IP makes the same check. Otherwise the
* failback will fail.
*/
continue;
}
if (!check_only) {
break;
}
}
}
if (check_only) {
}
/*
* We are done. No more phyint from which we can src the failback
*/
if (done) {
if (!partial)
/*
* Don't print out a message unless there is a
* transition from FAILED to RUNNING. For eg.
* we don't want to print out this message if a
* phyint is leaving the group, or at startup
*/
(IFF_FAILED | IFF_OFFLINE))) {
logerr("Successfully failed back to NIC %s\n",
}
}
return (IPMP_FAILURE);
}
/*
* This function is similar to do_failback() above, but respects the
* failback_enabled flag for phyints in named groups.
*/
int
{
if (debug & D_FAILOVER)
return (IPMP_EFBDISABLED);
}
/*
* Failback everything from phyint 'from' that has the same ifindex
* as phyint to's ifindex.
*/
static int
{
int ret;
if (debug & D_FAILOVER)
if (ret < 0) {
logperror("failback: ioctl (failback)");
}
/*
* Set full_scan_required to true. This will make us read
* the state from the kernel in initifs() and update our tables,
* to reflect the current state after the failback. If the
* failback has failed it will then reissue the failback.
*/
return (ret);
}
/*
* Select a target phyint for failing over from 'pi'.
* In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred
* target phyint is chosen as follows,
* 1. Pick any inactive standby interface.
* 2. If no inactive standby is available, select any phyint in the
* same group that has the least number of logints, (excluding
* IFF_NOFAILOVER and !IFF_UP logints)
* If we are failing over from a standby, failover_type is
* FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination.
* If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY,
* and we won't return NULL, as long as there is at least 1 other phyint
* in the group.
*/
static struct phyint *
{
return (NULL);
/*
* Loop thru the phyints in the group, and pick the preferred
* phyint for the target.
*/
/* Exclude ourself and offlined interfaces */
continue;
/*
* The chosen target phyint must have IPv4 instance
* plumbed, if the src phyint has IPv4 plumbed. Similarly
* for IPv6.
*/
continue;
/* The chosen target must be PI_RUNNING. */
last_choice = pi2;
continue;
}
(failover_type != FAILOVER_TO_NONSTANDBY)) {
return (pi2);
} else {
}
}
return (last_choice);
else
return (maybe);
}
/*
*/
{
int ifsock;
if (debug & D_FAILOVER) {
logdebug("change_lif_flags(%s): flags %llx setfl %d\n",
}
} else {
}
/*
* desired phyint flags. Since we set only phyint flags, we can
* do it on either IPv4 or IPv6 instance.
*/
logperror("change_lif_flags: ioctl (get flags)");
return (_B_FALSE);
}
if (setfl)
else
logperror("change_lif_flags: ioctl (set flags)");
return (_B_FALSE);
}
/*
* Keep pi_flags in synch. with actual flags. Assumes flags are
* phyint flags.
*/
if (setfl)
else
return (_B_TRUE);
}
/*
* icmp cksum computation for IPv4.
*/
static int
{
register int sum = 0;
/*
* Our algorithm is simple, using a 32 bit accumulator (sum),
* we add sequential 16 bit words to it, and at the end, fold
* back all the carry bits from the top 16 bits into the lower
* 16 bits.
*/
while (nleft > 1) {
sum += *w++;
nleft -= 2;
}
/* mop up an odd byte, if necessary */
if (nleft == 1) {
}
/*
* add back carry outs from top 16 bits to low 16 bits
*/
return (answer);
}
static void
reset_snxt_basetimes(void)
{
struct phyint_instance *pii;
}
}
/*
* Is the address one of our own addresses? Unfortunately,
* we cannot check our phyint tables to determine if the address
* is our own. This is because, we don't track interfaces that
* are not part of any group. We have to either use a 'bind' or
* get the complete list of all interfaces using SIOCGLIFCONF,
* to do this check. We choose to use 'bind'. We could use
* SIOCTMYADDR, but bind is preferred, since it is stronger.
* SIOCTMYADDR excludes down interfaces, while bind includes even
* down interfaces.
*/
{
int sock;
if (sock == -1) {
logperror("own_address: socket");
/*
* If the socket call fails, err on the side of caution,
* and return true.
*/
} else {
struct sockaddr_in6 sin6;
/*
* If the bind succeeds, then this address is one of our
* addresses.
* If bind returns error EADDRNOTAVAIL, the address is
* not one of ours.
* If bind returns an error other than EADDRNOTAVAIL, err
* on the side of caution and report the address as one of
* our own.
*/
sizeof (struct sockaddr_in6)) == -1) {
if (errno == EADDRNOTAVAIL)
else
logperror("own_address: bind");
}
}
char abuf[INET6_ADDRSTRLEN];
logdebug("own_address: addr %s is %s ours\n",
}
return (ours);
}