mpd_probe.c revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 1987 Regents of the University of California.
* All rights reserved.
*
* Redistribution and use in source and binary forms are permitted
* provided that the above copyright notice and this paragraph are
* duplicated in all such forms and that any documentation,
* advertising materials, and other materials related to such
* distribution and use acknowledge that the software was developed
* by the University of California, Berkeley. The name of the
* University may not be used to endorse or promote products derived
* from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
* WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include "mpd_defs.h"
#include "mpd_tables.h"
/*
* Probe types for probe()
*/
#define PROBE_UNI 0x1234 /* Unicast probe packet */
#define PROBE_MULTI 0x5678 /* Multicast probe packet */
#define PROBE_RTT 0x9abc /* RTT only probe packet */
#define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */
/*
* Format of probe / probe response packets. This is an ICMP Echo request
* or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
*/
struct pr_icmp
{
uint8_t pr_icmp_type; /* type field */
uint8_t pr_icmp_code; /* code field */
uint16_t pr_icmp_cksum; /* checksum field */
uint16_t pr_icmp_id; /* Identification */
uint16_t pr_icmp_seq; /* sequence number */
uint32_t pr_icmp_timestamp; /* Time stamp */
uint32_t pr_icmp_mtype; /* Message type */
};
static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x1 } };
static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */
static void *find_ancillary(struct msghdr *msg, int cmsg_type);
static void pi_set_crtt(struct target *tg, int m,
boolean_t is_probe_uni);
static void incoming_echo_reply(struct phyint_instance *pii,
struct pr_icmp *reply, struct in6_addr fromaddr);
static void incoming_rtt_reply(struct phyint_instance *pii,
struct pr_icmp *reply, struct in6_addr fromaddr);
static void incoming_mcast_reply(struct phyint_instance *pii,
struct pr_icmp *reply, struct in6_addr fromaddr);
static boolean_t check_pg_crtt_improved(struct phyint_group *pg);
static boolean_t check_pii_crtt_improved(struct phyint_instance *pii);
static boolean_t check_exception_target(struct phyint_instance *pii,
struct target *target);
static void probe_fail_info(struct phyint_instance *pii,
struct target *cur_tg, struct probe_fail_count *pfinfo);
static void probe_success_info(struct phyint_instance *pii,
struct target *cur_tg, struct probe_success_count *psinfo);
static boolean_t phyint_repaired(struct phyint *pi);
static int failover(struct phyint *from, struct phyint *to);
static int failback(struct phyint *from, struct phyint *to);
static struct phyint *get_failover_dst(struct phyint *pi, int failover_type);
static boolean_t highest_ack_tg(uint16_t seq, struct target *tg);
static int in_cksum(ushort_t *addr, int len);
static void reset_snxt_basetimes(void);
/*
* CRTT - Conservative Round Trip Time Estimate
* Probe success - A matching probe reply received before CRTT ms has elapsed
* after sending the probe.
* Probe failure - No probe reply received and more than CRTT ms has elapsed
* after sending the probe.
*
* TLS - Time last success. Most recent probe ack received at this time.
* TFF - Time first fail. The time of the earliest probe failure in
* a consecutive series of probe failures.
* NUM_PROBE_REPAIRS - Number of consecutive successful probes required
* before declaring phyint repair.
* NUM_PROBE_FAILS - Number of consecutive probe failures required to
* declare a phyint failure.
*
* Phyint state diagram
*
* The state of a phyint that is capable of being probed, is completely
* specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>.
*
* A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state
* of the link (according to the driver). If the phyint is also configured
* with a test address (the common case) and probe targets, then a phyint must
* also successfully be able to send and receive probes in order to remain in
* the PI_RUNNING state (otherwise, it transitions to PI_FAILED).
*
* Further, if a PI_RUNNING phyint is configured with a test address but is
* unable to find any probe targets, it will transition to the PI_NOTARGETS
* state, which indicates that the link is apparently functional but that
* in.mpathd is unable to send probes to verify functionality (in this case,
* in.mpathd makes the optimistic assumption that the interface is working
* correctly and thus does not perform a failover, but reports the interface
* as IPMP_IF_UNKNOWN through the async events and query interfaces).
*
* At any point, a phyint may be administratively marked offline via if_mpadm.
* In this case, the interface always transitions to PI_OFFLINE, regardless
* of its previous state. When the interface is later brought back online,
* in.mpathd acts as if the interface is new (and thus it transitions to
* PI_RUNNING or PI_FAILED based on the status of the link and the result of
* its probes, if probes are sent).
*
* pi_state - PI_RUNNING or PI_FAILED
* PI_RUNNING: The failure detection logic says the phyint is good.
* PI_FAILED: The failure detection logic says the phyint has failed.
*
* pg_groupfailed - Group failure, all interfaces in the group have failed.
* The pi_state may be either PI_FAILED or PI_NOTARGETS.
* In the case of router targets, we assume that the current list of
* targets obtained from the routing table, is still valid, so the
* phyint stat is PI_FAILED. In the case of host targets, we delete the
* list of targets, and multicast to the all hosts, to reconstruct the
* target list. So the phyints are in the PI_NOTARGETS state.
*
* I - value of (pi_flags & IFF_INACTIVE)
* IFF_INACTIVE: No failovers have been done to the standby, from
* other phyints. This phyint is an inactive standby.
*
* pi_empty
* This phyint has failed over successfully to another phyint, and
* this phyint is currently "empty". It does not host any addresses or
* multicast membership etc. This is the state of a phyint after a
* failover from the phyint has completed successfully and no subsequent
* 'failover to' or 'failback to' has occurred on the phyint.
* IP guarantees that no new logicals will be hosted nor any multicast
* joins permitted on the phyint, since the phyint is either failed or
* inactive. pi_empty is set implies the phyint is either failed or
* inactive.
*
* pi_full
* The phyint hosts all of its own addresses that it "owns". If the
* phyint was previously failed or inactive, failbacks to the phyint
* has completed successfully. i.e. No more failbacks to this phyint
* can produce any change in system state whatsoever.
*
* Not all 32 possible combinations of the above 5-tuple are possible.
* Furthermore some of the above combinations are transient. They may occur
* only because the failover or failback did not complete successfully. The
* failover/failback will be retried and eventually a stable state will be
* reached.
*
* I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd.
* The following are the state machines. 'from' and 'to' are the src and
* dst of the failover/failback, below
*
* pi_empty state machine
* ---------------------------------------------------------------------------
* Event State -> New State
* ---------------------------------------------------------------------------
* successful completion from.pi_empty = 0 -> from.pi_empty = 1
* of failover
*
* Initiate failover to.pi_empty = X -> to.pi_empty = 0
*
* Initiate failback to.pi_empty = X -> to.pi_empty = 0
*
* group failure pi_empty = X -> pi_empty = 0
* ---------------------------------------------------------------------------
*
* pi_full state machine
* ---------------------------------------------------------------------------
* Event State -> New State
* ---------------------------------------------------------------------------
* successful completion to.pi_full = 0 -> to.pi_full = 1
* of failback from
* each of the other phyints
*
* Initiate failover from.pi_full = X -> from.pi_full = 0
*
* group failure pi_full = X -> pi_full = 0
* ---------------------------------------------------------------------------
*
* pi_state state machine
* ---------------------------------------------------------------------------
* Event State New State
* Action:
* ---------------------------------------------------------------------------
* NIC failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
* detection : set IFF_FAILED on this phyint
* : failover from this phyint to another
*
* NIC failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 1)
* detection : set IFF_FAILED on this phyint
*
* NIC repair (PI_FAILED, I == 0) -> (PI_RUNNING, I == 0)
* detection : to.pi_empty = 0
* : failback to this phyint if enabled
* : clear IFF_FAILED on this phyint
*
* NIC repair (PI_FAILED, I == 1) -> (PI_RUNNING, I == 1)
* detection : clear IFF_FAILED on this phyint
*
* Group failure (perform on all phyints in the group)
* detection PI_RUNNING PI_FAILED
* (Router targets) : set IFF_FAILED
* : clear pi_empty and pi_full
*
* Group failure (perform on all phyints in the group)
* detection PI_RUNNING PI_NOTARGETS
* (Host targets) : set IFF_FAILED
* : clear pi_empty and pi_full
* : delete the target list on all phyints
* ---------------------------------------------------------------------------
*
* I state machine
* ---------------------------------------------------------------------------
* Event State Action:
* ---------------------------------------------------------------------------
* Turn on I pi_empty == 0 : failover from standby
*
* Turn off I PI_RUNNING, : pi_empty = 0
* pi_full == 0 : failback to this if enabled
* ---------------------------------------------------------------------------
*
* Assertions: (Read '==>' as implies)
*
* (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED)
* (pi_empty == 1) ==> (pi_full == 0)
* (pi_full == 1) ==> (pi_empty == 0)
*
* Invariants
*
* pg_groupfailed = 0 &&
* 1. (I == 1, pi_empty == 0) ==> initiate failover from standby
* 2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint
* 3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint
*
* 1. says that an inactive standby, that is not empty, has to be failed
* over. For a standby to be truly inactive, it should not host any
* addresses. So we move them to some other phyint. Usually we catch the
* turn on of IFF_INACTIVE, and perform this action. However if the failover
* did not complete successfully, then subsequently we have lost the edge
* trigger, and this invariant kicks in and completes the action.
*
* 2. says that any failed phyint that is not empty must be failed over.
* Usually we do the failover when we detect NIC failure. However if the
* failover does not complete successfully, this invariant kicks in and
* completes the failover. We exclude inactive standby which is covered by 1.
*
* 3. says that any running phyint that is not full must be failed back.
* Usually we do the failback when we detect NIC repair. However if the
* failback does not complete successfully, this invariant kicks in and
* completes the failback. Note that we don't want to failback to an inactive
* standby.
*
* The invariants 1 - 3 and the actions are in initifs().
*/
struct probes_missed probes_missed;
/*
* Compose and transmit an ICMP ECHO REQUEST packet. The IP header
* will be added on by the kernel. The id field identifies this phyint.
* and the sequence number is an increasing (modulo 2^^16) integer. The data
* portion holds the time value when the packet is sent. On echo this is
* extracted to compute the round-trip time. Three different types of
* probe packets are used.
*
* PROBE_UNI: This type is used to do failure detection / failure recovery
* and RTT calculation. PROBE_UNI probes are spaced apart in time,
* not less than the current CRTT. pii_probes[] stores data
* about these probes. These packets consume sequence number space.
*
* PROBE_RTT: This type is used to make only rtt measurments. Normally these
* are not used. Under heavy network load, the rtt may go up very high,
* due to a spike, or may appear to go high, due to extreme scheduling
* delays. Once the network stress is removed, mpathd takes long time to
* recover, because the probe_interval is already high, and it takes
* a long time to send out sufficient number of probes to bring down the
* rtt. To avoid this problem, PROBE_RTT probes are sent out every
* user_probe_interval ms. and will cause only rtt updates. These packets
* do not consume sequence number space nor is information about these
* packets stored in the pii_probes[]
*
* PROBE_MULTI: This type is only used to construct a list of targets, when
* no targets are known. The packet is multicast to the all hosts addr.
*/
static void
probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time)
{
struct pr_icmp probe_pkt; /* Probe packet */
struct sockaddr_in6 whereto6; /* target address IPv6 */
struct sockaddr_in whereto; /* target address IPv4 */
int pr_ndx; /* probe index in pii->pii_probes[] */
boolean_t sent = _B_TRUE;
if (debug & D_TARGET) {
logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af),
pii->pii_name, probe_type, cur_time);
}
assert(pii->pii_probe_sock != -1);
assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
probe_type == PROBE_RTT);
probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
probe_pkt.pr_icmp_code = 0;
probe_pkt.pr_icmp_cksum = 0;
probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
/*
* Since there is no need to do arithmetic on the icmpid,
* (only equality check is done) pii_icmpid is stored in
* network byte order at initialization itself.
*/
probe_pkt.pr_icmp_id = pii->pii_icmpid;
probe_pkt.pr_icmp_timestamp = htonl(cur_time);
probe_pkt.pr_icmp_mtype = htonl(probe_type);
/*
* If probe_type is PROBE_MULTI, this packet will be multicast to
* the all hosts address. Otherwise it is unicast to the next target.
*/
assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
pii->pii_rtt_target_next != NULL));
if (pii->pii_af == AF_INET6) {
bzero(&whereto6, sizeof (whereto6));
whereto6.sin6_family = AF_INET6;
if (probe_type == PROBE_MULTI) {
whereto6.sin6_addr = all_nodes_mcast_v6;
} else if (probe_type == PROBE_UNI) {
whereto6.sin6_addr = pii->pii_target_next->tg_address;
} else {
/* type is PROBE_RTT */
whereto6.sin6_addr =
pii->pii_rtt_target_next->tg_address;
}
if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6,
sizeof (whereto6)) != sizeof (probe_pkt)) {
logperror_pii(pii, "probe: probe sendto");
sent = _B_FALSE;
}
} else {
bzero(&whereto, sizeof (whereto));
whereto.sin_family = AF_INET;
if (probe_type == PROBE_MULTI) {
whereto.sin_addr = all_nodes_mcast_v4;
} else if (probe_type == PROBE_UNI) {
IN6_V4MAPPED_TO_INADDR(
&pii->pii_target_next->tg_address,
&whereto.sin_addr);
} else {
/* type is PROBE_RTT */
IN6_V4MAPPED_TO_INADDR(
&pii->pii_rtt_target_next->tg_address,
&whereto.sin_addr);
}
/*
* Compute the IPv4 icmp checksum. Does not cover the IP header.
*/
probe_pkt.pr_icmp_cksum =
in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
sizeof (probe_pkt), 0, (struct sockaddr *)&whereto,
sizeof (whereto)) != sizeof (probe_pkt)) {
logperror_pii(pii, "probe: probe sendto");
sent = _B_FALSE;
}
}
/*
* If this is a PROBE_UNI probe packet being unicast to a target, then
* update our tables. We will need this info in processing the probe
* response. PROBE_MULTI and PROBE_RTT packets are not used for
* the purpose of failure or recovery detection. PROBE_MULTI packets
* are only used to construct a list of targets. PROBE_RTT packets are
* used only for updating the rtt and not for failure detection.
*/
if (probe_type == PROBE_UNI && sent) {
pr_ndx = pii->pii_probe_next;
assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
/* Collect statistics, before we reuse the last slot. */
if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
pii->pii_cum_stats.lost++;
else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
pii->pii_cum_stats.acked++;
pii->pii_cum_stats.sent++;
pii->pii_probes[pr_ndx].pr_status = PR_UNACKED;
pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
pii->pii_probes[pr_ndx].pr_time_sent = cur_time;
pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
pii->pii_target_next = target_next(pii->pii_target_next);
assert(pii->pii_target_next != NULL);
/*
* If we have a single variable to denote the next target to
* probe for both rtt probes and failure detection probes, we
* could end up with a situation where the failure detection
* probe targets become disjoint from the rtt probe targets.
* Eg. if 2 targets and the actual fdt is double the user
* specified fdt. So we have 2 variables. In this scheme
* we also reset pii_rtt_target_next for every fdt probe,
* though that may not be necessary.
*/
pii->pii_rtt_target_next = pii->pii_target_next;
pii->pii_snxt++;
} else if (probe_type == PROBE_RTT) {
pii->pii_rtt_target_next =
target_next(pii->pii_rtt_target_next);
assert(pii->pii_rtt_target_next != NULL);
}
}
/*
* Incoming IPv4 data from wire, is received here. Called from main.
*/
void
in_data(struct phyint_instance *pii)
{
struct sockaddr_in from;
struct in6_addr fromaddr;
uint_t fromlen;
static uint_t in_packet[(IP_MAXPACKET + 1)/4];
struct ip *ip;
int iphlen;
int len;
char abuf[INET_ADDRSTRLEN];
struct pr_icmp *reply;
if (debug & D_PROBE) {
logdebug("in_data(%s %s)\n",
AF_STR(pii->pii_af), pii->pii_name);
}
/*
* Poll has already told us that a message is waiting,
* on this socket. Read it now. We should not block.
*/
fromlen = sizeof (from);
len = recvfrom(pii->pii_probe_sock, (char *)in_packet,
sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen);
if (len < 0) {
logperror_pii(pii, "in_data: recvfrom");
return;
}
/*
* If the NIC has indicated the link is down, don't go
* any further.
*/
if (LINK_DOWN(pii->pii_phyint))
return;
/* Get the printable address for error reporting */
(void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
/* Make sure packet contains at least minimum ICMP header */
ip = (struct ip *)in_packet;
iphlen = ip->ip_hl << 2;
if (len < iphlen + ICMP_MINLEN) {
if (debug & D_PKTBAD) {
logdebug("in_data: packet too short (%d bytes)"
" from %s\n", len, abuf);
}
return;
}
/*
* Subtract the IP hdr length, 'len' will be length of the probe
* reply, starting from the icmp hdr.
*/
len -= iphlen;
/* LINTED */
reply = (struct pr_icmp *)((char *)in_packet + iphlen);
/* Probe replies are icmp echo replies. Ignore anything else */
if (reply->pr_icmp_type != ICMP_ECHO_REPLY)
return;
/*
* The icmp id should match what we sent, which is stored
* in pi_icmpid. The icmp code for reply must be 0.
* The reply content must be a struct pr_icmp
*/
if (reply->pr_icmp_id != pii->pii_icmpid) {
/* Not in response to our probe */
return;
}
if (reply->pr_icmp_code != 0) {
logtrace("probe reply code %d from %s on %s\n",
reply->pr_icmp_code, abuf, pii->pii_name);
return;
}
if (len < sizeof (struct pr_icmp)) {
logtrace("probe reply too short: %d bytes from %s on %s\n",
len, abuf, pii->pii_name);
return;
}
IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
/* Unicast probe reply */
incoming_echo_reply(pii, reply, fromaddr);
else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
/* Multicast reply */
incoming_mcast_reply(pii, reply, fromaddr);
} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
incoming_rtt_reply(pii, reply, fromaddr);
} else {
/* Probably not in response to our probe */
logtrace("probe reply type: %d from %s on %s\n",
reply->pr_icmp_mtype, abuf, pii->pii_name);
return;
}
}
/*
* Incoming IPv6 data from wire is received here. Called from main.
*/
void
in6_data(struct phyint_instance *pii)
{
struct sockaddr_in6 from;
static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
int len;
char abuf[INET6_ADDRSTRLEN];
struct msghdr msg;
struct iovec iov;
uchar_t *opt;
struct pr_icmp *reply;
if (debug & D_PROBE) {
logdebug("in6_data(%s %s)\n",
AF_STR(pii->pii_af), pii->pii_name);
}
iov.iov_base = (char *)in_packet;
iov.iov_len = sizeof (in_packet);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_name = (struct sockaddr *)&from;
msg.msg_namelen = sizeof (from);
msg.msg_control = ancillary_data;
msg.msg_controllen = sizeof (ancillary_data);
if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
logperror_pii(pii, "in6_data: recvfrom");
return;
}
/*
* If the NIC has indicated that the link is down, don't go
* any further.
*/
if (LINK_DOWN(pii->pii_phyint))
return;
/* Get the printable address for error reporting */
(void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf));
if (len < ICMP_MINLEN) {
if (debug & D_PKTBAD) {
logdebug("Truncated message: msg_flags 0x%x from %s\n",
msg.msg_flags, abuf);
}
return;
}
/* Ignore packets > 64k or control buffers that don't fit */
if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
if (debug & D_PKTBAD) {
logdebug("Truncated message: msg_flags 0x%x from %s\n",
msg.msg_flags, abuf);
}
return;
}
reply = (struct pr_icmp *)in_packet;
if (reply->pr_icmp_type != ICMP6_ECHO_REPLY)
return;
if (reply->pr_icmp_id != pii->pii_icmpid) {
/* Not in response to our probe */
return;
}
/*
* The kernel has already verified the the ICMP checksum.
*/
if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) {
logtrace("ICMPv6 echo reply source address not linklocal from "
"%s on %s\n", abuf, pii->pii_name);
return;
}
opt = find_ancillary(&msg, IPV6_RTHDR);
if (opt != NULL) {
/* Can't allow routing headers in probe replies */
logtrace("message with routing header from %s on %s\n",
abuf, pii->pii_name);
return;
}
if (reply->pr_icmp_code != 0) {
logtrace("probe reply code: %d from %s on %s\n",
reply->pr_icmp_code, abuf, pii->pii_name);
return;
}
if (len < (sizeof (struct pr_icmp))) {
logtrace("probe reply too short: %d bytes from %s on %s\n",
len, abuf, pii->pii_name);
return;
}
if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
incoming_echo_reply(pii, reply, from.sin6_addr);
} else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
incoming_mcast_reply(pii, reply, from.sin6_addr);
} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
incoming_rtt_reply(pii, reply, from.sin6_addr);
} else {
/* Probably not in response to our probe */
logtrace("probe reply type: %d from %s on %s\n",
reply->pr_icmp_mtype, abuf, pii->pii_name);
}
}
/*
* Process the incoming rtt reply, in response to our rtt probe.
* Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
* have any stored information about the probe we sent. So we don't log
* any errors if we receive bad replies.
*/
static void
incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
struct in6_addr fromaddr)
{
int m; /* rtt measurment in ms */
uint32_t cur_time; /* in ms from some arbitrary point */
char abuf[INET6_ADDRSTRLEN];
struct target *target;
uint32_t pr_icmp_timestamp;
struct phyint_group *pg;
/* Get the printable address for error reporting */
(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
if (debug & D_PROBE) {
logdebug("incoming_rtt_reply: %s %s %s\n",
AF_STR(pii->pii_af), pii->pii_name, abuf);
}
/* Do we know this target ? */
target = target_lookup(pii, fromaddr);
if (target == NULL)
return;
pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp);
cur_time = getcurrenttime();
m = (int)(cur_time - pr_icmp_timestamp);
/* Invalid rtt. It has wrapped around */
if (m < 0)
return;
/*
* Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
* The initial few responses after the interface is repaired may
* contain high rtt's because they could have been queued up waiting
* for ARP/NDP resolution on a failed interface.
*/
pg = pii->pii_phyint->pi_group;
if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
return;
/*
* Update rtt only if the new rtt is lower than the current rtt.
* (specified by the 3rd parameter to pi_set_crtt).
* If a spike has caused the current probe_interval to be >
* user_probe_interval, then this mechanism is used to bring down
* the rtt rapidly once the network stress is removed.
* If the new rtt is higher than the current rtt, we don't want to
* update the rtt. We are having more than 1 outstanding probe and
* the increase in rtt we are seeing is being unnecessarily weighted
* many times. The regular rtt update will be handled by
* incoming_echo_reply() and will take care of any rtt increase.
*/
pi_set_crtt(target, m, _B_FALSE);
if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
(user_failure_detection_time < pg->pg_fdt) &&
(last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
/*
* If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
* investigate if we can improve the failure detection time to
* meet whatever the user specified.
*/
if (check_pg_crtt_improved(pg)) {
pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
user_failure_detection_time);
pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
if (pii->pii_phyint->pi_group != phyint_anongroup) {
logerr("Improved failure detection time %d ms "
"on (%s %s) for group \"%s\"\n",
pg->pg_fdt, AF_STR(pii->pii_af),
pii->pii_name,
pii->pii_phyint->pi_group->pg_name);
}
if (user_failure_detection_time == pg->pg_fdt) {
/* Avoid any truncation or rounding errors */
pg->pg_probeint = user_probe_interval;
/*
* No more rtt probes will be sent. The actual
* fdt has dropped to the user specified value.
* pii_fd_snxt_basetime and pii_snxt_basetime
* will be in sync henceforth.
*/
reset_snxt_basetimes();
}
}
}
}
/*
* Process the incoming echo reply, in response to our unicast probe.
* Common for both IPv4 and IPv6
*/
static void
incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
struct in6_addr fromaddr)
{
int m; /* rtt measurment in ms */
uint32_t cur_time; /* in ms from some arbitrary point */
char abuf[INET6_ADDRSTRLEN];
int pr_ndx;
struct target *target;
boolean_t exception;
uint32_t pr_icmp_timestamp;
uint16_t pr_icmp_seq;
struct phyint_group *pg = pii->pii_phyint->pi_group;
/* Get the printable address for error reporting */
(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
if (debug & D_PROBE) {
logdebug("incoming_echo_reply: %s %s %s seq %u\n",
AF_STR(pii->pii_af), pii->pii_name, abuf,
ntohs(reply->pr_icmp_seq));
}
pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp);
pr_icmp_seq = ntohs(reply->pr_icmp_seq);
/* Reject out of window probe replies */
if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) {
logtrace("out of window probe seq %u snxt %u on %s from %s\n",
pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
pii->pii_cum_stats.unknown++;
return;
}
cur_time = getcurrenttime();
m = (int)(cur_time - pr_icmp_timestamp);
if (m < 0) {
/*
* This is a ridiculously high value of rtt. rtt has wrapped
* around. Log a message, and ignore the rtt.
*/
logerr("incoming_echo_reply: rtt wraparound cur_time %u reply "
"timestamp %u\n", cur_time, pr_icmp_timestamp);
}
/*
* Get the probe index pr_ndx corresponding to the received icmp seq.
* number in our pii->pii_probes[] array. The icmp sequence number
* pii_snxt corresponds to the probe index pii->pii_probe_next
*/
pr_ndx = MOD_SUB(pii->pii_probe_next,
(uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT);
assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status));
target = pii->pii_probes[pr_ndx].pr_target;
/*
* Perform sanity checks, whether this probe reply that we
* have received is genuine
*/
if (target != NULL) {
/*
* Compare the src. addr of the received ICMP or ICMPv6
* probe reply with the target address in our tables.
*/
if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) {
/*
* We don't have any record of having sent a probe to
* this target. This is a fake probe reply. Log an error
*/
logtrace("probe status %d Fake probe reply seq %u "
"snxt %u on %s from %s\n",
pii->pii_probes[pr_ndx].pr_status,
pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
pii->pii_cum_stats.unknown++;
return;
} else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
/*
* The address matches, but our tables indicate that
* this probe reply has been acked already. So this
* is a duplicate probe reply. Log an error
*/
logtrace("probe status %d Duplicate probe reply seq %u "
"snxt %u on %s from %s\n",
pii->pii_probes[pr_ndx].pr_status,
pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
pii->pii_cum_stats.unknown++;
return;
}
} else {
/*
* Target must not be NULL in the PR_UNACKED state
*/
assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED);
if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) {
/*
* The probe stats slot is unused. So we didn't
* send out any probe to this target. This is a fake.
* Log an error.
*/
logtrace("probe status %d Fake probe reply seq %u "
"snxt %u on %s from %s\n",
pii->pii_probes[pr_ndx].pr_status,
pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
}
pii->pii_cum_stats.unknown++;
return;
}
/*
* If the rtt does not appear to be right, don't update the
* rtt stats. This can happen if the system dropped into the
* debugger, or the system was hung or too busy for a
* substantial time that we didn't get a chance to run.
*/
if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) {
/*
* If the probe corresponding to this receieved response
* was truly sent 'm' ms. ago, then this response must
* have been rejected by the sequence number checks. The
* fact that it has passed the sequence number checks
* means that the measured rtt is wrong. We were probably
* scheduled long after the packet was received.
*/
goto out;
}
/*
* Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
* The initial few responses after the interface is repaired may
* contain high rtt's because they could have been queued up waiting
* for ARP/NDP resolution on a failed interface.
*/
if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
goto out;
/*
* Don't update the Conservative Round Trip Time estimate for this
* (phint, target) pair if this is the not the highest ack seq seen
* thus far on this target.
*/
if (!highest_ack_tg(pr_icmp_seq, target))
goto out;
/*
* Always update the rtt. This is a failure detection probe
* and we want to measure both increase / decrease in rtt.
*/
pi_set_crtt(target, m, _B_TRUE);
/*
* If the crtt exceeds the average time between probes,
* investigate if this slow target is an exception. If so we
* can avoid this target and still meet the failure detection
* time. Otherwise we can't meet the failure detection time.
*/
if (target->tg_crtt > pg->pg_probeint) {
exception = check_exception_target(pii, target);
if (exception) {
/*
* This target is exceptionally slow. Don't use it
* for future probes. check_exception_target() has
* made sure that we have at least MIN_PROBE_TARGETS
* other active targets
*/
if (pii->pii_targets_are_routers) {
/*
* This is a slow router, mark it as slow
* and don't use it for further probes. We
* don't delete it, since it will be populated
* again when we do a router scan. Hence we
* need to maintain extra state (unlike the
* host case below). Mark it as TG_SLOW.
*/
if (target->tg_status == TG_ACTIVE)
pii->pii_ntargets--;
target->tg_status = TG_SLOW;
target->tg_latime = gethrtime();
target->tg_rtt_sa = -1;
target->tg_crtt = 0;
target->tg_rtt_sd = 0;
if (pii->pii_target_next == target) {
pii->pii_target_next =
target_next(target);
}
} else {
/*
* the slow target is not a router, we can
* just delete it. Send an icmp multicast and
* pick the fastest responder that is not
* already an active target. target_delete()
* adjusts pii->pii_target_next
*/
target_delete(target);
probe(pii, PROBE_MULTI, cur_time);
}
} else {
/*
* We can't meet the failure detection time.
* Log a message, and update the detection time to
* whatever we can achieve.
*/
pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE;
pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2);
last_fdt_bumpup_time = gethrtime();
if (pg != phyint_anongroup) {
logerr("Cannot meet requested failure detection"
" time of %d ms on (%s %s) new failure"
" detection time for group \"%s\" is %d"
" ms\n", user_failure_detection_time,
AF_STR(pii->pii_af), pii->pii_name,
pg->pg_name, pg->pg_fdt);
}
}
} else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
(user_failure_detection_time < pg->pg_fdt) &&
(last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
/*
* If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
* investigate if we can improve the failure detection time to
* meet whatever the user specified.
*/
if (check_pg_crtt_improved(pg)) {
pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
user_failure_detection_time);
pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
if (pg != phyint_anongroup) {
logerr("Improved failure detection time %d ms "
"on (%s %s) for group \"%s\"\n", pg->pg_fdt,
AF_STR(pii->pii_af), pii->pii_name,
pg->pg_name);
}
if (user_failure_detection_time == pg->pg_fdt) {
/* Avoid any truncation or rounding errors */
pg->pg_probeint = user_probe_interval;
/*
* No more rtt probes will be sent. The actual
* fdt has dropped to the user specified value.
* pii_fd_snxt_basetime and pii_snxt_basetime
* will be in sync henceforth.
*/
reset_snxt_basetimes();
}
}
}
out:
pii->pii_probes[pr_ndx].pr_status = PR_ACKED;
pii->pii_probes[pr_ndx].pr_time_acked = cur_time;
/*
* Update pii->pii_rack, i.e. the sequence number of the last received
* probe response, based on the echo reply we have received now, if
* either of the following conditions are satisfied.
* a. pii_rack is outside the current receive window of
* [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
* This means we have not received probe responses for a
* long time, and the sequence number has wrapped around.
* b. pii_rack is within the current receive window and this echo
* reply corresponds to the highest sequence number we have seen
* so far.
*/
if (SEQ_GE(pii->pii_rack, pii->pii_snxt) ||
SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) ||
SEQ_GT(pr_icmp_seq, pii->pii_rack)) {
pii->pii_rack = pr_icmp_seq;
}
}
/*
* Returns true if seq is the highest unacknowledged seq for target tg
* else returns false
*/
static boolean_t
highest_ack_tg(uint16_t seq, struct target *tg)
{
struct phyint_instance *pii;
int pr_ndx;
uint16_t pr_seq;
pii = tg->tg_phyint_inst;
/*
* Get the seq number of the most recent probe sent so far,
* and also get the corresponding probe index in the probe stats
* array.
*/
pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
pr_seq = pii->pii_snxt;
pr_seq--;
/*
* Start from the most recent probe and walk back, trying to find
* an acked probe corresponding to target tg.
*/
for (; pr_ndx != pii->pii_probe_next;
pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) {
if (pii->pii_probes[pr_ndx].pr_target == tg &&
pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
if (SEQ_GT(pr_seq, seq))
return (_B_FALSE);
}
}
return (_B_TRUE);
}
/*
* Check whether the crtt for the group has improved by a factor of
* LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure
* detection time flapping in the face of small crtt changes.
*/
static boolean_t
check_pg_crtt_improved(struct phyint_group *pg)
{
struct phyint *pi;
if (debug & D_PROBE)
logdebug("check_pg_crtt_improved()\n");
/*
* The crtt for the group is only improved if each phyint_instance
* for both ipv4 and ipv6 is improved.
*/
for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
if (!check_pii_crtt_improved(pi->pi_v4) ||
!check_pii_crtt_improved(pi->pi_v6))
return (_B_FALSE);
}
return (_B_TRUE);
}
/*
* Check whether the crtt has improved substantially on this phyint_instance.
* Returns _B_TRUE if there's no crtt information available, because pii
* is NULL or the phyint_instance is not capable of probing.
*/
boolean_t
check_pii_crtt_improved(struct phyint_instance *pii) {
struct target *tg;
if (pii == NULL)
return (_B_TRUE);
if (!PROBE_CAPABLE(pii) ||
pii->pii_phyint->pi_state == PI_FAILED)
return (_B_TRUE);
for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
if (tg->tg_status != TG_ACTIVE)
continue;
if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint /
LOWER_FDT_TRIGGER)) {
return (_B_FALSE);
}
}
return (_B_TRUE);
}
/*
* This target responds very slowly to probes. The target's crtt exceeds
* the probe interval of its group. Compare against other targets
* and determine if this target is an exception, if so return true, else false
*/
static boolean_t
check_exception_target(struct phyint_instance *pii, struct target *target)
{
struct target *tg;
char abuf[INET6_ADDRSTRLEN];
if (debug & D_PROBE) {
logdebug("check_exception_target(%s %s target %s)\n",
AF_STR(pii->pii_af), pii->pii_name,
pr_addr(pii->pii_af, target->tg_address,
abuf, sizeof (abuf)));
}
/*
* We should have at least MIN_PROBE_TARGETS + 1 good targets now,
* to make a good judgement. Otherwise don't drop this target.
*/
if (pii->pii_ntargets < MIN_PROBE_TARGETS + 1)
return (_B_FALSE);
/*
* Determine whether only this particular target is slow.
* We know that this target's crtt exceeds the group's probe interval.
* If all other active targets have a
* crtt < (this group's probe interval) / EXCEPTION_FACTOR,
* then this target is considered slow.
*/
for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
if (tg != target && tg->tg_status == TG_ACTIVE) {
if (tg->tg_crtt >
pii->pii_phyint->pi_group->pg_probeint /
EXCEPTION_FACTOR) {
return (_B_FALSE);
}
}
}
return (_B_TRUE);
}
/*
* Update the target list. The icmp all hosts multicast has given us
* some host to which we can send probes. If we already have sufficient
* targets, discard it.
*/
static void
incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
struct in6_addr fromaddr)
/* ARGSUSED */
{
int af;
char abuf[INET6_ADDRSTRLEN];
struct phyint *pi;
if (debug & D_PROBE) {
logdebug("incoming_mcast_reply(%s %s %s)\n",
AF_STR(pii->pii_af), pii->pii_name,
pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)));
}
/*
* Using host targets is a fallback mechanism. If we have
* found a router, don't add this host target. If we already
* know MAX_PROBE_TARGETS, don't add another target.
*/
assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
if (pii->pii_targets != NULL) {
if (pii->pii_targets_are_routers ||
(pii->pii_ntargets == MAX_PROBE_TARGETS)) {
return;
}
}
if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) {
/*
* Guard against response from 0.0.0.0
* and ::. Log a trace message
*/
logtrace("probe response from %s on %s\n",
pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)),
pii->pii_name);
return;
}
/*
* This address is one of our own, so reject this address as a
* valid probe target.
*/
af = pii->pii_af;
if (own_address(af, fromaddr))
return;
/*
* If the phyint is part a named group, then add the address to all
* members of the group. Otherwise, add the address only to the
* phyint itself, since other phyints in the anongroup may not be on
* the same subnet.
*/
pi = pii->pii_phyint;
if (pi->pi_group == phyint_anongroup) {
target_add(pii, fromaddr, _B_FALSE);
} else {
pi = pi->pi_group->pg_phyint;
for (; pi != NULL; pi = pi->pi_pgnext)
target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE);
}
}
/*
* Compute CRTT given an existing scaled average, scaled deviation estimate
* and a new rtt time. The formula is from Jacobson and Karels'
* "Congestion Avoidance and Control" in SIGCOMM '88. The variable names
* are the same as those in Appendix A.2 of that paper.
*
* m = new measurement
* sa = scaled RTT average (8 * average estimates)
* sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
* crtt = Conservative round trip time. Used to determine whether probe
* has timed out.
*
* New scaled average and deviation are passed back via sap and svp
*/
static int
compute_crtt(int *sap, int *svp, int m)
{
int sa = *sap;
int sv = *svp;
int crtt;
int saved_m = m;
assert(*sap >= -1);
assert(*svp >= 0);
if (sa != -1) {
/*
* Update average estimator:
* new rtt = old rtt + 1/8 Error
* where Error = m - old rtt
* i.e. 8 * new rtt = 8 * old rtt + Error
* i.e. new sa = old sa + Error
*/
m -= sa >> 3; /* m is now Error in estimate. */
if ((sa += m) < 0) {
/* Don't allow the smoothed average to be negative. */
sa = 0;
}
/*
* Update deviation estimator:
* new mdev = old mdev + 1/4 (abs(Error) - old mdev)
* i.e. 4 * new mdev = 4 * old mdev +
* (abs(Error) - old mdev)
* i.e. new sv = old sv + (abs(Error) - old mdev)
*/
if (m < 0)
m = -m;
m -= sv >> 2;
sv += m;
} else {
/* Initialization. This is the first response received. */
sa = (m << 3);
sv = (m << 1);
}
crtt = (sa >> 3) + sv;
if (debug & D_PROBE) {
logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = "
"%d\n", saved_m, sa, sv, crtt);
}
*sap = sa;
*svp = sv;
/*
* CRTT = average estimates + 4 * deviation estimates
* = sa / 8 + sv
*/
return (crtt);
}
static void
pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni)
{
struct phyint_instance *pii = tg->tg_phyint_inst;
int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
int sa = tg->tg_rtt_sa;
int sv = tg->tg_rtt_sd;
int new_crtt;
int i;
if (debug & D_PROBE)
logdebug("pi_set_crtt: target - m %d\n", m);
/* store the round trip time, in case we need to defer computation */
tg->tg_deferred[tg->tg_num_deferred] = m;
new_crtt = compute_crtt(&sa, &sv, m);
/*
* If this probe's round trip time would singlehandedly cause an
* increase in the group's probe interval consider it suspect.
*/
if ((new_crtt > probe_interval) && is_probe_uni) {
if (debug & D_PROBE) {
logdebug("Received a suspect probe on %s, new_crtt ="
" %d, probe_interval = %d, num_deferred = %d\n",
pii->pii_probe_logint->li_name, new_crtt,
probe_interval, tg->tg_num_deferred);
}
/*
* If we've deferred as many rtts as we plan on deferring, then
* assume the link really did slow down and process all queued
* rtts
*/
if (tg->tg_num_deferred == MAXDEFERREDRTT) {
if (debug & D_PROBE) {
logdebug("Received MAXDEFERREDRTT probes which "
"would cause an increased probe_interval. "
"Integrating queued rtt data points.\n");
}
for (i = 0; i <= tg->tg_num_deferred; i++) {
tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa,
&tg->tg_rtt_sd, tg->tg_deferred[i]);
}
tg->tg_num_deferred = 0;
} else {
tg->tg_num_deferred++;
}
return;
}
/*
* If this is a normal probe, or an RTT probe that would lead to a
* reduced CRTT, then update our CRTT data. Further, if this was
* a normal probe, pitch any deferred probes since our probes are
* again being answered within our CRTT estimates.
*/
if (is_probe_uni || new_crtt < tg->tg_crtt) {
tg->tg_rtt_sa = sa;
tg->tg_rtt_sd = sv;
tg->tg_crtt = new_crtt;
if (is_probe_uni)
tg->tg_num_deferred = 0;
}
}
/*
* Return a pointer to the specified option buffer.
* If not found return NULL.
*/
static void *
find_ancillary(struct msghdr *msg, int cmsg_type)
{
struct cmsghdr *cmsg;
for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
cmsg = CMSG_NXTHDR(msg, cmsg)) {
if (cmsg->cmsg_level == IPPROTO_IPV6 &&
cmsg->cmsg_type == cmsg_type) {
return (CMSG_DATA(cmsg));
}
}
return (NULL);
}
/*
* See if a previously failed interface has started working again.
*/
void
phyint_check_for_repair(struct phyint *pi)
{
if (phyint_repaired(pi)) {
if (pi->pi_group == phyint_anongroup) {
logerr("NIC repair detected on %s\n", pi->pi_name);
} else {
logerr("NIC repair detected on %s of group %s\n",
pi->pi_name, pi->pi_group->pg_name);
}
/*
* If the interface is offline, just clear the FAILED flag,
* delaying the state change and failback operation until it
* is brought back online.
*/
if (pi->pi_state == PI_OFFLINE) {
(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
return;
}
if (pi->pi_flags & IFF_INACTIVE) {
(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
} else {
if (try_failback(pi, _B_FALSE) != IPMP_FAILURE) {
(void) change_lif_flags(pi,
IFF_FAILED, _B_FALSE);
/* Per state diagram */
pi->pi_empty = 0;
}
}
phyint_chstate(pi, PI_RUNNING);
if (GROUP_FAILED(pi->pi_group)) {
/*
* This is the 1st phyint to receive a response
* after group failure.
*/
logerr("At least 1 interface (%s) of group %s has "
"repaired\n", pi->pi_name, pi->pi_group->pg_name);
phyint_group_chstate(pi->pi_group, PG_RUNNING);
}
}
}
/*
* See if a previously functioning interface has failed, or if the
* whole group of interfaces has failed.
*/
static void
phyint_inst_check_for_failure(struct phyint_instance *pii)
{
struct phyint *pi;
struct phyint *pi2;
pi = pii->pii_phyint;
switch (failure_state(pii)) {
case PHYINT_FAILURE:
(void) change_lif_flags(pi, IFF_FAILED, _B_TRUE);
if (pi->pi_group == phyint_anongroup) {
logerr("NIC failure detected on %s\n", pii->pii_name);
} else {
logerr("NIC failure detected on %s of group %s\n",
pii->pii_name, pi->pi_group->pg_name);
}
/*
* Do the failover, unless the interface is offline (in
* which case we've already failed over).
*/
if (pi->pi_state != PI_OFFLINE) {
phyint_chstate(pi, PI_FAILED);
reset_crtt_all(pi);
if (!(pi->pi_flags & IFF_INACTIVE))
(void) try_failover(pi, FAILOVER_NORMAL);
}
break;
case GROUP_FAILURE:
logerr("All Interfaces in group %s have failed\n",
pi->pi_group->pg_name);
for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL;
pi2 = pi2->pi_pgnext) {
if (pi2->pi_flags & IFF_OFFLINE)
continue;
(void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE);
reset_crtt_all(pi2);
/*
* In the case of host targets, we
* would have flushed the targets,
* and gone to PI_NOTARGETS state.
*/
if (pi2->pi_state == PI_RUNNING)
phyint_chstate(pi, PI_FAILED);
pi2->pi_empty = 0;
pi2->pi_full = 0;
}
break;
default:
break;
}
}
/*
* Determines if any timeout event has occurred and returns the number of
* milliseconds until the next timeout event for the phyint. Returns
* TIMER_INFINITY for "never".
*/
uint_t
phyint_inst_timer(struct phyint_instance *pii)
{
int pr_ndx;
uint_t timeout;
struct target *cur_tg;
struct probe_stats *pr_statp;
struct phyint_instance *pii_other;
struct phyint *pi;
int valid_unack_count;
int i;
int interval;
uint_t check_time;
uint_t cur_time;
hrtime_t cur_hrtime;
int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
cur_time = getcurrenttime();
if (debug & D_TIMER) {
logdebug("phyint_inst_timer(%s %s)\n",
AF_STR(pii->pii_af), pii->pii_name);
}
pii_other = phyint_inst_other(pii);
if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) {
/*
* Check to see if we're here due to link up/down flapping; If
* enough time has passed, then try to bring the interface
* back up; otherwise, schedule a timer to bring it back up
* when enough time *has* elapsed.
*/
pi = pii->pii_phyint;
if (pi->pi_state == PI_FAILED && LINK_UP(pi)) {
check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN;
if (check_time > cur_time)
return (check_time - cur_time);
phyint_check_for_repair(pi);
}
}
/*
* If this phyint is not yet initialized for probes,
* don't proceed further
*/
if (pii->pii_probe_sock == -1)
return (TIMER_INFINITY);
/*
* If the timer has fired too soon, probably triggered
* by some other phyint instance, return the remaining
* time
*/
if (TIME_LT(cur_time, pii->pii_snxt_time))
return (pii->pii_snxt_time - cur_time);
/*
* If the link is down, don't send any probes for now.
*/
if (LINK_DOWN(pii->pii_phyint))
return (TIMER_INFINITY);
/*
* Randomize the next probe time, between MIN_RANDOM_FACTOR
* and MAX_RANDOM_FACTOR with respect to the base probe time.
* Base probe time is strictly periodic.
*/
interval = GET_RANDOM(
(int)(MIN_RANDOM_FACTOR * user_probe_interval),
(int)(MAX_RANDOM_FACTOR * user_probe_interval));
pii->pii_snxt_time = pii->pii_snxt_basetime + interval;
/*
* Check if the current time > next time to probe. If so, we missed
* sending 1 or more probes, probably due to heavy system load. At least
* 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
* were scheduled. Make adjustments to the times, in multiples of
* user_probe_interval.
*/
if (TIME_GT(cur_time, pii->pii_snxt_time)) {
int n;
n = (cur_time - pii->pii_snxt_time) / user_probe_interval;
pii->pii_snxt_time += (n + 1) * user_probe_interval;
pii->pii_snxt_basetime += (n + 1) * user_probe_interval;
logtrace("missed sending %d probes cur_time %u snxt_time %u"
" snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time,
pii->pii_snxt_basetime);
/* Collect statistics about missed probes */
probes_missed.pm_nprobes += n + 1;
probes_missed.pm_ntimes++;
}
pii->pii_snxt_basetime += user_probe_interval;
interval = pii->pii_snxt_time - cur_time;
if (debug & D_TARGET) {
logdebug("cur_time %u snxt_time %u snxt_basetime %u"
" interval %u\n", cur_time, pii->pii_snxt_time,
pii->pii_snxt_basetime, interval);
}
/*
* If no targets are known, we need to send an ICMP multicast. The
* probe type is PROBE_MULTI. We'll check back in 'interval' msec
* to see if we found a target.
*/
if (pii->pii_target_next == NULL) {
assert(pii->pii_ntargets == 0);
pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
probe(pii, PROBE_MULTI, cur_time);
return (interval);
}
if ((user_probe_interval != probe_interval) &&
TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) {
/*
* the failure detection (fd) probe timer has not yet fired.
* Need to send only an rtt probe. The probe type is PROBE_RTT.
*/
probe(pii, PROBE_RTT, cur_time);
return (interval);
}
/*
* the fd probe timer has fired. Need to do all failure
* detection / recovery calculations, and then send an fd probe
* of type PROBE_UNI.
*/
if (user_probe_interval == probe_interval) {
/*
* We could have missed some probes, and then adjusted
* pii_snxt_basetime above. Otherwise we could have
* blindly added probe_interval to pii_fd_snxt_basetime.
*/
pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
} else {
pii->pii_fd_snxt_basetime += probe_interval;
if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) {
int n;
n = (cur_time - pii->pii_fd_snxt_basetime) /
probe_interval;
pii->pii_fd_snxt_basetime += (n + 1) * probe_interval;
}
}
/*
* We can have at most, the latest 2 probes that we sent, in
* the PR_UNACKED state. All previous probes sent, are either
* PR_LOST or PR_ACKED. An unacknowledged probe is considered
* timed out if the probe's time_sent + the CRTT < currenttime.
* For each of the last 2 probes, examine whether it has timed
* out. If so, mark it PR_LOST. The probe stats is a circular array.
*/
pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
valid_unack_count = 0;
for (i = 0; i < 2; i++) {
pr_statp = &pii->pii_probes[pr_ndx];
cur_tg = pii->pii_probes[pr_ndx].pr_target;
switch (pr_statp->pr_status) {
case PR_ACKED:
/*
* We received back an ACK, so the switch clearly
* is not dropping our traffic, and thus we can
* enable failure detection immediately.
*/
if (pii->pii_fd_hrtime > gethrtime()) {
if (debug & D_PROBE) {
logdebug("successful probe on %s; "
"ending quiet period\n",
pii->pii_phyint->pi_name);
}
pii->pii_fd_hrtime = gethrtime();
}
break;
case PR_UNACKED:
assert(cur_tg != NULL);
/*
* The crtt could be zero for some reason,
* Eg. the phyint could be failed. If the crtt is
* not available use group's probe interval,
* which is a worst case estimate.
*/
if (cur_tg->tg_crtt != 0) {
timeout = pr_statp->pr_time_sent +
cur_tg->tg_crtt;
} else {
timeout = pr_statp->pr_time_sent +
probe_interval;
}
if (TIME_LT(timeout, cur_time)) {
pr_statp->pr_status = PR_LOST;
pr_statp->pr_time_lost = timeout;
} else if (i == 1) {
/*
* We are forced to consider this probe
* lost, as we can have at most 2 unack.
* probes any time, and we will be sending a
* probe at the end of this function.
* Normally, we should not be here, but
* this can happen if an incoming response
* that was considered lost has increased
* the crtt for this target, and also bumped
* up the FDT. Note that we never cancel or
* increase the current pii_time_left, so
* when the timer fires, we find 2 valid
* unacked probes, and they are yet to timeout
*/
pr_statp->pr_status = PR_LOST;
pr_statp->pr_time_lost = cur_time;
} else {
/*
* Only the most recent probe can enter
* this 'else' arm. The second most recent
* probe must take either of the above arms,
* if it is unacked.
*/
valid_unack_count++;
}
break;
}
pr_ndx = PROBE_INDEX_PREV(pr_ndx);
}
/*
* We send out 1 probe randomly in the interval between one half
* and one probe interval for the group. Given that the CRTT is always
* less than the group's probe interval, we can have at most 1
* unacknowledged probe now. All previous probes are either lost or
* acked.
*/
assert(valid_unack_count == 0 || valid_unack_count == 1);
/*
* The timer has fired. Take appropriate action depending
* on the current state of the phyint.
*
* PI_RUNNING state - Failure detection and failover
* PI_FAILED state - Repair detection and failback
*/
switch (pii->pii_phyint->pi_state) {
case PI_FAILED:
/*
* If the most recent probe (excluding unacked probes that
* are yet to time out) has been acked, check whether the
* phyint is now repaired. If the phyint is repaired, then
* attempt failback, unless it is an inactive standby.
*/
if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
phyint_check_for_repair(pii->pii_phyint);
}
break;
case PI_RUNNING:
/*
* It's possible our probes have been lost because of a
* spanning-tree mandated quiet period on the switch. If so,
* ignore the lost probes and consider the interface to still
* be functioning.
*/
cur_hrtime = gethrtime();
if (pii->pii_fd_hrtime - cur_hrtime > 0)
break;
if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) {
/*
* We have 1 or more failed probes (excluding unacked
* probes that are yet to time out). Determine if the
* phyint has failed. If so attempt a failover,
* unless it is an inactive standby
*/
phyint_inst_check_for_failure(pii);
}
break;
default:
logerr("phyint_inst_timer: invalid state %d\n",
pii->pii_phyint->pi_state);
abort();
}
/*
* Start the next probe. probe() will also set pii->pii_probe_time_left
* to the group's probe interval. If phyint_failed -> target_flush_hosts
* was called, the target list may be empty.
*/
if (pii->pii_target_next != NULL) {
probe(pii, PROBE_UNI, cur_time);
/*
* If we have just the one probe target, and we're not using
* router targets, try to find another as we presently have
* no resilience.
*/
if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
probe(pii, PROBE_MULTI, cur_time);
} else {
probe(pii, PROBE_MULTI, cur_time);
}
return (interval);
}
/*
* Start the probe timer for an interface instance.
*/
void
start_timer(struct phyint_instance *pii)
{
uint32_t interval;
/*
* Spread the base probe times (pi_snxt_basetime) across phyints
* uniformly over the (curtime..curtime + the group's probe_interval).
* pi_snxt_basetime is strictly periodic with a frequency of
* the group's probe interval. The actual probe time pi_snxt_time
* adds some randomness to pi_snxt_basetime and happens in probe().
* For the 1st probe on each phyint after the timer is started,
* pi_snxt_time and pi_snxt_basetime are the same.
*/
interval = GET_RANDOM(0,
(int)pii->pii_phyint->pi_group->pg_probeint);
pii->pii_snxt_basetime = getcurrenttime() + interval;
pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
pii->pii_snxt_time = pii->pii_snxt_basetime;
timer_schedule(interval);
}
/*
* Restart the probe timer on an interface instance.
*/
static void
restart_timer(struct phyint_instance *pii)
{
/*
* We don't need to restart the timer if it was never started in
* the first place (pii->pii_basetime_inited not set), as the timer
* won't have gone off yet.
*/
if (pii->pii_basetime_inited != 0) {
if (debug & D_LINKNOTE)
logdebug("restart timer: restarting timer on %s, "
"address family %s\n", pii->pii_phyint->pi_name,
AF_STR(pii->pii_af));
start_timer(pii);
}
}
static void
process_link_state_down(struct phyint *pi)
{
logerr("The link has gone down on %s\n", pi->pi_name);
/*
* Clear the probe statistics arrays, we don't want the repair
* detection logic relying on probes that were succesful prior
* to the link going down.
*/
if (PROBE_CAPABLE(pi->pi_v4))
clear_pii_probe_stats(pi->pi_v4);
if (PROBE_CAPABLE(pi->pi_v6))
clear_pii_probe_stats(pi->pi_v6);
/*
* Check for interface failure. Although we know the interface
* has failed, we don't know if all the other interfaces in the
* group have failed as well.
*/
if ((pi->pi_state == PI_RUNNING) ||
(pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) {
if (debug & D_LINKNOTE) {
logdebug("process_link_state_down:"
" checking for failure on %s\n", pi->pi_name);
}
if (pi->pi_v4 != NULL)
phyint_inst_check_for_failure(pi->pi_v4);
else if (pi->pi_v6 != NULL)
phyint_inst_check_for_failure(pi->pi_v6);
}
}
static void
process_link_state_up(struct phyint *pi)
{
logerr("The link has come up on %s\n", pi->pi_name);
/*
* We stopped any running timers on each instance when the link
* went down, so restart them.
*/
if (pi->pi_v4)
restart_timer(pi->pi_v4);
if (pi->pi_v6)
restart_timer(pi->pi_v6);
phyint_check_for_repair(pi);
pi->pi_whenup[pi->pi_whendx++] = getcurrenttime();
if (pi->pi_whendx == LINK_UP_PERMIN)
pi->pi_whendx = 0;
}
/*
* Process any changes in link state passed up from the interfaces.
*/
void
process_link_state_changes(void)
{
struct phyint *pi;
/* Look for interfaces where the link state has just changed */
for (pi = phyints; pi != NULL; pi = pi->pi_next) {
boolean_t old_link_state_up = LINK_UP(pi);
/*
* Except when the "phyint" structure is created, this is
* the only place the link state is updated. This allows
* this routine to detect changes in link state, rather
* than just the current state.
*/
UPDATE_LINK_STATE(pi);
if (LINK_DOWN(pi)) {
/*
* Has link just gone down?
*/
if (old_link_state_up)
process_link_state_down(pi);
} else {
/*
* Has link just gone back up?
*/
if (!old_link_state_up)
process_link_state_up(pi);
}
}
}
void
reset_crtt_all(struct phyint *pi)
{
struct phyint_instance *pii;
struct target *tg;
pii = pi->pi_v4;
if (pii != NULL) {
for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
tg->tg_crtt = 0;
tg->tg_rtt_sa = -1;
tg->tg_rtt_sd = 0;
}
}
pii = pi->pi_v6;
if (pii != NULL) {
for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
tg->tg_crtt = 0;
tg->tg_rtt_sa = -1;
tg->tg_rtt_sd = 0;
}
}
}
/*
* Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
* probes on both instances IPv4 and IPv6.
* If the interface has failed, return the time of the first probe failure
* in "tff".
*/
static int
phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
{
uint_t pi_tff;
struct target *cur_tg;
struct probe_fail_count pfinfo;
struct phyint_instance *pii_other;
int pr_ndx;
/*
* Get the number of consecutive failed probes on
* this phyint across all targets. Also get the number
* of consecutive failed probes on this target only
*/
pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
cur_tg = pii->pii_probes[pr_ndx].pr_target;
probe_fail_info(pii, cur_tg, &pfinfo);
/* Get the time of first failure, for later use */
pi_tff = pfinfo.pf_tff;
/*
* If the current target has not responded to the
* last NUM_PROBE_FAILS probes, and other targets are
* responding delete this target. Dead gateway detection
* will eventually remove this target (if router) from the
* routing tables. If that does not occur, we may end
* up adding this to our list again.
*/
if (pfinfo.pf_nfail < NUM_PROBE_FAILS &&
pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) {
if (pii->pii_targets_are_routers) {
if (cur_tg->tg_status == TG_ACTIVE)
pii->pii_ntargets--;
cur_tg->tg_status = TG_DEAD;
cur_tg->tg_crtt = 0;
cur_tg->tg_rtt_sa = -1;
cur_tg->tg_rtt_sd = 0;
if (pii->pii_target_next == cur_tg)
pii->pii_target_next = target_next(cur_tg);
} else {
target_delete(cur_tg);
probe(pii, PROBE_MULTI, getcurrenttime());
}
return (PHYINT_OK);
}
/*
* If the phyint has lost NUM_PROBE_FAILS or more
* consecutive probes, on both IPv4 and IPv6 protocol
* instances of the phyint, then trigger failure
* detection, else return false
*/
if (pfinfo.pf_nfail < NUM_PROBE_FAILS)
return (PHYINT_OK);
pii_other = phyint_inst_other(pii);
if (PROBE_CAPABLE(pii_other)) {
probe_fail_info(pii_other, NULL, &pfinfo);
if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) {
/*
* We have NUM_PROBE_FAILS or more failures
* on both IPv4 and IPv6. Get the earliest
* time when failure was detected on this
* phyint across IPv4 and IPv6.
*/
if (TIME_LT(pfinfo.pf_tff, pi_tff))
pi_tff = pfinfo.pf_tff;
} else {
/*
* This instance has < NUM_PROBE_FAILS failure.
* So return false
*/
return (PHYINT_OK);
}
}
*tff = pi_tff;
return (PHYINT_FAILURE);
}
/*
* Check if the link has gone down on this phyint, or it has failed the
* last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
* Also look at other phyints of this group, for group failures.
*/
int
failure_state(struct phyint_instance *pii)
{
struct probe_success_count psinfo;
uint_t pi2_tls; /* time last success */
uint_t pi_tff; /* time first fail */
struct phyint *pi2;
struct phyint *pi;
struct phyint_instance *pii2;
struct phyint_group *pg;
boolean_t alone;
if (debug & D_FAILOVER)
logdebug("phyint_failed(%s)\n", pii->pii_name);
pi = pii->pii_phyint;
pg = pi->pi_group;
if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) ==
PHYINT_OK)
return (PHYINT_OK);
/*
* At this point, the link is down, or the phyint is suspect,
* as it has lost NUM_PROBE_FAILS or more probes. If the phyint
* does not belong to any group, or is the only member of the
* group capable of being probed, return PHYINT_FAILURE.
*/
alone = _B_TRUE;
if (pg != phyint_anongroup) {
for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
if (pi2 == pi)
continue;
if (PROBE_CAPABLE(pi2->pi_v4) ||
PROBE_CAPABLE(pi2->pi_v6)) {
alone = _B_FALSE;
break;
}
}
}
if (alone)
return (PHYINT_FAILURE);
/*
* Need to compare against other phyints of the same group
* to exclude group failures. If the failure was detected via
* probing, then if the time of last success (tls) of any
* phyint is more recent than the time of first fail (tff) of the
* phyint in question, and the link is up on the phyint,
* then it is a phyint failure. Otherwise it is a group failure.
* If failure was detected via a link down notification sent from
* the driver to IP, we see if any phyints in the group are still
* running and haven't received a link down notification. We
* will usually be processing the link down notification shortly
* after it was received, so there is no point looking at the tls
* of other phyints.
*/
for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
/* Exclude ourself from comparison */
if (pi2 == pi)
continue;
if (LINK_DOWN(pi)) {
/*
* We use FLAGS_TO_LINK_STATE() to test the
* flags directly, rather then LINK_UP() or
* LINK_DOWN(), as we may not have got round
* to processing the link state for the other
* phyints in the group yet.
*
* The check for PI_RUNNING and group
* failure handles the case when the
* group begins to recover. The first
* phyint to recover should not trigger
* a failover from the soon-to-recover
* other phyints to the first recovered
* phyint. PI_RUNNING will be set, and
* pg_groupfailed cleared only after
* receipt of NUM_PROBE_REPAIRS, by
* which time the other phyints should
* have received at least 1 packet,
* and so will not have NUM_PROBE_FAILS.
*/
if ((pi2->pi_state == PI_RUNNING) &&
!GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2))
return (PHYINT_FAILURE);
} else {
/*
* Need to compare against both IPv4 and
* IPv6 instances.
*/
pii2 = pi2->pi_v4;
if (pii2 != NULL) {
probe_success_info(pii2, NULL, &psinfo);
if (psinfo.ps_tls_valid) {
pi2_tls = psinfo.ps_tls;
/*
* See comment above regarding check
* for PI_RUNNING and group failure.
*/
if (TIME_GT(pi2_tls, pi_tff) &&
(pi2->pi_state == PI_RUNNING) &&
!GROUP_FAILED(pg) &&
FLAGS_TO_LINK_STATE(pi2))
return (PHYINT_FAILURE);
}
}
pii2 = pi2->pi_v6;
if (pii2 != NULL) {
probe_success_info(pii2, NULL, &psinfo);
if (psinfo.ps_tls_valid) {
pi2_tls = psinfo.ps_tls;
/*
* See comment above regarding check
* for PI_RUNNING and group failure.
*/
if (TIME_GT(pi2_tls, pi_tff) &&
(pi2->pi_state == PI_RUNNING) &&
!GROUP_FAILED(pg) &&
FLAGS_TO_LINK_STATE(pi2))
return (PHYINT_FAILURE);
}
}
}
}
/*
* Change the group state to PG_FAILED if it's not already.
*/
if (!GROUP_FAILED(pg))
phyint_group_chstate(pg, PG_FAILED);
return (GROUP_FAILURE);
}
/*
* Return the information associated with consecutive probe successes
* starting with the most recent probe. At most the last 2 probes can be
* in the unacknowledged state. All previous probes have either failed
* or succeeded.
*/
static void
probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
struct probe_success_count *psinfo)
{
uint_t i;
struct probe_stats *pr_statp;
uint_t most_recent;
uint_t second_most_recent;
boolean_t pi_found_failure = _B_FALSE;
boolean_t tg_found_failure = _B_FALSE;
uint_t now;
uint_t timeout;
struct target *tg;
if (debug & D_FAILOVER)
logdebug("probe_success_info(%s)\n", pii->pii_name);
bzero(psinfo, sizeof (*psinfo));
now = getcurrenttime();
/*
* Start with the most recent probe, and count the number
* of consecutive probe successes. Latch the number of successes
* on hitting a failure.
*/
most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
second_most_recent = PROBE_INDEX_PREV(most_recent);
for (i = most_recent; i != pii->pii_probe_next;
i = PROBE_INDEX_PREV(i)) {
pr_statp = &pii->pii_probes[i];
switch (pr_statp->pr_status) {
case PR_UNACKED:
/*
* Only the most recent 2 probes can be unacknowledged
*/
assert(i == most_recent || i == second_most_recent);
tg = pr_statp->pr_target;
assert(tg != NULL);
/*
* The crtt could be zero for some reason,
* Eg. the phyint could be failed. If the crtt is
* not available use the value of the group's probe
* interval which is a worst case estimate.
*/
if (tg->tg_crtt != 0) {
timeout = pr_statp->pr_time_sent + tg->tg_crtt;
} else {
timeout = pr_statp->pr_time_sent +
pii->pii_phyint->pi_group->pg_probeint;
}
if (TIME_LT(timeout, now)) {
/*
* We hit a failure. Latch the total number of
* recent consecutive successes.
*/
pr_statp->pr_time_lost = timeout;
pr_statp->pr_status = PR_LOST;
pi_found_failure = _B_TRUE;
if (cur_tg != NULL && tg == cur_tg) {
/*
* We hit a failure for the desired
* target. Latch the number of recent
* consecutive successes for this target
*/
tg_found_failure = _B_TRUE;
}
}
break;
case PR_ACKED:
/*
* Bump up the count of probe successes, if we
* have not seen any failure so far.
*/
if (!pi_found_failure)
psinfo->ps_nsucc++;
if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
!tg_found_failure) {
psinfo->ps_nsucc_tg++;
}
/*
* Record the time of last success, if this is
* the most recent probe success.
*/
if (!psinfo->ps_tls_valid) {
psinfo->ps_tls = pr_statp->pr_time_acked;
psinfo->ps_tls_valid = _B_TRUE;
}
break;
case PR_LOST:
/*
* We hit a failure. Latch the total number of
* recent consecutive successes.
*/
pi_found_failure = _B_TRUE;
if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
/*
* We hit a failure for the desired target.
* Latch the number of recent consecutive
* successes for this target
*/
tg_found_failure = _B_TRUE;
}
break;
default:
return;
}
}
}
/*
* Return the information associated with consecutive probe failures
* starting with the most recent probe. Only the last 2 probes can be in the
* unacknowledged state. All previous probes have either failed or succeeded.
*/
static void
probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
struct probe_fail_count *pfinfo)
{
int i;
struct probe_stats *pr_statp;
boolean_t tg_found_success = _B_FALSE;
boolean_t pi_found_success = _B_FALSE;
int most_recent;
int second_most_recent;
uint_t now;
uint_t timeout;
struct target *tg;
if (debug & D_FAILOVER)
logdebug("probe_fail_info(%s)\n", pii->pii_name);
bzero(pfinfo, sizeof (*pfinfo));
now = getcurrenttime();
/*
* Start with the most recent probe, and count the number
* of consecutive probe failures. Latch the number of failures
* on hitting a probe success.
*/
most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
second_most_recent = PROBE_INDEX_PREV(most_recent);
for (i = most_recent; i != pii->pii_probe_next;
i = PROBE_INDEX_PREV(i)) {
pr_statp = &pii->pii_probes[i];
assert(PR_STATUS_VALID(pr_statp->pr_status));
switch (pr_statp->pr_status) {
case PR_UNACKED:
/*
* Only the most recent 2 probes can be unacknowledged
*/
assert(i == most_recent || i == second_most_recent);
tg = pr_statp->pr_target;
/*
* Target is guaranteed to exist in the unack. state
*/
assert(tg != NULL);
/*
* The crtt could be zero for some reason,
* Eg. the phyint could be failed. If the crtt is
* not available use the group's probe interval,
* which is a worst case estimate.
*/
if (tg->tg_crtt != 0) {
timeout = pr_statp->pr_time_sent + tg->tg_crtt;
} else {
timeout = pr_statp->pr_time_sent +
pii->pii_phyint->pi_group->pg_probeint;
}
if (TIME_GT(timeout, now))
break;
pr_statp->pr_time_lost = timeout;
pr_statp->pr_status = PR_LOST;
/* FALLTHRU */
case PR_LOST:
if (!pi_found_success) {
pfinfo->pf_nfail++;
pfinfo->pf_tff = pr_statp->pr_time_lost;
}
if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
!tg_found_success) {
pfinfo->pf_nfail_tg++;
}
break;
default:
/*
* We hit a success or unused slot. Latch the
* total number of recent consecutive failures.
*/
pi_found_success = _B_TRUE;
if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
/*
* We hit a success for the desired target.
* Latch the number of recent consecutive
* failures for this target
*/
tg_found_success = _B_TRUE;
}
}
}
}
/*
* Check if the phyint has been repaired. If no test address has been
* configured, then consider the interface repaired if the link is up (unless
* the link is flapping; see below). Otherwise, look for proof of probes
* being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
* either IPv4 or IPv6 instance, the phyint can be considered repaired.
*/
static boolean_t
phyint_repaired(struct phyint *pi)
{
struct probe_success_count psinfo;
struct phyint_instance *pii;
struct target *cur_tg;
int pr_ndx;
uint_t cur_time;
if (debug & D_FAILOVER)
logdebug("phyint_repaired(%s)\n", pi->pi_name);
if (LINK_DOWN(pi))
return (_B_FALSE);
/*
* If we don't have any test addresses and the link is up, then
* consider the interface repaired, unless we've received more than
* LINK_UP_PERMIN link up notifications in the last minute, in
* which case we keep the link down until we drop back below
* the threshold.
*/
if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
cur_time = getcurrenttime();
if ((pi->pi_whenup[pi->pi_whendx] == 0 ||
(cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) {
pi->pi_lfmsg_printed = 0;
return (_B_TRUE);
}
if (!pi->pi_lfmsg_printed) {
logerr("The link has come up on %s more than %d times "
"in the last minute; disabling failback until it "
"stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
pi->pi_lfmsg_printed = 1;
}
return (_B_FALSE);
}
pii = pi->pi_v4;
if (PROBE_CAPABLE(pii)) {
pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
cur_tg = pii->pii_probes[pr_ndx].pr_target;
probe_success_info(pii, cur_tg, &psinfo);
if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
return (_B_TRUE);
}
pii = pi->pi_v6;
if (PROBE_CAPABLE(pii)) {
pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
cur_tg = pii->pii_probes[pr_ndx].pr_target;
probe_success_info(pii, cur_tg, &psinfo);
if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
return (_B_TRUE);
}
return (_B_FALSE);
}
/*
* Try failover from phyint 'pi' to a suitable destination.
*/
int
try_failover(struct phyint *pi, int failover_type)
{
struct phyint *dst;
int err;
if (debug & D_FAILOVER)
logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type);
/*
* Attempt to find a failover destination 'dst'.
* dst will be null if any of the following is true
* Phyint is not part of a group OR
* Phyint is the only member of a group OR
* No suitable failover dst was available
*/
dst = get_failover_dst(pi, failover_type);
if (dst == NULL)
return (IPMP_EMINRED);
dst->pi_empty = 0; /* Per state diagram */
pi->pi_full = 0; /* Per state diagram */
err = failover(pi, dst);
if (debug & D_FAILOVER) {
logdebug("failed over from %s to %s ret %d\n",
pi->pi_name, dst->pi_name, err);
}
if (err == 0) {
pi->pi_empty = 1; /* Per state diagram */
/*
* we don't want to print out this message if a
* phyint is leaving the group, nor for failover from
* standby
*/
if (failover_type == FAILOVER_NORMAL) {
logerr("Successfully failed over from NIC %s to NIC "
"%s\n", pi->pi_name, dst->pi_name);
}
return (0);
} else {
/*
* The failover did not succeed. We must retry the failover
* only after resyncing our state based on the kernel's.
* For eg. either the src or the dst might have been unplumbed
* causing this failure. initifs() will be called again,
* from main, since full_scan_required has been set to true
* by failover();
*/
return (IPMP_FAILURE);
}
}
/*
* global_errno captures the errno value, if failover() or failback()
* fails. This is sent to if_mpadm(1M).
*/
int global_errno;
/*
* Attempt failover from phyint 'from' to phyint 'to'.
* IP moves everything from phyint 'from' to phyint 'to'.
*/
static int
failover(struct phyint *from, struct phyint *to)
{
struct lifreq lifr;
int ret;
if (debug & D_FAILOVER) {
logdebug("failing over from %s to %s\n",
from->pi_name, to->pi_name);
}
/*
* Perform the failover. Both IPv4 and IPv6 are failed over
* using a single ioctl by passing in AF_UNSPEC family.
*/
lifr.lifr_addr.ss_family = AF_UNSPEC;
(void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
lifr.lifr_movetoindex = to->pi_ifindex;
ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr);
if (ret < 0) {
global_errno = errno;
logperror("failover: ioctl (failover)");
}
/*
* Set full_scan_required to true. This will make us read
* the state from the kernel in initifs() and update our tables,
* to reflect the current state after the failover. If the
* failover has failed it will then reissue the failover.
*/
full_scan_required = _B_TRUE;
return (ret);
}
/*
* phyint 'pi' has recovered. Attempt failback from every phyint in the same
* group as phyint 'pi' that is a potential failback source, to phyint 'pi'.
* Return values:
* IPMP_SUCCESS: Failback successful from each of the other
* phyints in the group.
* IPMP_EFBPARTIAL: Failback successful from some of the other
* phyints in the group.
* IPMP_FAILURE: Failback syscall failed with some error.
*
* Note that failback is attempted regardless of the setting of the
* failback_enabled flag.
*/
int
do_failback(struct phyint *pi, boolean_t check_only)
{
struct phyint *from;
boolean_t done;
boolean_t partial;
boolean_t attempted_failback = _B_FALSE;
if (debug & D_FAILOVER)
logdebug("do_failback(%s)\n", pi->pi_name);
/* If this phyint is not part of a named group, return. */
if (pi->pi_group == phyint_anongroup) {
pi->pi_full = 1;
return (IPMP_SUCCESS);
}
/*
* Attempt failback from every phyint in the group to 'pi'.
* The reason for doing this, instead of only from the
* phyint to which we did the failover is given below.
*
* After 'pi' failed, if any app. tries to join on a multicast
* address (IPv6), on the failed phyint, IP picks any arbitrary
* non-failed phyint in the group, instead of the failed phyint,
* in.mpathd is not aware of this. Thus failing back only from the
* interface to which 'pi' failed over, will failback the ipif's
* but not the ilm's. So we need to failback from all members of
* the phyint group
*/
done = _B_TRUE;
partial = _B_FALSE;
for (from = pi->pi_group->pg_phyint; from != NULL;
from = from->pi_pgnext) {
/* Exclude ourself as a failback src */
if (from == pi)
continue;
/*
* If the 'from' phyint has IPv4 plumbed, the 'to'
* phyint must also have IPv4 plumbed. Similar check
* for IPv6. IP makes the same check. Otherwise the
* failback will fail.
*/
if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) ||
(from->pi_v6 != NULL && pi->pi_v6 == NULL)) {
partial = _B_TRUE;
continue;
}
if (!check_only) {
pi->pi_empty = 0; /* Per state diagram */
attempted_failback = _B_TRUE;
if (failback(from, pi) != 0) {
done = _B_FALSE;
break;
}
}
}
if (check_only) {
return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS);
}
/*
* We are done. No more phyint from which we can src the failback
*/
if (done) {
if (!partial)
pi->pi_full = 1; /* Per state diagram */
/*
* Don't print out a message unless there is a
* transition from FAILED to RUNNING. For eg.
* we don't want to print out this message if a
* phyint is leaving the group, or at startup
*/
if (attempted_failback && (pi->pi_flags &
(IFF_FAILED | IFF_OFFLINE))) {
logerr("Successfully failed back to NIC %s\n",
pi->pi_name);
}
return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS);
}
return (IPMP_FAILURE);
}
/*
* This function is similar to do_failback() above, but respects the
* failback_enabled flag for phyints in named groups.
*/
int
try_failback(struct phyint *pi, boolean_t check_only)
{
if (debug & D_FAILOVER)
logdebug("try_failback(%s)\n", pi->pi_name);
if (pi->pi_group != phyint_anongroup && !failback_enabled)
return (IPMP_EFBDISABLED);
return (do_failback(pi, check_only));
}
/*
* Failback everything from phyint 'from' that has the same ifindex
* as phyint to's ifindex.
*/
static int
failback(struct phyint *from, struct phyint *to)
{
struct lifreq lifr;
int ret;
if (debug & D_FAILOVER)
logdebug("failback(%s %s)\n", from->pi_name, to->pi_name);
lifr.lifr_addr.ss_family = AF_UNSPEC;
(void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
lifr.lifr_movetoindex = to->pi_ifindex;
ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr);
if (ret < 0) {
global_errno = errno;
logperror("failback: ioctl (failback)");
}
/*
* Set full_scan_required to true. This will make us read
* the state from the kernel in initifs() and update our tables,
* to reflect the current state after the failback. If the
* failback has failed it will then reissue the failback.
*/
full_scan_required = _B_TRUE;
return (ret);
}
/*
* Select a target phyint for failing over from 'pi'.
* In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred
* target phyint is chosen as follows,
* 1. Pick any inactive standby interface.
* 2. If no inactive standby is available, select any phyint in the
* same group that has the least number of logints, (excluding
* IFF_NOFAILOVER and !IFF_UP logints)
* If we are failing over from a standby, failover_type is
* FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination.
* If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY,
* and we won't return NULL, as long as there is at least 1 other phyint
* in the group.
*/
static struct phyint *
get_failover_dst(struct phyint *pi, int failover_type)
{
struct phyint *maybe = NULL;
struct phyint *pi2;
struct phyint *last_choice = NULL;
if (pi->pi_group == phyint_anongroup)
return (NULL);
/*
* Loop thru the phyints in the group, and pick the preferred
* phyint for the target.
*/
for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
/* Exclude ourself and offlined interfaces */
if (pi2 == pi || pi2->pi_state == PI_OFFLINE)
continue;
/*
* The chosen target phyint must have IPv4 instance
* plumbed, if the src phyint has IPv4 plumbed. Similarly
* for IPv6.
*/
if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) ||
(pi2->pi_v6 == NULL && pi->pi_v6 != NULL))
continue;
/* The chosen target must be PI_RUNNING. */
if (pi2->pi_state != PI_RUNNING) {
last_choice = pi2;
continue;
}
if ((pi2->pi_flags & IFF_INACTIVE) &&
(failover_type != FAILOVER_TO_NONSTANDBY)) {
return (pi2);
} else {
if (maybe == NULL)
maybe = pi2;
else if (logint_upcount(pi2) < logint_upcount(maybe))
maybe = pi2;
}
}
if (maybe == NULL && failover_type == FAILOVER_TO_ANY)
return (last_choice);
else
return (maybe);
}
/*
* Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
*/
boolean_t
change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl)
{
int ifsock;
struct lifreq lifr;
if (debug & D_FAILOVER) {
logdebug("change_lif_flags(%s): flags %llx setfl %d\n",
pi->pi_name, flags, (int)setfl);
}
if (pi->pi_v4 != NULL) {
ifsock = ifsock_v4;
} else {
ifsock = ifsock_v6;
}
/*
* Get the current flags from the kernel, and set/clear the
* desired phyint flags. Since we set only phyint flags, we can
* do it on either IPv4 or IPv6 instance.
*/
(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
if (errno != ENXIO)
logperror("change_lif_flags: ioctl (get flags)");
return (_B_FALSE);
}
if (setfl)
lifr.lifr_flags |= flags;
else
lifr.lifr_flags &= ~flags;
if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
if (errno != ENXIO)
logperror("change_lif_flags: ioctl (set flags)");
return (_B_FALSE);
}
/*
* Keep pi_flags in synch. with actual flags. Assumes flags are
* phyint flags.
*/
if (setfl)
pi->pi_flags |= flags;
else
pi->pi_flags &= ~flags;
if (pi->pi_v4)
pi->pi_v4->pii_flags = pi->pi_flags;
if (pi->pi_v6)
pi->pi_v6->pii_flags = pi->pi_flags;
return (_B_TRUE);
}
/*
* icmp cksum computation for IPv4.
*/
static int
in_cksum(ushort_t *addr, int len)
{
register int nleft = len;
register ushort_t *w = addr;
register ushort_t answer;
ushort_t odd_byte = 0;
register int sum = 0;
/*
* Our algorithm is simple, using a 32 bit accumulator (sum),
* we add sequential 16 bit words to it, and at the end, fold
* back all the carry bits from the top 16 bits into the lower
* 16 bits.
*/
while (nleft > 1) {
sum += *w++;
nleft -= 2;
}
/* mop up an odd byte, if necessary */
if (nleft == 1) {
*(uchar_t *)(&odd_byte) = *(uchar_t *)w;
sum += odd_byte;
}
/*
* add back carry outs from top 16 bits to low 16 bits
*/
sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */
sum += (sum >> 16); /* add carry */
answer = ~sum; /* truncate to 16 bits */
return (answer);
}
static void
reset_snxt_basetimes(void)
{
struct phyint_instance *pii;
for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
}
}
/*
* Is the address one of our own addresses? Unfortunately,
* we cannot check our phyint tables to determine if the address
* is our own. This is because, we don't track interfaces that
* are not part of any group. We have to either use a 'bind' or
* get the complete list of all interfaces using SIOCGLIFCONF,
* to do this check. We choose to use 'bind'. We could use
* SIOCTMYADDR, but bind is preferred, since it is stronger.
* SIOCTMYADDR excludes down interfaces, while bind includes even
* down interfaces.
*/
boolean_t
own_address(int af, struct in6_addr addr)
{
int sock;
boolean_t ours = _B_TRUE;
sock = socket(AF_INET6, SOCK_DGRAM, 0);
if (sock == -1) {
logperror("own_address: socket");
/*
* If the socket call fails, err on the side of caution,
* and return true.
*/
} else {
struct sockaddr_in6 sin6;
(void) memset(&sin6, 0, sizeof (struct sockaddr_in6));
sin6.sin6_family = AF_INET6;
sin6.sin6_addr = addr;
/*
* If the bind succeeds, then this address is one of our
* addresses.
* If bind returns error EADDRNOTAVAIL, the address is
* not one of ours.
* If bind returns an error other than EADDRNOTAVAIL, err
* on the side of caution and report the address as one of
* our own.
*/
if (bind(sock, (struct sockaddr *)&sin6,
sizeof (struct sockaddr_in6)) == -1) {
if (errno == EADDRNOTAVAIL)
ours = _B_FALSE;
else
logperror("own_address: bind");
}
(void) close(sock);
}
if (debug & D_TARGET) {
char abuf[INET6_ADDRSTRLEN];
logdebug("own_address: addr %s is %s ours\n",
pr_addr(af, addr, abuf, sizeof (abuf)),
ours ? "one of" : "not");
}
return (ours);
}