/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 1987 Regents of the University of California.
* All rights reserved.
*
* Redistribution and use in source and binary forms are permitted
* provided that the above copyright notice and this paragraph are
* duplicated in all such forms and that any documentation,
* advertising materials, and other materials related to such
* distribution and use acknowledge that the software was developed
* by the University of California, Berkeley. The name of the
* University may not be used to endorse or promote products derived
* from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
* WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
*/
#include "mpd_defs.h"
#include "mpd_tables.h"
/*
* Probe types for probe()
*/
#define PROBE_UNI 0x1234 /* Unicast probe packet */
#define PROBE_MULTI 0x5678 /* Multicast probe packet */
#define PROBE_RTT 0x9abc /* RTT only probe packet */
#define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */
/*
* Format of probe / probe response packets. This is an ICMP Echo request
* or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
*/
struct pr_icmp
{
uint8_t pr_icmp_type; /* type field */
uint8_t pr_icmp_code; /* code field */
uint16_t pr_icmp_cksum; /* checksum field */
uint16_t pr_icmp_id; /* Identification */
uint16_t pr_icmp_seq; /* sequence number */
uint64_t pr_icmp_timestamp; /* Time stamp (in ns) */
uint32_t pr_icmp_mtype; /* Message type */
};
static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x1 } };
static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */
static void *find_ancillary(struct msghdr *msg, int cmsg_level,
int cmsg_type);
static void pi_set_crtt(struct target *tg, int64_t m,
boolean_t is_probe_uni);
static void incoming_echo_reply(struct phyint_instance *pii,
struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp);
static void incoming_rtt_reply(struct phyint_instance *pii,
struct pr_icmp *reply, struct in6_addr fromaddr);
static void incoming_mcast_reply(struct phyint_instance *pii,
struct pr_icmp *reply, struct in6_addr fromaddr);
static boolean_t check_pg_crtt_improved(struct phyint_group *pg);
static boolean_t check_pii_crtt_improved(struct phyint_instance *pii);
static boolean_t check_exception_target(struct phyint_instance *pii,
struct target *target);
static void probe_fail_info(struct phyint_instance *pii,
struct target *cur_tg, struct probe_fail_count *pfinfo);
static void probe_success_info(struct phyint_instance *pii,
struct target *cur_tg, struct probe_success_count *psinfo);
static boolean_t phyint_repaired(struct phyint *pi);
static boolean_t highest_ack_tg(uint16_t seq, struct target *tg);
static int in_cksum(ushort_t *addr, int len);
static void reset_snxt_basetimes(void);
static int ns2ms(int64_t ns);
static int64_t tv2ns(struct timeval *);
/*
* CRTT - Conservative Round Trip Time Estimate
* Probe success - A matching probe reply received before CRTT ms has elapsed
* after sending the probe.
* Probe failure - No probe reply received and more than CRTT ms has elapsed
* after sending the probe.
*
* TLS - Time last success. Most recent probe ack received at this time.
* TFF - Time first fail. The time of the earliest probe failure in
* a consecutive series of probe failures.
* NUM_PROBE_REPAIRS - Number of consecutive successful probes required
* before declaring phyint repair.
* NUM_PROBE_FAILS - Number of consecutive probe failures required to
* declare a phyint failure.
*
* Phyint state diagram
*
* The state of a phyint that is capable of being probed, is completely
* specified by the 3-tuple <pi_state, pg_state, I>.
*
* A phyint starts in either PI_RUNNING or PI_OFFLINE, depending on whether
* IFF_OFFLINE is set. If the phyint is also configured with a test address
* (the common case) and probe targets, then a phyint must also successfully
* be able to send and receive probes in order to remain in the PI_RUNNING
* state (otherwise, it transitions to PI_FAILED).
*
* Further, if a PI_RUNNING phyint is configured with a test address but is
* unable to find any probe targets, it will transition to the PI_NOTARGETS
* state, which indicates that the link is apparently functional but that
* in.mpathd is unable to send probes to verify functionality (in this case,
* in.mpathd makes the optimistic assumption that the interface is working
* correctly and thus does not mark the interface FAILED, but reports it as
* IPMP_IF_UNKNOWN through the async events and query interfaces).
*
* At any point, a phyint may be administratively marked offline via if_mpadm.
* In this case, the interface always transitions to PI_OFFLINE, regardless
* of its previous state. When the interface is later brought back online,
* in.mpathd acts as if the interface is new (and thus it transitions to
* PI_RUNNING or PI_FAILED based on the status of the link and the result of
* its probes, if probes are sent).
*
* pi_state - PI_RUNNING or PI_FAILED
* PI_RUNNING: The failure detection logic says the phyint is good.
* PI_FAILED: The failure detection logic says the phyint has failed.
*
* pg_state - PG_OK, PG_DEGRADED, or PG_FAILED.
* PG_OK: All interfaces in the group are OK.
* PG_DEGRADED: Some interfaces in the group are unusable.
* PG_FAILED: All interfaces in the group are unusable.
*
* In the case of router targets, we assume that the current list of
* targets obtained from the routing table, is still valid, so the
* phyint stat is PI_FAILED. In the case of host targets, we delete the
* list of targets, and multicast to the all hosts, to reconstruct the
* target list. So the phyints are in the PI_NOTARGETS state.
*
* I - value of (pi_flags & IFF_INACTIVE)
* IFF_INACTIVE: This phyint will not send or receive packets.
* Usually, inactive is tied to standby interfaces that are not yet
* needed (e.g., no non-standby interfaces in the group have failed).
* When failback has been disabled (FAILBACK=no configured), phyint can
* also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint
* subsequently recovers after a failure.
*
* Not all 9 possible combinations of the above 3-tuple are possible.
*
* I is tracked by IP. pi_state is tracked by mpathd.
*
* pi_state state machine
* ---------------------------------------------------------------------------
* Event State New State
* Action:
* ---------------------------------------------------------------------------
* IP interface failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
* detection : set IFF_FAILED on this phyint
*
* IP interface failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
* detection : set IFF_FAILED on this phyint
*
* IP interface repair (PI_FAILED, I == 0, FAILBACK=yes)
* detection -> (PI_RUNNING, I == 0)
* : clear IFF_FAILED on this phyint
*
* IP interface repair (PI_FAILED, I == 0, FAILBACK=no)
* detection -> (PI_RUNNING, I == 1)
* : clear IFF_FAILED on this phyint
* : if failback is disabled set I == 1
*
* Group failure (perform on all phyints in the group)
* detection PI_RUNNING PI_FAILED
* (Router targets) : set IFF_FAILED
*
* Group failure (perform on all phyints in the group)
* detection PI_RUNNING PI_NOTARGETS
* (Host targets) : set IFF_FAILED
* : delete the target list on all phyints
* ---------------------------------------------------------------------------
*/
struct probes_missed probes_missed;
/*
* Compose and transmit an ICMP ECHO REQUEST packet. The IP header
* will be added on by the kernel. The id field identifies this phyint.
* and the sequence number is an increasing (modulo 2^^16) integer. The data
* portion holds the time value when the packet is sent. On echo this is
* extracted to compute the round-trip time. Three different types of
* probe packets are used.
*
* PROBE_UNI: This type is used to do failure detection / failure recovery
* and RTT calculation. PROBE_UNI probes are spaced apart in time,
* not less than the current CRTT. pii_probes[] stores data
* about these probes. These packets consume sequence number space.
*
* PROBE_RTT: This type is used to make only rtt measurements. Normally these
* are not used. Under heavy network load, the rtt may go up very high,
* due to a spike, or may appear to go high, due to extreme scheduling
* delays. Once the network stress is removed, mpathd takes long time to
* recover, because the probe_interval is already high, and it takes
* a long time to send out sufficient number of probes to bring down the
* rtt. To avoid this problem, PROBE_RTT probes are sent out every
* user_probe_interval ms. and will cause only rtt updates. These packets
* do not consume sequence number space nor is information about these
* packets stored in the pii_probes[]
*
* PROBE_MULTI: This type is only used to construct a list of targets, when
* no targets are known. The packet is multicast to the all hosts addr.
*/
static void
probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime)
{
hrtime_t sent_hrtime;
struct timeval sent_tv;
struct pr_icmp probe_pkt; /* Probe packet */
struct sockaddr_storage targ; /* target address */
uint_t targaddrlen; /* targed address length */
int pr_ndx; /* probe index in pii->pii_probes[] */
boolean_t sent = _B_FALSE;
int rval;
if (debug & D_TARGET) {
logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af),
pii->pii_name, probe_type, start_hrtime);
}
assert(pii->pii_probe_sock != -1);
assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
probe_type == PROBE_RTT);
probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
probe_pkt.pr_icmp_code = 0;
probe_pkt.pr_icmp_cksum = 0;
probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
/*
* Since there is no need to do arithmetic on the icmpid,
* (only equality check is done) pii_icmpid is stored in
* network byte order at initialization itself.
*/
probe_pkt.pr_icmp_id = pii->pii_icmpid;
probe_pkt.pr_icmp_timestamp = htonll(start_hrtime);
probe_pkt.pr_icmp_mtype = htonl(probe_type);
/*
* If probe_type is PROBE_MULTI, this packet will be multicast to
* the all hosts address. Otherwise it is unicast to the next target.
*/
assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
pii->pii_rtt_target_next != NULL));
bzero(&targ, sizeof (targ));
targ.ss_family = pii->pii_af;
if (pii->pii_af == AF_INET6) {
struct in6_addr *addr6;
addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr;
targaddrlen = sizeof (struct sockaddr_in6);
if (probe_type == PROBE_MULTI) {
*addr6 = all_nodes_mcast_v6;
} else if (probe_type == PROBE_UNI) {
*addr6 = pii->pii_target_next->tg_address;
} else { /* type is PROBE_RTT */
*addr6 = pii->pii_rtt_target_next->tg_address;
}
} else {
struct in_addr *addr4;
addr4 = &((struct sockaddr_in *)&targ)->sin_addr;
targaddrlen = sizeof (struct sockaddr_in);
if (probe_type == PROBE_MULTI) {
*addr4 = all_nodes_mcast_v4;
} else if (probe_type == PROBE_UNI) {
IN6_V4MAPPED_TO_INADDR(
&pii->pii_target_next->tg_address, addr4);
} else { /* type is PROBE_RTT */
IN6_V4MAPPED_TO_INADDR(
&pii->pii_rtt_target_next->tg_address, addr4);
}
/*
* Compute the IPv4 icmp checksum. Does not cover the IP header.
*/
probe_pkt.pr_icmp_cksum =
in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
}
/*
* Use the current time as the time we sent. Not atomic, but the best
* we can do from here.
*/
sent_hrtime = gethrtime();
(void) gettimeofday(&sent_tv, NULL);
rval = sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0,
(struct sockaddr *)&targ, targaddrlen);
/*
* If the send would block, this may either be transient or a hang in a
* lower layer. We pretend the probe was actually sent, the daemon will
* not see a reply to the probe and will fail the interface if normal
* failure detection criteria are met.
*/
if (rval == sizeof (probe_pkt) ||
(rval == -1 && errno == EWOULDBLOCK)) {
sent = _B_TRUE;
} else {
logperror_pii(pii, "probe: probe sendto");
}
/*
* If this is a PROBE_UNI probe packet being unicast to a target, then
* update our tables. We will need this info in processing the probe
* response. PROBE_MULTI and PROBE_RTT packets are not used for
* the purpose of failure or recovery detection. PROBE_MULTI packets
* are only used to construct a list of targets. PROBE_RTT packets are
* used only for updating the rtt and not for failure detection.
*/
if (probe_type == PROBE_UNI && sent) {
pr_ndx = pii->pii_probe_next;
assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
/* Collect statistics, before we reuse the last slot. */
if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
pii->pii_cum_stats.lost++;
else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
pii->pii_cum_stats.acked++;
pii->pii_cum_stats.sent++;
pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt;
pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv;
pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime;
pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime;
pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED);
pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
pii->pii_target_next = target_next(pii->pii_target_next);
assert(pii->pii_target_next != NULL);
/*
* If we have a single variable to denote the next target to
* probe for both rtt probes and failure detection probes, we
* could end up with a situation where the failure detection
* probe targets become disjoint from the rtt probe targets.
* Eg. if 2 targets and the actual fdt is double the user
* specified fdt. So we have 2 variables. In this scheme
* we also reset pii_rtt_target_next for every fdt probe,
* though that may not be necessary.
*/
pii->pii_rtt_target_next = pii->pii_target_next;
pii->pii_snxt++;
} else if (probe_type == PROBE_RTT) {
pii->pii_rtt_target_next =
target_next(pii->pii_rtt_target_next);
assert(pii->pii_rtt_target_next != NULL);
}
}
/*
* Incoming IPv4 data from wire, is received here. Called from main.
*/
void
in_data(struct phyint_instance *pii)
{
struct sockaddr_in from;
struct in6_addr fromaddr;
static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
struct ip *ip;
int iphlen;
int len;
char abuf[INET_ADDRSTRLEN];
struct msghdr msg;
struct iovec iov;
struct pr_icmp *reply;
struct timeval *recv_tvp;
if (debug & D_PROBE) {
logdebug("in_data(%s %s)\n",
AF_STR(pii->pii_af), pii->pii_name);
}
iov.iov_base = (char *)in_packet;
iov.iov_len = sizeof (in_packet);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_name = (struct sockaddr *)&from;
msg.msg_namelen = sizeof (from);
msg.msg_control = ancillary_data;
msg.msg_controllen = sizeof (ancillary_data);
/*
* Poll has already told us that a message is waiting,
* on this socket. Read it now. We should not block.
*/
if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
logperror_pii(pii, "in_data: recvmsg");
return;
}
/*
* If the datalink has indicated the link is down, don't go
* any further.
*/
if (LINK_DOWN(pii->pii_phyint))
return;
/* Get the printable address for error reporting */
(void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
/* Ignore packets > 64k or control buffers that don't fit */
if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
if (debug & D_PKTBAD) {
logdebug("Truncated message: msg_flags 0x%x from %s\n",
msg.msg_flags, abuf);
}
return;
}
/* Make sure packet contains at least minimum ICMP header */
ip = (struct ip *)in_packet;
iphlen = ip->ip_hl << 2;
if (len < iphlen + ICMP_MINLEN) {
if (debug & D_PKTBAD) {
logdebug("in_data: packet too short (%d bytes)"
" from %s\n", len, abuf);
}
return;
}
/*
* Subtract the IP hdr length, 'len' will be length of the probe
* reply, starting from the icmp hdr.
*/
len -= iphlen;
/* LINTED */
reply = (struct pr_icmp *)((char *)in_packet + iphlen);
/* Probe replies are icmp echo replies. Ignore anything else */
if (reply->pr_icmp_type != ICMP_ECHO_REPLY)
return;
/*
* The icmp id should match what we sent, which is stored
* in pi_icmpid. The icmp code for reply must be 0.
* The reply content must be a struct pr_icmp
*/
if (reply->pr_icmp_id != pii->pii_icmpid) {
/* Not in response to our probe */
return;
}
if (reply->pr_icmp_code != 0) {
logtrace("probe reply code %d from %s on %s\n",
reply->pr_icmp_code, abuf, pii->pii_name);
return;
}
if (len < sizeof (struct pr_icmp)) {
logtrace("probe reply too short: %d bytes from %s on %s\n",
len, abuf, pii->pii_name);
return;
}
recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
if (recv_tvp == NULL) {
logtrace("message without timestamp from %s on %s\n",
abuf, pii->pii_name);
return;
}
IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
/* Unicast probe reply */
incoming_echo_reply(pii, reply, fromaddr, recv_tvp);
else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
/* Multicast reply */
incoming_mcast_reply(pii, reply, fromaddr);
} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
incoming_rtt_reply(pii, reply, fromaddr);
} else {
/* Probably not in response to our probe */
logtrace("probe reply type: %d from %s on %s\n",
reply->pr_icmp_mtype, abuf, pii->pii_name);
return;
}
}
/*
* Incoming IPv6 data from wire is received here. Called from main.
*/
void
in6_data(struct phyint_instance *pii)
{
struct sockaddr_in6 from;
static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
int len;
char abuf[INET6_ADDRSTRLEN];
struct msghdr msg;
struct iovec iov;
void *opt;
struct pr_icmp *reply;
struct timeval *recv_tvp;
if (debug & D_PROBE) {
logdebug("in6_data(%s %s)\n",
AF_STR(pii->pii_af), pii->pii_name);
}
iov.iov_base = (char *)in_packet;
iov.iov_len = sizeof (in_packet);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_name = (struct sockaddr *)&from;
msg.msg_namelen = sizeof (from);
msg.msg_control = ancillary_data;
msg.msg_controllen = sizeof (ancillary_data);
if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
logperror_pii(pii, "in6_data: recvmsg");
return;
}
/*
* If the datalink has indicated that the link is down, don't go
* any further.
*/
if (LINK_DOWN(pii->pii_phyint))
return;
/* Get the printable address for error reporting */
(void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf));
if (len < ICMP_MINLEN) {
if (debug & D_PKTBAD) {
logdebug("Truncated message: msg_flags 0x%x from %s\n",
msg.msg_flags, abuf);
}
return;
}
/* Ignore packets > 64k or control buffers that don't fit */
if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
if (debug & D_PKTBAD) {
logdebug("Truncated message: msg_flags 0x%x from %s\n",
msg.msg_flags, abuf);
}
return;
}
reply = (struct pr_icmp *)in_packet;
if (reply->pr_icmp_type != ICMP6_ECHO_REPLY)
return;
if (reply->pr_icmp_id != pii->pii_icmpid) {
/* Not in response to our probe */
return;
}
/*
* The kernel has already verified the the ICMP checksum.
*/
if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) {
logtrace("ICMPv6 echo reply source address not linklocal from "
"%s on %s\n", abuf, pii->pii_name);
return;
}
opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR);
if (opt != NULL) {
/* Can't allow routing headers in probe replies */
logtrace("message with routing header from %s on %s\n",
abuf, pii->pii_name);
return;
}
if (reply->pr_icmp_code != 0) {
logtrace("probe reply code: %d from %s on %s\n",
reply->pr_icmp_code, abuf, pii->pii_name);
return;
}
if (len < (sizeof (struct pr_icmp))) {
logtrace("probe reply too short: %d bytes from %s on %s\n",
len, abuf, pii->pii_name);
return;
}
recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
if (recv_tvp == NULL) {
logtrace("message without timestamp from %s on %s\n",
abuf, pii->pii_name);
return;
}
if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp);
} else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
incoming_mcast_reply(pii, reply, from.sin6_addr);
} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
incoming_rtt_reply(pii, reply, from.sin6_addr);
} else {
/* Probably not in response to our probe */
logtrace("probe reply type: %d from %s on %s\n",
reply->pr_icmp_mtype, abuf, pii->pii_name);
}
}
/*
* Process the incoming rtt reply, in response to our rtt probe.
* Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
* have any stored information about the probe we sent. So we don't log
* any errors if we receive bad replies.
*/
static void
incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
struct in6_addr fromaddr)
{
int64_t m; /* rtt measurement in ns */
char abuf[INET6_ADDRSTRLEN];
struct target *target;
struct phyint_group *pg;
/* Get the printable address for error reporting */
(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
if (debug & D_PROBE) {
logdebug("incoming_rtt_reply: %s %s %s\n",
AF_STR(pii->pii_af), pii->pii_name, abuf);
}
/* Do we know this target ? */
target = target_lookup(pii, fromaddr);
if (target == NULL)
return;
m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp));
/* Invalid rtt. It has wrapped around */
if (m < 0)
return;
/*
* Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
* The initial few responses after the interface is repaired may
* contain high rtt's because they could have been queued up waiting
* for ARP/NDP resolution on a failed interface.
*/
pg = pii->pii_phyint->pi_group;
if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
return;
/*
* Update rtt only if the new rtt is lower than the current rtt.
* (specified by the 3rd parameter to pi_set_crtt).
* If a spike has caused the current probe_interval to be >
* user_probe_interval, then this mechanism is used to bring down
* the rtt rapidly once the network stress is removed.
* If the new rtt is higher than the current rtt, we don't want to
* update the rtt. We are having more than 1 outstanding probe and
* the increase in rtt we are seeing is being unnecessarily weighted
* many times. The regular rtt update will be handled by
* incoming_echo_reply() and will take care of any rtt increase.
*/
pi_set_crtt(target, m, _B_FALSE);
if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
(user_failure_detection_time < pg->pg_fdt) &&
(last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
/*
* If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
* investigate if we can improve the failure detection time to
* meet whatever the user specified.
*/
if (check_pg_crtt_improved(pg)) {
pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
user_failure_detection_time);
pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
if (pii->pii_phyint->pi_group != phyint_anongroup) {
logerr("Improved failure detection time %d ms "
"on (%s %s) for group \"%s\"\n",
pg->pg_fdt, AF_STR(pii->pii_af),
pii->pii_name,
pii->pii_phyint->pi_group->pg_name);
}
if (user_failure_detection_time == pg->pg_fdt) {
/* Avoid any truncation or rounding errors */
pg->pg_probeint = user_probe_interval;
/*
* No more rtt probes will be sent. The actual
* fdt has dropped to the user specified value.
* pii_fd_snxt_basetime and pii_snxt_basetime
* will be in sync henceforth.
*/
reset_snxt_basetimes();
}
}
}
}
/*
* Process the incoming echo reply, in response to our unicast probe.
* Common for both IPv4 and IPv6
*/
static void
incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
struct in6_addr fromaddr, struct timeval *recv_tvp)
{
int64_t m; /* rtt measurement in ns */
hrtime_t cur_hrtime; /* in ns from some arbitrary point */
char abuf[INET6_ADDRSTRLEN];
int pr_ndx;
struct target *target;
boolean_t exception;
uint64_t pr_icmp_timestamp;
uint16_t pr_icmp_seq;
struct probe_stats *pr_statp;
struct phyint_group *pg = pii->pii_phyint->pi_group;
/* Get the printable address for error reporting */
(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
if (debug & D_PROBE) {
logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n",
AF_STR(pii->pii_af), pii->pii_name, abuf,
ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp));
}
pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp);
pr_icmp_seq = ntohs(reply->pr_icmp_seq);
/* Reject out of window probe replies */
if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) {
logtrace("out of window probe seq %u snxt %u on %s from %s\n",
pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
pii->pii_cum_stats.unknown++;
return;
}
cur_hrtime = gethrtime();
m = (int64_t)(cur_hrtime - pr_icmp_timestamp);
if (m < 0) {
/*
* This is a ridiculously high value of rtt. rtt has wrapped
* around. Log a message, and ignore the rtt.
*/
logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld "
"reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp);
}
/*
* Get the probe index pr_ndx corresponding to the received icmp seq.
* number in our pii->pii_probes[] array. The icmp sequence number
* pii_snxt corresponds to the probe index pii->pii_probe_next
*/
pr_ndx = MOD_SUB(pii->pii_probe_next,
(uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT);
assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status));
target = pii->pii_probes[pr_ndx].pr_target;
/*
* Perform sanity checks, whether this probe reply that we
* have received is genuine
*/
if (target != NULL) {
/*
* Compare the src. addr of the received ICMP or ICMPv6
* probe reply with the target address in our tables.
*/
if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) {
/*
* We don't have any record of having sent a probe to
* this target. This is a fake probe reply. Log an error
*/
logtrace("probe status %d Fake probe reply seq %u "
"snxt %u on %s from %s\n",
pii->pii_probes[pr_ndx].pr_status,
pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
pii->pii_cum_stats.unknown++;
return;
} else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
/*
* The address matches, but our tables indicate that
* this probe reply has been acked already. So this
* is a duplicate probe reply. Log an error
*/
logtrace("probe status %d Duplicate probe reply seq %u "
"snxt %u on %s from %s\n",
pii->pii_probes[pr_ndx].pr_status,
pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
pii->pii_cum_stats.unknown++;
return;
}
} else {
/*
* Target must not be NULL in the PR_UNACKED state
*/
assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED);
if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) {
/*
* The probe stats slot is unused. So we didn't
* send out any probe to this target. This is a fake.
* Log an error.
*/
logtrace("probe status %d Fake probe reply seq %u "
"snxt %u on %s from %s\n",
pii->pii_probes[pr_ndx].pr_status,
pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
}
pii->pii_cum_stats.unknown++;
return;
}
/*
* If the rtt does not appear to be right, don't update the
* rtt stats. This can happen if the system dropped into the
* debugger, or the system was hung or too busy for a
* substantial time that we didn't get a chance to run.
*/
if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) {
/*
* If the probe corresponding to this received response
* was truly sent 'm' ns. ago, then this response must
* have been rejected by the sequence number checks. The
* fact that it has passed the sequence number checks
* means that the measured rtt is wrong. We were probably
* scheduled long after the packet was received.
*/
goto out;
}
/*
* Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
* The initial few responses after the interface is repaired may
* contain high rtt's because they could have been queued up waiting
* for ARP/NDP resolution on a failed interface.
*/
if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
goto out;
/*
* Don't update the Conservative Round Trip Time estimate for this
* (phint, target) pair if this is the not the highest ack seq seen
* thus far on this target.
*/
if (!highest_ack_tg(pr_icmp_seq, target))
goto out;
/*
* Always update the rtt. This is a failure detection probe
* and we want to measure both increase / decrease in rtt.
*/
pi_set_crtt(target, m, _B_TRUE);
/*
* If the crtt exceeds the average time between probes,
* investigate if this slow target is an exception. If so we
* can avoid this target and still meet the failure detection
* time. Otherwise we can't meet the failure detection time.
*/
if (target->tg_crtt > pg->pg_probeint) {
exception = check_exception_target(pii, target);
if (exception) {
/*
* This target is exceptionally slow. Don't use it
* for future probes. check_exception_target() has
* made sure that we have at least MIN_PROBE_TARGETS
* other active targets
*/
if (pii->pii_targets_are_routers) {
/*
* This is a slow router, mark it as slow
* and don't use it for further probes. We
* don't delete it, since it will be populated
* again when we do a router scan. Hence we
* need to maintain extra state (unlike the
* host case below). Mark it as TG_SLOW.
*/
if (target->tg_status == TG_ACTIVE)
pii->pii_ntargets--;
target->tg_status = TG_SLOW;
target->tg_latime = gethrtime();
target->tg_rtt_sa = -1;
target->tg_crtt = 0;
target->tg_rtt_sd = 0;
if (pii->pii_target_next == target) {
pii->pii_target_next =
target_next(target);
}
} else {
/*
* the slow target is not a router, we can
* just delete it. Send an icmp multicast and
* pick the fastest responder that is not
* already an active target. target_delete()
* adjusts pii->pii_target_next
*/
target_delete(target);
probe(pii, PROBE_MULTI, cur_hrtime);
}
} else {
/*
* We can't meet the failure detection time.
* Log a message, and update the detection time to
* whatever we can achieve.
*/
pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE;
pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2);
last_fdt_bumpup_time = gethrtime();
if (pg != phyint_anongroup) {
logtrace("Cannot meet requested failure"
" detection time of %d ms on (%s %s) new"
" failure detection time for group \"%s\""
" is %d ms\n", user_failure_detection_time,
AF_STR(pii->pii_af), pii->pii_name,
pg->pg_name, pg->pg_fdt);
}
}
} else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
(user_failure_detection_time < pg->pg_fdt) &&
(last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
/*
* If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
* investigate if we can improve the failure detection time to
* meet whatever the user specified.
*/
if (check_pg_crtt_improved(pg)) {
pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
user_failure_detection_time);
pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
if (pg != phyint_anongroup) {
logtrace("Improved failure detection time %d ms"
" on (%s %s) for group \"%s\"\n",
pg->pg_fdt, AF_STR(pii->pii_af),
pii->pii_name, pg->pg_name);
}
if (user_failure_detection_time == pg->pg_fdt) {
/* Avoid any truncation or rounding errors */
pg->pg_probeint = user_probe_interval;
/*
* No more rtt probes will be sent. The actual
* fdt has dropped to the user specified value.
* pii_fd_snxt_basetime and pii_snxt_basetime
* will be in sync henceforth.
*/
reset_snxt_basetimes();
}
}
}
out:
pr_statp = &pii->pii_probes[pr_ndx];
pr_statp->pr_hrtime_ackproc = cur_hrtime;
pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent +
(tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent));
probe_chstate(pr_statp, pii, PR_ACKED);
/*
* Update pii->pii_rack, i.e. the sequence number of the last received
* probe response, based on the echo reply we have received now, if
* either of the following conditions are satisfied.
* a. pii_rack is outside the current receive window of
* [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
* This means we have not received probe responses for a
* long time, and the sequence number has wrapped around.
* b. pii_rack is within the current receive window and this echo
* reply corresponds to the highest sequence number we have seen
* so far.
*/
if (SEQ_GE(pii->pii_rack, pii->pii_snxt) ||
SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) ||
SEQ_GT(pr_icmp_seq, pii->pii_rack)) {
pii->pii_rack = pr_icmp_seq;
}
}
/*
* Returns true if seq is the highest unacknowledged seq for target tg
* else returns false
*/
static boolean_t
highest_ack_tg(uint16_t seq, struct target *tg)
{
struct phyint_instance *pii;
int pr_ndx;
uint16_t pr_seq;
pii = tg->tg_phyint_inst;
/*
* Get the seq number of the most recent probe sent so far,
* and also get the corresponding probe index in the probe stats
* array.
*/
pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
pr_seq = pii->pii_snxt;
pr_seq--;
/*
* Start from the most recent probe and walk back, trying to find
* an acked probe corresponding to target tg.
*/
for (; pr_ndx != pii->pii_probe_next;
pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) {
if (pii->pii_probes[pr_ndx].pr_target == tg &&
pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
if (SEQ_GT(pr_seq, seq))
return (_B_FALSE);
}
}
return (_B_TRUE);
}
/*
* Check whether the crtt for the group has improved by a factor of
* LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure
* detection time flapping in the face of small crtt changes.
*/
static boolean_t
check_pg_crtt_improved(struct phyint_group *pg)
{
struct phyint *pi;
if (debug & D_PROBE)
logdebug("check_pg_crtt_improved()\n");
/*
* The crtt for the group is only improved if each phyint_instance
* for both ipv4 and ipv6 is improved.
*/
for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
if (!check_pii_crtt_improved(pi->pi_v4) ||
!check_pii_crtt_improved(pi->pi_v6))
return (_B_FALSE);
}
return (_B_TRUE);
}
/*
* Check whether the crtt has improved substantially on this phyint_instance.
* Returns _B_TRUE if there's no crtt information available, because pii
* is NULL or the phyint_instance is not capable of probing.
*/
boolean_t
check_pii_crtt_improved(struct phyint_instance *pii) {
struct target *tg;
if (pii == NULL)
return (_B_TRUE);
if (!PROBE_CAPABLE(pii) ||
pii->pii_phyint->pi_state == PI_FAILED)
return (_B_TRUE);
for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
if (tg->tg_status != TG_ACTIVE)
continue;
if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint /
LOWER_FDT_TRIGGER)) {
return (_B_FALSE);
}
}
return (_B_TRUE);
}
/*
* This target responds very slowly to probes. The target's crtt exceeds
* the probe interval of its group. Compare against other targets
* and determine if this target is an exception, if so return true, else false
*/
static boolean_t
check_exception_target(struct phyint_instance *pii, struct target *target)
{
struct target *tg;
char abuf[INET6_ADDRSTRLEN];
if (debug & D_PROBE) {
logdebug("check_exception_target(%s %s target %s)\n",
AF_STR(pii->pii_af), pii->pii_name,
pr_addr(pii->pii_af, target->tg_address,
abuf, sizeof (abuf)));
}
/*
* We should have at least MIN_PROBE_TARGETS + 1 good targets now,
* to make a good judgement. Otherwise don't drop this target.
*/
if (pii->pii_ntargets < MIN_PROBE_TARGETS + 1)
return (_B_FALSE);
/*
* Determine whether only this particular target is slow.
* We know that this target's crtt exceeds the group's probe interval.
* If all other active targets have a
* crtt < (this group's probe interval) / EXCEPTION_FACTOR,
* then this target is considered slow.
*/
for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
if (tg != target && tg->tg_status == TG_ACTIVE) {
if (tg->tg_crtt >
pii->pii_phyint->pi_group->pg_probeint /
EXCEPTION_FACTOR) {
return (_B_FALSE);
}
}
}
return (_B_TRUE);
}
/*
* Update the target list. The icmp all hosts multicast has given us
* some host to which we can send probes. If we already have sufficient
* targets, discard it.
*/
static void
incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
struct in6_addr fromaddr)
/* ARGSUSED */
{
int af;
char abuf[INET6_ADDRSTRLEN];
struct phyint *pi;
if (debug & D_PROBE) {
logdebug("incoming_mcast_reply(%s %s %s)\n",
AF_STR(pii->pii_af), pii->pii_name,
pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)));
}
/*
* Using host targets is a fallback mechanism. If we have
* found a router, don't add this host target. If we already
* know MAX_PROBE_TARGETS, don't add another target.
*/
assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
if (pii->pii_targets != NULL) {
if (pii->pii_targets_are_routers ||
(pii->pii_ntargets == MAX_PROBE_TARGETS)) {
return;
}
}
if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) {
/*
* Guard against response from 0.0.0.0
* and ::. Log a trace message
*/
logtrace("probe response from %s on %s\n",
pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)),
pii->pii_name);
return;
}
/*
* This address is one of our own, so reject this address as a
* valid probe target.
*/
af = pii->pii_af;
if (own_address(fromaddr))
return;
/*
* If the phyint is part a named group, then add the address to all
* members of the group. Otherwise, add the address only to the
* phyint itself, since other phyints in the anongroup may not be on
* the same subnet.
*/
pi = pii->pii_phyint;
if (pi->pi_group == phyint_anongroup) {
target_add(pii, fromaddr, _B_FALSE);
} else {
pi = pi->pi_group->pg_phyint;
for (; pi != NULL; pi = pi->pi_pgnext)
target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE);
}
}
/*
* Compute CRTT given an existing scaled average, scaled deviation estimate
* and a new rtt time. The formula is from Jacobson and Karels'
* "Congestion Avoidance and Control" in SIGCOMM '88. The variable names
* are the same as those in Appendix A.2 of that paper.
*
* m = new measurement
* sa = scaled RTT average (8 * average estimates)
* sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
* crtt = Conservative round trip time. Used to determine whether probe
* has timed out.
*
* New scaled average and deviation are passed back via sap and svp
*/
static int64_t
compute_crtt(int64_t *sap, int64_t *svp, int64_t m)
{
int64_t sa = *sap;
int64_t sv = *svp;
int64_t crtt;
int64_t saved_m = m;
assert(*sap >= -1);
assert(*svp >= 0);
if (sa != -1) {
/*
* Update average estimator:
* new rtt = old rtt + 1/8 Error
* where Error = m - old rtt
* i.e. 8 * new rtt = 8 * old rtt + Error
* i.e. new sa = old sa + Error
*/
m -= sa >> 3; /* m is now Error in estimate. */
if ((sa += m) < 0) {
/* Don't allow the smoothed average to be negative. */
sa = 0;
}
/*
* Update deviation estimator:
* new mdev = old mdev + 1/4 (abs(Error) - old mdev)
* i.e. 4 * new mdev = 4 * old mdev +
* (abs(Error) - old mdev)
* i.e. new sv = old sv + (abs(Error) - old mdev)
*/
if (m < 0)
m = -m;
m -= sv >> 2;
sv += m;
} else {
/* Initialization. This is the first response received. */
sa = (m << 3);
sv = (m << 1);
}
crtt = (sa >> 3) + sv;
if (debug & D_PROBE) {
logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> "
"crtt = %lld\n", saved_m, sa, sv, crtt);
}
*sap = sa;
*svp = sv;
/*
* CRTT = average estimates + 4 * deviation estimates
* = sa / 8 + sv
*/
return (crtt);
}
static void
pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni)
{
struct phyint_instance *pii = tg->tg_phyint_inst;
int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
int64_t sa = tg->tg_rtt_sa;
int64_t sv = tg->tg_rtt_sd;
int new_crtt;
int i;
if (debug & D_PROBE)
logdebug("pi_set_crtt: target - m %lld\n", m);
/* store the round trip time, in case we need to defer computation */
tg->tg_deferred[tg->tg_num_deferred] = m;
new_crtt = ns2ms(compute_crtt(&sa, &sv, m));
/*
* If this probe's round trip time would singlehandedly cause an
* increase in the group's probe interval consider it suspect.
*/
if ((new_crtt > probe_interval) && is_probe_uni) {
if (debug & D_PROBE) {
logdebug("Received a suspect probe on %s, new_crtt ="
" %d, probe_interval = %d, num_deferred = %d\n",
pii->pii_probe_logint->li_name, new_crtt,
probe_interval, tg->tg_num_deferred);
}
/*
* If we've deferred as many rtts as we plan on deferring, then
* assume the link really did slow down and process all queued
* rtts
*/
if (tg->tg_num_deferred == MAXDEFERREDRTT) {
if (debug & D_PROBE) {
logdebug("Received MAXDEFERREDRTT probes which "
"would cause an increased probe_interval. "
"Integrating queued rtt data points.\n");
}
for (i = 0; i <= tg->tg_num_deferred; i++) {
tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa,
&tg->tg_rtt_sd, tg->tg_deferred[i]));
}
tg->tg_num_deferred = 0;
} else {
tg->tg_num_deferred++;
}
return;
}
/*
* If this is a normal probe, or an RTT probe that would lead to a
* reduced CRTT, then update our CRTT data. Further, if this was
* a normal probe, pitch any deferred probes since our probes are
* again being answered within our CRTT estimates.
*/
if (is_probe_uni || new_crtt < tg->tg_crtt) {
tg->tg_rtt_sa = sa;
tg->tg_rtt_sd = sv;
tg->tg_crtt = new_crtt;
if (is_probe_uni)
tg->tg_num_deferred = 0;
}
}
/*
* Return a pointer to the specified option buffer.
* If not found return NULL.
*/
static void *
find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type)
{
struct cmsghdr *cmsg;
for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
cmsg = CMSG_NXTHDR(msg, cmsg)) {
if (cmsg->cmsg_level == cmsg_level &&
cmsg->cmsg_type == cmsg_type) {
return (CMSG_DATA(cmsg));
}
}
return (NULL);
}
/*
* Try to activate another INACTIVE interface in the same group as `pi'.
* Prefer STANDBY INACTIVE to just INACTIVE.
*/
void
phyint_activate_another(struct phyint *pi)
{
struct phyint *pi2;
struct phyint *inactivepi = NULL;
if (pi->pi_group == phyint_anongroup)
return;
for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
if (pi == pi2 || !phyint_is_functioning(pi2) ||
!(pi2->pi_flags & IFF_INACTIVE))
continue;
inactivepi = pi2;
if (pi2->pi_flags & IFF_STANDBY)
break;
}
if (inactivepi != NULL)
(void) change_pif_flags(inactivepi, 0, IFF_INACTIVE);
}
/*
* Transition a phyint to PI_RUNNING. The caller must ensure that the
* transition is appropriate. Clears IFF_OFFLINE or IFF_FAILED if
* appropriate. Also sets IFF_INACTIVE on this or other interfaces as
* appropriate (see comment below). Finally, also updates the phyint's group
* state to account for the change.
*/
void
phyint_transition_to_running(struct phyint *pi)
{
struct phyint *pi2;
struct phyint *actstandbypi = NULL;
uint_t nactive = 0, nnonstandby = 0;
boolean_t onlining = (pi->pi_state == PI_OFFLINE);
boolean_t initial = (pi->pi_state == PI_INIT);
uint64_t set, clear;
/*
* The interface is running again, but should it or another interface
* in the group end up INACTIVE? There are three cases:
*
* 1. If it's a STANDBY interface, it should be end up INACTIVE if
* the group is operating at capacity (i.e., there are at least as
* many active interfaces as non-STANDBY interfaces in the group).
* No other interfaces should be changed.
*
* 2. If it's a non-STANDBY interface and we're onlining it or
* FAILBACK is enabled, then it should *not* end up INACTIVE.
* Further, if the group is above capacity as a result of this
* interface, then an active STANDBY interface in the group should
* end up INACTIVE.
*
* 3. If it's a non-STANDBY interface, we're repairing it, and
* FAILBACK is disabled, then it should end up INACTIVE *unless*
* the group was failed (in which case we have no choice but to
* use it). No other interfaces should be changed.
*/
if (pi->pi_group != phyint_anongroup) {
pi2 = pi->pi_group->pg_phyint;
for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
if (!(pi2->pi_flags & IFF_STANDBY))
nnonstandby++;
if (phyint_is_functioning(pi2) &&
!(pi2->pi_flags & IFF_INACTIVE)) {
nactive++;
if (pi2->pi_flags & IFF_STANDBY)
actstandbypi = pi2;
}
}
}
set = 0;
clear = (onlining ? IFF_OFFLINE : IFF_FAILED);
if (pi->pi_flags & IFF_STANDBY) { /* case 1 */
if (nactive >= nnonstandby)
set |= IFF_INACTIVE;
else
clear |= IFF_INACTIVE;
} else if (onlining || failback_enabled) { /* case 2 */
if (nactive >= nnonstandby && actstandbypi != NULL)
(void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0);
} else if (!initial && !GROUP_FAILED(pi->pi_group)) { /* case 3 */
set |= IFF_INACTIVE;
}
(void) change_pif_flags(pi, set, clear);
phyint_chstate(pi, PI_RUNNING);
/*
* Update the group state to account for the change.
*/
phyint_group_refresh_state(pi->pi_group);
}
/*
* Adjust IFF_INACTIVE on the provided `pi' to trend the group configuration
* to have at least one active interface and as many active interfaces as
* non-standby interfaces.
*/
void
phyint_standby_refresh_inactive(struct phyint *pi)
{
struct phyint *pi2;
uint_t nactive = 0, nnonstandby = 0;
/*
* All phyints in the anonymous group are effectively in their own
* group and thus active regardless of whether they're marked standby.
*/
if (pi->pi_group == phyint_anongroup) {
(void) change_pif_flags(pi, 0, IFF_INACTIVE);
return;
}
/*
* If the phyint isn't functioning we can't consider it.
*/
if (!phyint_is_functioning(pi))
return;
for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
if (!(pi2->pi_flags & IFF_STANDBY))
nnonstandby++;
if (phyint_is_functioning(pi2) &&
!(pi2->pi_flags & IFF_INACTIVE))
nactive++;
}
if (nactive == 0 || nactive < nnonstandby)
(void) change_pif_flags(pi, 0, IFF_INACTIVE);
else if (nactive > nnonstandby)
(void) change_pif_flags(pi, IFF_INACTIVE, 0);
}
/*
* See if a previously failed interface has started working again.
*/
void
phyint_check_for_repair(struct phyint *pi)
{
if (!phyint_repaired(pi))
return;
if (pi->pi_group == phyint_anongroup) {
logerr("IP interface repair detected on %s\n", pi->pi_name);
} else {
logerr("IP interface repair detected on %s of group %s\n",
pi->pi_name, pi->pi_group->pg_name);
}
/*
* If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet.
* So just clear IFF_OFFLINE and defer phyint_transition_to_running()
* until it is brought back online.
*/
if (pi->pi_state == PI_OFFLINE) {
(void) change_pif_flags(pi, 0, IFF_FAILED);
return;
}
phyint_transition_to_running(pi); /* calls phyint_chstate() */
}
/*
* See if an interface has failed, or if the whole group of interfaces has
* failed.
*/
static void
phyint_inst_check_for_failure(struct phyint_instance *pii)
{
struct phyint *pi = pii->pii_phyint;
struct phyint *pi2;
boolean_t was_active;
switch (failure_state(pii)) {
case PHYINT_FAILURE:
was_active = ((pi->pi_flags & IFF_INACTIVE) == 0);
(void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE);
if (pi->pi_group == phyint_anongroup) {
logerr("IP interface failure detected on %s\n",
pii->pii_name);
} else {
logerr("IP interface failure detected on %s of group"
" %s\n", pii->pii_name, pi->pi_group->pg_name);
}
/*
* If the failed interface was active, activate another
* INACTIVE interface in the group if possible.
*/
if (was_active)
phyint_activate_another(pi);
/*
* If the interface is offline, the state change will be
* noted when it comes back online.
*/
if (pi->pi_state != PI_OFFLINE) {
phyint_chstate(pi, PI_FAILED);
reset_crtt_all(pi);
}
break;
case GROUP_FAILURE:
pi2 = pi->pi_group->pg_phyint;
for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
(void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE);
if (pi2->pi_state == PI_OFFLINE) /* see comment above */
continue;
reset_crtt_all(pi2);
/*
* In the case of host targets, we would have flushed
* the targets, and gone to PI_NOTARGETS state.
*/
if (pi2->pi_state == PI_RUNNING)
phyint_chstate(pi2, PI_FAILED);
}
break;
default:
break;
}
}
/*
* Determines if any timeout event has occurred and returns the number of
* milliseconds until the next timeout event for the phyint. Returns
* TIMER_INFINITY for "never".
*/
uint_t
phyint_inst_timer(struct phyint_instance *pii)
{
int pr_ndx;
uint_t timeout;
struct target *cur_tg;
struct probe_stats *pr_statp;
struct phyint_instance *pii_other;
struct phyint *pi;
int valid_unack_count;
int i;
int interval;
uint_t check_time;
uint_t cur_time;
hrtime_t cur_hrtime;
int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
cur_hrtime = gethrtime();
cur_time = ns2ms(cur_hrtime);
if (debug & D_TIMER) {
logdebug("phyint_inst_timer(%s %s)\n",
AF_STR(pii->pii_af), pii->pii_name);
}
pii_other = phyint_inst_other(pii);
if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) {
/*
* Check to see if we're here due to link up/down flapping; If
* enough time has passed, then try to bring the interface
* back up; otherwise, schedule a timer to bring it back up
* when enough time *has* elapsed.
*/
pi = pii->pii_phyint;
if (pi->pi_state == PI_FAILED && LINK_UP(pi)) {
check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN;
if (check_time > cur_time)
return (check_time - cur_time);
phyint_check_for_repair(pi);
}
}
/*
* If probing is not enabled on this phyint instance, don't proceed.
*/
if (!PROBE_ENABLED(pii))
return (TIMER_INFINITY);
/*
* If the timer has fired too soon, probably triggered
* by some other phyint instance, return the remaining
* time
*/
if (TIME_LT(cur_time, pii->pii_snxt_time))
return (pii->pii_snxt_time - cur_time);
/*
* If the link is down, don't send any probes for now.
*/
if (LINK_DOWN(pii->pii_phyint))
return (TIMER_INFINITY);
/*
* Randomize the next probe time, between MIN_RANDOM_FACTOR
* and MAX_RANDOM_FACTOR with respect to the base probe time.
* Base probe time is strictly periodic.
*/
interval = GET_RANDOM(
(int)(MIN_RANDOM_FACTOR * user_probe_interval),
(int)(MAX_RANDOM_FACTOR * user_probe_interval));
pii->pii_snxt_time = pii->pii_snxt_basetime + interval;
/*
* Check if the current time > next time to probe. If so, we missed
* sending 1 or more probes, probably due to heavy system load. At least
* 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
* were scheduled. Make adjustments to the times, in multiples of
* user_probe_interval.
*/
if (TIME_GT(cur_time, pii->pii_snxt_time)) {
int n;
n = (cur_time - pii->pii_snxt_time) / user_probe_interval;
pii->pii_snxt_time += (n + 1) * user_probe_interval;
pii->pii_snxt_basetime += (n + 1) * user_probe_interval;
logtrace("missed sending %d probes cur_time %u snxt_time %u"
" snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time,
pii->pii_snxt_basetime);
/* Collect statistics about missed probes */
probes_missed.pm_nprobes += n + 1;
probes_missed.pm_ntimes++;
}
pii->pii_snxt_basetime += user_probe_interval;
interval = pii->pii_snxt_time - cur_time;
if (debug & D_TARGET) {
logdebug("cur_time %u snxt_time %u snxt_basetime %u"
" interval %u\n", cur_time, pii->pii_snxt_time,
pii->pii_snxt_basetime, interval);
}
/*
* If no targets are known, we need to send an ICMP multicast. The
* probe type is PROBE_MULTI. We'll check back in 'interval' msec
* to see if we found a target.
*/
if (pii->pii_target_next == NULL) {
assert(pii->pii_ntargets == 0);
pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
probe(pii, PROBE_MULTI, cur_time);
return (interval);
}
if ((user_probe_interval != probe_interval) &&
TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) {
/*
* the failure detection (fd) probe timer has not yet fired.
* Need to send only an rtt probe. The probe type is PROBE_RTT.
*/
probe(pii, PROBE_RTT, cur_hrtime);
return (interval);
}
/*
* the fd probe timer has fired. Need to do all failure
* detection / recovery calculations, and then send an fd probe
* of type PROBE_UNI.
*/
if (user_probe_interval == probe_interval) {
/*
* We could have missed some probes, and then adjusted
* pii_snxt_basetime above. Otherwise we could have
* blindly added probe_interval to pii_fd_snxt_basetime.
*/
pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
} else {
pii->pii_fd_snxt_basetime += probe_interval;
if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) {
int n;
n = (cur_time - pii->pii_fd_snxt_basetime) /
probe_interval;
pii->pii_fd_snxt_basetime += (n + 1) * probe_interval;
}
}
/*
* We can have at most, the latest 2 probes that we sent, in
* the PR_UNACKED state. All previous probes sent, are either
* PR_LOST or PR_ACKED. An unacknowledged probe is considered
* timed out if the probe's time_start + the CRTT < currenttime.
* For each of the last 2 probes, examine whether it has timed
* out. If so, mark it PR_LOST. The probe stats is a circular array.
*/
pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
valid_unack_count = 0;
for (i = 0; i < 2; i++) {
pr_statp = &pii->pii_probes[pr_ndx];
cur_tg = pii->pii_probes[pr_ndx].pr_target;
switch (pr_statp->pr_status) {
case PR_ACKED:
/*
* We received back an ACK, so the switch clearly
* is not dropping our traffic, and thus we can
* enable failure detection immediately.
*/
if (pii->pii_fd_hrtime > gethrtime()) {
if (debug & D_PROBE) {
logdebug("successful probe on %s; "
"ending quiet period\n",
pii->pii_phyint->pi_name);
}
pii->pii_fd_hrtime = gethrtime();
}
break;
case PR_UNACKED:
assert(cur_tg != NULL);
/*
* The crtt could be zero for some reason,
* Eg. the phyint could be failed. If the crtt is
* not available use group's probe interval,
* which is a worst case estimate.
*/
timeout = ns2ms(pr_statp->pr_hrtime_start);
if (cur_tg->tg_crtt != 0) {
timeout += cur_tg->tg_crtt;
} else {
timeout += probe_interval;
}
if (TIME_LT(timeout, cur_time)) {
pr_statp->pr_time_lost = timeout;
probe_chstate(pr_statp, pii, PR_LOST);
} else if (i == 1) {
/*
* We are forced to consider this probe
* lost, as we can have at most 2 unack.
* probes any time, and we will be sending a
* probe at the end of this function.
* Normally, we should not be here, but
* this can happen if an incoming response
* that was considered lost has increased
* the crtt for this target, and also bumped
* up the FDT. Note that we never cancel or
* increase the current pii_time_left, so
* when the timer fires, we find 2 valid
* unacked probes, and they are yet to timeout
*/
pr_statp->pr_time_lost = cur_time;
probe_chstate(pr_statp, pii, PR_LOST);
} else {
/*
* Only the most recent probe can enter
* this 'else' arm. The second most recent
* probe must take either of the above arms,
* if it is unacked.
*/
valid_unack_count++;
}
break;
}
pr_ndx = PROBE_INDEX_PREV(pr_ndx);
}
/*
* We send out 1 probe randomly in the interval between one half
* and one probe interval for the group. Given that the CRTT is always
* less than the group's probe interval, we can have at most 1
* unacknowledged probe now. All previous probes are either lost or
* acked.
*/
assert(valid_unack_count == 0 || valid_unack_count == 1);
/*
* The timer has fired. Take appropriate action depending
* on the current state of the phyint.
*
* PI_RUNNING state - Failure detection
* PI_FAILED state - Repair detection
*/
switch (pii->pii_phyint->pi_state) {
case PI_FAILED:
/*
* If the most recent probe (excluding unacked probes that
* are yet to time out) has been acked, check whether the
* phyint is now repaired.
*/
if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
phyint_check_for_repair(pii->pii_phyint);
}
break;
case PI_RUNNING:
/*
* It's possible our probes have been lost because of a
* spanning-tree mandated quiet period on the switch. If so,
* ignore the lost probes.
*/
if (pii->pii_fd_hrtime - cur_hrtime > 0)
break;
if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) {
/*
* We have 1 or more failed probes (excluding unacked
* probes that are yet to time out). Determine if the
* phyint has failed.
*/
phyint_inst_check_for_failure(pii);
}
break;
default:
logerr("phyint_inst_timer: invalid state %d\n",
pii->pii_phyint->pi_state);
abort();
}
/*
* Start the next probe. probe() will also set pii->pii_probe_time_left
* to the group's probe interval. If phyint_failed -> target_flush_hosts
* was called, the target list may be empty.
*/
if (pii->pii_target_next != NULL) {
probe(pii, PROBE_UNI, cur_hrtime);
/*
* If we have just the one probe target, and we're not using
* router targets, try to find another as we presently have
* no resilience.
*/
if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
probe(pii, PROBE_MULTI, cur_hrtime);
} else {
probe(pii, PROBE_MULTI, cur_hrtime);
}
return (interval);
}
/*
* Start the probe timer for an interface instance.
*/
void
start_timer(struct phyint_instance *pii)
{
uint32_t interval;
/*
* Spread the base probe times (pi_snxt_basetime) across phyints
* uniformly over the (curtime..curtime + the group's probe_interval).
* pi_snxt_basetime is strictly periodic with a frequency of
* the group's probe interval. The actual probe time pi_snxt_time
* adds some randomness to pi_snxt_basetime and happens in probe().
* For the 1st probe on each phyint after the timer is started,
* pi_snxt_time and pi_snxt_basetime are the same.
*/
interval = GET_RANDOM(0,
(int)pii->pii_phyint->pi_group->pg_probeint);
pii->pii_snxt_basetime = getcurrenttime() + interval;
pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
pii->pii_snxt_time = pii->pii_snxt_basetime;
timer_schedule(interval);
}
/*
* Restart the probe timer on an interface instance.
*/
static void
restart_timer(struct phyint_instance *pii)
{
/*
* We don't need to restart the timer if it was never started in
* the first place (pii->pii_basetime_inited not set), as the timer
* won't have gone off yet.
*/
if (pii->pii_basetime_inited != 0) {
if (debug & D_LINKNOTE)
logdebug("restart timer: restarting timer on %s, "
"address family %s\n", pii->pii_phyint->pi_name,
AF_STR(pii->pii_af));
start_timer(pii);
}
}
static void
process_link_state_down(struct phyint *pi)
{
logerr("The link has gone down on %s\n", pi->pi_name);
/*
* Clear the probe statistics arrays, we don't want the repair
* detection logic relying on probes that were successful prior
* to the link going down.
*/
if (PROBE_CAPABLE(pi->pi_v4))
clear_pii_probe_stats(pi->pi_v4);
if (PROBE_CAPABLE(pi->pi_v6))
clear_pii_probe_stats(pi->pi_v6);
/*
* Check for interface failure. Although we know the interface
* has failed, we don't know if all the other interfaces in the
* group have failed as well.
*/
if ((pi->pi_state == PI_RUNNING) ||
(pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) {
if (debug & D_LINKNOTE) {
logdebug("process_link_state_down:"
" checking for failure on %s\n", pi->pi_name);
}
if (pi->pi_v4 != NULL)
phyint_inst_check_for_failure(pi->pi_v4);
else if (pi->pi_v6 != NULL)
phyint_inst_check_for_failure(pi->pi_v6);
}
}
static void
process_link_state_up(struct phyint *pi)
{
logerr("The link has come up on %s\n", pi->pi_name);
/*
* We stopped any running timers on each instance when the link
* went down, so restart them.
*/
if (pi->pi_v4)
restart_timer(pi->pi_v4);
if (pi->pi_v6)
restart_timer(pi->pi_v6);
phyint_check_for_repair(pi);
pi->pi_whenup[pi->pi_whendx++] = getcurrenttime();
if (pi->pi_whendx == LINK_UP_PERMIN)
pi->pi_whendx = 0;
}
/*
* Process any changes in link state passed up from the interfaces.
*/
void
process_link_state_changes(void)
{
struct phyint *pi;
/* Look for interfaces where the link state has just changed */
for (pi = phyints; pi != NULL; pi = pi->pi_next) {
boolean_t old_link_state_up = LINK_UP(pi);
/*
* Except when the "phyint" structure is created, this is
* the only place the link state is updated. This allows
* this routine to detect changes in link state, rather
* than just the current state.
*/
UPDATE_LINK_STATE(pi);
if (LINK_DOWN(pi)) {
/*
* Has link just gone down?
*/
if (old_link_state_up)
process_link_state_down(pi);
} else {
/*
* Has link just gone back up?
*/
if (!old_link_state_up)
process_link_state_up(pi);
}
}
}
void
reset_crtt_all(struct phyint *pi)
{
struct phyint_instance *pii;
struct target *tg;
pii = pi->pi_v4;
if (pii != NULL) {
for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
tg->tg_crtt = 0;
tg->tg_rtt_sa = -1;
tg->tg_rtt_sd = 0;
}
}
pii = pi->pi_v6;
if (pii != NULL) {
for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
tg->tg_crtt = 0;
tg->tg_rtt_sa = -1;
tg->tg_rtt_sd = 0;
}
}
}
/*
* Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
* probes on both instances IPv4 and IPv6.
* If the interface has failed, return the time of the first probe failure
* in "tff".
*/
static int
phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
{
uint_t pi_tff;
struct target *cur_tg;
struct probe_fail_count pfinfo;
struct phyint_instance *pii_other;
int pr_ndx;
/*
* Get the number of consecutive failed probes on
* this phyint across all targets. Also get the number
* of consecutive failed probes on this target only
*/
pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
cur_tg = pii->pii_probes[pr_ndx].pr_target;
probe_fail_info(pii, cur_tg, &pfinfo);
/* Get the time of first failure, for later use */
pi_tff = pfinfo.pf_tff;
/*
* If the current target has not responded to the
* last NUM_PROBE_FAILS probes, and other targets are
* responding delete this target. Dead gateway detection
* will eventually remove this target (if router) from the
* routing tables. If that does not occur, we may end
* up adding this to our list again.
*/
if (pfinfo.pf_nfail < NUM_PROBE_FAILS &&
pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) {
if (pii->pii_targets_are_routers) {
if (cur_tg->tg_status == TG_ACTIVE)
pii->pii_ntargets--;
cur_tg->tg_status = TG_DEAD;
cur_tg->tg_crtt = 0;
cur_tg->tg_rtt_sa = -1;
cur_tg->tg_rtt_sd = 0;
if (pii->pii_target_next == cur_tg)
pii->pii_target_next = target_next(cur_tg);
} else {
target_delete(cur_tg);
probe(pii, PROBE_MULTI, gethrtime());
}
return (PHYINT_OK);
}
/*
* If the phyint has lost NUM_PROBE_FAILS or more
* consecutive probes, on both IPv4 and IPv6 protocol
* instances of the phyint, then trigger failure
* detection, else return false
*/
if (pfinfo.pf_nfail < NUM_PROBE_FAILS)
return (PHYINT_OK);
pii_other = phyint_inst_other(pii);
if (PROBE_CAPABLE(pii_other)) {
probe_fail_info(pii_other, NULL, &pfinfo);
if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) {
/*
* We have NUM_PROBE_FAILS or more failures
* on both IPv4 and IPv6. Get the earliest
* time when failure was detected on this
* phyint across IPv4 and IPv6.
*/
if (TIME_LT(pfinfo.pf_tff, pi_tff))
pi_tff = pfinfo.pf_tff;
} else {
/*
* This instance has < NUM_PROBE_FAILS failure.
* So return false
*/
return (PHYINT_OK);
}
}
*tff = pi_tff;
return (PHYINT_FAILURE);
}
/*
* Check if the link has gone down on this phyint, or it has failed the
* last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
* Also look at other phyints of this group, for group failures.
*/
int
failure_state(struct phyint_instance *pii)
{
struct probe_success_count psinfo;
uint_t pi2_tls; /* time last success */
uint_t pi_tff; /* time first fail */
struct phyint *pi2;
struct phyint *pi;
struct phyint_instance *pii2;
struct phyint_group *pg;
int retval;
if (debug & D_FAILREP)
logdebug("phyint_failed(%s)\n", pii->pii_name);
pi = pii->pii_phyint;
pg = pi->pi_group;
if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) ==
PHYINT_OK)
return (PHYINT_OK);
/*
* At this point, the link is down, or the phyint is suspect, as it
* has lost NUM_PROBE_FAILS or more probes. If the phyint does not
* belong to any group, this is a PHYINT_FAILURE. Otherwise, continue
* on to determine whether this should be considered a PHYINT_FAILURE
* or GROUP_FAILURE.
*/
if (pg == phyint_anongroup)
return (PHYINT_FAILURE);
/*
* Need to compare against other phyints of the same group
* to exclude group failures. If the failure was detected via
* probing, then if the time of last success (tls) of any
* phyint is more recent than the time of first fail (tff) of the
* phyint in question, and the link is up on the phyint,
* then it is a phyint failure. Otherwise it is a group failure.
* If failure was detected via a link down notification sent from
* the driver to IP, we see if any phyints in the group are still
* running and haven't received a link down notification. We
* will usually be processing the link down notification shortly
* after it was received, so there is no point looking at the tls
* of other phyints.
*/
retval = GROUP_FAILURE;
for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
/* Exclude ourself from comparison */
if (pi2 == pi)
continue;
if (LINK_DOWN(pi)) {
/*
* We use FLAGS_TO_LINK_STATE() to test the flags
* directly, rather then LINK_UP() or LINK_DOWN(), as
* we may not have got round to processing the link
* state for the other phyints in the group yet.
*
* The check for PI_RUNNING and group failure handles
* the case when the group begins to recover.
* PI_RUNNING will be set, and group failure cleared
* only after receipt of NUM_PROBE_REPAIRS, by which
* time the other phyints should have received at
* least 1 packet, and so will not have NUM_PROBE_FAILS.
*/
if ((pi2->pi_state == PI_RUNNING) &&
!GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) {
retval = PHYINT_FAILURE;
break;
}
continue;
}
if (LINK_DOWN(pi2))
continue;
/*
* If there's no probe-based failure detection on this
* interface, and its link is still up, then it's still
* working and thus the group has not failed.
*/
if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) {
retval = PHYINT_FAILURE;
break;
}
/*
* Need to compare against both IPv4 and IPv6 instances.
*/
pii2 = pi2->pi_v4;
if (pii2 != NULL) {
probe_success_info(pii2, NULL, &psinfo);
if (psinfo.ps_tls_valid) {
pi2_tls = psinfo.ps_tls;
/*
* See comment above regarding check
* for PI_RUNNING and group failure.
*/
if (TIME_GT(pi2_tls, pi_tff) &&
(pi2->pi_state == PI_RUNNING) &&
!GROUP_FAILED(pg) &&
FLAGS_TO_LINK_STATE(pi2)) {
retval = PHYINT_FAILURE;
break;
}
}
}
pii2 = pi2->pi_v6;
if (pii2 != NULL) {
probe_success_info(pii2, NULL, &psinfo);
if (psinfo.ps_tls_valid) {
pi2_tls = psinfo.ps_tls;
/*
* See comment above regarding check
* for PI_RUNNING and group failure.
*/
if (TIME_GT(pi2_tls, pi_tff) &&
(pi2->pi_state == PI_RUNNING) &&
!GROUP_FAILED(pg) &&
FLAGS_TO_LINK_STATE(pi2)) {
retval = PHYINT_FAILURE;
break;
}
}
}
}
/*
* Update the group state to account for the changes.
*/
phyint_group_refresh_state(pg);
return (retval);
}
/*
* Return the information associated with consecutive probe successes
* starting with the most recent probe. At most the last 2 probes can be
* in the unacknowledged state. All previous probes have either failed
* or succeeded.
*/
static void
probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
struct probe_success_count *psinfo)
{
uint_t i;
struct probe_stats *pr_statp;
uint_t most_recent;
uint_t second_most_recent;
boolean_t pi_found_failure = _B_FALSE;
boolean_t tg_found_failure = _B_FALSE;
uint_t now;
uint_t timeout;
struct target *tg;
if (debug & D_FAILREP)
logdebug("probe_success_info(%s)\n", pii->pii_name);
bzero(psinfo, sizeof (*psinfo));
now = getcurrenttime();
/*
* Start with the most recent probe, and count the number
* of consecutive probe successes. Latch the number of successes
* on hitting a failure.
*/
most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
second_most_recent = PROBE_INDEX_PREV(most_recent);
for (i = most_recent; i != pii->pii_probe_next;
i = PROBE_INDEX_PREV(i)) {
pr_statp = &pii->pii_probes[i];
switch (pr_statp->pr_status) {
case PR_UNACKED:
/*
* Only the most recent 2 probes can be unacknowledged
*/
assert(i == most_recent || i == second_most_recent);
tg = pr_statp->pr_target;
assert(tg != NULL);
/*
* The crtt could be zero for some reason,
* Eg. the phyint could be failed. If the crtt is
* not available use the value of the group's probe
* interval which is a worst case estimate.
*/
timeout = ns2ms(pr_statp->pr_hrtime_start);
if (tg->tg_crtt != 0) {
timeout += tg->tg_crtt;
} else {
timeout +=
pii->pii_phyint->pi_group->pg_probeint;
}
if (TIME_LT(timeout, now)) {
/*
* We hit a failure. Latch the total number of
* recent consecutive successes.
*/
pr_statp->pr_time_lost = timeout;
probe_chstate(pr_statp, pii, PR_LOST);
pi_found_failure = _B_TRUE;
if (cur_tg != NULL && tg == cur_tg) {
/*
* We hit a failure for the desired
* target. Latch the number of recent
* consecutive successes for this target
*/
tg_found_failure = _B_TRUE;
}
}
break;
case PR_ACKED:
/*
* Bump up the count of probe successes, if we
* have not seen any failure so far.
*/
if (!pi_found_failure)
psinfo->ps_nsucc++;
if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
!tg_found_failure) {
psinfo->ps_nsucc_tg++;
}
/*
* Record the time of last success, if this is
* the most recent probe success.
*/
if (!psinfo->ps_tls_valid) {
psinfo->ps_tls =
ns2ms(pr_statp->pr_hrtime_ackproc);
psinfo->ps_tls_valid = _B_TRUE;
}
break;
case PR_LOST:
/*
* We hit a failure. Latch the total number of
* recent consecutive successes.
*/
pi_found_failure = _B_TRUE;
if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
/*
* We hit a failure for the desired target.
* Latch the number of recent consecutive
* successes for this target
*/
tg_found_failure = _B_TRUE;
}
break;
default:
return;
}
}
}
/*
* Return the information associated with consecutive probe failures
* starting with the most recent probe. Only the last 2 probes can be in the
* unacknowledged state. All previous probes have either failed or succeeded.
*/
static void
probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
struct probe_fail_count *pfinfo)
{
int i;
struct probe_stats *pr_statp;
boolean_t tg_found_success = _B_FALSE;
boolean_t pi_found_success = _B_FALSE;
int most_recent;
int second_most_recent;
uint_t now;
uint_t timeout;
struct target *tg;
if (debug & D_FAILREP)
logdebug("probe_fail_info(%s)\n", pii->pii_name);
bzero(pfinfo, sizeof (*pfinfo));
now = getcurrenttime();
/*
* Start with the most recent probe, and count the number
* of consecutive probe failures. Latch the number of failures
* on hitting a probe success.
*/
most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
second_most_recent = PROBE_INDEX_PREV(most_recent);
for (i = most_recent; i != pii->pii_probe_next;
i = PROBE_INDEX_PREV(i)) {
pr_statp = &pii->pii_probes[i];
assert(PR_STATUS_VALID(pr_statp->pr_status));
switch (pr_statp->pr_status) {
case PR_UNACKED:
/*
* Only the most recent 2 probes can be unacknowledged
*/
assert(i == most_recent || i == second_most_recent);
tg = pr_statp->pr_target;
/*
* Target is guaranteed to exist in the unack. state
*/
assert(tg != NULL);
/*
* The crtt could be zero for some reason,
* Eg. the phyint could be failed. If the crtt is
* not available use the group's probe interval,
* which is a worst case estimate.
*/
timeout = ns2ms(pr_statp->pr_hrtime_start);
if (tg->tg_crtt != 0) {
timeout += tg->tg_crtt;
} else {
timeout +=
pii->pii_phyint->pi_group->pg_probeint;
}
if (TIME_GT(timeout, now))
break;
pr_statp->pr_time_lost = timeout;
probe_chstate(pr_statp, pii, PR_LOST);
/* FALLTHRU */
case PR_LOST:
if (!pi_found_success) {
pfinfo->pf_nfail++;
pfinfo->pf_tff = pr_statp->pr_time_lost;
}
if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
!tg_found_success) {
pfinfo->pf_nfail_tg++;
}
break;
default:
/*
* We hit a success or unused slot. Latch the
* total number of recent consecutive failures.
*/
pi_found_success = _B_TRUE;
if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
/*
* We hit a success for the desired target.
* Latch the number of recent consecutive
* failures for this target
*/
tg_found_success = _B_TRUE;
}
}
}
}
/*
* Change the state of probe `pr' on phyint_instance `pii' to state `state'.
*/
void
probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state)
{
if (pr->pr_status == state)
return;
pr->pr_status = state;
(void) probe_state_event(pr, pii);
}
/*
* Check if the phyint has been repaired. If no test address has been
* configured, then consider the interface repaired if the link is up (unless
* the link is flapping; see below). Otherwise, look for proof of probes
* being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
* either IPv4 or IPv6 instance, the phyint can be considered repaired.
*/
static boolean_t
phyint_repaired(struct phyint *pi)
{
struct probe_success_count psinfo;
struct phyint_instance *pii;
struct target *cur_tg;
int pr_ndx;
uint_t cur_time;
if (debug & D_FAILREP)
logdebug("phyint_repaired(%s)\n", pi->pi_name);
if (LINK_DOWN(pi))
return (_B_FALSE);
/*
* If we don't have any test addresses and the link is up, then
* consider the interface repaired, unless we've received more than
* LINK_UP_PERMIN link up notifications in the last minute, in
* which case we keep the link down until we drop back below
* the threshold.
*/
if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
cur_time = getcurrenttime();
if ((pi->pi_whenup[pi->pi_whendx] == 0 ||
(cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) {
pi->pi_lfmsg_printed = 0;
return (_B_TRUE);
}
if (!pi->pi_lfmsg_printed) {
logerr("The link has come up on %s more than %d times "
"in the last minute; disabling repair until it "
"stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
pi->pi_lfmsg_printed = 1;
}
return (_B_FALSE);
}
pii = pi->pi_v4;
if (PROBE_CAPABLE(pii)) {
pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
cur_tg = pii->pii_probes[pr_ndx].pr_target;
probe_success_info(pii, cur_tg, &psinfo);
if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
return (_B_TRUE);
}
pii = pi->pi_v6;
if (PROBE_CAPABLE(pii)) {
pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
cur_tg = pii->pii_probes[pr_ndx].pr_target;
probe_success_info(pii, cur_tg, &psinfo);
if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
return (_B_TRUE);
}
return (_B_FALSE);
}
/*
* Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
*/
boolean_t
change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear)
{
int ifsock;
struct lifreq lifr;
uint64_t old_flags;
if (debug & D_FAILREP) {
logdebug("change_pif_flags(%s): set %llx clear %llx\n",
pi->pi_name, set, clear);
}
if (pi->pi_v4 != NULL)
ifsock = ifsock_v4;
else
ifsock = ifsock_v6;
/*
* Get the current flags from the kernel, and set/clear the
* desired phyint flags. Since we set only phyint flags, we can
* do it on either IPv4 or IPv6 instance.
*/
(void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
if (errno != ENXIO)
logperror("change_pif_flags: ioctl (get flags)");
return (_B_FALSE);
}
old_flags = lifr.lifr_flags;
lifr.lifr_flags |= set;
lifr.lifr_flags &= ~clear;
if (old_flags == lifr.lifr_flags) {
/* No change in the flags. No need to send ioctl */
return (_B_TRUE);
}
if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
if (errno != ENXIO)
logperror("change_pif_flags: ioctl (set flags)");
return (_B_FALSE);
}
/*
* Keep pi_flags in synch. with actual flags. Assumes flags are
* phyint flags.
*/
pi->pi_flags |= set;
pi->pi_flags &= ~clear;
if (pi->pi_v4 != NULL)
pi->pi_v4->pii_flags = pi->pi_flags;
if (pi->pi_v6 != NULL)
pi->pi_v6->pii_flags = pi->pi_flags;
return (_B_TRUE);
}
/*
* icmp cksum computation for IPv4.
*/
static int
in_cksum(ushort_t *addr, int len)
{
register int nleft = len;
register ushort_t *w = addr;
register ushort_t answer;
ushort_t odd_byte = 0;
register int sum = 0;
/*
* Our algorithm is simple, using a 32 bit accumulator (sum),
* we add sequential 16 bit words to it, and at the end, fold
* back all the carry bits from the top 16 bits into the lower
* 16 bits.
*/
while (nleft > 1) {
sum += *w++;
nleft -= 2;
}
/* mop up an odd byte, if necessary */
if (nleft == 1) {
*(uchar_t *)(&odd_byte) = *(uchar_t *)w;
sum += odd_byte;
}
/*
* add back carry outs from top 16 bits to low 16 bits
*/
sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */
sum += (sum >> 16); /* add carry */
answer = ~sum; /* truncate to 16 bits */
return (answer);
}
static void
reset_snxt_basetimes(void)
{
struct phyint_instance *pii;
for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
}
}
/*
* Is the address one of our own addresses? Unfortunately,
* we cannot check our phyint tables to determine if the address
* is our own. This is because, we don't track interfaces that
* are not part of any group. We have to either use a 'bind' or
* get the complete list of all interfaces using SIOCGLIFCONF,
* to do this check. We could also use SIOCTMYADDR.
* Bind fails for the local zone address, so we might include local zone
* address as target address. If local zone address is a target address
* and it is up, it is not possible to detect the interface failure.
* SIOCTMYADDR also doesn't consider local zone address as own address.
* So, we choose to use SIOCGLIFCONF to collect the local addresses, and they
* are stored in `localaddrs'
*/
boolean_t
own_address(struct in6_addr addr)
{
addrlist_t *addrp;
struct sockaddr_storage ss;
int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6;
addr2storage(af, &addr, &ss);
for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) {
if (sockaddrcmp(&ss, &addrp->al_addr))
return (_B_TRUE);
}
return (_B_FALSE);
}
static int
ns2ms(int64_t ns)
{
return (NSEC2MSEC(ns));
}
static int64_t
tv2ns(struct timeval *tvp)
{
return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000);
}