mpd_main.c revision c61f3fa8aac69b7fcecb24979ae8cc3b399cddfd
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include "mpd_defs.h"
#include "mpd_tables.h"
int debug = 0; /* Debug flag */
static int pollfd_num = 0; /* Num. of poll descriptors */
/* All times below in ms */
int user_failure_detection_time; /* user specified failure detection */
/* time (fdt) */
int user_probe_interval; /* derived from user specified fdt */
/*
* Structure to store mib2 information returned by the kernel.
* This is used to process routing table information.
*/
typedef struct mib_item_s {
struct mib_item_s *mi_next;
void *mi_valp;
} mib_item_t;
static int rtsock_v4; /* AF_INET routing socket */
static int rtsock_v6; /* AF_INET6 routing socket */
static int lsock_v4; /* Listen socket to detect mpathd */
static int lsock_v6; /* Listen socket to detect mpathd */
static char **argv0; /* Saved for re-exec on SIGHUP */
static int ipRouteEntrySize; /* Size of IPv4 route entry */
static int ipv6RouteEntrySize; /* Size of IPv6 route entry */
static void initlog(void);
static void run_timeouts(void);
static void initifs(void);
static void select_test_ifs(void);
static void init_router_targets();
static void cleanup(void);
static int setup_listener(int af);
static void check_config(void);
static void check_testconfig(void);
static void check_addr_unique(struct phyint_instance *,
struct sockaddr_storage *);
static void init_host_targets(void);
static int closefunc(void *, int);
/*
* Return the current time in milliseconds (from an arbitrary reference)
* truncated to fit into an int. Truncation is ok since we are interested
* only in differences and not the absolute values.
*/
getcurrenttime(void)
{
/*
* Use of a non-user-adjustable source of time is
* required. However millisecond precision is sufficient.
* divide by 10^6
*/
return (cur_time);
}
getcurrentsec(void)
{
}
/*
* Add fd to the set being polled. Returns 0 if ok; -1 if failed.
*/
int
{
int i;
int new_num;
/* Check if already present */
for (i = 0; i < pollfd_num; i++) {
return (0);
}
/* Check for empty spot already present */
for (i = 0; i < pollfd_num; i++) {
return (0);
}
}
/* Allocate space for 32 more fds and initialize to -1 */
logperror("poll_add: realloc");
return (-1);
}
for (i = pollfd_num; i < new_num; i++) {
}
goto retry;
}
/*
* Remove fd from the set being polled. Returns 0 if ok; -1 if failed.
*/
int
poll_remove(int fd)
{
int i;
/* Check if already present */
for (i = 0; i < pollfd_num; i++) {
return (0);
}
}
return (-1);
}
/*
* Extract information about the phyint instance. If the phyint instance still
* exists in the kernel then set pii_in_use, else clear it. check_if_removed()
* will use it to detect phyint instances that don't exist any longer and
* remove them, from our database of phyint instances.
* Return value:
* returns true if the phyint instance exists in the kernel,
* returns false otherwise
*/
static boolean_t
{
int err;
struct phyint_instance *pii;
struct phyint_instance *pii_other;
/*
* Phyint instance does not exist in our tables,
* create new phyint instance
*/
} else {
/* Phyint exists in our tables */
switch (err) {
case PI_IOCTL_ERROR:
/* Some ioctl error. don't change anything */
break;
case PI_GROUP_CHANGED:
case PI_IFINDEX_CHANGED:
/*
* Interface index or group membership has changed.
* Delete the old state and recreate based on the new
* state (it may no longer be in a group).
*/
break;
case PI_DELETED:
/* Phyint instance has disappeared from kernel */
pii->pii_in_use = 0;
break;
case PI_OK:
/* Phyint instance exists and is fine */
break;
default:
/* Unknown status */
break;
}
}
else
return (_B_FALSE);
}
/*
* Scan all interfaces to detect changes as well as new and deleted interfaces
*/
static void
initifs()
{
int i, nlifr;
int af;
char *cp;
char *buf;
int sockfd;
struct phyint_instance *pii;
struct phyint_instance *next_pii;
logdebug("initifs: Scanning interfaces\n");
/*
* Free the existing local address list; we'll build a new list below.
*/
/*
* Mark the interfaces so that we can find phyints and logints
* which have disappeared from the kernel. pii_process() and
* logint_init_from_k() will set {pii,li}_in_use when they find
* the interface in the kernel. Also, clear dupaddr bit on probe
* logint. check_addr_unique() will set the dupaddr bit on the
* probe logint, if the testaddress is not unique.
*/
pii->pii_in_use = 0;
li->li_dupaddr = 0;
}
}
/*
* As above, mark groups so that we can detect IPMP interfaces which
* have been removed from the kernel. Also, delete the group address
* list since we'll iteratively recreate it below.
*/
}
logperror("initifs: ioctl (get interface count)");
return;
}
/*
* Pad the interface count to detect when additional interfaces have
* been configured between SIOCGLIFNUM and SIOCGLIFCONF.
*/
logperror("initifs: calloc");
return;
}
logperror("initifs: ioctl (get interface configuration)");
return;
}
/*
* If every lifr_req slot is taken, then additional interfaces must
* have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF.
* Recalculate to make sure we didn't miss any interfaces.
*/
goto again;
}
/*
* Walk through the lifreqs returned by SIOGGLIFCONF, and refresh the
* global list of addresses, phyint groups, phyints, and logints.
*/
logperror("initifs: ioctl (SIOCGLIFFLAGS)");
continue;
}
/*
* If the address is IFF_UP, add it to the local address list.
* (We ignore addresses that aren't IFF_UP since another node
* might legitimately have that address IFF_UP.)
*/
}
/*
* If this address is on an IPMP meta-interface, update our
* phyint_group information (either by recording that group
* still exists or creating a new group), and track what
* group the address is part of.
*/
logperror("initifs: ioctl "
"(SIOCGLIFGROUPNAME)");
continue;
}
logerr("initifs: cannot create group "
continue;
}
}
/*
* Add this to the group's list of data addresses.
*/
logerr("initifs: insufficient memory to track "
"data address information for %s\n",
}
continue;
}
/*
* This isn't an address on an IPMP meta-interface, so it's
* either on an underlying interface or not related to any
* group. Update our phyint and logint information (via
* pii_process() and logint_init_from_k()) -- but first,
* convert the logint name to a phyint name so we can call
* pii_process().
*/
*cp = '\0';
/* The phyint is fine. So process the logint */
}
}
/*
* Scan for groups, phyints and logints that have disappeared from the
* kernel, and delete them.
*/
}
continue;
}
/*
* Refresh the group's state. This is necessary since the
* group's state is defined by the set of usable interfaces in
* the group, and an interface is considered unusable if all
* the RTM_DELADDR/RTM_NEWADDR brings us through here.
*/
}
/*
* Select a test address for sending probes on each phyint instance
*/
/*
*/
}
/*
* Check that a given test address is unique across all of the interfaces in a
* group. (e.g., IPv6 link-locals may not be inherently unique, and binding
* to such an (IFF_NOFAILOVER) address can produce unexpected results.)
* Any issues will be reported by check_testconfig().
*/
static void
{
struct phyint_group *pg;
struct phyint_instance *pii;
struct sockaddr_in *sin;
} else {
}
/*
* For anonymous groups, every interface is assumed to be on its own
* link, so there is no chance of overlapping addresses.
*/
if (pg == phyint_anongroup)
return;
/*
* Walk the list of phyint instances in the group and check for test
* addresses matching ours. Of course, we skip ourself.
*/
continue;
/*
* If this test address is not unique, set the dupaddr bit.
*/
}
}
/*
* Stop probing an interface. Called when an interface is offlined.
* The probe socket is closed on each interface instance, and the
* interface state set to PI_OFFLINE.
*/
void
{
struct phyint_instance *pii;
}
}
}
/*
* Rate the provided test flags. By definition, IFF_NOFAILOVER must be set.
* IFF_UP must also be set so that the associated address can be used as a
* source address. Further, we must be able to exchange packets with local
* destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear. For historical
* reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses.
*/
static int
{
return (BAD_TESTFLAGS);
return (BAD_TESTFLAGS);
return (BEST_TESTFLAGS);
return (BEST_TESTFLAGS);
return (OK_TESTFLAGS);
}
/*
* Attempt to select a test address for each phyint instance.
* Call phyint_inst_sockinit() to complete the initializations.
*/
static void
select_test_ifs(void)
{
struct phyint_instance *pii;
struct phyint_instance *next_pii;
struct logint *probe_logint;
int rating;
logdebug("select_test_ifs\n");
/*
* For each phyint instance, do the test address selection
*/
probe_logint = NULL;
/*
* An interface that is offline should not be probed.
* IFF_OFFLINE interfaces should always be PI_OFFLINE
* unless some other entity has set the offline flag.
*/
logerr("shouldn't be probing offline"
" interface %s (state is: %u)."
" Stopping probes.\n",
}
continue;
} else {
/*
* If something cleared IFF_OFFLINE (e.g., by accident
* because the SIOCGLIFFLAGS/SIOCSLIFFLAGS sequence is
* inherently racy), the phyint may still be offline.
* Just ignore it.
*/
continue;
}
/*
* We've already got a test address; only proceed
* if it's suboptimal.
*/
continue;
}
/*
* Walk the logints of this phyint instance, and select
* the best available test address
*/
/*
* Skip 0.0.0.0 addresses, as those are never
* actually usable.
*/
continue;
/*
* Skip any IPv6 logints that are not link-local,
* since we should always have a link-local address
* anyway and in6_data() expects link-local replies.
*/
continue;
/*
* Rate the testflags. If we've found an optimal
* match, then break out; otherwise, record the most
* recent OK one.
*/
if (rating == BAD_TESTFLAGS)
continue;
probe_logint = li;
if (rating == BEST_TESTFLAGS)
break;
}
/*
* If the probe logint has changed, ditch the old one.
*/
}
if (probe_logint == NULL) {
/*
* We don't have a test address; zero out the probe
* stats array since it is no longer relevant.
* Optimize by checking if it is already zeroed out.
*/
int pr_ndx;
}
continue;
/*
* If we didn't find any new test addr, go to the
* next phyint.
*/
continue;
}
/*
* The phyint is either being assigned a new testaddr
* or is being assigned a testaddr for the 1st time.
* Need to initialize the phyint socket
*/
if (!phyint_inst_sockinit(pii)) {
logdebug("select_test_ifs: "
"phyint_sockinit failed\n");
}
continue;
}
/*
* This phyint instance is now enabled for probes; this
* impacts our state machine in two ways:
*
* 1. If we're probe *capable* as well (i.e., we have
* probe targets) and the interface is in PI_NOTARGETS,
* then transition to PI_RUNNING.
*
* 2. If we're not probe capable, and the other phyint
* instance is also not probe capable, and we were in
* PI_RUNNING, then transition to PI_NOTARGETS.
*
* Also see the state diagram in mpd_probe.c.
*/
if (PROBE_CAPABLE(pii)) {
}
/*
* If no targets are currently known for this phyint
* we need to call init_router_targets. Since
* init_router_targets() initializes the list of targets
* for all phyints it is done below the loop.
*/
/*
* Start the probe timer for this instance.
*/
}
}
/*
* Scan the interface list for any interfaces that are PI_FAILED or
* PI_NOTARGETS but no longer enabled to send probes, and call
* phyint_check_for_repair() to see if the link state indicates that
* the interface should be repaired. Also see the state diagram in
* mpd_probe.c.
*/
}
}
/*
* Try to populate the target list. init_router_targets populates
* the target list from the routing table. If our target list is
* still empty, init_host_targets adds host targets based on the
* host target list of other phyints in the group.
*/
if (target_scan_reqd) {
}
}
/*
* Note that this function only logs pre-existing conditions (e.g., that
* probe-based failure detection is disabled).
*/
static void
check_testconfig(void)
{
char abuf[INET6_ADDRSTRLEN];
int pri;
continue;
if (pi->pi_taddrmsg_printed ||
if (pi->pi_duptaddrmsg_printed)
else
"interface %s; enabling probe-based "
pi->pi_taddrmsg_printed = 0;
pi->pi_duptaddrmsg_printed = 0;
}
continue;
}
if (pi->pi_duptaddrmsg_printed)
continue;
logerr("Test address %s is not unique in group; "
"disabling probe-based failure detection on %s\n",
continue;
}
continue;
if (!pi->pi_taddrmsg_printed) {
logtrace("No test address configured on interface %s; "
"disabling probe-based failure detection on it\n",
}
}
}
/*
* Check phyint group configuration, to detect any inconsistencies,
* and log an error message. This is called from runtimeouts every
* 20 secs. But the error message is displayed once. If the
* consistency is resolved by the admin, a recovery message is displayed
* once.
*/
static void
check_config(void)
{
struct phyint_group *pg;
/*
* All phyints of a group must be homogeneous to ensure that they can
* take over for one another. If any phyint in a group has IPv4
* plumbed, check that all phyints have IPv4 plumbed. Do a similar
* check for IPv6.
*/
if (pg == phyint_anongroup)
continue;
/*
* 1st pass. Determine if at least 1 phyint in the group
* has IPv4 plumbed and if so set v4_in_group to true.
* Repeat similarly for IPv6.
*/
}
/*
* 2nd pass. If v4_in_group is true, check that phyint
* has IPv4 plumbed. Repeat similarly for IPv6. Print
* out a message the 1st time only.
*/
continue;
if (!pi->pi_cfgmsg_printed) {
logerr("IP interface %s in group %s is"
" not plumbed for IPv4, affecting"
" IPv4 connectivity\n",
}
} else if (v6_in_group == _B_TRUE &&
if (!pi->pi_cfgmsg_printed) {
logerr("IP interface %s in group %s is"
" not plumbed for IPv6, affecting"
" IPv6 connectivity\n",
}
} else {
/*
* The phyint matches the group configuration,
* if we have reached this point. If it was
* improperly configured earlier, log an
* error recovery message
*/
if (pi->pi_cfgmsg_printed) {
logerr("IP interface %s is now"
" consistent with group %s "
" and connectivity is restored\n",
pi->pi_cfgmsg_printed = 0;
}
}
}
}
}
/*
* Timer mechanism using relative time (in milliseconds) from the
* previous timer event. Timers exceeding TIMER_INFINITY milliseconds
* will fire after TIMER_INFINITY milliseconds.
* Unsigned arithmetic note: We assume a 32-bit circular sequence space for
* time values. Hence 2 consecutive timer events cannot be spaced farther
* than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value
* that can be passed for the delay parameter of timer_schedule()
*/
static void
timer_init(void)
{
/*
* The call to run_timeouts() will get the timer started
* Since there are no phyints at this point, the timer will
* be set for IF_SCAN_INTERVAL ms.
*/
run_timeouts();
}
/*
* Make sure the next SIGALRM occurs delay milliseconds from the current
* time if not earlier. We are interested only in time differences.
*/
void
{
now = getcurrenttime();
if (delay == 0) {
/* Minimum allowed delay */
delay = 1;
}
/* Will this timer occur before the currently scheduled SIGALRM? */
logdebug("timer_schedule(%u) - no action: "
}
return;
}
logdebug("timer_schedule(%u): sec %ld usec %ld\n",
}
logperror("timer_schedule: setitimer");
exit(2);
}
}
static void
timer_cancel(void)
{
logdebug("timer_cancel()\n");
logperror("timer_cancel: setitimer");
}
/*
* Timer has fired. Determine when the next timer event will occur by asking
* all the timer routines. Should not be called from a timer routine.
*/
static void
run_timeouts(void)
{
struct phyint_instance *pii;
struct phyint_instance *next_pii;
static boolean_t timeout_running;
/* assert that recursive timeouts don't happen. */
logdebug("run_timeouts()\n");
initifs();
check_config();
}
logdebug("run_timeouts(%s %s): next scheduled for"
" this phyint inst %u, next scheduled global"
" %u ms\n",
}
}
/*
* Make sure initifs() is called at least once every
* IF_SCAN_INTERVAL, to make sure that we are in sync
* with the kernel, in case we have missed any routing
* socket messages.
*/
if (next > IF_SCAN_INTERVAL)
}
static int eventpipe_write = -1;
/*
* Ensure that signals are processed synchronously with the rest of
* the code by just writing a one character signal number on the pipe.
* The poll loop will pick this up and process the signal event.
*/
static void
sig_handler(int signo)
{
/*
* Don't write to pipe if cleanup has already begun. cleanup()
* might have closed the pipe already
*/
if (cleanup_started)
return;
if (eventpipe_write == -1) {
logerr("sig_handler: no pipe found\n");
return;
}
logperror("sig_handler: write");
}
extern struct probes_missed probes_missed;
/*
* Pick up a signal "byte" from the pipe and process it.
*/
static void
{
struct phyint_instance *pii;
int pr_ndx;
case -1:
logperror("in_signal: read");
exit(1);
/* NOTREACHED */
case 1:
break;
case 0:
logerr("in_signal: read end of file\n");
exit(1);
/* NOTREACHED */
default:
logerr("in_signal: read > 1\n");
exit(1);
}
switch (buf) {
case SIGALRM:
logdebug("in_signal(SIGALRM) delta %u\n",
now - timer_next);
}
run_timeouts();
break;
case SIGUSR1:
logdebug("Printing configuration:\n");
/* Print out the internal tables */
/*
* Print out the accumulated statistics about missed
* probes (happens due to scheduling delay).
*/
logerr("Missed sending total of %d probes spread over"
/*
* Print out the accumulated statistics about probes
* that were sent.
*/
unacked = 0;
case PR_ACKED:
acked++;
break;
case PR_LOST:
lost++;
break;
case PR_UNACKED:
unacked++;
break;
}
}
logerr("\nProbe stats on (%s %s)\n"
"Number of probes sent %lld\n"
"Number of probe acks received %lld\n"
"Number of valid unacknowledged probes %lld\n"
"Number of ambiguous probe acks received %lld\n",
}
break;
case SIGHUP:
logerr("SIGHUP: restart and reread config file\n");
/*
* Cancel the interval timer. Needed since setitimer() uses
* alarm() and the time left is inherited across exec(), and
* thus the SIGALRM may be delivered before a handler has been
* setup, causing in.mpathd to erroneously exit.
*/
timer_cancel();
cleanup();
_exit(0177);
/* NOTREACHED */
case SIGINT:
case SIGTERM:
case SIGQUIT:
cleanup();
exit(0);
/* NOTREACHED */
default:
}
}
static void
cleanup(void)
{
struct phyint_instance *pii;
struct phyint_instance *next_pii;
/*
* Make sure that we don't write to eventpipe in
* sig_handler() if any signal notably SIGALRM,
* occurs after we close the eventpipe descriptor below
*/
}
(void) close(0);
(void) close(1);
(void) close(2);
(void) close(eventpipe_read);
(void) close(eventpipe_write);
}
/*
* Create pipe for signal delivery and set up signal handlers.
*/
static void
setup_eventpipe(void)
{
int fds[2];
logperror("setup_eventpipe: pipe");
exit(1);
}
eventpipe_read = fds[0];
exit(1);
}
}
/*
* Create a routing socket for receiving RTM_IFINFO messages.
*/
static int
setup_rtsock(int af)
{
int s;
int flags;
int aware = RTAW_UNDER_IPMP;
if (s == -1) {
logperror("setup_rtsock: socket PF_ROUTE");
exit(1);
}
logperror("setup_rtsock: setsockopt RT_AWARE");
(void) close(s);
exit(1);
}
logperror("setup_rtsock: fcntl F_GETFL");
(void) close(s);
exit(1);
}
logperror("setup_rtsock: fcntl F_SETFL");
(void) close(s);
exit(1);
}
if (poll_add(s) == -1) {
(void) close(s);
exit(1);
}
return (s);
}
/*
* Process an RTM_IFINFO message received on a routing socket.
* The return value indicates whether a full interface scan is required.
* If just the state of the IFF_RUNNING interface flag has changed, a
* a full interface scan isn't required.
*/
static boolean_t
{
struct sockaddr_dl *sdl;
struct phyint_instance *pii;
/*
* Although the sockaddr_dl structure is directly after the
* if_msghdr_t structure. At the time of writing, the size of the
* if_msghdr_t structure is different on 32 and 64 bit kernels, due
* to the presence of a timeval structure, which contains longs,
* in the if_data structure. Anyway, we know where the message ends,
* so we work backwards to get the start of the sockaddr_dl structure.
*/
/*LINTED*/
sizeof (struct sockaddr_dl));
/*
* The interface name is in sdl_data.
* RTM_IFINFO messages are only generated for logical interface
* zero, so there is no colon and logical interface number to
* strip from the name. The name is not null terminated, but
* there should be enough space in sdl_data to add the null.
*/
if (debug & D_LINKNOTE)
logdebug("process_rtm_ifinfo: phyint name too long\n");
return (_B_TRUE);
}
if (debug & D_LINKNOTE)
logdebug("process_rtm_ifinfo: phyint lookup failed"
return (_B_TRUE);
}
/*
* We want to try and avoid doing a full interface scan for
* link state notifications from the datalink layer, as indicated
* by the state of the IFF_RUNNING flag. If just the
* IFF_RUNNING flag has changed state, the link state changes
* are processed without a full scan.
* If there is both an IPv4 and IPv6 instance associated with
* the physical interface, we will get an RTM_IFINFO message
* for each instance. If we just maintained a single copy of
* the physical interface flags, it would appear that no flags
* had changed when the second message is processed, leading us
* to believe that the message wasn't generated by a flags change,
* and that a full interface scan is required.
* To get around this problem, two additional copies of the flags
* are kept, one copy for each instance. These are only used in
* this routine. At any one time, all three copies of the flags
* should be identical except for the IFF_RUNNING flag. The
* copy of the flags in the "phyint" structure is always up to
* date.
*/
if (debug & D_LINKNOTE)
logdebug("process_rtm_ifinfo: no instance of address "
return (_B_TRUE);
}
if (debug & D_LINKNOTE) {
logdebug("process_rtm_ifinfo: %s address family: %s, "
}
/*
* If IFF_STANDBY has changed, indicate that the interface has changed
* types.
*/
/* Has just the IFF_RUNNING flag changed state ? */
struct phyint_instance *pii_other;
/*
* It wasn't just a link state change. Update
* the other instance's copy of the flags.
*/
return (_B_TRUE);
}
return (_B_FALSE);
}
/*
* Retrieve as many routing socket messages as possible, and try to
* empty the routing sockets. Initiate full scan of targets or interfaces
* as needed.
* We listen on separate IPv4 an IPv6 sockets so that we can accurately
* detect changes in certain flags (see "process_rtm_ifinfo()" above).
*/
static void
{
int nbytes;
int type;
/* Read as many messages as possible and try to empty the sockets */
for (;;) {
if (nbytes <= 0) {
/* No more messages */
break;
}
logerr("process_rtsock: version %d "
break;
}
logdebug("process_rtsock: message %d\n",
}
case RTM_NEWADDR:
case RTM_DELADDR:
/*
* Some logical interface has changed,
* have to scan everything to determine
* what actually changed.
*/
break;
case RTM_IFINFO:
break;
case RTM_ADD:
case RTM_DELETE:
case RTM_CHANGE:
case RTM_OLDADD:
case RTM_OLDDEL:
break;
default:
/* Not interesting */
break;
}
}
break;
}
if (need_if_scan) {
logdebug("process_rtsock: synchronizing with kernel\n");
initifs();
} else if (rtm_ifinfo_seen) {
if (debug & D_LINKNOTE)
logdebug("process_rtsock: "
}
if (need_rt_scan)
}
/*
* Look if the phyint instance or one of its logints have been removed from
* the kernel and take appropriate action.
* Uses {pii,li}_in_use.
*/
static void
{
/* Detect phyints that have been removed from the kernel. */
if (!pii->pii_in_use) {
logtrace("%s %s has been removed from kernel\n",
} else {
/* Detect logints that have been removed. */
}
}
}
}
/*
* Parse the supplied mib2 information to extract the routing information
* table. Process the routing table to get the list of known onlink routers
* and update our database. These onlink routers will serve as probe
* targets.
*/
static void
{
continue;
}
}
}
/*
* Convert octet `octp' to a phyint name and store in `ifname'
*/
static void
{
char *cp;
*cp = '\0';
}
/*
* Examine the IPv4 routing table `buf' for possible targets. For each
* possible target, if it's on the same subnet an interface route, pass
* it to router_add_common() for further consideration.
*/
static void
{
struct in_addr nexthop_v4;
if (len == 0)
return;
/*
* Scan the routing table entries for any IRE_OFFSUBNET entries, and
* cross-reference them with the interface routes to determine if
* they're possible probe targets.
*/
continue;
/* Get the nexthop address. */
/*
* Rescan the routing table looking for interface routes that
* are on the same subnet, and try to add them. If they're
* not relevant (e.g., the interface route isn't part of an
* IPMP group, router_add_common() will discard).
*/
continue;
continue;
}
}
}
void
{
struct phyint_instance *pii;
/*
* Retrieve the phyint instance; bail if it's not known to us yet.
*/
return;
/*
* Don't use our own addresses as targets.
*/
if (own_address(nexthop))
return;
/*
* If the phyint is part a named group, then add the address to all
* members of the group; note that this is suboptimal in the IPv4 case
* as it has already been added to all matching interfaces in
* ire_process_v4(). Otherwise, add the address only to the phyint
* itself, since other phyints in the anongroup may not be on the same
* subnet.
*/
} else {
}
}
/*
* Examine the IPv6 routing table `buf' for possible link-local targets, and
* pass any contenders to router_add_common() for further consideration.
*/
static void
{
char grname[LIFGRNAMSIZ];
struct in6_addr nexthop_v6;
if (len == 0)
return;
/*
* Scan the routing table entries for any IRE_OFFSUBNET entries, and
* cross-reference them with the interface routes to determine if
* they're possible probe targets.
*/
continue;
/* Get the nexthop address. */
/*
* The interface name should always exist for link-locals;
* we use it to map this entry to an IPMP group name.
*/
continue;
continue;
}
/*
* Rescan the list of routes for interface routes, and add the
* above target to any interfaces in the same IPMP group.
*/
continue;
}
}
}
}
}
/*
* Build a list of target routers, by scanning the routing tables.
* It is assumed that interface routes exist, to reach the routers.
*/
static void
init_router_targets(void)
{
struct phyint_instance *pii;
if (force_mcast)
return;
/*
* Set tg_in_use to false only for router targets.
*/
if (!pii->pii_targets_are_routers)
continue;
}
exit(1);
if (!pii->pii_targets_are_routers)
continue;
/*
* If the group has failed, it's likely the route was
* removed by an application affected by that failure.
* In that case, we keep the target so that we can
* reliably repair, at which point we'll refresh the
* target list again.
*/
}
}
}
/*
* Attempt to assign host targets to any interfaces that do not currently
* have probe targets by sharing targets with other interfaces in the group.
*/
static void
init_host_targets(void)
{
struct phyint_instance *pii;
struct phyint_group *pg;
}
}
/*
* Duplicate host targets from other phyints of the group to
* the phyint instance 'desired_pii'.
*/
static void
{
int af;
struct phyint_instance *pii;
/*
* For every phyint in the same group as desired_pii, check if
* it has any host targets. If so add them to desired_pii.
*/
/*
* We know that we don't have targets on this phyint instance
* since we have been called. But we still check for
* pii_targets_are_routers because another phyint instance
* could have router targets, since IFF_NOFAILOVER addresses
* on different phyint instances may belong to different
* subnets.
*/
continue;
}
}
}
static void
{
}
#define MPATHD_DEFAULT_FILE "/etc/default/mpathd"
static char *
getdefault(char *name)
{
if (defopen(MPATHD_DEFAULT_FILE) == 0) {
char *cp;
int flags;
/*
* ignore case
*/
/* Add "=" to the name */
/* close */
}
return (value);
}
/*
* Command line options below
*/
int
{
int i;
int c;
struct phyint_instance *pii;
char *value;
/*
* NOTE: The messages output by in.mpathd are not suitable for
* translation, so we do not call textdomain().
*/
/*
* Get the user specified value of 'failure detection time'
*/
if (user_failure_detection_time <= 0) {
logerr("Invalid failure detection time %s, assuming "
"default of %d ms\n", value,
} else if (user_failure_detection_time <
logerr("Too small failure detection time of %s, "
"assuming minimum of %d ms\n", value,
}
} else {
/* User has not specified the parameter, Use default value */
}
/*
* This gives the frequency at which probes will be sent.
* When fdt ms elapses, we should be able to determine
* whether 5 consecutive probes have failed or not.
* 1 probe will be sent in every user_probe_interval ms,
* randomly anytime in the (0.5 - 1.0) 2nd half of every
* user_probe_interval. Thus when we send out probe 'n' we
* can be sure that probe 'n - 2' is lost, if we have not
* got the ack. (since the probe interval is > crtt). But
* probe 'n - 1' may be a valid unacked probe, since the
* time between 2 successive probes could be as small as
* 0.5 * user_probe_interval. Hence the NUM_PROBE_FAILS + 2
*/
(NUM_PROBE_FAILS + 2);
/*
* Get the user specified value of failback_enabled from
*/
else
} else {
}
/*
* Get the user specified value of track_all_phyints from
* TRACK_INTERFACES_ONLY_WITH_GROUPS.
*/
else
logerr("Invalid value for "
"TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value);
} else {
}
switch (c) {
case 'a':
break;
case 'm':
break;
case 'd':
break;
case 'D':
if (i == 0) {
optarg);
exit(1);
}
debug |= i;
break;
case 'l':
/*
* Turn off link state notification handling.
* Undocumented command line flag, for debugging
* purposes.
*/
break;
default:
exit(1);
}
}
/*
* The sockets for the loopback command interface should be listening
* before we fork and exit in daemonize(). This way, whoever started us
* can use the loopback interface as soon as they get a zero exit
* status.
*/
logerr("main: setup_listener failed for both IPv4 and IPv6\n");
exit(1);
}
if (!foreground) {
if (!daemonize()) {
logerr("cannot daemonize\n");
}
initlog();
}
/*
* Initializations:
* 1. Create ifsock* sockets. These are used for performing SIOC*
* ioctls. We have 2 sockets 1 each for IPv4 and IPv6.
* 3. Create the routing sockets, used for listening
* to routing / interface changes.
* 4. phyint_init() - Initialize physical interface state
* (in mpd_tables.c). Must be done before creating interfaces,
* which timer_init() does indirectly.
* 5. Query kernel for route entry sizes (v4 and v6).
* 6. timer_init() - Initialize timer related stuff
* 7. initifs() - Initialize our database of all known interfaces
* 8. init_router_targets() - Initialize our database of all known
* router targets.
*/
if (ifsock_v4 < 0) {
logperror("main: IPv4 socket open");
exit(1);
}
if (ifsock_v6 < 0) {
logperror("main: IPv6 socket open");
exit(1);
}
if (phyint_init() == -1) {
logerr("cannot initialize physical interface structures");
exit(1);
}
exit(1);
timer_init();
initifs();
/*
* If we're operating in "adopt" mode and no interfaces need to be
* tracked, shut down (ifconfig(1M) will restart us on demand if
* interfaces are subsequently put into multipathing groups).
*/
exit(0);
/*
* Main body. Keep listening for activity on any of the sockets
* that we are monitoring and take appropriate action as necessary.
* signals are also handled synchronously.
*/
for (;;) {
continue;
logperror("main: poll");
exit(1);
}
for (i = 0; i < pollfd_num; i++) {
continue;
break;
}
break;
}
else
break;
}
}
break;
}
}
}
}
/* NOTREACHED */
return (EXIT_SUCCESS);
}
static int
setup_listener(int af)
{
int sock;
int on;
int len;
int ret;
struct sockaddr_storage laddr;
struct sockaddr_in *sin;
struct sockaddr_in6 *sin6;
if (sock < 0) {
logperror("setup_listener: socket");
exit(1);
}
on = 1;
sizeof (on)) < 0) {
logperror("setup_listener: setsockopt (SO_REUSEADDR)");
exit(1);
}
len = sizeof (struct sockaddr_in);
} else {
len = sizeof (struct sockaddr_in6);
}
if (ret < 0) {
if (errno == EADDRINUSE) {
/*
* Another instance of mpathd may be already active.
*/
logerr("main: is another instance of in.mpathd "
"already active?\n");
exit(1);
} else {
return (-1);
}
}
logperror("main: listen");
exit(1);
}
exit(1);
}
return (sock);
}
/*
* Table of commands and their expected size; used by loopback_cmd().
*/
static struct {
const char *name;
unsigned int size;
} commands[] = {
{ "MI_PING", sizeof (uint32_t) },
{ "MI_OFFLINE", sizeof (mi_offline_t) },
{ "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) },
{ "MI_QUERY", sizeof (mi_query_t) }
};
/*
* Commands received over the loopback interface come here (via libipmp).
*/
static void
{
int newfd;
struct sockaddr_storage peer;
struct sockaddr_in *peer_sin;
struct sockaddr_in6 *peer_sin6;
union mi_commands mpi;
char abuf[INET6_ADDRSTRLEN];
int retval;
if (newfd < 0) {
logperror("loopback_cmd: accept");
return;
}
switch (family) {
case AF_INET:
/*
* Validate the address and port to make sure that
* non privileged processes don't connect and start
* talking to us.
*/
if (peerlen != sizeof (struct sockaddr_in)) {
return;
}
logerr("Attempt to connect from addr %s port %d\n",
return;
}
break;
case AF_INET6:
if (peerlen != sizeof (struct sockaddr_in6)) {
return;
}
/*
* Validate the address and port to make sure that
* non privileged processes don't connect and start
* talking to us.
*/
sizeof (abuf));
logerr("Attempt to connect from addr %s port %d\n",
return;
}
default:
return;
}
/*
* The sizeof the 'mpi' buffer corresponds to the maximum size of
* all supported commands
*/
/*
* In theory, we can receive any sized message for a stream socket,
* but we don't expect that to happen for a small message over a
* loopback connection.
*/
logerr("loopback_cmd: bad command format or read returns "
"partial data %d\n", len);
return;
}
return;
}
/*
* Only MI_PING and MI_QUERY can come from unprivileged sources.
*/
logerr("Unprivileged request from %s for privileged "
return;
}
logerr("loopback_cmd: short %s command (expected %d, got %d)\n",
return;
}
if (retval != IPMP_SUCCESS) {
}
}
/*
* Process the commands received via libipmp.
*/
static unsigned int
{
struct mi_offline *mio;
struct mi_undo_offline *miu;
unsigned int retval;
switch (mpi->mi_command) {
case MI_PING:
case MI_OFFLINE:
if (retval == IPMP_FAILURE)
case MI_UNDO_OFFLINE:
if (retval == IPMP_FAILURE)
case MI_QUERY:
default:
break;
}
}
/*
* Process the query request pointed to by `miq' and send a reply on file
* descriptor `fd'. Returns an IPMP error code.
*/
static unsigned int
{
unsigned int retval;
switch (miq->miq_inforeq) {
case IPMP_ADDRINFO:
&adinfop);
if (retval != IPMP_SUCCESS)
if (retval == IPMP_SUCCESS)
return (retval);
case IPMP_GROUPLIST:
if (retval != IPMP_SUCCESS)
if (retval == IPMP_SUCCESS)
return (retval);
case IPMP_GROUPINFO:
if (retval != IPMP_SUCCESS)
if (retval == IPMP_SUCCESS)
return (retval);
case IPMP_IFINFO:
if (retval != IPMP_SUCCESS)
if (retval == IPMP_SUCCESS)
return (retval);
case IPMP_SNAP:
/*
* Before taking the snapshot, sync with the kernel.
*/
initifs();
if (retval != IPMP_SUCCESS)
if (retval != IPMP_SUCCESS)
goto out;
if (retval != IPMP_SUCCESS)
goto out;
if (retval != IPMP_SUCCESS)
goto out;
if (retval != IPMP_SUCCESS)
goto out;
}
if (retval != IPMP_SUCCESS)
goto out;
}
if (retval != IPMP_SUCCESS)
goto out;
}
out:
return (retval);
default:
break;
}
}
/*
* Send the group information pointed to by `grinfop' on file descriptor `fd'.
* Returns an IPMP error code.
*/
static unsigned int
{
unsigned int retval;
if (retval != IPMP_SUCCESS)
return (retval);
if (retval != IPMP_SUCCESS)
return (retval);
}
/*
* Send the interface information pointed to by `ifinfop' on file descriptor
* `fd'. Returns an IPMP error code.
*/
static unsigned int
{
unsigned int retval;
if (retval != IPMP_SUCCESS)
return (retval);
if (retval != IPMP_SUCCESS)
return (retval);
}
/*
* Send the address information pointed to by `adinfop' on file descriptor
* `fd'. Returns an IPMP error code.
*/
static unsigned int
{
}
/*
* Send the group list pointed to by `grlistp' on file descriptor `fd'.
* Returns an IPMP error code.
*/
static unsigned int
{
}
/*
* Initialize an mi_result_t structure using `error' and `syserror' and
* send it on file descriptor `fd'. Returns an IPMP error code.
*/
static unsigned int
{
if (error == IPMP_FAILURE)
else
me.me_sys_error = 0;
}
/*
* Daemonize the process.
*/
static boolean_t
daemonize(void)
{
switch (fork()) {
case -1:
return (_B_FALSE);
case 0:
/*
* Lose our controlling terminal, and become both a session
* leader and a process group leader.
*/
if (setsid() == -1)
return (_B_FALSE);
/*
* Under POSIX, a session leader can accidentally (through
* open(2)) acquire a controlling terminal if it does not
* have one. Just to be safe, fork() again so we are not a
* session leader.
*/
switch (fork()) {
case -1:
return (_B_FALSE);
case 0:
(void) chdir("/");
(void) umask(022);
break;
default:
}
break;
default:
}
return (_B_TRUE);
}
/*
* The parent has created some fds before forking on purpose, keep them open.
*/
static int
/* ARGSUSED */
{
return (0);
}
/* LOGGER */
#include <syslog.h>
/*
* Logging routines. All routines log to syslog, unless the daemon is
* running in the foreground, in which case the logging goes to stderr.
*
* The following routines are available:
*
* logdebug(): A printf-like function for outputting debug messages
* (messages at LOG_DEBUG) that are only of use to developers.
*
* logtrace(): A printf-like function for outputting tracing messages
* (messages at LOG_INFO) from the daemon. This is typically used
* to log the receipt of interesting network-related conditions.
*
* logerr(): A printf-like function for outputting error messages
* (messages at LOG_ERR) from the daemon.
*
* logperror*(): A set of functions used to output error messages
* (messages at LOG_ERR); these automatically append strerror(errno)
* and a newline to the message passed to them.
*
* NOTE: since the logging functions write to syslog, the messages passed
* to them are not eligible for localization. Thus, gettext() must
* *not* be used.
*/
static int logging = 0;
static void
initlog(void)
{
logging++;
}
/* PRINTFLIKE2 */
void
{
if (logging)
else
}
/* PRINTFLIKE1 */
void
{
if (logging)
else
}
void
{
if (logging) {
} else {
}
}
void
{
if (logging) {
} else {
}
}
void
{
if (polled)
pii->pii_basetime_inited = 0;
}
struct sockaddr_storage *ssp)
{
return (_B_FALSE);
return (_B_TRUE);
}
void
{
}
}
/*
* Send down a T_OPTMGMT_REQ to ip asking for all data in the various
* tables defined by mib2.h. Pass the table information returned to the
* supplied function.
*/
static int
{
int flags;
int rval;
int status = -1;
if (mibfd == -1) {
logperror("mibwalk(): ip open");
return (status);
}
}
/*
* Note: we use the special level value below so that IP will return
* us information concerning IRE_MARK_TESTHIDDEN routes.
*/
logperror("mibwalk(): putmsg(ctl)");
return (status);
}
/*
* The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for
* each table defined in mib2.h. Each T_OPTMGMT_ACK msg contains
* a control and data part. The control part contains a struct
* T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies
* the level, name and length of the data in the data part. The
* data part contains the actual table data. The last message
* is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a
* single option with zero optlen.
*/
for (;;) {
continue;
logerr("mibwalk(): getmsg(ctl) ret: %d err: %d\n",
goto error;
}
goto error;
}
case T_ERROR_ACK:
logerr("mibwalk(): T_ERROR_ACK ctlbuf "
goto error;
}
logerr("mibwalk(): T_ERROR_ACK: TLI_err = 0x%lx: %s\n"
goto error;
case T_OPTMGMT_ACK:
sizeof (struct opthdr))) {
logerr("mibwalk(): T_OPTMGMT_ACK ctlbuf too "
goto error;
}
logerr("mibwalk(): MGMT_flags != T_SUCCESS: "
goto error;
}
break;
default:
goto error;
}
/* The following assert also implies MGMT_flags == T_SUCCESS */
/*
* We have reached the end of this T_OPTMGMT_ACK
* message. If this is the last message i.e EOD,
* break, else process the next T_OPTMGMT_ACK msg.
*/
if (rval == 0) {
/* This is the EOD message. */
break;
}
/* Not EOD but no data to retrieve */
continue;
}
/*
* We should only be here if MOREDATA was set.
* Allocate an empty mib_item_t and link into the list
* of MIB items.
*/
logperror("mibwalk(): malloc() failed.");
goto error;
}
else
logperror("mibwalk(): malloc() failed.");
goto error;
}
/* Retrieve the actual MIB data */
for (;;) {
flags = 0;
&flags)) != 0) {
continue;
/*
* We shouldn't get MOREDATA here so treat that
* as an error.
*/
logperror("mibwalk(): getmsg(data)");
goto error;
}
break;
}
}
status = 0;
/* Pass the accumulated MIB data to the supplied function pointer */
}
return (status);
}
/*
* Parse the supplied mib2 information to get the size of routing table
* entries. This is needed when running in a branded zone where the
* Solaris application environment and the Solaris kernel may not be the
* the same release version.
*/
static void
{
continue;
}
}
}