pxping.c revision 10ccdc440e8a56cea9ecb0ae75e3fe7a74fb1c7a
/* -*- indent-tabs-mode: nil; -*- */
#include "winutils.h"
#include "proxy.h"
#include "proxy_pollmgr.h"
#include "pxremap.h"
#ifndef RT_OS_WINDOWS
#ifdef RT_OS_DARWIN
# define __APPLE_USE_RFC_3542
#endif
#include <poll.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#else
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "winpoll.h"
#endif
#include "lwip/inet_chksum.h"
#if 1 /* XXX: force debug for now */
#endif
#if defined(RT_OS_LINUX) && !defined(__USE_GNU)
/*
* XXX: This is gross. in6_pktinfo is now hidden behind _GNU_SOURCE
*
* But in older glibc versions, e.g. RHEL5, it is not! I don't want
* to deal with _GNU_SOURCE now, so as a kludge check for glibc
* version. It seems the __USE_GNU guard was introduced in 2.8.
*/
struct in6_pktinfo {
unsigned int ipi6_ifindex;
};
#endif /* __GLIBC_PREREQ */
#endif /* RT_OS_LINUX && !__USE_GNU */
/* forward */
struct ping_pcb;
/**
* Global state for ping proxy collected in one entity to minimize
* globals. There's only one instance of this structure.
*
* Raw ICMP sockets are promiscuous, so it doesn't make sense to have
* multiple. If this code ever needs to support multiple netifs, the
* netif member should be exiled into "pcb".
*/
struct pxping {
#if defined(RT_OS_DARWIN) || defined(RT_OS_SOLARIS)
# define DF_WITH_IP_HDRINCL
int hdrincl;
#else
int df;
#endif
int ttl;
int tos;
#ifdef RT_OS_WINDOWS
#endif
int hopl;
struct pollmgr_handler pmhdl4;
struct pollmgr_handler pmhdl6;
/**
* Protect lwIP and pmgr accesses to the list of pcbs.
*/
/*
* We need to find pcbs both from the guest side and from the host
* side. If we need to support industrial grade ping throughput,
* we will need two pcb hashes. For now, a short linked list
* should be enough. Cf. pxping_pcb_for_request() and
* pxping_pcb_for_reply().
*/
#define PXPING_MAX_PCBS 8
#define TIMEOUT 5
int timer_active;
};
/**
* Quasi PCB for ping.
*/
struct ping_pcb {
/**
* Desired slot in pxping::timeout_list. See pxping_timer().
*/
/**
* Chaining for pxping::timeout_list
*/
struct ping_pcb **pprev_timeout;
struct ping_pcb *next_timeout;
/**
* Chaining for pxping::pcbs
*/
union {
struct sockaddr_in sin;
struct sockaddr_in6 sin6;
} peer;
};
/**
* lwIP thread callback message for IPv4 ping.
*
* We pass raw IP datagram for ip_output_if() so we only need pbuf and
* netif (from pxping).
*/
struct ping_msg {
struct pbuf *p;
};
/**
* lwIP thread callback message for IPv6 ping.
*
* We cannot obtain raw IPv6 datagram from host without extra trouble,
* so we pass ICMPv6 payload in pbuf and also other parameters to
* ip6_output_if().
*/
struct ping6_msg {
struct pbuf *p;
};
#ifdef RT_OS_WINDOWS
#endif
static void pxping_timer(void *arg);
int is_ipv6,
static void pxping_pcb_forward_inbound(void *arg);
static void pxping_pcb_forward_inbound6(void *arg);
/*
* NB: This is not documented except in RTFS.
*
* If ip_output_if() is passed dest == NULL then it treats p as
* complete IP packet with payload pointing to the IP header. It does
* not build IP header, ignores all header-related arguments, fetches
* real destination from the header in the pbuf and outputs pbuf to
* the specified netif.
*/
#define ip_raw_output_if(p, netif) \
{
const int on = 1;
int status;
return ERR_VAL;
}
#ifdef DF_WITH_IP_HDRINCL
#else
#endif
#ifdef RT_OS_LINUX
{
const int dont = IP_PMTUDISC_DONT;
if (status != 0) {
perror("IP_MTU_DISCOVER");
}
}
#endif /* RT_OS_LINUX */
}
#ifdef RT_OS_WINDOWS
/* we need recvmsg */
if (status == SOCKET_ERROR) {
/* close(sock6); */
}
}
#endif
#if !defined(IPV6_RECVPKTINFO)
#define IPV6_RECVPKTINFO (IPV6_PKTINFO)
#endif
if (status < 0) {
perror("IPV6_RECVPKTINFO");
/* XXX: for now this is fatal */
}
#if !defined(IPV6_RECVHOPLIMIT)
#define IPV6_RECVHOPLIMIT (IPV6_HOPLIMIT)
#endif
if (status < 0) {
perror("IPV6_RECVHOPLIMIT");
}
#ifdef IPV6_RECVTCLASS /* new in RFC 3542, there's no RFC 2292 counterpart */
/* TODO: IPV6_RECVTCLASS */
#endif
}
return ERR_OK;
}
#ifdef RT_OS_WINDOWS
static int
{
int status;
&WSARecvMsgGUID, sizeof(WSARecvMsgGUID),
&nread,
return status;
}
#endif /* RT_OS_WINDOWS */
static u32_t
{
return sum;
}
static u32_t
{
return sum;
}
static u32_t
{
return sum;
}
static u32_t
{
return sum;
}
static u32_t
{
return sum;
}
static u32_t
{
return sum;
}
/**
* ICMP Echo Request in pbuf "p" is to be proxied.
*/
static void
{
#ifdef DF_WITH_IP_HDRINCL
#endif
struct icmp_echo_hdr icmph_orig;
struct icmp_echo_hdr *icmph;
int status;
pbuf_free(p);
return;
}
pbuf_free(p);
return;
}
printf(" seq %d len %u ttl %d\n",
}
pbuf_free(p);
return;
}
--ttl;
}
/*
* OS X doesn't provide a socket option to control fragmentation.
* Solaris doesn't provide IP_DONTFRAG on all releases we support.
* In this case we have to use IP_HDRINCL. We don't want to use
* it always since it doesn't handle fragmentation (but that's ok
* for DF) and Windows doesn't do automatic source address
* selection with IP_HDRINCL.
*/
#ifdef DF_WITH_IP_HDRINCL
}
else {
perror("IP_HDRINCL");
}
}
if (RT_UNLIKELY(status != 0)) {
pbuf_free(p);
return;
}
/* we will overwrite IP header, save original for ICMP errors */
}
else {
/* let the kernel select suitable source address */
}
#ifdef RT_OS_DARWIN
/* wants ip_offset and ip_len fields in host order */
/* wants checksum of everything (sic!), in host order */
sum = inet_chksum_pbuf(p);
#else /* !RT_OS_DARWIN */
#endif
}
else /* !pxping->hdrincl */
#endif /* DF_WITH_IP_HDRINCL */
{
#if !defined(DF_WITH_IP_HDRINCL)
/* control DF flag via setsockopt(2) */
#define USE_DF_OPTION(_Optname) \
#if defined(RT_OS_LINUX)
#elif defined(RT_OS_WINDOWS)
#endif
}
else {
}
}
#endif /* !DF_WITH_IP_HDRINCL */
}
else {
perror("IP_TTL");
}
}
}
else {
perror("IP_TOS");
}
}
}
/* rewrite ICMP echo header */
if (status != 0) {
#ifdef DF_WITH_IP_HDRINCL
/* restore original IP header */
}
else
#endif
{
if (RT_UNLIKELY(status != 0)) {
pbuf_free(p);
return;
}
}
/* restore original ICMP header */
/*
* Some ICMP errors may be generated by the kernel and we read
* them from the socket and forward them normally, hence the
* ifdefs below.
*/
switch (error) {
#if !( defined(RT_OS_SOLARIS) \
|| (defined(RT_OS_LINUX) && !defined(DF_WITH_IP_HDRINCL)) \
)
case EMSGSIZE:
break;
#endif
case ENETDOWN:
case ENETUNREACH:
break;
case EHOSTDOWN:
case EHOSTUNREACH:
break;
}
}
pbuf_free(p);
}
/**
* ICMPv6 Echo Request in pbuf "p" is to be proxied.
*/
static void
{
struct icmp6_echo_hdr *icmph;
int hopl;
int status;
id);
pbuf_free(p);
return;
}
printf(" seq %d len %u hopl %d\n",
IP6H_HOPLIM(iph));
if (hopl == 1) {
}
pbuf_free(p);
return;
}
--hopl;
}
/*
* Rewrite ICMPv6 echo header. We don't need to recompute the
* checksum since, unlike IPv4, checksum includes pseudo-header.
* OS computes checksum for us on send() since it needs to select
* source address.
*/
/* TODO: use control messages to save a syscall? */
if (status == 0) {
}
else {
perror("IPV6_HOPLIMIT");
}
}
pbuf_free(p);
}
static void
{
char addrbuf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"];
const char *addrstr;
DPRINTF((" ->"));
}
static struct ping_pcb *
{
return NULL;
}
return NULL;
}
return pcb;
}
static void
{
}
static void
{
}
}
static void
{
}
}
static void
{
}
static void
{
struct ping_pcb **p;
if (*p == pcb) {
break;
}
}
}
static struct ping_pcb *
{
/* on lwip thread, so no concurrent updates */
{
break;
}
}
int mapped;
return NULL;
}
#ifdef RT_OS_WINDOWS
#endif
if (is_ipv6) {
#if HAVE_SA_LEN
#endif
}
else {
#if HAVE_SA_LEN
#endif
}
if (mapped == PXREMAP_FAILED) {
return NULL;
}
else {
}
printf(" - created\n");
}
else {
/* just bump up expiration timeout lazily */
printf(" - slot %d -> %d\n",
(unsigned int)pcb->timeout_slot,
(unsigned int)pxping->timeout_slot);
}
return pcb;
}
/**
* Called on pollmgr thread. Caller must do the locking since caller
* is going to use the returned pcb, which needs to be protected from
* being expired by pxping_timer() on lwip thread.
*/
static struct ping_pcb *
{
/* XXX: allow broadcast pings? */
{
return pcb;
}
}
return NULL;
}
static void
pxping_timer(void *arg)
{
pxping->timer_active = 0;
/*
* New slot points to the list of pcbs to check for expiration.
*/
pxping->timeout_slot = 0;
}
/* protect from pollmgr concurrent reads */
/* expired */
printf("... ");
printf(" - expired\n");
}
else {
/*
* If there was another request, we updated timeout_slot
* but delayed actually moving the pcb until now.
*/
printf("... ");
printf(" - alive slot %d -> %d\n",
(unsigned int)pxping->timeout_slot,
(unsigned int)xpcb->timeout_slot);
}
}
}
static void
{
}
}
static int
{
return POLLIN;
}
int sockerr = -1;
int status;
if (status < 0) {
DPRINTF(("%s: sock %d: SO_ERROR failed with errno %d\n",
}
else {
DPRINTF(("%s: sock %d: errno %d\n",
}
}
return POLLIN;
}
}
else /* fd == pxping->sock6 */ {
}
return POLLIN;
}
/**
* Process incoming ICMP message for the host.
* NB: we will get a lot of spam here and have to sift through it.
*/
static void
{
struct sockaddr_in sin;
struct icmp_echo_hdr *icmph;
/*
* Reads from raw IPv4 sockets deliver complete IP datagrams with
* IP header included.
*/
if (nread < 0) {
return;
}
DPRINTF2(("%s: read %d bytes, IP header truncated\n",
return;
}
/* match version */
return;
}
/* no fragmentation */
return;
}
/* no options */
DPRINTF2(("%s: dropping datagram with options (IP header length %d)\n",
return;
}
return;
}
#if !defined(RT_OS_DARWIN)
/* darwin reports IPH_LEN in host byte order */
#endif
#if defined(RT_OS_DARWIN) || defined(RT_OS_SOLARIS)
/* darwin and solaris change IPH_LEN to payload length only */
#endif
DPRINTF2(("%s: read %d bytes but total length is %d bytes\n",
return;
}
DPRINTF2(("%s: IP length %d bytes, ICMP header truncated\n",
return;
}
}
}
#if 1
else {
}
#endif
}
/**
* Check if this incoming ICMP echo reply is for one of our pings and
* forward it to the guest.
*/
static void
{
struct icmp_echo_hdr *icmph;
int mapped;
{
char addrbuf[sizeof "255.255.255.255"];
const char *addrstr;
DPRINTF(("<--- PING %s id 0x%x seq %d\n",
}
/*
* Is this a reply to one of our pings?
*/
if (mapped == PXREMAP_FAILED) {
return;
}
return;
}
return;
}
/* save info before unlocking since pcb may expire */
/*
* Rewrite headers and forward to guest.
*/
/* rewrite ICMP echo header */
/* rewrite IP header */
if (mapped == PXREMAP_MAPPED) {
}
else {
}
}
/**
* Check if this incoming ICMP error (destination unreachable or time
* exceeded) is about one of our pings and forward it to the guest.
*/
static void
{
int target_mapped, error_mapped;
/*
* Inner IP datagram is not checked by the kernel and may be
* anything, possibly malicious.
*/
DPRINTF2(("%s: original datagram truncated to %d bytes\n",
}
/* IP header of the original message */
/* match version */
return;
}
/* can't match fragments except the first one */
DPRINTF2(("%s: ignoring fragment with offset %d\n",
return;
}
#if 0
/* don't spam with every "destination unreachable" in the system */
#endif
return;
}
DPRINTF2(("%s: original datagram truncated to %d bytes\n",
return;
}
DPRINTF2(("%s: ignoring ICMP error for original ICMP type %d\n",
return;
}
{
char addrbuf[sizeof "255.255.255.255"];
const char *addrstr;
DPRINTF2(("%s: ping %s id 0x%x seq %d",
}
else {
DPRINTF2((" time exceeded\n"));
}
}
/*
* Is the inner (failed) datagram one of our pings?
*/
if (target_mapped == PXREMAP_FAILED) {
return;
}
return;
}
/* save info before unlocking since pcb may expire */
/*
* Rewrite both inner and outer headers and forward to guest.
* Note that the checksum of the outer ICMP error message is
* preserved by the changes we do to inner headers.
*/
if (error_mapped == PXREMAP_FAILED) {
return;
}
return;
}
/* rewrite inner ICMP echo header */
/* rewrite inner IP header */
if (target_mapped == PXREMAP_MAPPED) {
}
/* rewrite outer IP header */
if (error_mapped == PXREMAP_MAPPED) {
}
else {
}
}
/**
* Process incoming ICMPv6 message for the host.
* NB: we will get a lot of spam here and have to sift through it.
*/
static void
{
#ifndef RT_OS_WINDOWS
#else
#endif
struct sockaddr_in6 sin6;
struct icmp6_echo_hdr *icmph;
struct in6_pktinfo *pktinfo;
int status;
char addrbuf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"];
const char *addrstr;
/*
* Reads from raw IPv6 sockets deliver only the payload. Full
* headers are available via recvmsg(2)/cmsg(3).
*/
#ifndef RT_OS_WINDOWS
if (nread < 0) {
return;
}
#else /* RT_OS_WINDOWS */
if (status == SOCKET_ERROR) {
return;
}
#endif
DPRINTF2(("echo reply %04x %u\n",
}
else { /* XXX */
DPRINTF2(("echo request %04x %u\n",
}
DPRINTF2(("destination unreachable\n"));
}
DPRINTF2(("packet too big\n"));
}
DPRINTF2(("time exceeded\n"));
}
DPRINTF2(("parameter problem\n"));
}
else {
}
return; /* informational message */
}
}
hopl = -1;
tclass = -1;
break;
{
}
{
DPRINTF2(("pktinfo found\n"));
}
}
/*
* ip6_output_if() doesn't do checksum for us so we need to
* manually recompute it - for this we must know the
* destination address of the pseudo-header that we will
* rewrite with guest's address. (TODO: yeah, yeah, we can
* compute it from scratch...)
*/
return;
}
if (hopl < 0) {
}
}
}
}
/**
* Check if this incoming ICMPv6 echo reply is for one of our pings
* and forward it to the guest.
*/
static void
{
struct icmp6_echo_hdr *icmph;
int mapped;
if (mapped == PXREMAP_FAILED) {
return;
}
else if (mapped == PXREMAP_ASIS) {
if (hopl == 1) {
return;
}
--hopl;
}
return;
}
/* save info before unlocking since pcb may expire */
/* rewrite ICMPv6 echo header */
if (mapped) {
}
&target_ip, /* echo reply src */
&guest_ip, /* echo reply dst */
}
/**
* Check if this incoming ICMPv6 error is about one of our pings and
* forward it to the guest.
*/
static void
{
int proto;
struct icmp6_echo_hdr *oicmph;
int target_mapped, error_mapped;
/*
* Inner IP datagram is not checked by the kernel and may be
* anything, possibly malicious.
*/
for (;;) {
DPRINTF2(("truncated datagram inside ICMPv6 error message is too short\n"));
return;
}
return;
}
}
else if (proto == IP6_NEXTH_ICMP6) {
break;
}
else if (proto == IP6_NEXTH_ROUTING
|| proto == IP6_NEXTH_HOPBYHOP
|| proto == IP6_NEXTH_DESTOPTS)
{
}
else {
break;
}
}
return;
}
return;
}
return;
}
if (target_mapped == PXREMAP_FAILED) {
return;
}
return;
}
/* save info before unlocking since pcb may expire */
/*
* Rewrite inner and outer headers and forward to guest. Note
* that IPv6 has no IP header checksum, but uses pseudo-header for
* ICMPv6, so we update both in one go, adjusting ICMPv6 checksum
* as we rewrite IP header.
*/
if (error_mapped == PXREMAP_FAILED) {
return;
}
return;
}
/* rewrite inner ICMPv6 echo header and inner IPv6 header */
if (target_mapped) {
}
/* rewrite outer ICMPv6 error header */
if (error_mapped) {
}
&error_ip, /* error src */
&guest_ip, /* error dst */
}
/**
* Hand off ICMP datagram to the lwip thread where it will be
* forwarded to the guest.
*
* We no longer need ping_pcb. The pcb may get expired on the lwip
* thread, but we have already patched necessary information into the
* datagram.
*/
static void
{
struct pbuf *p;
if (p == NULL) {
DPRINTF(("%s: pbuf_alloc(%d) failed\n",
return;
}
DPRINTF(("%s: pbuf_take(%d) failed\n",
pbuf_free(p);
return;
}
pbuf_free(p);
return;
}
msg->p = p;
}
static void
pxping_pcb_forward_inbound(void *arg)
{
DPRINTF(("%s: ip_output_if: %s\n",
}
}
static void
{
struct pbuf *p;
if (p == NULL) {
DPRINTF(("%s: pbuf_alloc(%d) failed\n",
return;
}
DPRINTF(("%s: pbuf_take(%d) failed\n",
pbuf_free(p);
return;
}
pbuf_free(p);
return;
}
msg->p = p;
}
static void
pxping_pcb_forward_inbound6(void *arg)
{
DPRINTF(("%s: ip6_output_if: %s\n",
}
}