socket.c revision 952db80cc5b668e33a0be435d217f3dcb5a87990
/* $Id$ */
/** @file
* NAT - socket handling.
*/
/*
* Copyright (C) 2006-2012 Oracle Corporation
*
* This file is part of VirtualBox Open Source Edition (OSE), as
* available from http://www.virtualbox.org. This file is free software;
* General Public License (GPL) as published by the Free Software
* Foundation, in version 2 as it comes in the "COPYING" file of the
* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
*/
/*
* This code is based on:
*
* Copyright (c) 1995 Danny Gasparovski.
*
* Please read the file COPYRIGHT for the
* terms and conditions of the copyright.
*/
#include <slirp.h>
#include "ip_icmp.h"
#include "main.h"
#ifdef __sun__
#endif
#if defined (RT_OS_WINDOWS)
#include <iphlpapi.h>
#include <icmpapi.h>
#endif
/**
*
*/
struct socket * soCloneUDPSocketWithForegnAddr(PNATState pData, bool fBindSocket, struct socket *pSo, uint32_t u32ForeignAddr)
{
LogFlowFunc(("Enter: fBindSocket:%RTbool, so:%R[natsock], u32ForeignAddr:%RTnaipv4\n", fBindSocket, pSo, u32ForeignAddr));
pNewSocket = socreate();
if (!pNewSocket)
{
LogFunc(("Can't create socket\n"));
LogFlowFunc(("Leave: NULL\n"));
return NULL;
}
if (fBindSocket)
{
{
LogFunc(("Can't attach fresh created socket\n"));
return NULL;
}
}
else
{
pNewSocket->s = pSo->s;
}
pSo->so_cCloneCounter++;
return pNewSocket;
}
struct socket *soLookUpClonedUDPSocket(PNATState pData, const struct socket *pcSo, uint32_t u32ForeignAddress)
{
{
if ( pSoClone->so_cloneOf
goto done;
}
done:
return pSoClone;
}
#endif
#ifdef VBOX_WITH_NAT_SEND2HOME
DECLINLINE(bool) slirpSend2Home(PNATState pData, struct socket *pSo, const void *pvBuf, uint32_t cbBuf, int iFlags)
{
int idxAddr;
int ret = 0;
bool fSendDone = false;
LogFlowFunc(("Enter pSo:%R[natsock] pvBuf: %p, cbBuf: %d, iFlags: %d\n", pSo, pvBuf, cbBuf, iFlags));
{
struct socket *pNewSocket = soCloneUDPSocketWithForegnAddr(pData, pSo, pData->pInSockAddrHomeAddress[idxAddr].sin_addr);
AssertReturn((pNewSocket, false));
/* @todo: more verbose on errors,
* @note: we shouldn't care if this send fail or not (we're in broadcast).
*/
LogFunc(("send %d bytes to %RTnaipv4 from %R[natsock]\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr, pNewSocket));
ret = sendto(pNewSocket->s, pvBuf, cbBuf, iFlags, (struct sockaddr *)&pData->pInSockAddrHomeAddress[idxAddr], sizeof(struct sockaddr_in));
if (ret < 0)
LogFunc(("Failed to send %d bytes to %RTnaipv4\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr));
}
return fSendDone;
}
#endif /* !VBOX_WITH_NAT_SEND2HOME */
#ifdef RT_OS_WINDOWS
#else /* RT_OS_WINDOWS */
#endif /* !RT_OS_WINDOWS */
void
so_init()
{
}
struct socket *
{
{
return so;
}
}
/*
* Create a new socket, initialise the fields
* It is the responsibility of the caller to
* insque() it into the correct linked-list
*/
struct socket *
socreate()
{
if (so)
{
so->s = -1;
#if !defined(RT_OS_WINDOWS)
#endif
}
return so;
}
/*
* remque and free a socket, clobber cache
*/
void
{
/*
* We should not remove socket when polling routine do the polling
* instead we mark it for deletion.
*/
if (so->fUnderPolling)
{
return;
}
/**
* Check that we don't freeng socket with tcbcb
*/
/* udp checks */
if (so == tcp_last_so)
tcp_last_so = &tcb;
else if (so == udp_last_so)
udp_last_so = &udb;
/* libalias notification */
/* check if mbuf haven't been already freed */
{
}
{
NSOCK_DEC();
}
}
/*
* Read from so's socket into sb_snd, updating all relevant sbuf fields
* NOTE: This will only be called if it is select()ed for reading, so
* a read() of 0 (or less) means it's disconnected
*/
#ifndef VBOX_WITH_SLIRP_BSD_SBUF
int
{
/*
* No need to check if there's enough room to read.
* soread wouldn't have been called if there weren't
*/
{
/* Should never succeed, but... */
n = 1;
}
else
{
/* Should never succeed, but... */
if (len)
{
{
{
n = 2;
}
else
{
n = 1;
}
}
else
n = 2;
}
else
{
n = 1;
}
}
#ifdef HAVE_READV
#else
#endif
if (nn <= 0)
{
/*
* Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
* _could_ mean that the connection is closed. But we will receive an
* FD_CLOSE event later if the connection was _really_ closed. With
* www.youtube.com I see this very often. Closing the socket too early
* would be dangerous.
*/
int status;
unsigned long pending = 0;
if (status < 0)
{
return 0;
}
if ( nn < 0
{
return 0;
}
else
{
int fUninitiolizedTemplate = 0;
/* nn == 0 means peer has performed an orderly shutdown */
Log2(("%s: disconnected, nn = %d, errno = %d (%s)\n",
if (!fUninitiolizedTemplate)
else
return -1;
}
}
if (n == 1)
{
}
else
{
}
);
#ifndef HAVE_READV
/*
* If there was no error, try and read the second time round
* We read again if n = 2 (ie, there's another part of the buffer)
* and we read as much as we could in the first read
* We don't test for <= 0 this time, because there legitimately
* might not be any more data (since the socket is non-blocking),
* a close will be detected on next iteration.
* A return of -1 wont (shouldn't) happen, since it didn't happen above
*/
{
int ret;
if (ret > 0)
if (ret > 0)
{
}
);
}
#endif
/* Update fields */
{
}
return nn;
}
#else /* VBOX_WITH_SLIRP_BSD_SBUF */
int
{
int n;
char *buf;
{
Log(("NAT: can't alloc enough memory\n"));
return -1;
}
if (n <= 0)
{
/*
* Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
* _could_ mean that the connection is closed. But we will receive an
* FD_CLOSE event later if the connection was _really_ closed. With
* www.youtube.com I see this very often. Closing the socket too early
* would be dangerous.
*/
int status;
unsigned long pending = 0;
if (status < 0)
if (n == 0 && (pending != 0))
{
return 0;
}
if ( n < 0
{
return 0;
}
else
{
Log2((" --- soread() disconnected, n = %d, errno = %d (%s)\n",
return -1;
}
}
return n;
}
#endif
/*
* Get urgent data
*
* When the socket is created, we set it SO_OOBINLINE,
* so when OOB data arrives, we soread() it and everything
* in the send buffer is sent as urgent data
*/
void
{
/*
* We take a guess at how much urgent data has arrived.
* In most situations, when urgent data arrives, the next
* read() should get all the urgent data. This guess will
* be wrong however if more data arrives just after the
* urgent data, or the read() doesn't return all the
* urgent data.
*/
{
}
}
#ifndef VBOX_WITH_SLIRP_BSD_SBUF
/*
* Send urgent data
* There's a lot duplicated code here, but...
*/
int
{
int n, len;
{
/* We can send it directly */
Log2((" --- sent %d bytes urgent data, %d urgent bytes left\n",
}
else
{
/*
* Since there's no sendv or sendtov like writev,
* we must copy all data to a linear buffer then
* send it all
*/
{
len += n;
}
#ifdef DEBUG
if (n != len)
Log(("Didn't send all data urgently XXXXX\n"));
#endif
Log2((" ---2 sent %d bytes urgent data, %d urgent bytes left\n",
}
return n;
}
/*
* Write data from so_rcv to so's socket,
* updating all sbuf field as necessary
*/
int
{
int n, nn;
{
{
return 0;
}
}
/*
* No need to check if there's something to write,
* sowrite wouldn't have been called otherwise
*/
{
/* Should never succeed, but... */
n = 1;
}
else
{
if (len)
{
n = 2;
}
else
n = 1;
}
if (n == 1)
{
}
else
{
}
});
/* Check if there's urgent data to send, and if so, send it */
#ifdef HAVE_READV
#else
#endif
/* This should never happen, but people tell me it does *shrug* */
if ( nn < 0
{
return 0;
}
{
Log2(("%s: disconnected, so->so_state = %x, errno = %d\n",
return -1;
}
#ifndef HAVE_READV
{
int ret;
if (ret > 0)
{
}
});
}
#endif
/* Update sbuf */
{
}
/*
* If in DRAIN mode, and there's no more data, set
* it CANTSENDMORE
*/
return nn;
}
#else /* VBOX_WITH_SLIRP_BSD_SBUF */
static int
{
int n, len;
if (n < 0)
Log(("NAT: Can't sent sbuf via socket.\n"));
if (fUrg)
if (n > 0 && n < len)
{
char *ptr;
char *buff;
{
Log(("NAT: No space to allocate temporal buffer\n"));
return -1;
}
return n;
}
sbuf_clear(sb);
return n;
}
int
{
}
/*
* Write data from so_rcv to so's socket,
* updating all sbuf field as necessary
*/
int
{
}
#endif
/*
* recvfrom() a UDP socket
*/
void
{
struct sockaddr_in addr;
{
/* This is a "ping" reply */
#ifdef RT_OS_WINDOWS
#else /* RT_OS_WINDOWS */
#endif /* !RT_OS_WINDOWS */
}
else
{
/* A "normal" UDP packet */
struct mbuf *m;
u_long n = 0;
int rc = 0;
static int signalled = 0;
bool fWithTemporalBuffer = false;
/*How many data has been received ?*/
/*
* 1. calculate how much we can read
* 2. read as much as possible
* 3. attach buffer to allocated header mbuf
*/
if (rc == -1)
{
if ( soIgnorableErrorCode(errno)
return;
else if (signalled == 0)
{
LogRel(("NAT: can't fetch amount of bytes on socket %R[natsock], so message will be truncated.\n", so));
signalled = 1;
}
return;
}
if (m == NULL)
return;
len += n;
fWithTemporalBuffer = false;
/*
* Even if amounts of bytes on socket is greater than MTU value
* Slirp will able fragment it, but we won't create temporal location
* here.
*/
{
pchBuffer = RTMemAlloc((n) * sizeof(char));
if (!pchBuffer)
{
return;
}
fWithTemporalBuffer = true;
}
if (fWithTemporalBuffer)
{
if (ret > 0)
{
/*
* If we've met comporison below our size prediction was failed
* it's not fatal just we've allocated for nothing. (@todo add counter here
* to calculate how rare we here)
*/
Log(("NAT:udp: Expected size(%d) lesser than real(%d) and less minimal mbuf size(%d)\n",
}
/* we're freeing buffer anyway */
}
else
if (ret < 0)
{
if (errno == EHOSTUNREACH)
else if (errno == ENETUNREACH)
if ( soIgnorableErrorCode(errno)
{
return;
}
}
else
{
/*
* Hack: domain name lookup will be used the most for UDP,
* and since they'll only be used once there's no need
* for the 4 minute (or whatever) timeout... So we time them
* out much quicker (10 seconds for now...)
*/
{
}
/*
* last argument should be changed if Slirp will inject IP attributes
* Note: Here we can't check if dnsproxy's sent initial request
*/
if ( pData->fUseDnsProxy
#if 0
{
m->m_len = 0;
}
#endif
/* packets definetly will be fragmented, could confuse receiver peer. */
m->m_flags |= M_SKIP_FIREWALL;
/*
* If this packet was destined for CTL_ADDR,
* make it look like that's where it came from, done by udp_output
*/
} /* rx error */
} /* if ping packet */
}
/*
* sendto() a socket
*/
int
{
int ret;
struct sockaddr_in *paddr;
#if 0
struct sockaddr_in host_addr;
#endif
int mlen;
#ifdef RT_OS_DARWIN
#endif
{
/* It's an alias */
switch(last_byte)
{
#if 0
/* handle this case at 'default:' */
case CTL_BROADCAST:
/* Send the packet to host to fully emulate broadcast */
/** @todo r=klaus: on Linux host this causes the host to receive
* the packet twice for some reason. And I cannot find any place
* in the man pages which states that sending a broadcast does not
* reach the host itself. */
break;
#endif
case CTL_DNS:
case CTL_ALIAS:
default:
else
break;
}
}
else
Log2((" sendto()ing, addr.sin_port=%d, addr.sin_addr.s_addr=%.16s\n",
/* Don't care what port we get */
/*
* > nmap -sV -T4 -O -A -v -PU3483 255.255.255.255
* generates bodyless messages, annoying memmory management system.
*/
if (mlen > 0)
{
{
return -1;
}
}
#ifdef VBOX_WITH_NAT_SEND2HOME
{
}
#endif
if (buf)
if (ret < 0)
{
return -1;
}
/*
* Kill the socket if there's no reply in 4 minutes,
* but only if it's an expirable socket
*/
return 0;
}
/*
* XXX This should really be tcp_listen
*/
struct socket *
{
struct sockaddr_in addr;
int s, opt = 1;
int status;
LogFlowFunc(("solisten: port = %d, laddr = %x, lport = %d, flags = %x\n", port, laddr, lport, flags));
{
/* RTMemFree(so); Not sofree() ??? free(NULL) == NOP */
return NULL;
}
/* Don't tcp_attach... we don't need so_snd nor so_rcv */
{
return NULL;
}
NSOCK_INC();
/*
* SS_FACCEPTONCE sockets must time out.
*/
if (flags & SS_FACCEPTONCE)
#ifdef RT_OS_DARWIN
#endif
/**
* kernel will choose the optimal value for requests queue length.
* @note: MSDN recommends low (2-4) values for bluetooth networking devices.
*/
{
#ifdef RT_OS_WINDOWS
closesocket(s);
/* Restore the real errno */
#else
close(s);
else
/* Restore the real errno */
#endif
return NULL;
}
fd_nonblock(s);
/* set socket buffers */
if (status < 0)
{
goto no_sockopt;
}
if (status < 0)
{
goto no_sockopt;
}
else
so->s = s;
return so;
}
/*
* Data is available in so_rcv
* Just write() the data to the socket
* XXX not yet...
* @todo do we really need this function, what it's intended to do?
*/
void
{
#if 0
#endif
}
/*
* Data has been freed in so_snd
* We have room for a read() if we want to
* For now, don't read, it'll be done in the main loop
*/
void
{
}
/*
* Various session state calls
* XXX Should be #define's
* The socket state stuff needs work, these often get call 2 or 3
* times each when only 1 was needed
*/
void
{
}
void
{
}
void
{
{
}
/* XXX close() here as well? */
else
}
void
{
else
}
void
{
#if 0
/*
* XXX Do nothing ... ?
*/
#endif
}
/*
* Set write drain mode
* Set CANTSENDMORE once all data has been write()n
*/
void
{
else
}
static void
{
char ip_copy[256];
int old_ip_len = 0;
int hlen, original_hlen = 0;
struct mbuf *m;
int type = 0;
/* Fix ip->ip_len to contain the total packet length including the header
* in _host_ byte order for all OSes. On Darwin, that value already is in
* host byte order. Solaris and Darwin report only the payload. */
#ifndef RT_OS_DARWIN
#endif
#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
#endif
{
Log(("send_icmp_to_guest: ICMP header is too small to understand which type/subtype of the datagram\n"));
return;
}
{
return;
}
/*
* ICMP_ECHOREPLY, ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
* ICMP_ECHOREPLY assuming data 0
* icmp_{type(8), code(8), cksum(16),identifier(16),seqnum(16)}
*/
{
Log(("send_icmp_to_guest: NAT accept ICMP_{ECHOREPLY, TIMXCEED, UNREACH} the minimum size is 64 (see rfc792)\n"));
return;
}
if ( type == ICMP_TIMXCEED
|| type == ICMP_UNREACH)
{
/*
* ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
* icmp_{type(8), code(8), cksum(16),unused(32)} + IP header + 64 bit of original datagram
*/
{
Log(("send_icmp_to_guest: NAT accept ICMP_{TIMXCEED, UNREACH} the minimum size of ipheader + 64 bit of data (see rfc792)\n"));
return;
}
}
{
Log(("NAT: Can't find the corresponding packet for the received ICMP\n"));
return;
}
if (!m)
{
return;
}
if (type == ICMP_ECHOREPLY)
{
{
Log(("NAT: we haven't found echo for this reply\n"));
return;
}
/*
* while combining buffer to send (see ip_icmp.c) we control ICMP header only,
* IP header combined by OS network stack, our local copy of IP header contians values
* in host byte order so no byte order conversion is required. IP headers fields are converting
* in ip_output0 routine only.
*/
{
Log(("NAT: ECHO(%d) lenght doesn't match ECHOREPLY(%d)\n",
return;
}
}
/* ip points on origianal ip header */
/* Now ip is pointing on header we've sent from guest */
{
if (old_ip_len > sizeof(ip_copy))
old_ip_len = sizeof(ip_copy);
}
/* source address from original IP packet*/
/* overide ther tail of old packet */
/* saves original ip header and options */
if ( type == ICMP_TIMXCEED
|| type == ICMP_UNREACH)
{
/* according RFC 793 error messages required copy of initial IP header + 64 bit */
}
icmp_reflect(pData, m);
pData->cIcmpCacheSize--;
/* Don't call m_free here*/
if ( type == ICMP_TIMXCEED
|| type == ICMP_UNREACH)
{
switch (proto)
{
case IPPROTO_UDP:
/*XXX: so->so_m already freed so we shouldn't call sofree */
break;
case IPPROTO_TCP:
/*close tcp should be here */
break;
default:
/* do nothing */
break;
}
}
}
#ifdef RT_OS_WINDOWS
static void
{
int len;
int i;
struct mbuf *m;
int hlen = 0;
int nbytes = 0;
int out_len;
int size;
if (len < 0)
{
return;
}
if (len == 0)
return; /* no error */
for (i = 0; i < len; ++i)
{
LogFunc(("icr[%d] Data:%p, DataSize:%d\n",
{
case IP_DEST_HOST_UNREACHABLE:
case IP_DEST_NET_UNREACHABLE:
case IP_DEST_PROT_UNREACHABLE:
/* UNREACH error inject here */
case IP_DEST_PORT_UNREACHABLE:
break;
case IP_SUCCESS: /* echo replied */
size;
else if (out_len < MJUM9BYTES)
size = MJUM9BYTES;
else if (out_len < MJUM16BYTES)
size = MJUM16BYTES;
else
AssertMsgFailed(("Unsupported size"));
LogFunc(("m_getjcl returns m: %p\n", m));
if (m == NULL)
return;
m->m_len = 0;
m->m_data += if_maxlinkhdr;
if (icm)
{
/* on this branch we don't need stored variant */
pData->cIcmpCacheSize--;
}
{
break;
}
icmp_reflect(pData, m);
break;
case IP_TTL_EXPIRED_TRANSIT: /* TTL expired */
return;
}
icmp_reflect(pData, m);
pData->cIcmpCacheSize--;
break;
default:
Log(("ICMP(default): message with Status: %x was received from %x\n", icr[i].Status, icr[i].Address));
break;
}
}
}
#else /* !RT_OS_WINDOWS */
{
struct sockaddr_in addr;
char *buff;
int len = 0;
/* 1- step: read the ip header */
if ( len < 0
&& ( soIgnorableErrorCode(errno)
{
Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm (would block)\n"));
return;
}
|| len < 0
|| len == 0)
{
if (errno == EHOSTUNREACH)
else if (errno == ENETUNREACH)
Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm\n"));
return;
}
/* basic check of IP header */
# ifndef RT_OS_DARWIN
# endif
)
{
Log(("sorecvfrom_icmp_unix: 1 - step IP isn't IPv4\n"));
return;
}
# ifndef RT_OS_DARWIN
/* Darwin reports the IP length already in host byte order. */
# endif
# if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
/* Solaris and Darwin report the payload only */
# endif
/* Note: ip->ip_len in host byte order (all OS) */
{
Log(("sorecvfrom_icmp_unix: 1 - step can't allocate enought room for datagram\n"));
return;
}
/* 2 - step: we're reading rest of the datagramm to the buffer */
addrlen = sizeof(struct sockaddr_in);
if ( len < 0
&& ( soIgnorableErrorCode(errno)
{
Log(("sorecvfrom_icmp_unix: 2 - step can't read IP body (would block expected:%d)\n",
return;
}
if ( len < 0
|| len == 0)
{
Log(("sorecvfrom_icmp_unix: 2 - step read of the rest of datagramm is fallen (errno:%d, len:%d expected: %d)\n",
return;
}
/* len is modified in 2nd read, when the rest of the datagramm was read */
}
#endif /* !RT_OS_WINDOWS */