tcp_subr.c revision 56ce5cb366159dd5df5d864239dcc0327f262671
893N/A/* $Id$ */
3909N/A/** @file
893N/A * NAT - TCP support.
893N/A */
893N/A
893N/A/*
893N/A * Copyright (C) 2006-2010 Sun Microsystems, Inc.
893N/A *
893N/A * This file is part of VirtualBox Open Source Edition (OSE), as
893N/A * available from http://www.virtualbox.org. This file is free software;
893N/A * you can redistribute it and/or modify it under the terms of the GNU
893N/A * General Public License (GPL) as published by the Free Software
893N/A * Foundation, in version 2 as it comes in the "COPYING" file of the
893N/A * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
893N/A * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
893N/A *
893N/A * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
893N/A * Clara, CA 95054 USA or visit http://www.sun.com if you need
2362N/A * additional information or have any questions.
2362N/A */
2362N/A
893N/A/*
893N/A * This code is based on:
893N/A *
1433N/A * Copyright (c) 1982, 1986, 1988, 1990, 1993
3471N/A * The Regents of the University of California. All rights reserved.
893N/A *
893N/A * Redistribution and use in source and binary forms, with or without
1868N/A * modification, are permitted provided that the following conditions
1868N/A * are met:
893N/A * 1. Redistributions of source code must retain the above copyright
893N/A * notice, this list of conditions and the following disclaimer.
893N/A * 2. Redistributions in binary form must reproduce the above copyright
893N/A * notice, this list of conditions and the following disclaimer in the
893N/A * documentation and/or other materials provided with the distribution.
893N/A * 3. All advertising materials mentioning features or use of this software
893N/A * must display the following acknowledgement:
893N/A * This product includes software developed by the University of
893N/A * California, Berkeley and its contributors.
893N/A * 4. Neither the name of the University nor the names of its contributors
893N/A * may be used to endorse or promote products derived from this software
893N/A * without specific prior written permission.
893N/A *
893N/A * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
893N/A * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
893N/A * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
893N/A * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
893N/A * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
893N/A * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
893N/A * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
1433N/A * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
893N/A * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
893N/A * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
893N/A * SUCH DAMAGE.
3471N/A *
3471N/A * @(#)tcp_subr.c 8.1 (Berkeley) 6/10/93
893N/A * tcp_subr.c,v 1.5 1994/10/08 22:39:58 phk Exp
893N/A */
893N/A
893N/A/*
893N/A * Changes and additions relating to SLiRP
893N/A * Copyright (c) 1995 Danny Gasparovski.
893N/A *
893N/A * Please read the file COPYRIGHT for the
893N/A * terms and conditions of the copyright.
893N/A */
893N/A
893N/A#define WANT_SYS_IOCTL_H
893N/A#include <slirp.h>
893N/A
893N/A
893N/A/*
3471N/A * Tcp initialization
893N/A */
3471N/Avoid
893N/Atcp_init(PNATState pData)
3471N/A{
893N/A tcp_iss = 1; /* wrong */
893N/A tcb.so_next = tcb.so_prev = &tcb;
1433N/A tcp_last_so = &tcb;
1433N/A tcp_reass_maxqlen = 48;
1433N/A tcp_reass_maxseg = 256;
1433N/A}
1433N/A
3471N/A/*
3471N/A * Create template to be used to send tcp packets on a connection.
1433N/A * Call after host entry created, fills
1433N/A * in a skeletal tcp/ip header, minimizing the amount of work
3471N/A * necessary when the connection is used.
3471N/A */
1433N/A/* struct tcpiphdr * */
1433N/Avoid
3471N/Atcp_template(struct tcpcb *tp)
1433N/A{
1433N/A struct socket *so = tp->t_socket;
3471N/A register struct tcpiphdr *n = &tp->t_template;
1433N/A
1433N/A memset(n->ti_x1, 0, 9);
1433N/A n->ti_pr = IPPROTO_TCP;
1433N/A n->ti_len = RT_H2N_U16(sizeof (struct tcpiphdr) - sizeof (struct ip));
1433N/A n->ti_src = so->so_faddr;
1433N/A n->ti_dst = so->so_laddr;
1433N/A n->ti_sport = so->so_fport;
1433N/A n->ti_dport = so->so_lport;
1433N/A
3471N/A n->ti_seq = 0;
3471N/A n->ti_ack = 0;
1433N/A n->ti_x2 = 0;
3471N/A n->ti_off = 5;
3471N/A n->ti_flags = 0;
1433N/A n->ti_win = 0;
3471N/A n->ti_sum = 0;
1433N/A n->ti_urp = 0;
1433N/A}
1433N/A
1433N/A/*
1433N/A * Send a single message to the TCP at address specified by
3471N/A * the given TCP/IP header. If m == 0, then we make a copy
3471N/A * of the tcpiphdr at ti and send directly to the addressed host.
3471N/A * This is used to force keep alive messages out using the TCP
1433N/A * template for a connection tp->t_template. If flags are given
3471N/A * then we send a message back to the TCP which originated the
3471N/A * segment ti, and discard the mbuf containing it and any other
3471N/A * attached mbufs.
1433N/A *
3471N/A * In any case the ack and sequence number of the transmitted
1433N/A * segment are as specified by the parameters.
1433N/A */
1433N/Avoid
3471N/Atcp_respond(PNATState pData, struct tcpcb *tp, struct tcpiphdr *ti, struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags)
3471N/A{
3471N/A register int tlen;
1433N/A int win = 0;
893N/A
893N/A DEBUG_CALL("tcp_respond");
893N/A DEBUG_ARG("tp = %lx", (long)tp);
893N/A DEBUG_ARG("ti = %lx", (long)ti);
893N/A DEBUG_ARG("m = %lx", (long)m);
893N/A DEBUG_ARG("ack = %u", ack);
3471N/A DEBUG_ARG("seq = %u", seq);
3471N/A DEBUG_ARG("flags = %x", flags);
893N/A
3471N/A if (tp)
893N/A win = sbspace(&tp->t_socket->so_rcv);
3471N/A if (m == 0)
893N/A {
893N/A#ifndef VBOX_WITH_SLIRP_BSD_MBUF
893N/A if ((m = m_get(pData)) == NULL)
893N/A#else
893N/A if ((m = m_gethdr(pData, M_DONTWAIT, MT_HEADER)) == NULL)
893N/A#endif
893N/A return;
3471N/A#ifdef TCP_COMPAT_42
3471N/A tlen = 1;
893N/A#else
893N/A tlen = 0;
3471N/A#endif
893N/A m->m_data += if_maxlinkhdr;
893N/A#ifdef VBOX_WITH_SLIRP_BSD_MBUF
893N/A m->m_pkthdr.header = mtod(m, void *);
893N/A#endif
3471N/A *mtod(m, struct tcpiphdr *) = *ti;
893N/A ti = mtod(m, struct tcpiphdr *);
893N/A flags = TH_ACK;
893N/A }
893N/A else
893N/A {
893N/A /*
893N/A * ti points into m so the next line is just making
893N/A * the mbuf point to ti
893N/A */
893N/A m->m_data = (caddr_t)ti;
893N/A
893N/A m->m_len = sizeof (struct tcpiphdr);
893N/A tlen = 0;
893N/A#define xchg(a,b,type) { type t; t = a; a = b; b = t; }
893N/A xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, u_int32_t);
893N/A xchg(ti->ti_dport, ti->ti_sport, u_int16_t);
893N/A#undef xchg
893N/A }
893N/A ti->ti_len = RT_H2N_U16((u_short)(sizeof (struct tcphdr) + tlen));
893N/A tlen += sizeof (struct tcpiphdr);
893N/A m->m_len = tlen;
893N/A
893N/A memset(ti->ti_x1, 0, 9);
893N/A ti->ti_seq = RT_H2N_U32(seq);
893N/A ti->ti_ack = RT_H2N_U32(ack);
893N/A ti->ti_x2 = 0;
893N/A ti->ti_off = sizeof (struct tcphdr) >> 2;
893N/A ti->ti_flags = flags;
893N/A if (tp)
ti->ti_win = RT_H2N_U16((u_int16_t) (win >> tp->rcv_scale));
else
ti->ti_win = RT_H2N_U16((u_int16_t)win);
ti->ti_urp = 0;
ti->ti_sum = 0;
ti->ti_sum = cksum(m, tlen);
((struct ip *)ti)->ip_len = tlen;
if(flags & TH_RST)
((struct ip *)ti)->ip_ttl = MAXTTL;
else
((struct ip *)ti)->ip_ttl = ip_defttl;
(void) ip_output(pData, (struct socket *)0, m);
}
/*
* Create a new TCP control block, making an
* empty reassembly queue and hooking it to the argument
* protocol control block.
*/
struct tcpcb *
tcp_newtcpcb(PNATState pData, struct socket *so)
{
register struct tcpcb *tp;
tp = (struct tcpcb *)RTMemAllocZ(sizeof(*tp));
if (tp == NULL)
return ((struct tcpcb *)0);
tp->t_maxseg = tcp_mssdflt;
tp->t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
tp->t_socket = so;
/*
* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
* rtt estimate. Set rttvar so that srtt + 2 * rttvar gives
* reasonable initial retransmit time.
*/
tp->t_srtt = TCPTV_SRTTBASE;
tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << 2;
tp->t_rttmin = TCPTV_MIN;
TCPT_RANGESET(tp->t_rxtcur,
((TCPTV_SRTTBASE >> 2) + (TCPTV_SRTTDFLT << 2)) >> 1,
TCPTV_MIN, TCPTV_REXMTMAX);
tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->t_state = TCPS_CLOSED;
so->so_tcpcb = tp;
return (tp);
}
/*
* Drop a TCP connection, reporting
* the specified error. If connection is synchronized,
* then send a RST to peer.
*/
struct tcpcb *tcp_drop(PNATState pData, struct tcpcb *tp, int err)
{
/* tcp_drop(tp, errno)
register struct tcpcb *tp;
int errno;
{
*/
DEBUG_CALL("tcp_drop");
DEBUG_ARG("tp = %lx", (long)tp);
DEBUG_ARG("errno = %d", errno);
if (TCPS_HAVERCVDSYN(tp->t_state))
{
tp->t_state = TCPS_CLOSED;
(void) tcp_output(pData, tp);
tcpstat.tcps_drops++;
}
else
tcpstat.tcps_conndrops++;
#if 0
if (errno == ETIMEDOUT && tp->t_softerror)
errno = tp->t_softerror;
so->so_error = errno;
#endif
return (tcp_close(pData, tp));
}
/*
* Close a TCP control block:
* discard all space held by the tcp
* discard internet protocol block
* wake up any sleepers
*/
struct tcpcb *
tcp_close(PNATState pData, register struct tcpcb *tp)
{
struct socket *so = tp->t_socket;
struct socket *so_next, *so_prev;
struct tseg_qent *te = NULL;
DEBUG_CALL("tcp_close");
DEBUG_ARG("tp = %lx", (long )tp);
so_next = so_prev = NULL;
/*XXX: freeing the reassembly queue */
while (!LIST_EMPTY(&tp->t_segq))
{
te = LIST_FIRST(&tp->t_segq);
LIST_REMOVE(te, tqe_q);
m_freem(pData, te->tqe_m);
RTMemFree(te);
tcp_reass_qsize--;
}
RTMemFree(tp);
so->so_tcpcb = 0;
soisfdisconnected(so);
/* clobber input socket cache if we're closing the cached connection */
if (so == tcp_last_so)
tcp_last_so = &tcb;
closesocket(so->s);
/* Avoid double free if the socket is listening and therefore doesn't have
* any sbufs reserved. */
if (!(so->so_state & SS_FACCEPTCONN))
{
sbfree(&so->so_rcv);
sbfree(&so->so_snd);
}
sofree(pData, so);
SOCKET_UNLOCK(so);
tcpstat.tcps_closed++;
return ((struct tcpcb *)0);
}
void
tcp_drain()
{
/* XXX */
}
/*
* When a source quench is received, close congestion window
* to one segment. We will gradually open it again as we proceed.
*/
#if 0
void
tcp_quench(i, int errno)
{
struct tcpcb *tp = intotcpcb(inp);
if (tp)
tp->snd_cwnd = tp->t_maxseg;
}
#endif
/*
* TCP protocol interface to socket abstraction.
*/
/*
* User issued close, and wish to trail through shutdown states:
* if never received SYN, just forget it. If got a SYN from peer,
* but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
* If already got a FIN from peer, then almost done; go to LAST_ACK
* state. In all other cases, have already sent FIN to peer (e.g.
* after PRU_SHUTDOWN), and just have to play tedious game waiting
* for peer to send FIN or not respond to keep-alives, etc.
* We can let the user exit from the close as soon as the FIN is acked.
*/
void
tcp_sockclosed(PNATState pData, struct tcpcb *tp)
{
DEBUG_CALL("tcp_sockclosed");
DEBUG_ARG("tp = %lx", (long)tp);
switch (tp->t_state)
{
case TCPS_CLOSED:
case TCPS_LISTEN:
case TCPS_SYN_SENT:
tp->t_state = TCPS_CLOSED;
tp = tcp_close(pData, tp);
break;
case TCPS_SYN_RECEIVED:
case TCPS_ESTABLISHED:
tp->t_state = TCPS_FIN_WAIT_1;
break;
case TCPS_CLOSE_WAIT:
tp->t_state = TCPS_LAST_ACK;
break;
}
/* soisfdisconnecting(tp->t_socket); */
if ( tp
&& tp->t_state >= TCPS_FIN_WAIT_2)
soisfdisconnected(tp->t_socket);
/*
* (vasily) there're situations when the FIN or FIN,ACK are lost (Windows host)
* and retransmitting keeps VBox busy on sending closing sequences *very* frequent,
* easting a lot of CPU. To avoid this we don't sent on sockets marked as closed
* (see slirp.c for details about setting so_close member).
*/
if ( tp
&& tp->t_socket
&& !tp->t_socket->so_close)
tcp_output(pData, tp);
}
/*
* Connect to a host on the Internet
* Called by tcp_input
* Only do a connect, the tcp fields will be set in tcp_input
* return 0 if there's a result of the connect,
* else return -1 means we're still connecting
* The return value is almost always -1 since the socket is
* nonblocking. Connect returns after the SYN is sent, and does
* not wait for ACK+SYN.
*/
int tcp_fconnect(PNATState pData, struct socket *so)
{
int ret = 0;
DEBUG_CALL("tcp_fconnect");
DEBUG_ARG("so = %lx", (long )so);
if ((ret = so->s = socket(AF_INET, SOCK_STREAM, 0)) >= 0)
{
int opt, s = so->s;
struct sockaddr_in addr;
fd_nonblock(s);
opt = 1;
setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&opt, sizeof(opt));
opt = 1;
setsockopt(s, SOL_SOCKET, SO_OOBINLINE, (char *)&opt, sizeof(opt));
addr.sin_family = AF_INET;
if ((so->so_faddr.s_addr & RT_H2N_U32(pData->netmask)) == pData->special_addr.s_addr)
{
/* It's an alias */
switch(RT_N2H_U32(so->so_faddr.s_addr) & ~pData->netmask)
{
case CTL_DNS:
case CTL_ALIAS:
default:
addr.sin_addr = loopback_addr;
break;
}
}
else
addr.sin_addr = so->so_faddr;
addr.sin_port = so->so_fport;
DEBUG_MISC((dfd, " connect()ing, addr.sin_port=%d, "
"addr.sin_addr.s_addr=%.16s\n",
RT_N2H_U16(addr.sin_port), inet_ntoa(addr.sin_addr)));
/* We don't care what port we get */
ret = connect(s,(struct sockaddr *)&addr,sizeof (addr));
/*
* If it's not in progress, it failed, so we just return 0,
* without clearing SS_NOFDREF
*/
soisfconnecting(so);
}
return(ret);
}
/*
* Accept the socket and connect to the local-host
*
* We have a problem. The correct thing to do would be
* to first connect to the local-host, and only if the
* connection is accepted, then do an accept() here.
* But, a) we need to know who's trying to connect
* to the socket to be able to SYN the local-host, and
* b) we are already connected to the foreign host by
* the time it gets to accept(), so... We simply accept
* here and SYN the local-host.
*/
void
tcp_connect(PNATState pData, struct socket *inso)
{
struct socket *so;
struct sockaddr_in addr;
socklen_t addrlen = sizeof(struct sockaddr_in);
struct tcpcb *tp;
int s, opt;
int status;
socklen_t optlen;
static int cVerbose = 1;
DEBUG_CALL("tcp_connect");
DEBUG_ARG("inso = %lx", (long)inso);
/*
* If it's an SS_ACCEPTONCE socket, no need to socreate()
* another socket, just use the accept() socket.
*/
if (inso->so_state & SS_FACCEPTONCE)
{
/* FACCEPTONCE already have a tcpcb */
so = inso;
}
else
{
if ((so = socreate()) == NULL)
{
/* If it failed, get rid of the pending connection */
closesocket(accept(inso->s,(struct sockaddr *)&addr,&addrlen));
return;
}
if (tcp_attach(pData, so) < 0)
{
RTMemFree(so); /* NOT sofree */
return;
}
so->so_laddr = inso->so_laddr;
so->so_lport = inso->so_lport;
so->so_la = inso->so_la;
}
(void) tcp_mss(pData, sototcpcb(so), 0);
fd_nonblock(inso->s);
if ((s = accept(inso->s,(struct sockaddr *)&addr,&addrlen)) < 0)
{
tcp_close(pData, sototcpcb(so)); /* This will sofree() as well */
return;
}
fd_nonblock(s);
opt = 1;
setsockopt(s, SOL_SOCKET, SO_REUSEADDR,(char *)&opt, sizeof(int));
opt = 1;
setsockopt(s, SOL_SOCKET, SO_OOBINLINE,(char *)&opt, sizeof(int));
#if 0
opt = 1;
setsockopt(s, IPPROTO_TCP, TCP_NODELAY,(char *)&opt, sizeof(int));
#endif
optlen = sizeof(int);
status = getsockopt(s, SOL_SOCKET, SO_RCVBUF, (char *)&opt, &optlen);
if (status < 0)
{
LogRel(("NAT: Error(%d) while getting RCV capacity\n", errno));
goto no_sockopt;
}
if (cVerbose > 0)
LogRel(("NAT: old socket rcv size: %dKB\n", opt / 1024));
/* @todo (r-vvl) make it configurable (via extra data) */
opt = pData->socket_rcv;
status = setsockopt(s, SOL_SOCKET, SO_RCVBUF, (char *)&opt, sizeof(int));
if (status < 0)
{
LogRel(("NAT: Error(%d) while setting RCV capacity to (%d)\n", errno, opt));
goto no_sockopt;
}
optlen = sizeof(int);
status = getsockopt(s, SOL_SOCKET, SO_SNDBUF, (char *)&opt, &optlen);
if (status < 0)
{
LogRel(("NAT: Error(%d) while getting SND capacity\n", errno));
goto no_sockopt;
}
if (cVerbose > 0)
LogRel(("NAT: old socket snd size: %dKB\n", opt / 1024));
opt = pData->socket_rcv;
status = setsockopt(s, SOL_SOCKET, SO_SNDBUF, (char *)&opt, sizeof(int));
if (status < 0)
{
LogRel(("NAT: Error(%d) while setting SND capacity to (%d)\n", errno, opt));
goto no_sockopt;
}
if (cVerbose > 0)
cVerbose--;
no_sockopt:
so->so_fport = addr.sin_port;
so->so_faddr = addr.sin_addr;
/* Translate connections from localhost to the real hostname */
if (so->so_faddr.s_addr == 0 || so->so_faddr.s_addr == loopback_addr.s_addr)
so->so_faddr = alias_addr;
/* Close the accept() socket, set right state */
if (inso->so_state & SS_FACCEPTONCE)
{
closesocket(so->s); /* If we only accept once, close the accept() socket */
so->so_state = SS_NOFDREF; /* Don't select it yet, even though we have an FD */
/* if it's not FACCEPTONCE, it's already NOFDREF */
}
so->s = s;
tp = sototcpcb(so);
tcp_template(tp);
/* Compute window scaling to request. */
/* while (tp->request_r_scale < TCP_MAX_WINSHIFT
* && (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
* tp->request_r_scale++;
*/
/* soisconnecting(so); */ /* NOFDREF used instead */
tcpstat.tcps_connattempt++;
tp->t_state = TCPS_SYN_SENT;
tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
tp->iss = tcp_iss;
tcp_iss += TCP_ISSINCR/2;
tcp_sendseqinit(tp);
tcp_output(pData, tp);
}
/*
* Attach a TCPCB to a socket.
*/
int
tcp_attach(PNATState pData, struct socket *so)
{
if ((so->so_tcpcb = tcp_newtcpcb(pData, so)) == NULL)
return -1;
SOCKET_LOCK_CREATE(so);
QSOCKET_LOCK(tcb);
insque(pData, so, &tcb);
NSOCK_INC();
QSOCKET_UNLOCK(tcb);
return 0;
}