tcp.c revision 9f1fc992b281e57216b036e784b762829b875b4b
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
* tcp.c, Code implementing the TCP protocol.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <socket_impl.h>
#include <socket_inet.h>
#include <sys/sysmacros.h>
#include <netinet/in_systm.h>
#include <net/if_types.h>
#include "ipv4.h"
#include "ipv4_impl.h"
#include "mac.h"
#include "mac_impl.h"
#include "v4_sum_impl.h"
#include <sys/bootdebug.h>
#include "tcp_inet.h"
#include "tcp_sack.h"
/*
* We need to redefine BUMP_MIB/UPDATE_MIB to not have DTrace probes.
*/
#define BUMP_MIB(x) (x)++
#define UPDATE_MIB(x, y) x += y
/*
* MIB-2 stuff for SNMP
*/
/* The TCP mib does not include the following errors. */
static uint_t tcp_cksum_errors;
/* Macros for timestamp comparisons */
/*
* Parameters for TCP Initial Send Sequence number (ISS) generation.
* The ISS is calculated by adding three components: a time component
* which grows by 1 every 4096 nanoseconds (versus every 4 microseconds
* suggested by RFC 793, page 27);
* a per-connection component which grows by 125000 for every new connection;
* and an "extra" component that grows by a random amount centered
* approximately on 64000. This causes the the ISS generator to cycle every
* 4.89 hours if no TCP connections are made, and faster if connections are
* made.
*/
#define ISS_INCR 250000
#define ISS_NSEC_SHT 0
#define TCP_XMIT_LOWATER 4096
#define TCP_XMIT_HIWATER 49152
#define TCP_RECV_LOWATER 2048
#define TCP_RECV_HIWATER 49152
/*
* PAWS needs a timer for 24 days. This is the number of ms in 24 days
*/
/*
* TCP options struct returned from tcp_parse_options.
*/
typedef struct tcp_opt_s {
} tcp_opt_t;
/*
* RFC1323-recommended phrasing of TSTAMP option, for easier parsing
*/
#ifdef _BIG_ENDIAN
#else
#endif
/*
* Flags returned from tcp_parse_options.
*/
#define TCP_OPT_MSS_PRESENT 1
#define TCP_OPT_WSCALE_PRESENT 2
#define TCP_OPT_TSTAMP_PRESENT 4
#define TCP_OPT_SACK_OK_PRESENT 8
#define TCP_OPT_SACK_PRESENT 16
/* TCP option length */
#define TCPOPT_NOP_LEN 1
#define TCPOPT_MAXSEG_LEN 4
#define TCPOPT_WS_LEN 3
#define TCPOPT_TSTAMP_LEN 10
#define TCPOPT_SACK_OK_LEN 2
#define TCPOPT_REAL_SACK_LEN 4
#define TCPOPT_MAX_SACK_LEN 36
#define TCPOPT_HEADER_LEN 2
/* TCP cwnd burst factor. */
#define TCP_CWND_INFINITE 65535
#define TCP_CWND_SS 3
#define TCP_CWND_NORMAL 5
/* Named Dispatch Parameter Management Structure */
typedef struct tcpparam_s {
char *tcp_param_name;
} tcpparam_t;
/* Max size IP datagram is 64k - 1 */
sizeof (tcph_t)))
/* Max of the above */
#define TCP_MSS_MAX TCP_MSS_MAX_IPV4
/* Largest TCP port number */
/* Round up the value to the nearest mss. */
#define MS 1L
/* All NDD params in the core TCP became static variables. */
static int tcp_conn_req_max_q = 128;
static int tcp_conn_req_max_q0 = 1024;
static int tcp_conn_req_min = 1;
static int tcp_conn_grace_period = 0 * SECONDS;
static int tcp_smallest_nonpriv_port = 1024;
static int tcp_ipv4_ttl = 64;
static int tcp_mss_def_ipv4 = 536;
static int tcp_mss_max_ipv4 = TCP_MSS_MAX_IPV4;
static int tcp_mss_min = 108;
static int tcp_dupack_fast_retransmit = 3;
static int tcp_largest_anon_port = TCP_MAX_PORT;
static int tcp_xmit_lowat = TCP_XMIT_LOWATER;
static int tcp_recv_hiwat_minmss = 4;
static int tcp_wscale_always = 1;
static int tcp_tstamp_always = 1;
static int tcp_tstamp_if_wscale = 1;
static int tcp_rexmit_interval_extra = 0;
static int tcp_slow_start_after_idle = 2;
static int tcp_slow_start_initial = 2;
static int tcp_sack_permitted = 2;
static int tcp_ecn_permitted = 2;
/* Extra room to fit in headers. */
static uint_t tcp_wroff_xtra;
/* Hint for next port to try. */
/*
* Figure out the value of window scale opton. Note that the rwnd is
* ASSUMED to be rounded up to the nearest MSS before the calculation.
* We cannot find the scale value and then do a round up of tcp_rwnd
* because the scale value may not be correct after that.
*/
#define SET_WS_VALUE(tcp) \
{ \
int i; \
i++, rwnd >>= 1) \
; \
(tcp)->tcp_rcv_ws = i; \
}
/*
* Set ECN capable transport (ECT) code point in IP header.
*
* Note that there are 2 ECT code points '01' and '10', which are called
* ECT(1) and ECT(0) respectively. Here we follow the original ECT code
* point ECT(0) for TCP as described in RFC 2481.
*/
/* We need to clear the code point first. */ \
}
/*
* The format argument to pass to tcp_display().
* DISP_PORT_ONLY means that the returned string has only port info.
* DISP_ADDR_AND_PORT means that the returned string also contains the
* remote and local IP address.
*/
#define DISP_PORT_ONLY 1
#define DISP_ADDR_AND_PORT 2
/*
* TCP reassembly macros. We hide starting and ending sequence numbers in
* b_next and b_prev of messages on the reassembly queue. The messages are
* chained using b_cont. These macros are used in tcp_reass() so we don't
* have to see the ugly casts and assignments.
* Note. use uintptr_t to suppress the gcc warning.
*/
static char *tcp_display(tcp_t *, char *, char);
static int tcp_drain_input(tcp_t *, int, int);
static void tcp_drain_needed(int, tcp_t *);
static int tcp_header_len(struct inetgram *);
static int tcp_input(int);
static void tcp_iss_init(tcp_t *);
static int tcp_conn_check(tcp_t *);
static int tcp_close(int);
static void tcp_close_detached(tcp_t *);
static void tcp_eager_unlink(tcp_t *);
static int tcp_header_init_ipv4(tcp_t *);
static int tcp_random(void);
static void tcp_random_init(void);
static void tcp_set_cksum(mblk_t *);
static void tcp_ss_rexmit(tcp_t *, int);
static int tcp_state_wait(int, tcp_t *, int);
static void tcp_time_wait_append(tcp_t *);
static void tcp_time_wait_collector(void);
static void tcp_time_wait_remove(tcp_t *);
static int tcp_verify_cksum(mblk_t *);
int, uint_t, int);
int, uint_t);
static int tcp_xmit_end(tcp_t *, int);
#if DEBUG > 1
{ \
\
}
#else
#endif
#ifdef DEBUG
#else
#endif
/* Whether it is the first time TCP is used. */
/* TCP time wait list. */
static tcp_t *tcp_time_wait_head;
static tcp_t *tcp_time_wait_tail;
static uint32_t tcp_cum_timewait;
/* When the tcp_time_wait_collector is run. */
static uint32_t tcp_time_wait_runtime;
#define TCP_RUN_TIME_WAIT_COLLECTOR() \
if (prom_gettime() > tcp_time_wait_runtime) \
/*
* Accept will return with an error if there is no connection coming in
* after this (in ms).
*/
static int tcp_accept_timeout = 60000;
/*
* Initialize the TCP-specific parts of a socket.
*/
void
{
/* Do some initializations. */
if (!tcp_initialized) {
/* Extra head room for the MAC layer address. */
}
/* Schedule the first time wait cleanup time */
}
/* Socket layer should call tcp_send() directly. */
return;
}
return;
}
/*
* This is set last because this field is used to determine if
* a socket is in use or not.
*/
}
/*
* Return the size of a TCP header including TCP option.
*/
static int
{
int ipvers;
/* Just returns the standard TCP header without option */
return (sizeof (tcph_t));
return (0);
if (ipvers == IPV4_VERSION) {
} else {
dprintf("tcp_header_len: non-IPv4 packet.\n");
return (0);
}
}
/*
* Return the requested port number in network order.
*/
static in_port_t
{
}
/*
* Because inetboot is not interrupt driven, TCP can only poll. This
* means that there can be packets stuck in the NIC buffer waiting to
* be processed. Thus we need to drain them before, for example, sending
* anything because an ACK may actually be stuck there.
*
* The timeout arguments determine how long we should wait for draining.
*/
static int
{
struct inetgram *old_in_gram;
int old_timeout;
int i;
/*
* Since the driver uses the in_timeout value in the socket
* structure to determine the timeout value, we need to save
* the original one so that we can restore that after draining.
*/
/*
* We do this because the input queue may have some user
* data already.
*/
/* Go out and check the wire */
for (i = MEDIA_LVL; i < TRANSPORT_LVL; i++) {
return (-1);
}
}
}
#if DEBUG
printf("tcp_drain_input: done with checking packets\n");
#endif
/* Remove unknown inetgrams from the head of inq. */
#if DEBUG
printf("tcp_drain_input: unexpected packet "
#endif
continue;
}
/*
* The other side may have closed this connection or
* RST us. But we need to continue to process other
* packets in the socket's queue because they may be
* belong to another TCP connections.
*/
}
return (-1);
else
return (0);
}
#if DEBUG
printf("tcp_drain_input: done with processing packets\n");
#endif
/*
* Data may have been received so indicate it is available
*/
return (0);
}
/*
* The receive entry point for upper layer to call to get data. Note
* that this follows the current architecture that lower layer receive
* routines have been called already. Thus if the inq of socket is
* not NULL, the packets must be for us.
*/
static int
{
return (-1);
/* Remove unknown inetgrams from the head of inq. */
#ifdef DEBUG
printf("tcp_input: unexpected packet "
#endif
continue;
}
/* The TCP may be gone because it gets a RST. */
return (-1);
}
/* Flush the receive list. */
} else {
/* The other side has closed the connection, report this up. */
return (0);
}
}
return (0);
}
/*
* The send entry point for upper layer to call to send data. In order
* to minimize changes to the core TCP code, we need to put the
* data into mblks.
*/
int
{
int cnt = 0;
int win_size;
/* We don't want to append 0 size mblk. */
if (len == 0)
return (0);
while (len > 0) {
}
/*
* If we cannot allocate more buffer, stop here and
* the number of bytes buffered will be returned.
*
* Note that we follow the core TCP optimization that
* each mblk contains only MSS bytes data.
*/
break;
}
} else {
}
}
/*
* Since inetboot is not interrupt driven, there may be
* some ACKs in the MAC's buffer. Drain them first,
* otherwise, we may not be able to send.
*
* We expect an ACK in two cases:
*
* 1) We have un-ACK'ed data.
*
* 2) All ACK's have been received and the sender's window has been
* closed. We need an ACK back to open the window so that we can
* send. In this case, call tcp_drain_input() if the window size is
* less than 2 * MSS.
*/
/* window size = MIN(swnd, cwnd) - unacked bytes */
return (-1);
/*
* errno should be reset here as it may be
* set to ETIMEDOUT. This may be set by
* the MAC driver in case it has timed out
* waiting for ARP reply. Any segment which
* was not transmitted because of ARP timeout
* will be retransmitted by TCP.
*/
errno = 0;
return (cnt);
}
/* Free up all TCP related stuff */
static void
{
}
}
}
}
sizeof (tcp_sack_info_t));
}
}
static void
{
}
/*
* If we are an eager connection hanging off a listener that hasn't
* formally accepted the connection yet, get off his list and blow off
* any data that we have accumulated.
*/
static void
{
/* Remove the eager tcp from q0 */
} else {
/*
* If we are unlinking the last
* element on the list, adjust
* tail pointer. Set tail pointer
* to nil when list is empty.
*/
if (listener->tcp_eager_last_q ==
NULL;
} else {
/*
* We won't get here if there
* is only one eager in the
* list.
*/
prev;
}
}
break;
}
}
}
}
/*
* Reset any eager connection hanging off this listener
* and then reclaim it's resources.
*/
static void
{
if (!q0_only) {
/* First cleanup q */
tcp_xmit_ctl("tcp_eager_cleanup, can't wait",
sock_id);
}
}
/* Then cleanup q0 */
tcp_xmit_ctl("tcp_eager_cleanup, can't wait",
}
}
/*
* To handle the shutdown request. Called from shutdown()
*/
int
tcp_shutdown(int sock_id)
{
return (-1);
}
/*
* Since inetboot is not interrupt driven, there may be
* some ACKs in the MAC's buffer. Drain them first,
* otherwise, we may not be able to send.
*/
/*
* If we return now without freeing TCP, there will be
* a memory leak.
*/
return (-1);
}
case TCPS_SYN_RCVD:
/*
* Shutdown during the connect 3-way handshake
*/
case TCPS_ESTABLISHED:
/*
* Transmit the FIN
* wait for the FIN to be ACKed,
* then remain in FIN_WAIT_2
*/
dprintf("tcp_shutdown: sending fin\n");
/* During the wait, TCP may be gone... */
return (-1);
}
dprintf("tcp_shutdown: done\n");
break;
default:
break;
}
return (0);
}
/* To handle closing of the socket */
static int
{
char *msg;
int error = 0;
return (-1);
}
/*
* Since inetboot is not interrupt driven, there may be
* some ACKs in the MAC's buffer. Drain them first,
* otherwise, we may not be able to send.
*/
/*
* If we return now without freeing TCP, there will be
* a memory leak.
*/
return (-1);
}
/* Cleanup for listener */
}
case TCPS_CLOSED:
case TCPS_IDLE:
case TCPS_BOUND:
case TCPS_LISTEN:
break;
case TCPS_SYN_SENT:
msg = "tcp_close, during connect";
break;
case TCPS_SYN_RCVD:
/*
* Close during the connect 3-way handshake
* but here there may or may not be pending data
* already on queue. Process almost same as in
* the ESTABLISHED state.
*/
/* FALLTHRU */
default:
/*
* If SO_LINGER has set a zero linger time, abort the
* connection with a reset.
*/
msg = "tcp_close, zero lingertime";
break;
}
/*
* Abort connection if there is unread data queued.
*/
msg = "tcp_close, unread data";
break;
}
break;
/*
* Transmit the FIN before detaching the tcp_t.
* no longer owns the tcp_t thus others can modify it.
* The TCP could be closed in tcp_state_wait called by
* tcp_wput_data called by tcp_xmit_end.
*/
return (0);
/*
* If lingering on close then wait until the fin is acked,
*/
!(tcp->tcp_fin_acked) &&
tcp->tcp_client_errno = 0;
stoptime = prom_gettime() +
while (!(tcp->tcp_fin_acked) &&
tcp->tcp_client_errno == 0 &&
tcp, 0);
}
return (-1);
}
}
tcp->tcp_client_errno = 0;
}
/* During the wait, TCP may be gone... */
return (0);
msg = "tcp_close, couldn't detach";
} else {
return (0);
}
break;
}
/* Something went wrong... Send a RST and report the error */
sock_id);
}
return (error);
}
/* To make an endpoint a listener. */
int
{
return (-1);
}
/* We allow calling listen() multiple times to change the backlog. */
errno = EOPNOTSUPP;
return (-1);
}
/* The following initialization should only be done once. */
}
}
}
return (0);
}
/* To accept connections. */
int
{
int sd, new_sock_id;
int timeout;
/* Sanity check. */
*addr_len < sizeof (struct sockaddr_in) ||
return (-1);
}
else
timeout > prom_gettime()) {
#if DEBUG
printf("tcp_accept: Waiting in tcp_accept()\n");
#endif
return (-1);
}
}
/* If there is an eager, don't timeout... */
#if DEBUG
printf("tcp_accept: timeout\n");
#endif
return (-1);
}
#if DEBUG
printf("tcp_accept: got a connection\n");
#endif
/* Now create the socket for this new TCP. */
return (-1);
}
/* This should not happen! */
prom_panic("so_check_fd() fails in tcp_accept()");
/* Free the TCP PCB in the original socket. */
/* Dequeue the eager and attach it to the socket. */
/* Copy in the address info. */
sizeof (in_addr_t));
#ifdef DEBUG
#endif
return (sd);
}
/* Update the next anonymous port to use. */
static in_port_t
{
/* Don't allow the port to fall out of the anonymous port range. */
if (port < tcp_smallest_nonpriv_port)
return (port);
}
/* To check whether a bind to a port is allowed. */
static in_port_t
{
int i, count;
for (i = 0; i < MAXSOCKET; i++) {
continue;
}
/*
* Both TCPs have the same port. If SO_REUSEDADDR is
* set and the bound TCP has a state greater than
* TCPS_LISTEN, it is fine.
*/
continue;
}
*addr != INADDR_ANY &&
continue;
}
if (bind_to_req_port_only) {
return (0);
}
if (--count > 0) {
goto try_again;
} else {
return (0);
}
}
return (port);
}
/* To handle the bind request. */
int
{
return (-1);
}
/* We don't allow multiple bind(). */
return (-1);
}
/* The bound source can be INADDR_ANY. */
/* Verify the port is available. */
if (requested_port == 0)
else /* T_BIND_REQ and requested_port != 0 */
if (requested_port == 0) {
}
if (allocated_port == 0) {
errno = EADDRINUSE;
return (-1);
}
return (0);
}
/*
* Check for duplicate TCP connections.
*/
static int
{
int i;
for (i = 0; i < MAXSOCKET; i++) {
continue;
/* Socket may not be closed but the TCP can be gone. */
continue;
/* We only care about TCP in states later than SYN_SENT. */
continue;
continue;
} else {
return (-1);
}
}
return (0);
}
/* To handle a connect request. */
int
tcp_connect(int sock_id)
{
int mss;
return (-1);
}
/*
* Check for attempt to connect to INADDR_ANY or non-unicast addrress.
* We don't have enough info to check for broadcast addr, except
* for the all 1 broadcast.
*/
dstaddr == INADDR_BROADCAST) {
/*
* SunOS 4.x and 4.3 BSD allow an application
* to connect a TCP socket to INADDR_ANY.
* When they do this, the kernel picks the
* address of one interface and uses it
* instead. The kernel usually ends up
* picking the address of the loopback
* interface. This is an undocumented feature.
* However, we provide the same thing here
* in order to have source and binary
* compatibility with SunOS 4.x.
* generate the T_CONN_CON.
*
* Fail this for inetboot TCP.
*/
return (-1);
}
/* It is not bound to any address yet... */
/* We don't have an address! */
INADDR_ANY) {
return (-1);
}
}
/*
* Don't let an endpoint connect to itself.
*/
return (-1);
}
/*
* Don't allow this connection to completely duplicate
* an existing connection.
*/
if (tcp_conn_check(tcp) < 0) {
errno = EADDRINUSE;
return (-1);
}
/*
* Just make sure our rwnd is at
* least tcp_recv_hiwat_mss * MSS
* large, and round up to the nearest
* MSS.
*
* We do the round up here because
* we need to get the interface
* MTU first before we can do the
* round up.
*/
/*
* Set tcp_snd_ts_ok to true
* so that tcp_xmit_mp will
* include the timestamp
* option in the SYN segment.
*/
if (tcp_tstamp_always ||
}
if (tcp_sack_permitted == 2 ||
tcp->tcp_snd_sack_ok) {
sizeof (tcp_sack_info_t))) == NULL) {
} else {
}
}
/*
* Should we use ECN? Note that the current
* default value (SunOS 5.9) of tcp_ecn_permitted
* is 2. The reason for doing this is that there
* are equipments out there that will drop ECN
* enabled IP packets. Setting it to 1 avoids
* compatibility problems.
*/
if (tcp_ecn_permitted == 2)
int ret;
/* Dump the packet when debugging. */
/* Send out the SYN packet. */
/*
* errno ETIMEDOUT is set by the mac driver
* in case it is not able to receive ARP reply.
* TCP will retransmit this segment so we can
* ignore the ARP timeout.
*/
return (-1);
}
/* tcp_state_wait() will finish the 3 way handshake. */
} else {
return (-1);
}
}
/*
* Common accept code. Called by tcp_conn_request.
* cr_pkt is the SYN packet.
*/
static int
{
#ifdef DEBUG
printf("tcp_accept_comm #######################\n");
#endif
/*
* When we get here, we know that the acceptor header template
* has already been initialized.
* However, it may not match the listener if the listener
* includes options...
* It may also not match the listener if the listener is v6 and
* and the acceptor is v4
*/
/*
* Listener had options of some sort; acceptor inherits.
* Free up the acceptor template and allocate one
* of the right size.
*/
acceptor->tcp_iphc_len = 0;
return (ENOMEM);
}
}
/*
* Copy the IP+TCP header template from listener to acceptor
*/
} else {
prom_panic("tcp_accept_comm: version not equal");
}
/* Copy our new dest and fport from the connection request packet */
} else {
prom_panic("tcp_accept_comm: not IPv4");
}
sizeof (in_port_t));
/*
* For an all-port proxy listener, the local port is determined by
* the port number field in the SYN packet.
*/
sizeof (in_port_t));
}
/* Inherit various TCP parameters from the listener */
/* Process all TCP options. */
/* Is the other end ECN capable? */
if (tcp_ecn_permitted >= 1 &&
}
/*
* listener->tcp_rq->q_hiwat should be the default window size or a
* window size changed via SO_RCVBUF option. First round up the
* acceptor's tcp_rwnd to the nearest MSS. Then find out the window
* scale option value if needed. Call tcp_rwnd_set() to finish the
* setting.
*
* Note if there is a rpipe metric associated with the remote host,
* we should not inherit receive window size from listener.
*/
if (acceptor->tcp_snd_ws_ok)
/*
* Note that this is the only place tcp_rwnd_set() is called for
* accepting a connection. We need to call it here instead of
* after the 3-way handshake because we need to tell the other
* side our rwnd in the SYN-ACK segment.
*/
return (0);
}
/*
* Defense for the SYN attack -
* 1. When q0 is full, drop from the tail (tcp_eager_prev_q0) the oldest
* one that doesn't have the dontdrop bit set.
* 2. Don't drop a SYN request before its first timeout. This gives every
* request at least til the first timeout to complete its 3-way handshake.
* 3. The current threshold is - # of timeout > q0len/4 => SYN alert on
* # of timeout drops back to <= q0len/32 => SYN alert off
*/
static boolean_t
{
/*
* New one is added after next_q0 so prev_q0 points to the oldest
* Also do not drop any established connections that are deferred on
* q0 due to q being full
*/
/* XXX should move the eager to the head */
break;
}
}
dprintf("tcp_drop_q0: listen half-open queue (max=%d) overflow"
" (%d pending) on %s, drop one", tcp_conn_req_max_q0,
return (B_TRUE);
}
/* ARGSUSED */
static tcp_t *
{
int err;
#ifdef DEBUG
printf("tcp_conn_request ###################\n");
#endif
dprintf("tcp_conn_request: listen backlog (max=%d) "
"overflow (%d pending) on %s",
return (NULL);
}
if (tcp->tcp_conn_req_cnt_q0 >=
/*
* Q0 is full. Drop a pending half-open req from the queue
* to make room for the new SYN req. Also mark the time we
* drop a SYN.
*/
if (!tcp_drop_q0(tcp)) {
dprintf("tcp_conn_request: listen half-open queue "
"(max=%d) full (%d pending) on %s",
return (NULL);
}
}
return (NULL);
}
/*
* We allow the connection to proceed
* by generating a detached tcp state vector and put it in
* the eager queue. When an accept happens, it will be
* dequeued sequentially.
*/
return (NULL);
}
return (NULL);
}
/*
* Eager connection inherits address form from its listener,
* but its packet form comes from the version of the received
* SYN segment.
*/
if (err) {
return (NULL);
}
/* Set tcp_listener before adding it to tcp_conn_fanout */
return (eager);
}
/*
* To get around the non-interrupt problem of inetboot.
* Keep on processing packets until a certain state is reached or the
* TCP is destroyed because of getting a RST packet.
*/
static int
{
int i;
int timeout;
/*
* We need to make sure that the MAC does not wait longer
* than RTO for any packet so that TCP can do retransmission.
* But if the MAC timeout is less than tcp_rto, we are fine
* and do not need to change it.
*/
}
/* Go out and check the wire */
for (i = MEDIA_LVL; i < TRANSPORT_LVL; i++) {
if (changed) {
}
return (-1);
}
}
}
}
break;
/* Remove unknown inetgrams from the head of inq. */
#ifdef DEBUG
printf("tcp_state_wait for state %d: unexpected "
"packet level %d frame found\n", state,
#endif
continue;
}
/*
* The other side may have closed this connection or
* RST us. But we need to continue to process other
* packets in the socket's queue because they may be
* belong to another TCP connections.
*/
}
}
/* If the other side has closed the connection, just return. */
#ifdef DEBUG
printf("tcp_state_wait other side dead: state %d "
#endif
return (-1);
else
return (0);
}
/*
* TCPS_ALL_ACKED is not a valid TCP state, it is just used as an
* indicator to tcp_state_wait to mean that it is being called
* to wait till we have received acks for all the new segments sent.
*/
goto done;
}
goto retry;
}
done:
if (changed)
return (0);
}
/* Verify the checksum of a segment. */
static int
{
int len;
/*
* Calculate the TCP checksum. Need to include the psuedo header,
* which is similar to the real IP header starting at the TTL field.
*/
return (0);
} else {
return (-1);
}
}
/* To find a TCP connection matching the incoming segment. */
static tcp_t *
{
int i;
for (i = 0; i < MAXSOCKET; i++) {
*sock_id = i;
return (tcp);
}
}
}
/* Find it in the time wait list. */
*sock_id = -1;
return (tcp);
}
}
return (NULL);
}
/* To find a TCP listening connection matching the incoming segment. */
static tcp_t *
{
int i;
for (i = 0; i < MAXSOCKET; i++) {
*sock_id = i;
return (tcp);
}
}
}
return (NULL);
}
/* To find a TCP eager matching the incoming segment. */
static tcp_t *
{
#ifdef DEBUG
printf("tcp_lookup_eager_ipv4 ###############\n");
#endif
return (tcp);
}
}
return (tcp);
}
}
#ifdef DEBUG
printf("No eager found\n");
#endif
return (NULL);
}
/* To destroy a TCP control block. */
static void
{
if (sock_id >= 0) {
if (err != 0)
}
}
/*
* tcp_rwnd_set() is called to adjust the receive window to a desired value.
* We do not allow the receive window to shrink. After setting rwnd,
* set the flow control hiwat of the stream.
*
* This function is called in 2 cases:
*
* 1) Before data transfer begins, in tcp_accept_comm() for accepting a
* connection (passive open) and in tcp_rput_data() for active connect.
* This is called after tcp_mss_set() when the desired MSS value is known.
* This makes sure that our window size is a mutiple of the other side's
* MSS.
* 2) Handling SO_RCVBUF option.
*
* It is ASSUMED that the requested size is a multiple of the current MSS.
*
* XXX - Should allow a lower rwnd than tcp_recv_hiwat_minmss * mss if the
* user requests so.
*/
static int
{
if (tcp->tcp_rwnd_max != 0)
else
/*
* Insist on a receive window that is at least
* tcp_recv_hiwat_minmss * MSS (default 4 * MSS) to avoid
* funny TCP interactions of Nagle algorithm, SWS avoidance
* and delayed acknowledgement.
*/
/*
* If window size info has already been exchanged, TCP should not
* shrink the window. Shrinking window is doable if done carefully.
* We may add that support later. But so far there is not a real
* need to do that.
*/
/* MSS may have changed, do a round up again. */
}
/*
* tcp_rcv_ws starts with TCP_MAX_WINSHIFT so the following check
* can be applied even before the window scale option is decided.
*/
if (rwnd > max_transmittable_rwnd) {
/*
* If we're over the limit we may have to back down tcp_rwnd.
* The increment below won't work for us. So we set all three
* here and the increment below will have no effect.
*/
}
/*
* Increment the current rwnd by the amount the maximum grew (we
* can not overwrite it since we might be in the middle of a
* connection.)
*/
return (rwnd);
}
/*
* Extract option values from a tcp header. We put any found values into the
* tcpopt struct and return a bitmask saying which options were found.
*/
static int
{
int len;
int found = 0;
switch (*up) {
case TCPOPT_EOL:
break;
case TCPOPT_NOP:
up++;
continue;
case TCPOPT_MAXSEG:
if (len < TCPOPT_MAXSEG_LEN ||
break;
/* Caller must handle tcp_mss_min and tcp_mss_max_* */
up += TCPOPT_MAXSEG_LEN;
continue;
case TCPOPT_WSCALE:
break;
else
up += TCPOPT_WS_LEN;
continue;
case TCPOPT_SACK_PERMITTED:
if (len < TCPOPT_SACK_OK_LEN ||
break;
up += TCPOPT_SACK_OK_LEN;
continue;
case TCPOPT_SACK:
break;
/* If TCP is not interested in SACK blks... */
continue;
}
up += TCPOPT_HEADER_LEN;
/*
* If the list is empty, allocate one and assume
* nothing is sack'ed.
*/
&(tcp->tcp_num_notsack_blk),
&(tcp->tcp_cnt_notsack_list));
/*
* Make sure tcp_notsack_list is not NULL.
* This happens when kmem_alloc(KM_NOSLEEP)
* returns NULL.
*/
continue;
}
}
while (sack_len > 0) {
break;
}
up += 4;
up += 4;
sack_len -= 8;
/*
* Bounds checking. Make sure the SACK
* info is within tcp_suna and tcp_snxt.
* If this SACK blk is out of bound, ignore
* it but continue to parse the following
* blks.
*/
continue;
}
&(tcp->tcp_num_notsack_blk),
&(tcp->tcp_cnt_notsack_list));
}
}
continue;
case TCPOPT_TSTAMP:
if (len < TCPOPT_TSTAMP_LEN ||
break;
up += TCPOPT_TSTAMP_LEN;
continue;
default:
break;
continue;
}
break;
}
return (found);
}
/*
* Set the mss associated with a particular tcp based on its current value,
* and a new one passed in. Observe minimums and maximums, and reset
* other state variables that we want to view as multiples of mss.
*
* This function is called in various places mainly because
* 1) Various stuffs, tcp_mss, tcp_cwnd, ... need to be adjusted when the
* 2) PMTUd may get us a new MSS.
* 3) If the other side stops sending us timestamp option, we need to
* increase the MSS size to use the extra bytes available.
*/
static void
{
if (mss < tcp_mss_min)
mss = tcp_mss_min;
/*
* Unless naglim has been set by our client to
* a non-mss value, force naglim to track mss.
* This can help to aggregate small writes.
*/
/*
* TCP should be able to buffer at least 4 MSS data for obvious
* performance reason.
*/
/*
* Initialize cwnd according to draft-floyd-incr-init-win-01.txt.
* Previously, we use tcp_slow_start_initial to control the size
* of the initial cwnd. Now, when tcp_slow_start_initial * mss
* is smaller than the cwnd calculated from the formula suggested in
* the draft, we use tcp_slow_start_initial * mss as the cwnd.
* Otherwise, use the cwnd from the draft's formula. The default
* of tcp_slow_start_initial is 2.
*/
tcp->tcp_cwnd_cnt = 0;
}
/*
* Process all TCP option in SYN segment.
*
* This function sets up the correct tcp_mss value according to the
* MSS option value and our header size. It also sets up the window scale
* and timestamp values, and initialize SACK info blocks. But it does not
* change receive window size after setting the tcp_mss value. The caller
* should do the appropriate change.
*/
void
{
int options;
char *tmp_tcph;
/*
* Process MSS option. Note that MSS option value does not account
* for IP or TCP options. This means that it is equal to MTU - minimum
* IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for
* IPv6.
*/
if (!(options & TCP_OPT_MSS_PRESENT)) {
} else {
}
/* Process Window Scale option. */
if (options & TCP_OPT_WSCALE_PRESENT) {
} else {
}
/* Process Timestamp option. */
if ((options & TCP_OPT_TSTAMP_PRESENT) &&
/* Fill in our template header with basic timestamp option. */
tmp_tcph[0] = TCPOPT_NOP;
} else {
}
/*
* Process SACK options. If SACK is enabled for this connection,
* then allocate the SACK info structure.
*/
if ((options & TCP_OPT_SACK_OK_PRESENT) &&
(tcp->tcp_snd_sack_ok ||
/* This should be true only in the passive case. */
sizeof (tcp_sack_info_t));
}
} else {
if (tcp->tcp_snd_ts_ok) {
} else {
}
}
} else {
/*
* Resetting tcp_snd_sack_ok to B_FALSE so that
* no SACK info will be used for this
* connection. This assumes that SACK usage
* permission is negotiated. This may need
* to be changed once this is clarified.
*/
sizeof (tcp_sack_info_t));
}
}
/*
* that from tcp_mss to get our side's MSS.
*/
/*
* Here we assume that the other side's header size will be equal to
* our header size. We calculate the real MSS accordingly. Need to
* take into additional stuffs IPsec puts in.
*
*/
/*
* Set MSS to the smaller one of both ends of the connection.
* We should not have called tcp_mss_set() before, but our
* side of the MSS should have been set to a proper value
* by tcp_adapt_ire(). tcp_mss_set() will also set up the
* STREAM head parameters properly.
*
* If we have a larger-than-16-bit window but the other side
* didn't want to do window scale, tcp_rwnd_set() will take
* care of that.
*/
}
/*
* This function does PAWS protection check. Returns B_TRUE if the
* segment passes the PAWS test, else returns B_FALSE.
*/
{
int options;
/*
* If timestamp option is aligned nicely, get values inline,
* otherwise call general routine to parse. Only do that
* if timestamp is the only option.
*/
} else {
if (tcp->tcp_snd_sack_ok) {
} else {
}
}
if (options & TCP_OPT_TSTAMP_PRESENT) {
/*
* Do PAWS per RFC 1323 section 4.2. Accept RST
* regardless of the timestamp, page 18 RFC 1323.bis.
*/
tcp->tcp_ts_recent)) {
if (TSTMP_LT(prom_gettime(),
/* This segment is not acceptable. */
return (B_FALSE);
} else {
/*
* Connection has been idle for
* too long. Reset the timestamp
* and assume the segment is valid.
*/
tcp->tcp_ts_recent =
}
}
} else {
/*
* If we don't get a timestamp on every packet, we
* figure we can't really trust 'em, so we stop sending
* and parsing them.
*/
if (tcp->tcp_snd_sack_ok) {
}
}
return (B_TRUE);
}
/*
* tcp_get_seg_mp() is called to get the pointer to a segment in the
* send queue which starts at the given seq. no.
*
* Parameters:
* tcp_t *tcp: the tcp instance pointer.
* uint32_t seq: the starting seq. no of the requested segment.
* int32_t *off: after the execution, *off will be the offset to
* the returned mblk which points to the requested seq no.
*
* Return:
* A mblk_t pointer pointing to the requested segment in send queue.
*/
static mblk_t *
{
/* Defensive coding. Make sure we don't send incorrect data. */
return (NULL);
}
if (cnt < 0) {
break;
}
}
return (mp);
}
/*
* This function handles all retransmissions if SACK is enabled for this
* connection. First it calculates how many segments can be retransmitted
* based on tcp_pipe. Then it goes thru the notsack list to find eligible
* segments. A segment is eligible if sack_cnt for that segment is greater
* than or equal tcp_dupack_fast_retransmit. After it has retransmitted
* all eligible segments, it checks to see if TCP can send some new segments
* (fast recovery). If it can, it returns 1. Otherwise it returns 0.
*
* Parameters:
* tcp_t *tcp: the tcp structure of the connection.
*
* Return:
* 1 if the pipe is not full (new data can be sent), 0 otherwise
*/
static int32_t
{
/* Defensive coding in case there is a bug... */
return (0);
}
/*
* Limit the num of outstanding data in the network to be
* tcp_cwnd_ssthresh, which is half of the original congestion wnd.
*/
/* At least retransmit 1 MSS of data. */
if (usable_swnd <= 0) {
usable_swnd = mss;
}
/* Make sure no new RTT samples will be taken. */
while (usable_swnd > 0) {
(notsack_blk->sack_cnt >=
}
break;
}
}
/*
* All holes are filled. Manipulate tcp_cwnd to send more
* if we can. Note that after the SACK recovery, tcp_cwnd is
* set to tcp_cwnd_ssthresh.
*/
if (notsack_blk == NULL) {
if (usable_swnd <= 0) {
return (0);
} else {
return (1);
}
}
/*
* Note that we may send more than usable_swnd allows here
* because of round off, but no more than 1 MSS of data.
*/
/* This should not happen. Defensive coding again... */
return (0);
}
return (0);
usable_swnd -= seg_len;
/*
* Update the send timestamp to avoid false retransmission.
* Note. use uintptr_t to suppress the gcc warning.
*/
/*
* Update tcp_rexmit_max to extend this SACK recovery phase.
* This happens when new data sent during fast recovery is
* also lost. If TCP retransmits those new data, it needs
* to extend SACK recover phase to avoid starting another
* fast retransmit/recovery unnecessarily.
*/
}
}
return (0);
}
static void
{
int seg_len;
int mss;
int npkt;
#ifdef DEBUG
printf("tcp_rput_data sock %d mp %x mp_datap %x #################\n",
#endif
/* Dump the packet when debugging. */
if (ip_hdr_len != IP_SIMPLE_HDR_LENGTH) {
#ifdef DEBUG
printf("Not simple IP header\n");
#endif
/* We cannot handle IP option yet... */
tcp_drops++;
return;
}
/* The TCP header must be aligned. */
/* In inetboot, b_cont should always be NULL. */
/* Verify the checksum. */
if (tcp_verify_cksum(mp) < 0) {
#ifdef DEBUG
printf("tcp_rput_data: wrong cksum\n");
#endif
return;
}
/*
* This segment is not for us, try to find its
* intended receiver.
*/
#ifdef DEBUG
printf("tcp_rput_data: not for us, state %d\n",
#endif
/*
* First try to find a established connection. If none
* is found, look for a listener.
*
* If a listener is found, we need to check to see if the
* incoming segment is for one of its eagers. If it is,
* give it to the eager. If not, listener should take care
* of it.
*/
/* No eager... sent to listener */
#ifdef DEBUG
printf("found the listener: %s\n",
#endif
}
#ifdef DEBUG
else {
printf("found the eager: %s\n",
}
#endif
} else {
/* Non listener found... */
#ifdef DEBUG
printf("found the connection: %s\n",
#endif
}
} else {
/*
* No connection for this segment...
* Send a RST to the other side.
*/
return;
}
}
return;
}
/*
* From this point we can assume that the tcp is not compressed,
* since we would have branched off to tcp_time_wait_processing()
* in such a case.
*/
/*
* After this point, we know we have the correct TCP, so update
* the receive time.
*/
/* In inetboot, we do not handle urgent pointer... */
DEBUG_1("tcp_rput_data(%d): received segment with urgent "
"pointer\n", sock_id);
tcp_drops++;
return;
}
case TCPS_LISTEN:
return;
}
tcp_xmit_early_reset("TCPS_LISTEN-TH_ACK",
return;
}
return;
}
prom_panic("inetboot");
}
if (tcp->tcp_conn_req_max > 0) {
return;
}
#ifdef DEBUG
printf("tcp_rput_data: new tcp created\n");
#endif
}
goto syn_rcvd;
case TCPS_SYN_SENT:
/*
* Note that our stack cannot send data before a
* connection is established, therefore the
* following check is valid. Otherwise, it has
* to be changed.
*/
return;
}
tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq",
return;
}
}
}
return;
}
return;
}
/* Process all TCP options. */
/*
* The following changes our rwnd to be a multiple of the
* MIN(peer MSS, our MSS) for performance reason.
*/
/* Is the other end ECN capable? */
if (tcp->tcp_ecn_ok) {
}
}
/*
* Clear ECN flags because it may interfere with later
* processing.
*/
/* One for the SYN */
/*
* If SYN was retransmitted, need to reset all
* retransmission info. This is because this
* segment will be treated as a dup ACK.
*/
if (tcp->tcp_rexmit) {
/*
* Set tcp_cwnd back to 1 MSS, per
* recommendation from
* Increasing TCP's Initial Window.
*/
}
/*
* Always send the three-way handshake ack immediately
* in order to make the connection complete as soon as
* possible on the accepting host.
*/
flags |= TH_ACK_NEEDED;
/*
* Check to see if there is data to be sent. If
* yes, set the transmit flag. Then check to see
* if received data processing needs to be done.
* If not, go straight to xmit_check. This short
*/
if (tcp->tcp_unsent)
flags |= TH_XMIT_NEEDED;
if (seg_len == 0) {
goto xmit_check;
}
seg_seq++;
break;
}
/*
* Let's wait till our SYN has been ACKED since we
* don't have a timer.
*/
return;
}
}
return;
default:
break;
}
if (tcp->tcp_snd_ts_ok) {
/*
* This segment is not acceptable.
* Drop it and send back an ACK.
*/
flags |= TH_ACK_NEEDED;
goto ack_check;
}
} else if (tcp->tcp_snd_sack_ok) {
/*
* SACK info in already updated in tcp_parse_options. Ignore
* all other TCP options...
*/
}
/*
* gap is the amount of sequence space between what we expect to see
* and what we got for seg_seq. A positive value for gap means
* something got lost. A negative value means we got some old stuff.
*/
if (gap < 0) {
/* Old stuff present. Is the SYN in there? */
(seg_len != 0)) {
seg_seq++;
/* Recompute the gaps after noting the SYN. */
goto try_again;
}
/* Remove the old stuff from seg_len. */
/*
* Anything left?
* Make sure to check for unack'd FIN when rest of data
* has been previously ack'd.
*/
/*
* Resets are only valid if they lie within our offered
* window. If the RST bit is set, we just ignore this
* segment.
*/
return;
}
/*
* This segment is "unacceptable". None of its
* sequence space lies within our advertized window.
*
* Adjust seg_len to the original value for tracing.
*/
#ifdef DEBUG
printf("tcp_rput: unacceptable, gap %d, rgap "
"%d, flags 0x%x, seg_seq %u, seg_ack %u, "
"seg_len %d, rnxt %u, snxt %u, %s",
#endif
/*
* Arrange to send an ACK in response to the
* unacceptable segment per RFC 793 page 69. There
* is only one small difference between ours and the
* acceptability test in the RFC - we accept ACK-only
* packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK
* will be generated.
*
* Note that we have to ACK an ACK-only packet at least
* for stacks that send 0-length keep-alives with
* SEG.SEQ = SND.NXT-1 as recommended by RFC1122,
* section 4.2.3.6. As long as we don't ever generate
* an unacceptable packet in response to an incoming
* packet that is unacceptable, it should not cause
* "ACK wars".
*/
flags |= TH_ACK_NEEDED;
/*
* Continue processing this segment in order to use the
* ACK information it contains, but skip all other
* sequence-number processing. Processing the ACK
* information is necessary in order to
* re-synchronize connections that may have lost
* synchronization.
*
* We clear seg_len and flag fields related to
* sequence number processing as they are not
* to be trusted for an unacceptable segment.
*/
seg_len = 0;
goto process_ack;
}
/* Fix seg_seq, and chew the gap off the front. */
do {
if (gap > 0) {
break;
}
} while (gap < 0);
}
/*
* rgap is the amount of stuff received out of window. A negative
* value is the amount out of window.
*/
if (rgap < 0) {
else {
}
/*
* seg_len does not include the FIN, so if more than
* just the FIN is out of window, we act like we don't
* see it. (If just the FIN is out of window, rgap
* will be zero and we will go ahead and acknowledge
* the FIN.)
*/
/* Fix seg_len and make sure there is something left. */
if (seg_len <= 0) {
/*
* Resets are only valid if they lie within our offered
* window. If the RST bit is set, we just ignore this
* segment.
*/
return;
}
/* Per RFC 793, we need to send back an ACK. */
flags |= TH_ACK_NEEDED;
/*
* If this is a zero window probe, continue to
* process the ACK part. But we need to set seg_len
* to 0 to avoid data processing. Otherwise just
* drop the segment and send back an ACK.
*/
seg_len = 0;
/* Let's see if we can update our rwnd */
goto process_ack;
} else {
goto ack_check;
}
}
/* Pitch out of window stuff off the end. */
do {
if (rgap < 0) {
}
break;
}
}
ok:;
/*
* TCP should check ECN info for segments inside the window only.
* Therefore the check should be done here.
*/
if (tcp->tcp_ecn_ok) {
}
/*
* Note that both ECN_CE and CWR can be set in the
* same segment. In this case, we once again turn
* on ECN_ECHO.
*/
}
}
/*
* Check whether we can update tcp_ts_recent. This test is
* NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP
* Extensions for High Performance: An Update", Internet Draft.
*/
if (tcp->tcp_snd_ts_ok &&
}
/*
* FIN in an out of order segment. We record this in
* tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq.
* Clear the FIN so that any check on FIN flag will fail.
* Remember that FIN also counts in the sequence number
* space. So we need to ack out of order FIN only segments.
*/
flags |= TH_ACK_NEEDED;
}
if (seg_len > 0) {
/* Fill in the SACK blk list. */
if (tcp->tcp_snd_sack_ok) {
&(tcp->tcp_num_sack_blk));
}
/*
* Attempt reassembly and see if we have something
* ready to go.
*/
/* Always ack out of order packets */
/*
* A gap is filled and the seq num and len
* of the gap match that of a previously
* received FIN, put the FIN flag back in.
*/
tcp->tcp_valid_bits &=
}
} else {
/*
* Keep going even with NULL mp.
* There may be a useful ACK or something else
* we don't want to miss.
*
* But TCP should not perform fast retransmit
* because of the ack number. TCP uses
* seg_len == 0 to determine if it is a pure
* ACK. And this is not a pure ACK.
*/
seg_len = 0;
}
}
} else if (seg_len > 0) {
/*
* If an out of order FIN was received before, and the seq
* num and len of the new segment match that of the FIN,
* put the FIN flag back in.
*/
}
}
case TCPS_SYN_RCVD:
break;
case TCPS_ESTABLISHED:
case TCPS_FIN_WAIT_1:
case TCPS_FIN_WAIT_2:
case TCPS_CLOSE_WAIT:
break;
case TCPS_CLOSING:
case TCPS_LAST_ACK:
break;
default:
break;
}
return;
}
/*
* See RFC 793, Page 71
*
* The seq number must be in the window as it should
* be "fixed" above. If it is outside window, it should
* be already rejected. Note that we allow seg_seq to be
* rnxt + rwnd because we want to accept 0 window probe.
*/
/*
* If the ACK flag is not set, just use our snxt as the
* seq number of the RST segment.
*/
}
return;
}
#ifdef DEBUG
#endif
goto xmit_check;
}
}
#ifdef DEBUG
printf("Done with eager 3-way handshake\n");
#endif
/*
* NOTE: RFC 793 pg. 72 says this should be 'bytes_acked < 0'
* but that would mean we have an ack that ignored our SYN.
*/
tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack",
return;
}
/*
* if the conn_req_q is full defer processing
* until space is availabe after accept()
* processing
*/
if (listener->tcp_conn_req_cnt_q <
/* Move from SYN_RCVD to ESTABLISHED list */
/*
* Insert at end of the queue because sockfs
* sends down T_CONN_RES in chronological
* order. Leaving the older conn indications
* at front of the queue helps reducing search
* time.
*/
} else {
}
} else {
/*
* Defer connection on q0 and set deferred
* connection bit true
*/
/* take tcp out of q0 ... */
/* ... and place it at the end of q0 */
}
bytes_acked--;
/*
* If SYN was retransmitted, need to reset all
* retransmission info as this segment will be
* treated as a dup ACK.
*/
if (tcp->tcp_rexmit) {
tcp->tcp_ms_we_have_waited = 0;
}
/*
* We set the send window to zero here.
* This is needed if there is data to be
* processed already on the queue.
* Later (at swnd_update label), the
* "new_swnd > tcp_swnd" condition is satisfied
* the XMIT_NEEDED flag is set in the current
* (SYN_RCVD) state. This ensures tcp_wput_data() is
* called if there is already data on queue in
* this state.
*/
}
/* This code follows 4.4BSD-Lite2 mostly. */
if (bytes_acked < 0)
goto est;
/*
* If TCP is ECN capable and the congestion experience bit is
* set, reduce tcp_cwnd and tcp_ssthresh. But this should only be
* done once per window (or more loosely, per RTT).
*/
/*
* If the cwnd is 0, use the timer to clock out
* new segments. This is required by the ECN spec.
*/
if (npkt == 0) {
/*
* This makes sure that when the ACK comes
* back, we will increase tcp_cwnd by 1 MSS.
*/
tcp->tcp_cwnd_cnt = 0;
}
/*
* This marks the end of the current window of in
* flight data. That is why we don't use
* tcp_suna + tcp_swnd. Only data in flight can
* provide ECN info.
*/
}
}
if (bytes_acked == 0) {
int dupack_cnt;
/*
* Fast retransmit. When we have seen exactly three
* identical ACKs while we have unacked data
* outstanding we take it as a hint that our peer
* dropped something.
*
* If TCP is retransmitting, don't do fast retransmit.
*/
! tcp->tcp_rexmit) {
/* Do Limited Transmit */
/*
* RFC 3042
*
* What we need to do is temporarily
* increase tcp_cwnd so that new
* data can be sent if it is allowed
* by the receive window (tcp_rwnd).
* tcp_wput_data() will take care of
* the rest.
*
* If the connection is SACK capable,
* only do limited xmit when there
* is SACK info.
*
* Note how tcp_cwnd is incremented.
* The first dup ACK will increase
* it by 1 MSS. The second dup ACK
* will increase it by 2 MSS. This
* means that only 1 new segment will
* be sent for each dup ACK.
*/
if (tcp->tcp_unsent > 0 &&
(!tcp->tcp_snd_sack_ok ||
(tcp->tcp_snd_sack_ok &&
flags |= TH_LIMIT_XMIT;
}
} else if (dupack_cnt ==
/*
* If we have reduced tcp_ssthresh
* because of ECN, do not reduce it again
* unless it is already one window of data
* away. After one window of data, tcp_cwr
* should then be cleared. Note that
* for non ECN capable connection, tcp_cwr
* should always be false.
*
* Adjust cwnd since the duplicate
* ack indicates that a packet was
* dropped (due to congestion.)
*/
if (npkt < 2)
npkt = 2;
}
if (tcp->tcp_ecn_ok) {
}
/*
* We do Hoe's algorithm. Refer to her
* paper "Improving the Start-up Behavior
* of a Congestion Control Scheme for TCP,"
* appeared in SIGCOMM'96.
*
* Save highest seq no we have sent so far.
* Be careful about the invisible FIN byte.
*/
(tcp->tcp_unsent == 0)) {
} else {
}
/*
* Do not allow bursty traffic during.
* fast recovery. Refer to Fall and Floyd's
* paper "Simulation-based Comparisons of
* Tahoe, Reno and SACK TCP" (in CCR ??)
* This is a best current practise.
*/
/*
* For SACK:
* Calculate tcp_pipe, which is the
* estimated number of bytes in
* network.
*
* tcp_fack is the highest sack'ed seq num
* TCP has received.
*
* tcp_pipe is explained in the above quoted
* Fall and Floyd's paper. tcp_fack is
* explained in Mathis and Mahdavi's
* "Forward Acknowledgment: Refining TCP
* Congestion Control" in SIGCOMM '96.
*/
if (tcp->tcp_snd_sack_ok) {
} else {
/*
* Always initialize tcp_pipe
* even though we don't have
* any SACK info. If later
* we get SACK info and
* tcp_pipe is not initialized,
* funny things will happen.
*/
}
} else {
} /* tcp_snd_sack_ok */
} else {
/*
* Here we perform congestion
* avoidance, but NOT slow start.
* This is known as the Fast
* Recovery Algorithm.
*/
if (tcp->tcp_snd_sack_ok &&
} else {
/*
* We know that one more packet has
* left the pipe thus we can update
* cwnd.
*/
flags |= TH_XMIT_NEEDED;
}
}
}
} else if (tcp->tcp_zero_win_probe) {
/*
* If the window has opened, need to arrange
* to send additional data.
*/
if (new_swnd != 0) {
/* tcp_suna != tcp_snxt */
/* Packet contains a window update */
tcp->tcp_zero_win_probe = 0;
tcp->tcp_timer_backoff = 0;
tcp->tcp_ms_we_have_waited = 0;
/*
* Transmit starting with tcp_suna since
* the one byte probe is not ack'ed.
* If TCP has sent more than one identical
* probe, tcp_rexmit will be set. That means
* tcp_ss_rexmit() will send out the one
* byte along with new data. Otherwise,
* fake the retransmission.
*/
flags |= TH_XMIT_NEEDED;
if (!tcp->tcp_rexmit) {
tcp->tcp_dupack_cnt = 0;
}
}
}
goto swnd_update;
}
/*
* Check for "acceptability" of ACK value per RFC 793, pages 72 - 73.
* If the ACK value acks something that we have not yet sent, it might
* be an old duplicate segment. Send an ACK to re-synchronize the
* other side.
* Note: reset in response to unacceptable ACK in SYN_RECEIVE
* state is handled above, so we can always just drop the segment and
* send an ACK here.
*
* Should we send ACKs in response to ACK only segments?
*/
/* drop the received segment */
/* Send back an ACK. */
return;
}
return;
}
/*
* TCP gets a new ACK, update the notsack'ed list to delete those
* blocks that are covered by this ACK.
*/
}
/*
* If we got an ACK after fast retransmit, check to see
* if it is a partial ACK. If it is not and the congestion
* window was inflated to account for the other side's
* cached packets, retract it. If it is, do Hoe's algorithm.
*/
tcp->tcp_dupack_cnt = 0;
/*
* Restore the orig tcp_cwnd_ssthresh after
* fast retransmit phase.
*/
}
tcp->tcp_cwnd_cnt = 0;
/*
* Remove all notsack info to avoid confusion with
* the next fast retrasnmit/recovery phase.
*/
if (tcp->tcp_snd_sack_ok &&
}
} else {
if (tcp->tcp_snd_sack_ok &&
} else {
/*
* Hoe's algorithm:
*
* Retransmit the unack'ed segment and
* restart fast recovery. Note that we
* need to scale back tcp_cwnd to the
* original value when we started fast
* recovery. This is to prevent overly
* aggressive behaviour in sending new
* segments.
*/
}
}
} else {
tcp->tcp_dupack_cnt = 0;
if (tcp->tcp_rexmit) {
/*
* TCP is retranmitting. If the ACK ack's all
* outstanding data, update tcp_rexmit_max and
* tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt
* to the correct value.
*
* Note that SEQ_LEQ() is used. This is to avoid
* unnecessary fast retransmit caused by dup ACKs
* received when TCP does slow start retransmission
* after a time out. During this phase, TCP may
* send out segments which are already received.
* This causes dup ACKs to be sent back.
*/
}
flags |= TH_XMIT_NEEDED;
}
} else {
}
tcp->tcp_ms_we_have_waited = 0;
}
}
if (tcp->tcp_zero_win_probe != 0) {
tcp->tcp_zero_win_probe = 0;
tcp->tcp_timer_backoff = 0;
}
/*
* If tcp_xmit_head is NULL, then it must be the FIN being ack'ed.
* Note that it cannot be the SYN being ack'ed. The code flow
* will not reach here.
*/
goto fin_acked;
}
/*
* Update the congestion window.
*
* If TCP is not ECN capable or TCP is ECN capable but the
* congestion experience bit is not set, increase the tcp_cwnd as
* usual.
*/
/*
* This is to prevent an increase of less than 1 MSS of
* tcp_cwnd. With partial increase, tcp_wput_data()
* may send out tinygrams in order to preserve mblk
* boundaries.
*
* By initializing tcp_cwnd_cnt to new tcp_cwnd and
* decrementing it by 1 MSS for every ACKs, tcp_cwnd is
* increased by 1 MSS for every RTTs.
*/
if (tcp->tcp_cwnd_cnt <= 0) {
} else {
add = 0;
}
}
}
/* Can we update the RTT estimates? */
if (tcp->tcp_snd_ts_ok) {
/* Ignore zero timestamp echo-reply. */
if (tcpopt.tcp_opt_ts_ecr != 0) {
}
/* If needed, restart the timer. */
tcp->tcp_set_timer = 0;
}
/*
* Update tcp_csuna in case the other side stops sending
* us timestamps.
*/
/*
* An ACK sequence we haven't seen before, so get the RTT
* and update the RTO.
* Note. use uintptr_t to suppress the gcc warning.
*/
/* Remeber the last sequence to be ACKed */
tcp->tcp_set_timer = 0;
}
} else {
}
/* Eat acknowledged bytes off the xmit queue. */
for (;;) {
if (bytes_acked < 0) {
break;
}
if (bytes_acked == 0) {
/* Everything is ack'ed, clear the tail. */
goto pre_swnd_update;
}
break;
break;
}
/*
* More was acked but there is nothing more
* outstanding. This means that the FIN was
* just acked or that we're talking to a clown.
*/
if (tcp->tcp_fin_sent) {
} else {
/*
* We should never got here because
* we have already checked that the
* number of bytes ack'ed should be
* smaller than or equal to what we
* have sent so far (it is the
* acceptability check of the ACK).
* We can only get here if the send
* queue is corrupted.
*
* Terminate the connection and
* panic the system. It is better
* for us to panic instead of
* continuing to avoid other disaster.
*/
printf("Memory corruption "
"detected for connection %s.\n",
/* We should never get here... */
prom_panic("tcp_rput_data");
return;
}
goto pre_swnd_update;
}
}
if (tcp->tcp_unsent) {
flags |= TH_XMIT_NEEDED;
}
/*
* The following check is different from most other implementations.
* For bi-directional transfer, when segments are dropped, the
* "normal" check will not accept a window update in those
* retransmitted segemnts. Failing to do that, TCP may send out
* segments which are outside receiver's window. As TCP accepts
* the ack in those retransmitted segments, if the window update in
* the same segment is not accepted, TCP will incorrectly calculates
* that it can send more segments. This can create a deadlock
* with the receiver if its window becomes zero.
*/
/*
* The criteria for update is:
*
* 1. the segment acknowledges some data. Or
* 2. the segment is new, i.e. it has a higher seq num. Or
* 3. the segment is not old and the advertised window is
* larger than the previous advertised window.
*/
flags |= TH_XMIT_NEEDED;
}
est:
case TCPS_FIN_WAIT_1:
if (tcp->tcp_fin_acked) {
/*
* FIN_WAIT_2 flushing algorithm.
* If there is no user attached to this
* TCP endpoint, then this TCP struct
* could hang around forever in FIN_WAIT_2
* state if the peer forgets to send us
* a FIN. To prevent this, we wait only
* 2*MSL (a convenient time value) for
* the FIN to arrive. If it doesn't show up,
* we flush the TCP endpoint. This algorithm,
* though a violation of RFC-793, has worked
* for over 10 years in BSD systems.
* Note: SunOS 4.x waits 675 seconds before
* flushing the FIN_WAIT_2 connection.
*/
}
break;
case TCPS_FIN_WAIT_2:
break; /* Shutdown hook? */
case TCPS_LAST_ACK:
if (tcp->tcp_fin_acked) {
return;
}
goto xmit_check;
case TCPS_CLOSING:
if (tcp->tcp_fin_acked) {
}
/*FALLTHRU*/
case TCPS_CLOSE_WAIT:
goto xmit_check;
default:
break;
}
}
/* Make sure we ack the fin */
flags |= TH_ACK_NEEDED;
if (!tcp->tcp_fin_rcvd) {
case TCPS_SYN_RCVD:
case TCPS_ESTABLISHED:
/* Keepalive? */
break;
case TCPS_FIN_WAIT_1:
if (!tcp->tcp_fin_acked) {
break;
}
/* FALLTHRU */
case TCPS_FIN_WAIT_2:
if (seg_len) {
/*
* implies data piggybacked on FIN.
* break to handle data.
*/
break;
}
goto ack_check;
}
}
}
goto xmit_check;
if (seg_len == 0) {
goto xmit_check;
}
/*
* The header has been consumed, so we remove the
* zero-length mblk here.
*/
}
/*
* ACK every other segments, unless the input queue is empty
* as we don't have a timer available.
*/
flags |= TH_ACK_NEEDED;
tcp->tcp_rack_cnt = 0;
}
/* Update SACK list */
&(tcp->tcp_num_sack_blk));
}
if (tcp->tcp_listener) {
/*
* Side queue inbound data until the accept happens.
* tcp_accept/tcp_rput drains this when the accept happens.
*/
} else {
/* Just queue the data until the app calls read. */
/*
* Make sure the timer is running if we have data waiting
* for a push bit. This provides resiliency against
* implementations that do not correctly generate push bits.
*/
flags |= TH_TIMER_NEEDED;
}
/* Is there anything left to do? */
return;
/* Any transmit work to do and a non-zero window? */
if (flags & TH_REXMIT_NEEDED) {
B_TRUE);
/* use uintptr_t to suppress the gcc warning */
}
}
if (flags & TH_NEED_SACK_REXMIT) {
flags |= TH_XMIT_NEEDED;
}
}
/*
* For TH_LIMIT_XMIT, tcp_wput_data() is called to send
* out new segment. Note that tcp_rexmit should not be
* set, otherwise TH_LIMIT_XMIT should not be set.
*/
if (!tcp->tcp_rexmit) {
} else {
}
/*
* The TCP could be closed in tcp_state_wait via
* tcp_wput_data (tcp_ss_rexmit could call
* tcp_wput_data as well).
*/
return;
}
/*
* Adjust tcp_cwnd back to normal value after sending
* new data segments.
*/
if (flags & TH_LIMIT_XMIT) {
}
/* Anything more to do? */
return;
}
if (flags & TH_ACK_NEEDED) {
/*
* Time to send an ack for some reason.
*/
}
}
}
/*
* tcp_ss_rexmit() is called in tcp_rput_data() to do slow start
* retransmission after a timeout.
*
* To limit the number of duplicate segments, we limit the number of segment
* to be sent in one time to tcp_snd_burst, the burst variable.
*/
static void
{
/*
* Note that tcp_rexmit can be set even though TCP has retransmitted
* all unack'ed segments.
*/
}
}
}
return;
/*
* Update the send timestamp to avoid false
* retransmission.
* Note. use uintptr_t to suppress the gcc warning.
*/
burst--;
}
/*
* If we have transmitted all we have at the time
* we started the retranmission, we can leave
* the rest of the job to tcp_wput_data(). But we
* need to check the send window first. If the
* win is not 0, go on with tcp_wput_data().
*/
return;
}
}
/* Only call tcp_wput_data() if there is data to be sent. */
if (tcp->tcp_unsent) {
}
}
/*
* tcp_timer is the timer service routine. It handles all timer events for
* a tcp instance except keepalives. It figures out from the state of the
* tcp instance what kind of action needs to be done at the time it is called.
*/
static void
{
case TCPS_IDLE:
case TCPS_BOUND:
case TCPS_LISTEN:
return;
case TCPS_SYN_RCVD:
case TCPS_SYN_SENT:
break;
case TCPS_ESTABLISHED:
case TCPS_FIN_WAIT_1:
case TCPS_CLOSING:
case TCPS_CLOSE_WAIT:
case TCPS_LAST_ACK:
/* If we have data to rexmit */
break;
/* use uintptr_t to suppress the gcc warning */
if (time_to_wait > 0) {
/*
* Timer fired too early, so restart it.
*/
return;
}
/*
* When we probe zero windows, we force the swnd open.
* If our peer acks with a closed window swnd will be
* set to zero by tcp_rput(). As long as we are
* receiving acks tcp_rput will
* reset 'tcp_ms_we_have_waited' so as not to trip the
* first and second interval actions. NOTE: the timer
* interval is allowed to continue its exponential
* backoff.
*/
break;
} else {
/*
* After retransmission, we need to do
* slow start. Set the ssthresh to one
* half of current effective window and
* cwnd to one MSS. Also reset
* tcp_cwnd_cnt.
*
* Note that if tcp_ssthresh is reduced because
* of ECN, do not reduce it again unless it is
* already one window of data away (tcp_cwr
* should then be cleared) or this is a
* timeout for a retransmitted segment.
*/
if (npkt < 2)
npkt = 2;
}
tcp->tcp_cwnd_cnt = 0;
if (tcp->tcp_ecn_ok) {
}
}
break;
}
/*
* We have something to send yet we cannot send. The
* reason can be:
*
* 1. Zero send window: we need to do zero window probe.
* 2. Zero cwnd: because of ECN, we need to "clock out
* segments.
* 3. SWS avoidance: receiver may have shrunk window,
* reset our knowledge.
*
* Note that condition 2 can happen with either 1 or
* 3. But 1 and 3 are exclusive.
*/
if (tcp->tcp_unsent != 0) {
/*
* Set tcp_cwnd to 1 MSS so that a
* new segment can be sent out. We
* are "clocking out" new data when
* the network is really congested.
*/
}
/* Extend window for zero window probe */
} else {
/*
* Handle timeout from sender SWS avoidance.
* Reset our knowledge of the max send window
* since the receiver might have reduced its
* receive buffer. Avoid setting tcp_max_swnd
* to one since that will essentially disable
* the SWS checks.
*
* Note that since we don't have a SWS
* state variable, if the timeout is set
* for ECN but not for SWS, this
* code will also be executed. This is
* fine as tcp_max_swnd is updated
* constantly and it will not affect
* anything.
*/
}
return;
}
/* Is there a FIN that needs to be to re retransmitted? */
!tcp->tcp_fin_acked)
break;
/* Nothing to do, return without restarting timer. */
return;
case TCPS_FIN_WAIT_2:
/*
* User closed the TCP endpoint and peer ACK'ed our FIN.
* We waited some time for for peer's FIN, but it hasn't
* arrived. We flush the connection now to avoid
* case where the peer has rebooted.
*/
/* FALLTHRU */
case TCPS_TIME_WAIT:
return;
default:
return;
}
/*
* For zero window probe, we need to send indefinitely,
* unless we have not heard from the other side for some
* time...
*/
if ((tcp->tcp_zero_win_probe == 0) ||
second_threshold)) {
/*
* If TCP is in SYN_RCVD state, send back a
* RST|ACK as BSD does. Note that tcp_zero_win_probe
* should be zero in TCPS_SYN_RCVD state.
*/
tcp_xmit_ctl("tcp_timer: RST sent on timeout "
"in SYN_RCVD",
}
return;
} else {
/*
* Set tcp_ms_we_have_waited to second_threshold
* so that in next timeout, we will do the above
* check (lbolt - tcp_last_recv_time). This is
* also to avoid overflow.
*
* We don't need to decrement tcp_timer_backoff
* to avoid overflow because it will be decremented
* later if new timeout value is greater than
* tcp_rexmit_interval_max. In the case when
* tcp_rexmit_interval_max is greater than
* second_threshold, it means that we will wait
* longer than second_threshold to send the next
* window probe.
*/
}
/*
* We have been retransmitting for too long... The RTT
* we calculated is probably incorrect. Reinitialize it.
* Need to compensate for 0 tcp_rtt_sa. Reset
* tcp_rtt_update so that we won't accidentally cache a
* bad value. But only do this if this is not a zero
* window probe.
*/
if (tcp->tcp_zero_win_probe == 0) {
tcp->tcp_rtt_sa = 0;
tcp->tcp_rtt_update = 0;
}
}
tcp->tcp_timer_backoff++;
/*
* This means the original RTO is tcp_rexmit_interval_min.
* So we will use tcp_rexmit_interval_min as the RTO value
* and do the backoff.
*/
} else {
}
if (ms > tcp_rexmit_interval_max) {
/*
* ms is at max, decrement tcp_timer_backoff to avoid
* overflow.
*/
tcp->tcp_timer_backoff--;
}
if (tcp->tcp_zero_win_probe == 0) {
}
/*
* This is after a timeout and tcp_rto is backed off. Set
* tcp_set_timer to 1 so that next time RTO is updated, we will
* restart the timer with a correct value.
*/
/* use uintptr_t to suppress the gcc warning */
}
B_TRUE);
return;
/* Dump the packet when debugging. */
/*
* When slow start after retransmission begins, start with
* this seq no. tcp_rexmit_max marks the end of special slow
* start phase. tcp_snd_burst controls how many segments
* can be sent because of an ack.
*/
(tcp->tcp_unsent == 0)) {
} else {
}
tcp->tcp_dupack_cnt = 0;
/*
* Remove all rexmit SACK blk to start from fresh.
*/
tcp->tcp_num_notsack_blk = 0;
tcp->tcp_cnt_notsack_list = 0;
}
}
/*
* The TCP normal data output path.
* NOTE: the logic of the fast path is duplicated from this function.
*/
static void
{
int len;
int tail_unsent;
int tcpstate;
int usable = 0;
int32_t num_sack_blk = 0;
#ifdef DEBUG
#endif
/* Really tacky... but we need this for detached closes. */
goto data_null;
}
/*
* Don't allow data after T_ORDREL_REQ or T_DISCON_REQ,
* or before a connection attempt has begun.
*
* The following should not happen in inetboot....
*/
printf("tcp_wput_data: data after ordrel, %s\n",
}
return;
}
/* Strip empties */
for (;;) {
if (len > 0)
break;
return;
}
}
/* If we are the first on the list ... */
} else {
}
/* Tack on however many more positive length mblks we have */
do {
int tlen;
if (tlen <= 0) {
} else {
}
}
/*
* Note that tcp_mss has been adjusted to take into account the
* timestamp option if applicable. Because SACK options do not
* appear in every TCP segments and they are of variable lengths,
* they cannot be included in tcp_mss. Thus we need to calculate
* the actual segment length when we need to send a segment which
* includes SACK options.
*/
2 + TCPOPT_HEADER_LEN;
} else {
}
}
if (tcpstate == TCPS_SYN_RCVD) {
/*
* The three-way connection establishment handshake is not
* complete yet. We want to queue the data for transmission
* after entering ESTABLISHED state (RFC793). Setting usable to
* zero cause a jump to "done" label effectively leaving data
* on the queue.
*/
usable = 0;
} else {
/*
* In the special case when cwnd is zero, which can only
* happen if the connection is ECN capable, return now.
* New segments is sent using tcp_timer(). The timer
* is set in tcp_rput_data().
*/
/*
* Note that tcp_cwnd is 0 before 3-way handshake is
* finished.
*/
return;
}
/* usable = MIN(swnd, cwnd) - unacked_bytes */
/* NOTE: trouble if xmitting while SYN not acked? */
/* usable = MIN(usable, unsent) */
/* usable = MAX(usable, {1 for urgent, 0 for data}) */
if (usable_r != 0)
}
/* use uintptr_t to suppress the gcc warning */
/*
* "Our" Nagle Algorithm. This is not the same as in the old
* BSD. This is more in line with the true intent of Nagle.
*
* The conditions are:
* 1. The amount of unsent data (or amount of data which can be
* sent, whichever is smaller) is less than Nagle limit.
* 2. The last sent size is also less than Nagle limit.
* 3. There is unack'ed data.
* 4. Urgent pointer is not set. Send urgent data ignoring the
* Nagle algorithm. This reduces the probability that urgent
* bytes get "merged" together.
* 5. The app has not closed the connection. This eliminates the
* wait time of the receiving side waiting for the last piece of
* (small) data.
*
* If all are satisified, exit without sending anything. Note
* that Nagle limit can be smaller than 1 MSS. Nagle limit is
* the smaller of 1 MSS and global tcp_naglim_def (default to be
* 4095).
*/
goto done;
for (;;) {
if (num_burst_seg-- == 0)
goto done;
if (len <= 0) {
/* Terminate the loop */
goto done;
}
/*
* Sender silly-window avoidance.
* Ignore this if we are going to send a
* zero window probe out.
*
* TODO: force data into microscopic window ??
* ==> (!pushed || (unsent > usable))
*/
/*
* If the retransmit timer is not running
* we start it so that we will retransmit
* in the case when the the receiver has
* decremented the window.
*/
/*
* We are not supposed to send
* anything. So let's wait a little
* bit longer before breaking SWS
* avoidance.
*
* What should the value be?
* Suggestion: MAX(init rexmit time,
* tcp->tcp_rto)
*/
}
goto done;
}
}
if (usable > 0)
else
if (tcp->tcp_valid_bits) {
if (tail_unsent == 0) {
} else {
}
/* Restore tcp_snxt so we get amount sent right. */
else
break;
}
/* Dump the packet when debugging. */
continue;
}
if (tail_unsent) {
/* Are the bytes above us in flight? */
tail_unsent -= len;
len += tcp_hdr_len;
if (!mp)
break;
goto must_alloc;
}
} else {
}
tail_unsent -= len;
len += tcp_hdr_len;
goto out_of_mem;
len = tcp_hdr_len;
/*
* There are four reasons to allocate a new hdr mblk:
* 1) The bytes above us are in use by another packet
* 2) We don't have good alignment
* 3) The mblk is being shared
* 4) We don't have enough room for a header
*/
/* NOTE: we assume allocb returns an OK_32PTR */
tcp_wroff_xtra, 0);
goto out_of_mem;
}
/* Leave room for Link Level header */
len = tcp_hdr_len;
}
if (tcp->tcp_snd_ts_ok) {
/* use uintptr_t to suppress the gcc warning */
} else {
}
/* Copy the template header. */
if (len -= 40) {
len >>= 2;
dst += 10;
src += 10;
do {
} while (--len);
}
/*
* Set tcph to point to the header of the outgoing packet,
* not to the template header.
*/
/*
* Set the ECN info in the TCP header if it is not a zero
* window probe. Zero window probe is only sent in
* tcp_wput_data() and tcp_timer().
*/
if (tcp->tcp_ecn_echo_on)
}
}
/* Fill in SACK options */
if (num_sack_blk > 0) {
int32_t i;
wptr[0] = TCPOPT_NOP;
sizeof (sack_blk_t);
for (i = 0; i < num_sack_blk; i++) {
}
<< 4);
}
if (tail_unsent) {
/*
* If we're a little short, tack on more mblks
* as long as we don't need to split an mblk.
*/
while (tail_unsent < 0 &&
/* Stash for rtt use later */
goto out_of_mem;
}
}
/* Trim back any surplus on the last mblk */
if (tail_unsent > 0)
if (tail_unsent < 0) {
/*
* We did not send everything we could in
* order to preserve mblk boundaries.
*/
usable -= tail_unsent;
snxt += tail_unsent;
/* Adjust the IP length field. */
tail_unsent = 0;
}
}
goto out_of_mem;
/*
* Performance hit! We need to pullup the whole message
* in order to do checksum and for the MAC output routine.
*/
int mp_size;
#ifdef DEBUG
#endif
}
}
}
/* Pretend that all we were trying to send really got sent */
if (tail_unsent < 0) {
do {
} while (tail_unsent < 0);
}
done:;
if (len) {
/*
* If new data was sent, need to update the notsack
* list, which is, afterall, data blocks that have
* not been sack'ed by the receiver. New data is
* not sack'ed.
*/
/* len is a negative value. */
&(tcp->tcp_num_notsack_blk),
&(tcp->tcp_cnt_notsack_list));
}
tcp->tcp_rack_cnt = 0;
}
/*
* Note that len is the amount we just sent but with a negative
* sign. We update tcp_unsent here since we may come back to
* tcp_wput_data from tcp_state_wait.
*/
/*
* Let's wait till all the segments have been acked, since we
* don't have a timer.
*/
return;
/*
* Didn't send anything. Make sure the timer is running
* so that we will probe a zero window.
*/
}
/* Note that len is the amount we just sent but with a negative sign */
}
static void
int sock_id)
{
#ifdef DEBUG
printf("Time wait processing called ###############3\n");
#endif
/* Just make sure we send the right sock_id to tcp_clean_death */
sock_id = -1;
if (tcp->tcp_snd_ts_ok) {
return;
}
}
if (gap < 0) {
return;
}
/*
* When TCP receives a duplicate FIN in
* TIME_WAIT state, restart the 2 MSL timer.
* See page 73 in RFC 793. Make sure this TCP
* is already on the TIME_WAIT list. If not,
* just restart the timer.
*/
return;
}
flags |= TH_ACK_NEEDED;
seg_len = 0;
goto process_ack;
}
/* Fix seg_seq, and chew the gap off the front. */
}
/*
* Make sure that when we accept the connection, pick
* an ISS greater than (tcp_snxt + ISS_INCR/2) for the
* old connection.
*
* The next ISS generated is equal to tcp_iss_incr_extra
* + ISS_INCR/2 + other components depending on the
* value of tcp_strong_iss. We pre-calculate the new
* ISS here and compare with tcp_snxt to determine if
* we need to make adjustment to tcp_iss_incr_extra.
*
* Note that since we are now in the global queue
* perimeter and need to do a lateral_put() to the
* listener queue, there can be other connection requests/
* attempts while the lateral_put() is going on. That
* means what we calculate here may not be correct. This
* is extremely difficult to solve unless TCP and IP
* modules are merged and there is no perimeter, but just
* locks. The above calculation is ugly and is a
* waste of CPU cycles...
*/
/* Add time component and min random (i.e. 1). */
/*
* New ISS not guaranteed to be ISS_INCR/2
* ahead of the current tcp_snxt, so add the
* difference to tcp_iss_incr_extra.
*/
}
/*
* This is a passive open. Right now we do not
* do anything...
*/
return;
}
/*
* rgap is the amount of stuff received out of window. A negative
* value is the amount out of window.
*/
if (rgap < 0) {
/* Fix seg_len and make sure there is something left. */
if (seg_len <= 0) {
return;
}
flags |= TH_ACK_NEEDED;
seg_len = 0;
goto process_ack;
}
}
/*
* Check whether we can update tcp_ts_recent. This test is
* NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP
* Extensions for High Performance: An Update", Internet Draft.
*/
if (tcp->tcp_snd_ts_ok &&
}
/* Always ack out of order packets */
flags |= TH_ACK_NEEDED;
seg_len = 0;
} else if (seg_len > 0) {
}
return;
}
/*
* Do not delete the TCP structure if it is in
* TIME_WAIT state. Refer to RFC 1122, 4.2.2.13.
*/
return;
}
if (bytes_acked <= 0) {
if (bytes_acked == 0 && seg_len == 0 &&
} else {
/* Acks something not sent */
flags |= TH_ACK_NEEDED;
}
}
if (flags & TH_ACK_NEEDED) {
/*
* Time to send an ack for some reason.
*/
}
}
static int
{
int err;
/*
* Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
* will be close to tcp_rexmit_interval_initial. By doing this, we
* allow the algorithm to adjust slowly to large fluctuations of RTT
* during first few transmissions of a connection as seen in slow
* links.
*/
tcp->tcp_timer_backoff = 0;
tcp->tcp_ms_we_have_waited = 0;
/* For Ethernet, the mtu returned is actually 1550... */
if (mac_get_type() == IFT_ETHER) {
} else {
}
/*
* Fix it to tcp_ip_abort_linterval later if it turns out to be a
* passive open.
*/
/* NOTE: ISS is now set in tcp_adapt_ire(). */
/* Initialize the header template */
}
if (err)
return (err);
/*
* Init the window scale to the max so tcp_rwnd_set() won't pare
* down tcp_rwnd. tcp_adapt_ire() will set the right value later.
*/
}
return (0);
}
/*
* Initialize the IPv4 header. Loses any record of any IP options.
*/
static int
{
/*
* This is a simple initialization. If there's
* already a template, it should never be too small,
* so reuse it. Otherwise, allocate space for the new one.
*/
} else {
tcp->tcp_iphc_len = 0;
return (ENOMEM);
}
}
/*
* Note that it does not include TCP options yet. It will
* after the connection is established.
*/
/* We don't support IP options... */
/* We are not supposed to do PMTU discovery... */
return (0);
}
/*
* Send out a control packet on the tcp connection specified. This routine
* is typically called where we need a simple ACK or RST generated.
*
* This function is called with or without a mp.
*/
static void
{
int tcp_hdr_len;
int tcp_ip_hdr_len;
if (mp) {
assert(ip_hdr_len != 0);
/* Don't reply to a RST segment. */
return;
}
} else {
assert(ip_hdr_len == 0);
}
/* If a text string is passed in with the request, print it out. */
dprintf("tcp_xmit_ctl(%d): '%s', seq 0x%x, ack 0x%x, "
}
return;
}
/*
* Don't send TSopt w/ TH_RST packets per RFC 1323.
*/
}
}
if (tcp->tcp_snd_ts_ok) {
}
tcp->tcp_rack_cnt = 0;
}
}
/* Generate an ACK-only (no data) segment for a TCP endpoint */
static mblk_t *
{
if (tcp->tcp_valid_bits) {
/*
* For the complex case where we have to send some
* controls (FIN or SYN), let tcp_xmit_mp do it.
* When sending an ACK-only segment (no data)
* into a zero window, always set the seq number to
* suna, since snxt will be extended past the window.
* If we used snxt, the receiver might consider the ACK
* unacceptable.
*/
(tcp->tcp_zero_win_probe) ?
} else {
/* Generate a simple ACK */
int32_t num_sack_blk = 0;
/*
* Allocate space for TCP + IP headers
* and link-level header
*/
} else {
}
return (NULL);
/* copy in prototype TCP + IP header */
/*
* Set the TCP sequence number.
* When sending an ACK-only segment (no data)
* into a zero window, always set the seq number to
* suna, since snxt will be extended past the window.
* If we used snxt, the receiver might consider the ACK
* unacceptable.
*/
/* Set up the TCP flag field. */
if (tcp->tcp_ecn_echo_on)
tcp->tcp_rack_cnt = 0;
/* fill in timestamp option if in use */
if (tcp->tcp_snd_ts_ok) {
}
/* Fill in SACK options */
if (num_sack_blk > 0) {
int32_t i;
wptr[0] = TCPOPT_NOP;
sizeof (sack_blk_t);
for (i = 0; i < num_sack_blk; i++) {
}
<< 4);
}
return (mp1);
}
}
/*
* tcp_xmit_mp is called to return a pointer to an mblk chain complete with
* ip and tcp header ready to pass down to IP. If the mp passed in is
* non-NULL, then up to max_to_send bytes of data will be dup'ed off that
* mblk. (If sendall is not set the dup'ing will stop at an mblk boundary
* otherwise it will dup partial mblks.)
* Otherwise, an appropriate ACK packet will be generated. This
* routine is not usually called to send new data for the first time. It
* is mostly called out of the timer for retransmits, and to generate ACKs.
*
* If offset is not NULL, the returned mblk chain's first mblk's b_rptr will
* be adjusted by *offset. And after dupb(), the offset and the ending mblk
* of the original mblk chain will be returned in *offset and *end_mp.
*/
static mblk_t *
{
int data_length;
int32_t num_sack_blk = 0;
int32_t sack_opt_len = 0;
/* Allocate for our maximum TCP header + link-level */
tcp_wroff_xtra, 0);
return (NULL);
data_length = 0;
/*
* Note that tcp_mss has been adjusted to take into account the
* timestamp option if applicable. Because SACK options do not
* appear in every TCP segments and they are of variable lengths,
* they cannot be included in tcp_mss. Thus we need to calculate
* the actual segment length when we need to send a segment which
* includes SACK options.
*/
}
/* We use offset as an indicator that end_mp is not NULL. */
}
/* This could be faster with cooperation from downstream */
/*
* Don't send the next mblk since the whole mblk
* does not fit.
*/
break;
return (NULL);
}
if (data_length > max_to_send) {
break;
} else {
off = 0;
}
}
}
*seg_len = data_length;
}
/*
* Use tcp_unsent to determine if the PUSH bit should be used assumes
* that this function was called from tcp_wput_data. Thus, when called
* to retransmit data the setting of the PUSH bit may appear some
* what random in that it might get set when it should not. This
* should not pose any performance issues.
*/
} else {
}
if (tcp->tcp_ecn_ok) {
if (tcp->tcp_ecn_echo_on)
/*
* Only set ECT bit and ECN_CWR if a segment contains new data.
* There is no TCP flow control for non-data segments, and
* only data segment is transmitted reliably.
*/
if (data_length > 0 && !rexmit) {
}
}
}
if (tcp->tcp_valid_bits) {
/*
* Tack on the MSS option. It is always needed
* for both active and passive open.
*/
wptr[0] = TCPOPT_MAXSEG;
wptr += 2;
/*
* MSS option value should be interface MTU - MIN
*/
/* Update the offset to cover the additional word */
/*
* Note that the following way of filling in
* TCP options are not optimal. Some NOPs can
* be saved. But there is no need at this time
* to optimize it. When it is needed, we will
* do it.
*/
case TCPS_SYN_SENT:
if (tcp->tcp_snd_ws_ok) {
wptr[0] = TCPOPT_NOP;
tcph->th_offset_and_rsrvd[0] +=
(1 << 4);
}
if (tcp->tcp_snd_ts_ok) {
llbolt = prom_gettime();
wptr[0] = TCPOPT_NOP;
wptr += 4;
wptr += 4;
U32_TO_BE32(0L, wptr);
tcph->th_offset_and_rsrvd[0] +=
(3 << 4);
}
if (tcp->tcp_snd_sack_ok) {
wptr[0] = TCPOPT_NOP;
tcph->th_offset_and_rsrvd[0] +=
(1 << 4);
}
/*
* Set up all the bits to tell other side
* we are ECN capable.
*/
if (tcp->tcp_ecn_ok) {
}
break;
case TCPS_SYN_RCVD:
if (tcp->tcp_snd_ws_ok) {
wptr[0] = TCPOPT_NOP;
}
if (tcp->tcp_snd_sack_ok) {
wptr[0] = TCPOPT_NOP;
tcph->th_offset_and_rsrvd[0] +=
(1 << 4);
}
/*
* If the other side is ECN capable, reply
* that we are also ECN capable.
*/
if (tcp->tcp_ecn_ok) {
}
break;
default:
break;
}
/* allocb() of adequate mblk assures space */
}
if (!tcp->tcp_fin_acked) {
}
if (!tcp->tcp_fin_sent) {
case TCPS_SYN_RCVD:
case TCPS_ESTABLISHED:
break;
case TCPS_CLOSE_WAIT:
break;
}
}
}
}
tcp->tcp_rack_cnt = 0;
if (tcp->tcp_snd_ts_ok) {
}
}
if (num_sack_blk > 0) {
int32_t i;
wptr[0] = TCPOPT_NOP;
sizeof (sack_blk_t);
for (i = 0; i < num_sack_blk; i++) {
}
}
/*
* Performance hit! We need to pullup the whole message
* in order to do checksum and for the MAC output routine.
*/
int mp_size;
#ifdef DEBUG
#endif
}
}
/* Fill in the TTL field as it is 0 in the header template. */
return (mp1);
}
/*
* Generate a "no listener here" reset in response to the
* connection request contained within 'mp'
*/
static void
{
tcp_xmit_early_reset("no tcp, reset",
} else {
seg_len++;
}
}
/* Non overlapping byte exchanger */
static void
{
while (len-- > 0) {
}
}
/*
* Generate a reset based on an inbound packet for which there is no active
* tcp state that we can find.
*/
static void
{
int i;
dprintf("tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, "
}
/*
* We skip reversing source route here.
* (for now we replace all IP options with EOL)
*/
for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++)
/*
* Make sure that src address is not a limited broadcast
* address. Not all broadcast address checking for the
* src address is possible, since we don't know the
* netmask of the src addr.
* No check for destination address is done, since
* IP will not pass up a packet with a broadcast dest address
* to TCP.
*/
return;
}
return;
}
/*
* Now copy the original header to a new buffer. The reason
* for doing this is that we need to put extra room before
* the header for the MAC layer address. The original mblk
* does not have this extra head room.
*/
return;
}
}
/* Swap addresses */
/* Dump the packet when debugging. */
}
static void
{
int len;
/*
* Calculate the TCP checksum. Need to include the psuedo header,
* which is similar to the real IP header starting at the TTL field.
*/
}
static uint16_t
{
/*
* Compute Internet Checksum for "count" bytes
* beginning at location "addr".
*/
while (len > 1) {
/* This is the inner loop */
len -= 2;
}
/* Add left-over byte, if any */
if (len > 0)
/* Fold 32-bit sum to 16 bits */
while (sum >> 16)
}
/*
* Type three generator adapted from the random() function in 4.4 BSD:
*/
/*
* Copyright (c) 1983, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/* Type 3 -- x**31 + x**3 + 1 */
#define DEG_3 31
#define SEP_3 3
/* Protected by tcp_random_lock */
static void
tcp_random_init(void)
{
int i;
/*
*
* XXX We don't have high resolution time in standalone... The
* following is just some approximation on the comment below.
*
* Use high-res timer and current time for seed. Gethrtime() returns
* a longlong, which may contain resolution down to nanoseconds.
* The current time will either be a 32-bit or a 64-bit quantity.
* XOR the two together in a 64-bit result variable.
* Convert the result to a 32-bit value by multiplying the high-order
* 32-bits by the low-order 32-bits.
*
* XXX We don't have gethrtime() in prom and the wallclock....
*/
hrt = prom_gettime();
tcp_random_state[0] = result;
for (i = 1; i < DEG_3; i++)
+ 12345;
tcp_random_rptr = &tcp_random_state[0];
for (i = 0; i < 10 * DEG_3; i++)
(void) tcp_random();
}
/*
* tcp_random: Return a random number in the range [1 - (128K + 1)].
* This range is selected to be approximately centered on TCP_ISS / 2,
* and easy to compute. We get this value by generating a 32-bit random
* number, selecting out the high-order 17 bits, and then adding one so
* that we never return zero.
*/
static int
tcp_random(void)
{
int i;
/*
* The high-order bits are more random than the low-order bits,
* so we select out the high-order 17 bits and add one so that
* we never return zero.
*/
if (++tcp_random_fptr >= tcp_random_end_ptr) {
} else if (++tcp_random_rptr >= tcp_random_end_ptr)
return (i);
}
/*
* Generate ISS, taking into account NDD changes may happen halfway through.
* (If the iss is not zero, set it.)
*/
static void
{
}
/*
* Diagnostic routine used to return a string associated with the tcp state.
* Note that if the caller does not supply a buffer, it will use an internal
* static string. This means that if multiple threads call this function at
* the same time, output can be corrupted... Note also that this function
* does not check the size of the supplied buffer. The caller has to make
* sure that it is big enough.
*/
static char *
{
char buf1[30];
char *buf;
char *cp;
char local_addrbuf[INET_ADDRSTRLEN];
char remote_addrbuf[INET_ADDRSTRLEN];
else
return ("NULL_TCP");
case TCPS_CLOSED:
cp = "TCP_CLOSED";
break;
case TCPS_IDLE:
cp = "TCP_IDLE";
break;
case TCPS_BOUND:
cp = "TCP_BOUND";
break;
case TCPS_LISTEN:
cp = "TCP_LISTEN";
break;
case TCPS_SYN_SENT:
cp = "TCP_SYN_SENT";
break;
case TCPS_SYN_RCVD:
cp = "TCP_SYN_RCVD";
break;
case TCPS_ESTABLISHED:
cp = "TCP_ESTABLISHED";
break;
case TCPS_CLOSE_WAIT:
cp = "TCP_CLOSE_WAIT";
break;
case TCPS_FIN_WAIT_1:
cp = "TCP_FIN_WAIT_1";
break;
case TCPS_CLOSING:
cp = "TCP_CLOSING";
break;
case TCPS_LAST_ACK:
cp = "TCP_LAST_ACK";
break;
case TCPS_FIN_WAIT_2:
cp = "TCP_FIN_WAIT_2";
break;
case TCPS_TIME_WAIT:
cp = "TCP_TIME_WAIT";
break;
default:
break;
}
switch (format) {
case DISP_ADDR_AND_PORT:
/*
* Note that we use the remote address in the tcp_b
* structure. This means that it will print out
* the real destination address, not the next hop's
* address if source routing is used.
*/
break;
case DISP_PORT_ONLY:
default:
break;
}
return (buf);
}
/*
* Add a new piece to the tcp reassembly queue. If the gap at the beginning
* is filled, return as much as we can. The message passed in may be
* multi-part, chained using b_cont. "start" is the starting sequence
* number for this piece.
*/
static mblk_t *
{
/* Walk through all the new pieces. */
do {
/* Empty. Blast it. */
continue;
}
if (!mp1) {
continue;
}
/* New stuff completely beyond tail? */
/* Link it on end. */
continue;
}
/* New stuff at the front? */
/* Yes... Check for overlap. */
continue;
}
/*
* The new piece fits somewhere between the head and tail.
* We find our slot, where mp1 precedes us and mp2 trails.
*/
break;
}
/* Link ourselves in */
/* Trim overlap with following mblk(s) first */
/* Trim overlap with preceding mblk */
/* Anything ready to go? */
return (NULL);
/* Eat what we can off the queue */
for (;;) {
TCP_REASS_SET_SEQ(mp1, 0);
TCP_REASS_SET_END(mp1, 0);
if (!mp) {
break;
}
break;
}
}
return (mp1);
}
/* Eliminate any overlap that mp may have over later mblks */
static void
{
break;
break;
}
}
if (!mp1)
}
/*
* Remove a connection from the list of detached TIME_WAIT connections.
*/
static void
{
if (tcp->tcp_time_wait_expire == 0) {
return;
}
if (tcp == tcp_time_wait_head) {
if (tcp_time_wait_head != NULL) {
} else {
}
} else if (tcp == tcp_time_wait_tail) {
} else {
}
tcp->tcp_time_wait_expire = 0;
}
/*
* Add a connection to the list of detached TIME_WAIT connections
* and set its time to expire ...
*/
static void
{
if (tcp->tcp_time_wait_expire == 0)
if (tcp_time_wait_head == NULL) {
} else {
}
/* for ndd stats about compression */
}
/*
* Periodic qtimeout routine run on the default queue.
* Performs 2 functions.
* 1. Does TIME_WAIT compression on all recently added tcps. List
* traversal is done backwards from the tail.
* 2. Blows away all tcps whose TIME_WAIT has expired. List traversal
* is done forwards from the head.
*/
void
tcp_time_wait_collector(void)
{
/*
* In order to reap time waits reliably, we should use a
* source of time that is not adjustable by the user
*/
now = prom_gettime();
/*
* Compare times using modular arithmetic, since
* lbolt can wrapover.
*/
break;
}
/*
* Note that the err must be 0 as there is no socket
* associated with this TCP...
*/
}
/* Schedule next run time. */
}
void
tcp_time_wait_report(void)
{
}
}
/*
* Send up all messages queued on tcp_rcv_list.
* Have to set tcp_co_norm since we use putnext.
*/
static void
{
int len;
/* Don't drain if the app has not finished reading all the data. */
return;
/* We might have come here just to updated the rwnd */
goto win_update;
return;
}
return;
}
}
tcp->tcp_rcv_cnt = 0;
/* This means that so_rcvbuf can be less than 0. */
/*
* Increase the receive window to max. But we need to do receiver
* SWS avoidance. This means that we need to check the increase of
* of receive window is at least 1 MSS.
*/
}
}
/*
* Wrapper for recvfrom to call
*/
void
{
return;
}
/*
* If the inq == NULL and the tcp_rcv_list != NULL, we have data that
* recvfrom could read. Place a magic message in the inq to let recvfrom
* know that it needs to call tcp_rcv_drain_sock to pullup the data.
*/
static void
{
#ifdef DEBUG
printf("tcp_drain_needed: inq %x, tcp_rcv_list %x\n",
#endif
return;
return;
}
/*
* Queue data on tcp_rcv_list which is a b_next chain.
* Each element of the chain is a b_cont chain.
*
* M_DATA messages are added to the current element.
* Other messages are added as new (b_next) elements.
*/
static void
{
} else {
}
#ifdef DEBUG
#endif
}
/* The minimum of smoothed mean deviation in RTO calculation. */
#define TCP_SD_MIN 400
/*
* Set RTO for this connection. The formula is from Jacobson and Karels'
* "Congestion Avoidance and Control" in SIGCOMM '88. The variable names
* are the same as those in Appendix A.2 of that paper.
*
* m = new measurement
* sa = smoothed RTT average (8 * average estimates).
* sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
*/
static void
{
tcp->tcp_rtt_update++;
/* tcp_rtt_sa is not 0 means this is a new sample. */
if (sa != 0) {
/*
* Update average estimator:
* new rtt = 7/8 old rtt + 1/8 Error
*/
/* m is now Error in estimate. */
m -= sa >> 3;
/*
* Don't allow the smoothed average to be negative.
* We use 0 to denote reinitialization of the
* variables.
*/
sa = 1;
}
/*
* Update deviation estimator:
* new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev)
*/
if (m < 0)
m = -m;
m -= sv >> 2;
sv += m;
} else {
/*
* This follows BSD's implementation. So the reinitialized
* RTO is 3 * m. We cannot go less than 2 because if the
* link is bandwidth dominated, doubling the window size
* during slow start means doubling the RTT. We want to be
* more conservative when we reinitialize our estimates. 3
* is just a convenient number.
*/
sa = m << 3;
sv = m << 1;
}
if (sv < TCP_SD_MIN) {
/*
* We do not know that if sa captures the delay ACK
* effect as in a long train of segments, a receiver
* does not delay its ACKs. So set the minimum of sv
* to be TCP_SD_MIN, which is default to 400 ms, twice
* of BSD DATO. That means the minimum of mean
* deviation is 100 ms.
*
*/
sv = TCP_SD_MIN;
}
/*
* RTO = average estimates (sa / 8) + 4 * deviation estimates (sv)
*
* Add tcp_rexmit_interval extra in case of extreme environment
* where the algorithm fails to work. The default value of
* tcp_rexmit_interval_extra should be 0.
*
* As we use a finer grained clock than BSD and update
* RTO for every ACKs, add in another .25 of RTT to the
* deviation of RTO to accomodate burstiness of 1/4 of
* window size.
*/
if (rto > tcp_rexmit_interval_max) {
} else if (rto < tcp_rexmit_interval_min) {
} else {
}
/* Now, we can reset tcp_timer_backoff to use the new RTO... */
tcp->tcp_timer_backoff = 0;
}
/*
* Initiate closedown sequence on an active connection.
* Return value zero for OK return, non-zero for error return.
*/
static int
{
/*
* Invalid state, only states TCPS_SYN_RCVD,
* TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid
*/
return (-1);
}
/*
* If there is nothing more unsent, send the FIN now.
* Otherwise, it will go out with the last segment.
*/
if (tcp->tcp_unsent == 0) {
/* Dump the packet when debugging. */
} else {
/*
* Couldn't allocate msg. Pretend we got it out.
* Wait for rexmit timeout.
*/
}
/*
* If needed, update tcp_rexmit_snxt as tcp_snxt is
* changed.
*/
}
} else {
}
return (0);
}
int
{
switch (level) {
case SOL_SOCKET: {
switch (option) {
case SO_RCVBUF:
if (optlen == sizeof (int)) {
if (val > tcp_max_buf) {
break;
}
/* Silently ignore zero */
if (val != 0) {
}
} else {
}
break;
case SO_SNDBUF:
if (optlen == sizeof (int)) {
} else {
}
break;
case SO_LINGER:
} else {
tcp->tcp_linger = 0;
tcp->tcp_lingertime = 0;
}
} else {
}
break;
default:
errno = ENOPROTOOPT;
break;
}
break;
} /* case SOL_SOCKET */
case IPPROTO_TCP: {
switch (option) {
default:
errno = ENOPROTOOPT;
break;
}
break;
} /* case IPPROTO_TCP */
case IPPROTO_IP: {
switch (option) {
default:
errno = ENOPROTOOPT;
break;
}
break;
} /* case IPPROTO_IP */
default:
errno = ENOPROTOOPT;
break;
} /* switch (level) */
if (errno != 0)
return (-1);
else
return (0);
}