tcp_timers.c revision 3d0a255c417cf2e7b69e770de43f195b0eeffacb
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/squeue_impl.h>
#include <inet/tcp_impl.h>
/*
* Implementation of TCP Timers.
* =============================
*
* INTERFACE:
*
* There are two basic functions dealing with tcp timers:
*
* timeout_id_t tcp_timeout(connp, func, time)
* clock_t tcp_timeout_cancel(connp, timeout_id)
* TCP_TIMER_RESTART(tcp, intvl)
*
* tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
* after 'time' ticks passed. The function called by timeout() must adhere to
* the same restrictions as a driver soft interrupt handler - it must not sleep
* or call other functions that might sleep. The value returned is the opaque
* non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
* cancel the request. The call to tcp_timeout() may fail in which case it
* returns zero. This is different from the timeout(9F) function which never
* fails.
*
* The call-back function 'func' always receives 'connp' as its single
* argument. It is always executed in the squeue corresponding to the tcp
* structure. The tcp structure is guaranteed to be present at the time the
* call-back is called.
*
* NOTE: The call-back function 'func' is never called if tcp is in
* the TCPS_CLOSED state.
*
* tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
* request. locks acquired by the call-back routine should not be held across
* the call to tcp_timeout_cancel() or a deadlock may result.
*
* tcp_timeout_cancel() returns -1 if it can not cancel the timeout request.
* Otherwise, it returns an integer value greater than or equal to 0. In
* particular, if the call-back function is already placed on the squeue, it can
* not be canceled.
*
* NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
* within squeue context corresponding to the tcp instance. Since the
* call-back is also called via the same squeue, there are no race
* conditions described in untimeout(9F) manual page since all calls are
* strictly serialized.
*
* TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
* stored in tcp_timer_tid and starts a new one using
* MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
* and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
* field.
*
* NOTE: since the timeout cancellation is not guaranteed, the cancelled
* call-back may still be called, so it is possible tcp_timer() will be
* called several times. This should not be a problem since tcp_timer()
* should always check the tcp instance state.
*
*
* IMPLEMENTATION:
*
* TCP timers are implemented using three-stage process. The call to
* tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
* when the timer expires. The tcp_timer_callback() arranges the call of the
* tcp_timer_handler() function via squeue corresponding to the tcp
* instance. The tcp_timer_handler() calls actual requested timeout call-back
* and passes tcp instance as an argument to it. Information is passed between
* stages using the tcp_timer_t structure which contains the connp pointer, the
* tcp call-back to call and the timeout id returned by the timeout(9F).
*
* The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
* like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
* mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
* returns the pointer to this mblk.
*
* The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
* looks like a normal mblk without actual dblk attached to it.
*
* To optimize performance each tcp instance holds a small cache of timer
* mblocks. In the current implementation it caches up to two timer mblocks per
* tcp instance. The cache is preserved over tcp frees and is only freed when
* the whole tcp structure is destroyed by its kmem destructor. Since all tcp
* timer processing happens on a corresponding squeue, the cache manipulation
* does not require any locks. Experiments show that majority of timer mblocks
* allocations are satisfied from the tcp cache and do not involve kmem calls.
*
* The tcp_timeout() places a refhold on the connp instance which guarantees
* that it will be present at the time the call-back function fires. The
* tcp_timer_handler() drops the reference after calling the call-back, so the
* call-back function does not need to manipulate the references explicitly.
*/
static void tcp_ip_notify(tcp_t *);
static void tcp_timer_callback(void *);
/*
* tim is in millisec.
*/
{
} else {
}
/*
* TCP timers are normal timeouts. Plus, they do not require more than
* a 10 millisecond resolution. By choosing a coarser resolution and by
* rounding up the expiration to the next resolution boundary, we can
* batch timers in the callout subsystem to make TCP timers more
* efficient. The roundup also protects short timers from expiring too
* early before they have a chance to be cancelled.
*/
return ((timeout_id_t)mp);
}
static void
tcp_timer_callback(void *arg)
{
}
/* ARGSUSED */
static void
{
/*
* If the TCP has reached the closed state, don't proceed any
* further. This TCP logically does not exist on the system.
* tcpt_proc could for example access queues, that have already
* been qprocoff'ed off.
*/
} else {
tcp->tcp_timer_tid = 0;
}
}
/*
* There is potential race with untimeout and the handler firing at the same
* time. The mblock may be freed by the handler while we are trying to use
* it. But since both should execute on the same squeue, this race should not
* occur.
*/
{
return (-1);
if (delta >= 0) {
}
return (TICK_TO_MSEC(delta));
}
/*
* Allocate space for the timer event. The allocation looks like mblk, but it is
* not a proper mblk. To avoid confusion we set b_wptr to NULL.
*
* Dealing with failures: If we can't allocate from the timer cache we try
* allocating from dblock caches using allocb_tryhard(). In this case b_wptr
* points to b_rptr.
* If we can't allocate anything using allocb_tryhard(), we perform a last
* attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and
* save the actual allocation size in b_datap.
*/
mblk_t *
tcp_timermp_alloc(int kmflags)
{
/*
* Failed to allocate memory for the timer. Try allocating from
* dblock caches.
*/
/* ipclassifier calls this from a constructor - hence no tcps */
/*
* Memory is really low. Try tryhard allocation.
*
* ipclassifier calls this from a constructor -
* hence no tcps
*/
}
}
/* ipclassifier calls this from a constructor - hence no tcps */
return (mp);
}
/*
* Free per-tcp timer cache.
* It can only contain entries from tcp_timercache.
*/
void
{
}
}
/*
* Free timer event. Put it on the per-tcp timer cache if there is not too many
* events there already (currently at most two events are cached).
* If the event is not allocated from the timer cache, free it right away.
*/
static void
{
/*
* This allocation is not from a timer cache, free it right
* away.
*/
else
/* Cache this timer block for future allocations */
} else {
}
}
/*
* Stop all TCP timers.
*/
void
{
if (tcp->tcp_timer_tid != 0) {
tcp->tcp_timer_tid = 0;
}
if (tcp->tcp_ka_tid != 0) {
tcp->tcp_ka_tid = 0;
}
if (tcp->tcp_ack_tid != 0) {
tcp->tcp_ack_tid = 0;
}
if (tcp->tcp_push_tid != 0) {
tcp->tcp_push_tid = 0;
}
if (tcp->tcp_reass_tid != 0) {
tcp->tcp_reass_tid = 0;
}
}
/*
* Timer callback routine for keepalive probe. We do a fake resend of
* last ACKed byte. Then set a timer using RTO. When the timer expires,
* check to see if we have heard anything from the other end for the last
* RTO period. If we have, set the timer to expire for another
* tcp_keepalive_intrvl and check again. If we have not, set a timer using
* RTO << 1 and check again when it expires. Keep exponentially increasing
* the timeout if we have not heard from the other side. If for more than
* (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
* kill the connection unless the keepalive abort threshold is 0. In
* that case, we will probe "forever."
* If tcp_ka_cnt and tcp_ka_rinterval are non-zero, then we do not follow
* the exponential backoff, but send probes tcp_ka_cnt times in regular
* intervals of tcp_ka_rinterval milliseconds until we hear back from peer.
* Kill the connection if we don't hear back from peer after tcp_ka_cnt
* probes are sent.
*/
void
tcp_keepalive_timer(void *arg)
{
tcp->tcp_ka_tid = 0;
return;
/*
* Keepalive probe should only be sent if the application has not
* done a close on the connection.
*/
return;
}
/* Timer fired too early, restart it. */
return;
}
/*
* If we have not heard from the other side for a long
* time, kill the connection unless the keepalive abort
* threshold is 0. In that case, we will probe "forever."
*/
if (tcp->tcp_ka_abort_thres != 0 &&
return;
}
/* Fake resend of last ACKed byte. */
/*
* if allocation failed, fall through to start the
* timer back.
*/
if (tcp->tcp_ka_rinterval) {
} else if (tcp->tcp_ka_last_intrvl != 0) {
int max;
/*
* We should probe again at least
* in ka_intrvl, but not more than
* tcp_rto_max.
*/
} else {
}
return;
}
}
} else {
tcp->tcp_ka_last_intrvl = 0;
}
/* firetime can be negative if (mp1 == NULL || mp == NULL) */
}
}
void
tcp_reass_timer(void *arg)
{
tcp->tcp_reass_tid = 0;
return;
}
}
/* This function handles the push timeout. */
void
tcp_push_timer(void *arg)
{
tcp->tcp_push_tid = 0;
}
/*
* This function handles delayed ACK timeout.
*/
void
tcp_ack_timer(void *arg)
{
tcp->tcp_ack_tid = 0;
return;
/*
* Do not send ACK if there is no outstanding unack'ed data.
*/
return;
}
/*
* Make sure we don't allow deferred ACKs to result in
* timer-based ACKing. If we have held off an ACK
* when there was more than an mss here, and the timer
* goes off, we have to worry about the possibility
* that the sender isn't doing slow-start, or is out
* of step with us for some other reason. We fall
* permanently back in the direction of
* ACK-every-other-packet as suggested in RFC 1122.
*/
tcp->tcp_rack_abs_max--;
}
}
}
/*
* Notify IP that we are having trouble with this connection. IP should
* make note so it can potentially use a different IRE.
*/
static void
{
/*
* Note: in the case of source routing we want to blow away the
* route to the first source route hop.
*/
/*
* As per RFC 1122, we send an RTM_LOSING to inform
* routing protocols.
*/
connp->conn_laddr_v4, 0, 0, 0,
}
(void) ire_no_good(ire);
}
}
/*
* tcp_timer is the timer service routine. It handles the retransmission,
* FIN_WAIT_2 flush, and zero window probe timeout events. It figures out
* from the state of the tcp instance what kind of action needs to be done
* at the time it is called.
*/
void
{
tcp->tcp_timer_tid = 0;
return;
case TCPS_IDLE:
case TCPS_BOUND:
case TCPS_LISTEN:
return;
case TCPS_SYN_RCVD: {
/* it's our first timeout */
/*
* Make this eager available for drop if we
* need to drop one to accomodate a new
* incoming SYN request.
*/
}
if (!listener->tcp_syn_defense &&
/* We may be under attack. Put on a defense. */
"rate! System (port %d) may be under a "
"SYN flood attack!",
IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t),
}
!tcp->tcp_closemp_used) {
/*
* This is our second timeout. Put the tcp in
* the list of droppable eagers to allow it to
* be dropped, if needed. We don't check
* whether tcp_dontdrop is set or not to
* protect ourselve from a SYN attack where a
* remote host can spoof itself as one of the
* good IP source and continue to hold
* resources too long.
*/
}
}
}
/* FALLTHRU */
case TCPS_SYN_SENT:
/*
* If an app has set the second_threshold to 0, it means that
* we need to retransmit forever, unless this is a passive
* open. We need to set second_threshold back to a normal
* value such that later comparison with it still makes
* sense. But we set dont_timeout to B_TRUE so that we will
* never time out.
*/
if (second_threshold == 0) {
if (tcp->tcp_active_open)
}
break;
case TCPS_ESTABLISHED:
case TCPS_CLOSE_WAIT:
/*
* If the end point has not been closed, TCP can retransmit
* forever. But if the end point is closed, the normal
* timeout applies.
*/
if (second_threshold == 0) {
}
/* FALLTHRU */
case TCPS_FIN_WAIT_1:
case TCPS_CLOSING:
case TCPS_LAST_ACK:
/* If we have data to rexmit */
if (!tcp->tcp_xmit_head)
break;
time_to_wait = ddi_get_lbolt() -
/*
* If the timer fires too early, 1 clock tick earlier,
* restart the timer.
*/
if (time_to_wait > msec_per_tick) {
return;
}
/*
* When we probe zero windows, we force the swnd open.
* If our peer acks with a closed window swnd will be
* set to zero by tcp_rput(). As long as we are
* receiving acks tcp_rput will
* reset 'tcp_ms_we_have_waited' so as not to trip the
* first and second interval actions. NOTE: the timer
* interval is allowed to continue its exponential
* backoff.
*/
if (connp->conn_debug) {
SL_TRACE, "tcp_timer: zero win");
}
} else {
/*
* After retransmission, we need to do
* slow start. Set the ssthresh to one
* half of current effective window and
* cwnd to one MSS. Also reset
* tcp_cwnd_cnt.
*
* Note that if tcp_ssthresh is reduced because
* of ECN, do not reduce it again unless it is
* already one window of data away (tcp_cwr
* should then be cleared) or this is a
* timeout for a retransmitted segment.
*/
}
tcp->tcp_cwnd_cnt = 0;
if (tcp->tcp_ecn_ok) {
}
}
break;
}
/*
* We have something to send yet we cannot send. The
* reason can be:
*
* 1. Zero send window: we need to do zero window probe.
* 2. Zero cwnd: because of ECN, we need to "clock out
* segments.
* 3. SWS avoidance: receiver may have shrunk window,
* reset our knowledge.
*
* Note that condition 2 can happen with either 1 or
* 3. But 1 and 3 are exclusive.
*/
if (tcp->tcp_unsent != 0) {
/*
* Should not hold the zero-copy messages for too long.
*/
/*
* Set tcp_cwnd to 1 MSS so that a
* new segment can be sent out. We
* are "clocking out" new data when
* the network is really congested.
*/
}
/* Extend window for zero window probe */
} else {
/*
* Handle timeout from sender SWS avoidance.
* Reset our knowledge of the max send window
* since the receiver might have reduced its
* receive buffer. Avoid setting tcp_max_swnd
* to one since that will essentially disable
* the SWS checks.
*
* Note that since we don't have a SWS
* state variable, if the timeout is set
* for ECN but not for SWS, this
* code will also be executed. This is
* fine as tcp_max_swnd is updated
* constantly and it will not affect
* anything.
*/
}
return;
}
/* Is there a FIN that needs to be to re retransmitted? */
!tcp->tcp_fin_acked)
break;
/* Nothing to do, return without restarting timer. */
return;
case TCPS_FIN_WAIT_2:
/*
* User closed the TCP endpoint and peer ACK'ed our FIN.
* We waited some time for for peer's FIN, but it hasn't
* arrived. We flush the connection now to avoid
* case where the peer has rebooted.
*/
if (TCP_IS_DETACHED(tcp)) {
(void) tcp_clean_death(tcp, 0);
} else {
}
return;
case TCPS_TIME_WAIT:
(void) tcp_clean_death(tcp, 0);
return;
default:
if (connp->conn_debug) {
"tcp_timer: strange state (%d) %s",
}
return;
}
/*
* If the system is under memory pressure or the max number of
* connections have been established for the listener, be more
* aggressive in aborting connections.
*/
/* We will ignore the never timeout promise in this case... */
}
ASSERT(second_threshold != 0);
/*
* Should not hold the zero-copy messages for too long.
*/
if (dont_timeout) {
/*
* Reset tcp_ms_we_have_waited to avoid overflow since
* we are going to retransmit forever.
*/
goto timer_rexmit;
}
/*
* For zero window probe, we need to send indefinitely,
* unless we have not heard from the other side for some
* time...
*/
if ((tcp->tcp_zero_win_probe == 0) ||
second_threshold)) {
/*
* If TCP is in SYN_RCVD state, send back a
* RST|ACK as BSD does. Note that tcp_zero_win_probe
* should be zero in TCPS_SYN_RCVD state.
*/
tcp_xmit_ctl("tcp_timer: RST sent on timeout "
"in SYN_RCVD",
}
(void) tcp_clean_death(tcp,
return;
} else {
/*
* If the system is under memory pressure, we also
* abort connection in zero window probing.
*/
if (tcps->tcps_reclaim) {
(void) tcp_clean_death(tcp,
return;
}
/*
* Set tcp_ms_we_have_waited to second_threshold
* so that in next timeout, we will do the above
* check (ddi_get_lbolt() - tcp_last_recv_time).
* This is also to avoid overflow.
*
* We don't need to decrement tcp_timer_backoff
* to avoid overflow because it will be decremented
* later if new timeout value is greater than
* tcp_rto_max. In the case when tcp_rto_max is
* greater than second_threshold, it means that we
* will wait longer than second_threshold to send
* the next
* window probe.
*/
}
} else if (ms > first_threshold) {
/*
* Should not hold the zero-copy messages for too long.
*/
/*
* We have been retransmitting for too long... The RTT
* we calculated is probably incorrect. Reinitialize it.
* Need to compensate for 0 tcp_rtt_sa. Reset
* tcp_rtt_update so that we won't accidentally cache a
* bad value. But only do this if this is not a zero
* window probe.
*/
tcp->tcp_rtt_sa = 0;
tcp->tcp_rtt_update = 0;
}
}
tcp->tcp_timer_backoff++;
tcp->tcp_rto_min) {
/*
* This means the original RTO is tcp_rexmit_interval_min.
* So we will use tcp_rexmit_interval_min as the RTO value
* and do the backoff.
*/
} else {
}
/*
* ms is at max, decrement tcp_timer_backoff to avoid
* overflow.
*/
tcp->tcp_timer_backoff--;
}
if (tcp->tcp_zero_win_probe == 0) {
}
/*
* This is after a timeout and tcp_rto is backed off. Set
* tcp_set_timer to 1 so that next time RTO is updated, we will
* restart the timer with a correct value.
*/
B_TRUE);
/*
* When slow start after retransmission begins, start with
* this seq no. tcp_rexmit_max marks the end of special slow
* start phase. tcp_snd_burst controls how many segments
* can be sent because of an ack.
*/
(tcp->tcp_unsent == 0)) {
} else {
}
tcp->tcp_dupack_cnt = 0;
/*
* Remove all rexmit SACK blk to start from fresh.
*/
if (tcp->tcp_snd_sack_ok)
return;
}
}
/*
* Handle lingering timeouts. This function is called when the SO_LINGER timeout
* expires.
*/
void
tcp_close_linger_timeout(void *arg)
{
}