inet/tcp/tcp_time_wait.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2016 Joyent, Inc.
 */

/*
 * This file contains functions related to TCP time wait processing.  Also
 * refer to the time wait handling comments in tcp_impl.h.
 */

#include <sys/types.h>
#include <sys/strsun.h>
#include <sys/squeue_impl.h>
#include <sys/squeue.h>
#include <sys/callo.h>

#include <inet/common.h>
#include <inet/ip.h>
#include <inet/tcp.h>
#include <inet/tcp_impl.h>
#include <inet/tcp_cluster.h>

static void tcp_time_wait_purge(tcp_t *, tcp_squeue_priv_t *);

#define TW_BUCKET(t)                    \
    (((t) / MSEC_TO_TICK(TCP_TIME_WAIT_DELAY)) % TCP_TIME_WAIT_BUCKETS)

#define TW_BUCKET_NEXT(b)   (((b) + 1) % TCP_TIME_WAIT_BUCKETS)


/*
 * Remove a connection from the list of detached TIME_WAIT connections.
 * It returns B_FALSE if it can't remove the connection from the list
 * as the connection has already been removed from the list due to an
 * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
 */
boolean_t
tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tsp)
{
    boolean_t   locked = B_FALSE;

    if (tsp == NULL) {
        tsp = *((tcp_squeue_priv_t **)
            squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
        mutex_enter(&tsp->tcp_time_wait_lock);
        locked = B_TRUE;
    } else {
        ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock));
    }

    /* 0 means that the tcp_t has not been added to the time wait list. */
    if (tcp->tcp_time_wait_expire == 0) {
        ASSERT(tcp->tcp_time_wait_next == NULL);
        ASSERT(tcp->tcp_time_wait_prev == NULL);
        if (locked)
            mutex_exit(&tsp->tcp_time_wait_lock);
        return (B_FALSE);
    }
    ASSERT(TCP_IS_DETACHED(tcp));
    ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
    ASSERT(tsp->tcp_time_wait_cnt > 0);

    if (tcp->tcp_time_wait_next != NULL) {
        tcp->tcp_time_wait_next->tcp_time_wait_prev =
            tcp->tcp_time_wait_prev;
    }
    if (tcp->tcp_time_wait_prev != NULL) {
        tcp->tcp_time_wait_prev->tcp_time_wait_next =
            tcp->tcp_time_wait_next;
    } else {
        unsigned int bucket;

        bucket = TW_BUCKET(tcp->tcp_time_wait_expire);
        ASSERT(tsp->tcp_time_wait_bucket[bucket] == tcp);
        tsp->tcp_time_wait_bucket[bucket] = tcp->tcp_time_wait_next;
    }
    tcp->tcp_time_wait_next = NULL;
    tcp->tcp_time_wait_prev = NULL;
    tcp->tcp_time_wait_expire = 0;
    tsp->tcp_time_wait_cnt--;

    if (locked)
        mutex_exit(&tsp->tcp_time_wait_lock);
    return (B_TRUE);
}

/* Constants used for fast checking of a localhost address */
#if defined(_BIG_ENDIAN)
#define IPv4_LOCALHOST  0x7f000000U
#define IPv4_LH_MASK    0xffffff00U
#else
#define IPv4_LOCALHOST  0x0000007fU
#define IPv4_LH_MASK    0x00ffffffU
#endif

#define IS_LOCAL_HOST(x)    ( \
    ((x)->tcp_connp->conn_ipversion == IPV4_VERSION && \
    ((x)->tcp_connp->conn_laddr_v4 & IPv4_LH_MASK) == IPv4_LOCALHOST) || \
    ((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \
    IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6)))


/*
 * Add a connection to the list of detached TIME_WAIT connections
 * and set its time to expire.
 */
void
tcp_time_wait_append(tcp_t *tcp)
{
    tcp_stack_t *tcps = tcp->tcp_tcps;
    squeue_t    *sqp = tcp->tcp_connp->conn_sqp;
    tcp_squeue_priv_t *tsp =
        *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
    int64_t     now, schedule;
    unsigned int    bucket;

    tcp_timers_stop(tcp);

    /* Freed above */
    ASSERT(tcp->tcp_timer_tid == 0);
    ASSERT(tcp->tcp_ack_tid == 0);

    /* must have happened at the time of detaching the tcp */
    ASSERT(TCP_IS_DETACHED(tcp));
    ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
    ASSERT(tcp->tcp_ptpahn == NULL);
    ASSERT(tcp->tcp_flow_stopped == 0);
    ASSERT(tcp->tcp_time_wait_next == NULL);
    ASSERT(tcp->tcp_time_wait_prev == NULL);
    ASSERT(tcp->tcp_time_wait_expire == 0);
    ASSERT(tcp->tcp_listener == NULL);

    TCP_DBGSTAT(tcps, tcp_time_wait);
    mutex_enter(&tsp->tcp_time_wait_lock);

    /*
     * Immediately expire loopback connections.  Since there is no worry
     * about packets on the local host showing up after a long network
     * delay, this is safe and allows much higher rates of connection churn
     * for applications operating locally.
     *
     * This typically bypasses the tcp_free_list fast path due to squeue
     * re-entry for the loopback close operation.
     */
    if (tcp->tcp_loopback) {
        tcp_time_wait_purge(tcp, tsp);
        mutex_exit(&tsp->tcp_time_wait_lock);
        return;
    }

    /*
     * In order to reap TIME_WAITs reliably, we should use a source of time
     * that is not adjustable by the user.  While it would be more accurate
     * to grab this timestamp before (potentially) sleeping on the
     * tcp_time_wait_lock, doing so complicates bucket addressing later.
     */
    now = ddi_get_lbolt64();

    /*
     * Each squeue uses an arbitrary time offset when scheduling
     * expiration timers.  This prevents the bucketing from forcing
     * tcp_time_wait_collector to run in locksetup across squeues.
     *
     * This offset is (re)initialized when a new TIME_WAIT connection is
     * added to an squeue which has no connections waiting to expire.
     */
    if (tsp->tcp_time_wait_tid == 0) {
        ASSERT(tsp->tcp_time_wait_cnt == 0);
        tsp->tcp_time_wait_offset =
            now % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
    }
    now -= tsp->tcp_time_wait_offset;

    /*
     * Use the netstack-defined timeout, rounded up to the minimum
     * time_wait_collector interval.
     */
    schedule = now + MSEC_TO_TICK(tcps->tcps_time_wait_interval);
    tcp->tcp_time_wait_expire = schedule;

    /*
     * Append the connection into the appropriate bucket.
     */
    bucket = TW_BUCKET(tcp->tcp_time_wait_expire);
    tcp->tcp_time_wait_next = tsp->tcp_time_wait_bucket[bucket];
    tsp->tcp_time_wait_bucket[bucket] = tcp;
    if (tcp->tcp_time_wait_next != NULL) {
        ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == NULL);
        tcp->tcp_time_wait_next->tcp_time_wait_prev = tcp;
    }
    tsp->tcp_time_wait_cnt++;

    /*
     * Round delay up to the nearest bucket boundary.
     */
    schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
    schedule -= schedule % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);

    /*
     * The newly inserted entry may require a tighter schedule for the
     * expiration timer.
     */
    if (schedule < tsp->tcp_time_wait_schedule) {
        callout_id_t old_tid = tsp->tcp_time_wait_tid;

        tsp->tcp_time_wait_schedule = schedule;
        tsp->tcp_time_wait_tid =
            timeout_generic(CALLOUT_NORMAL,
            tcp_time_wait_collector, sqp,
            TICK_TO_NSEC(schedule - now),
            CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);

        /*
         * It is possible for the timer to fire before the untimeout
         * action is able to complete.  In that case, the exclusion
         * offered by the tcp_time_wait_collector_active flag will
         * prevent multiple collector threads from processing records
         * simultaneously from the same squeue.
         */
        mutex_exit(&tsp->tcp_time_wait_lock);
        (void) untimeout_default(old_tid, 0);
        return;
    }

    /*
     * Start a fresh timer if none exists.
     */
    if (tsp->tcp_time_wait_schedule == 0) {
        ASSERT(tsp->tcp_time_wait_tid == 0);

        tsp->tcp_time_wait_schedule = schedule;
        tsp->tcp_time_wait_tid =
            timeout_generic(CALLOUT_NORMAL,
            tcp_time_wait_collector, sqp,
            TICK_TO_NSEC(schedule - now),
            CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
    }
    mutex_exit(&tsp->tcp_time_wait_lock);
}

/*
 * Wrapper to call tcp_close_detached() via squeue to clean up TIME-WAIT
 * tcp_t.  Used in tcp_time_wait_collector().
 */
/* ARGSUSED */
static void
tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
{
    conn_t  *connp = (conn_t *)arg;
    tcp_t   *tcp = connp->conn_tcp;

    ASSERT(tcp != NULL);
    if (tcp->tcp_state == TCPS_CLOSED) {
        return;
    }

    ASSERT((connp->conn_family == AF_INET &&
        connp->conn_ipversion == IPV4_VERSION) ||
        (connp->conn_family == AF_INET6 &&
        (connp->conn_ipversion == IPV4_VERSION ||
        connp->conn_ipversion == IPV6_VERSION)));
    ASSERT(!tcp->tcp_listener);

    ASSERT(TCP_IS_DETACHED(tcp));

    /*
     * Because they have no upstream client to rebind or tcp_close()
     * them later, we axe the connection here and now.
     */
    tcp_close_detached(tcp);
}


static void
tcp_time_wait_purge(tcp_t *tcp, tcp_squeue_priv_t *tsp)
{
    mblk_t *mp;
    conn_t *connp = tcp->tcp_connp;
    kmutex_t *lock;

    ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock));
    ASSERT(connp->conn_fanout != NULL);

    lock = &connp->conn_fanout->connf_lock;

    /*
     * This is essentially a TIME_WAIT reclaim fast path optimization for
     * performance where the connection is checked under the fanout lock
     * (so that no one else can get access to the conn_t) that the refcnt
     * is 2 (one each for TCP and the classifier hash list).  That is the
     * case and clustering callbacks are not enabled, the conn can be
     * removed under the fanout lock and avoid clean-up under the squeue.
     *
     * This optimization is forgone when clustering is enabled since the
     * clustering callback must be made before setting the CONDEMNED flag
     * and after dropping all locks
     *
     * See the comments in tcp_closei_local for additional information
     * regarding the refcnt logic.
     */
    if (mutex_tryenter(lock)) {
        mutex_enter(&connp->conn_lock);
        if (connp->conn_ref == 2 && cl_inet_disconnect == NULL) {
            ipcl_hash_remove_locked(connp, connp->conn_fanout);
            /*
             * Set the CONDEMNED flag now itself so that the refcnt
             * cannot increase due to any walker.
             */
            connp->conn_state_flags |= CONN_CONDEMNED;
            mutex_exit(&connp->conn_lock);
            mutex_exit(lock);
            if (tsp->tcp_free_list_cnt < tcp_free_list_max_cnt) {
                /*
                 * Add to head of tcp_free_list
                 */
                tcp_cleanup(tcp);
                ASSERT(connp->conn_latch == NULL);
                ASSERT(connp->conn_policy == NULL);
                ASSERT(tcp->tcp_tcps == NULL);
                ASSERT(connp->conn_netstack == NULL);

                tcp->tcp_time_wait_next = tsp->tcp_free_list;
                tcp->tcp_in_free_list = B_TRUE;
                tsp->tcp_free_list = tcp;
                tsp->tcp_free_list_cnt++;
            } else {
                /*
                 * Do not add to tcp_free_list
                 */
                tcp_bind_hash_remove(tcp);
                ixa_cleanup(tcp->tcp_connp->conn_ixa);
                tcp_ipsec_cleanup(tcp);
                CONN_DEC_REF(tcp->tcp_connp);
            }

            /*
             * With the fast-path complete, we can bail.
             */
            return;
        } else {
            /*
             * Fall back to slow path.
             */
            CONN_INC_REF_LOCKED(connp);
            mutex_exit(&connp->conn_lock);
            mutex_exit(lock);
        }
    } else {
        CONN_INC_REF(connp);
    }

    /*
     * We can reuse the closemp here since conn has detached (otherwise we
     * wouldn't even be in time_wait list). It is safe to change
     * tcp_closemp_used without taking a lock as no other thread can
     * concurrently access it at this point in the connection lifecycle.
     */
    if (tcp->tcp_closemp.b_prev == NULL) {
        tcp->tcp_closemp_used = B_TRUE;
    } else {
        cmn_err(CE_PANIC,
            "tcp_timewait_collector: concurrent use of tcp_closemp: "
            "connp %p tcp %p\n", (void *)connp, (void *)tcp);
    }

    TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
    mp = &tcp->tcp_closemp;
    mutex_exit(&tsp->tcp_time_wait_lock);
    SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timewait_close, connp, NULL,
        SQ_FILL, SQTAG_TCP_TIMEWAIT);
    mutex_enter(&tsp->tcp_time_wait_lock);
}

/*
 * Purge any tcp_t instances associated with this squeue which have expired
 * from the TIME_WAIT state.
 */
void
tcp_time_wait_collector(void *arg)
{
    tcp_t *tcp;
    int64_t now, sched_active, sched_cur, sched_new;
    unsigned int idx;

    squeue_t *sqp = (squeue_t *)arg;
    tcp_squeue_priv_t *tsp =
        *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));

    mutex_enter(&tsp->tcp_time_wait_lock);

    /*
     * Because of timer scheduling complexity and the fact that the
     * tcp_time_wait_lock is dropped during tcp_time_wait_purge, it is
     * possible for multiple tcp_time_wait_collector threads to run against
     * the same squeue.  This flag is used to exclude other collectors from
     * the squeue during execution.
     */
    if (tsp->tcp_time_wait_collector_active) {
        mutex_exit(&tsp->tcp_time_wait_lock);
        return;
    }
    tsp->tcp_time_wait_collector_active = B_TRUE;

    /*
     * After its assignment here, the value of sched_active must not be
     * altered as it is used to validate the state of the
     * tcp_time_wait_collector callout schedule for this squeue.
     *
     * The same does not hold true of sched_cur, which holds the timestamp
     * of the bucket undergoing processing.  While it is initially equal to
     * sched_active, certain conditions below can walk it forward,
     * triggering the retry loop.
     */
    sched_cur = sched_active = tsp->tcp_time_wait_schedule;

    /*
     * Purge the free list if necessary
     */
    if (tsp->tcp_free_list != NULL) {
        TCP_G_STAT(tcp_freelist_cleanup);
        while ((tcp = tsp->tcp_free_list) != NULL) {
            tsp->tcp_free_list = tcp->tcp_time_wait_next;
            tcp->tcp_time_wait_next = NULL;
            tsp->tcp_free_list_cnt--;
            ASSERT(tcp->tcp_tcps == NULL);
            CONN_DEC_REF(tcp->tcp_connp);
        }
        ASSERT(tsp->tcp_free_list_cnt == 0);
    }

    /*
     * If there are no connections pending, clear timer-related state to be
     * reinitialized by the next caller.
     */
    if (tsp->tcp_time_wait_cnt == 0) {
        tsp->tcp_time_wait_offset = 0;
        tsp->tcp_time_wait_schedule = 0;
        tsp->tcp_time_wait_tid = 0;
        tsp->tcp_time_wait_collector_active = B_FALSE;
        mutex_exit(&tsp->tcp_time_wait_lock);
        return;
    }

retry:
    /*
     * Grab the bucket which we were scheduled to cleanse.
     */
    idx = TW_BUCKET(sched_cur - 1);
    now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset;
    tcp = tsp->tcp_time_wait_bucket[idx];

    while (tcp != NULL) {
        /*
         * Since the bucket count is sized to prevent wrap-around
         * during typical operation and timers are schedule to process
         * buckets with only expired connections, there is only one
         * reason to encounter a connection expiring in the future:
         * The tcp_time_wait_collector thread has been so delayed in
         * its processing that connections have wrapped around the
         * timing wheel into this bucket.
         *
         * In that case, the remaining entires in the bucket can be
         * ignored since, being appended sequentially, they should all
         * expire in the future.
         */
        if (now < tcp->tcp_time_wait_expire) {
            break;
        }

        /*
         * Pull the connection out of the bucket.
         */
        VERIFY(tcp_time_wait_remove(tcp, tsp));

        /*
         * Purge the connection.
         *
         * While tcp_time_wait_lock will be temporarily dropped as part
         * of the process, there is no risk of the timer being
         * (re)scheduled while the collector is running since a value
         * corresponding to the past is left in tcp_time_wait_schedule.
         */
        tcp_time_wait_purge(tcp, tsp);

        /*
         * Because tcp_time_wait_remove clears the tcp_time_wait_next
         * field, the next item must be grabbed directly from the
         * bucket itself.
         */
        tcp = tsp->tcp_time_wait_bucket[idx];
    }

    if (tsp->tcp_time_wait_cnt == 0) {
        /*
         * There is not a need for the collector to schedule a new
         * timer if no pending items remain.  The timer state can be
         * cleared only if it was untouched while the collector dropped
         * its locks during tcp_time_wait_purge.
         */
        if (tsp->tcp_time_wait_schedule == sched_active) {
            tsp->tcp_time_wait_offset = 0;
            tsp->tcp_time_wait_schedule = 0;
            tsp->tcp_time_wait_tid = 0;
        }
        tsp->tcp_time_wait_collector_active = B_FALSE;
        mutex_exit(&tsp->tcp_time_wait_lock);
        return;
    } else {
        unsigned int nidx;

        /*
         * Locate the next bucket containing entries.
         */
        sched_new = sched_cur + MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
        nidx = TW_BUCKET_NEXT(idx);
        while (tsp->tcp_time_wait_bucket[nidx] == NULL) {
            if (nidx == idx) {
                break;
            }
            nidx = TW_BUCKET_NEXT(nidx);
            sched_new += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
        }
        ASSERT(tsp->tcp_time_wait_bucket[nidx] != NULL);
    }

    /*
     * It is possible that the system is under such dire load that between
     * the timer scheduling and TIME_WAIT processing delay, execution
     * overran the interval allocated to this bucket.
     */
    now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset;
    if (sched_new <= now) {
        /*
         * Attempt to right the situation by immediately performing a
         * purge on the next bucket.  This loop will continue as needed
         * until the schedule can be pushed out ahead of the clock.
         */
        sched_cur = sched_new;
        DTRACE_PROBE3(tcp__time__wait__overrun,
            tcp_squeue_priv_t *, tsp, int64_t, sched_new, int64_t, now);
        goto retry;
    }

    /*
     * Another thread may have snuck in to reschedule the timer while locks
     * were dropped during tcp_time_wait_purge.  Defer to the running timer
     * if that is the case.
     */
    if (tsp->tcp_time_wait_schedule != sched_active) {
        tsp->tcp_time_wait_collector_active = B_FALSE;
        mutex_exit(&tsp->tcp_time_wait_lock);
        return;
    }

    /*
     * Schedule the next timer.
     */
    tsp->tcp_time_wait_schedule = sched_new;
    tsp->tcp_time_wait_tid =
        timeout_generic(CALLOUT_NORMAL,
        tcp_time_wait_collector, sqp,
        TICK_TO_NSEC(sched_new - now),
        CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
    tsp->tcp_time_wait_collector_active = B_FALSE;
    mutex_exit(&tsp->tcp_time_wait_lock);
}

/*
 * tcp_time_wait_processing() handles processing of incoming packets when
 * the tcp_t is in the TIME_WAIT state.
 *
 * A TIME_WAIT tcp_t that has an associated open TCP end point (not in
 * detached state) is never put on the time wait list.
 */
void
tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
    uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira)
{
    int32_t     bytes_acked;
    int32_t     gap;
    int32_t     rgap;
    tcp_opt_t   tcpopt;
    uint_t      flags;
    uint32_t    new_swnd = 0;
    conn_t      *nconnp;
    conn_t      *connp = tcp->tcp_connp;
    tcp_stack_t *tcps = tcp->tcp_tcps;

    BUMP_LOCAL(tcp->tcp_ibsegs);
    DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);

    flags = (unsigned int)tcpha->tha_flags & 0xFF;
    new_swnd = ntohs(tcpha->tha_win) <<
        ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws);

    if (tcp->tcp_snd_ts_ok && !(tcpha->tha_flags & TH_RST)) {
        int options;
        if (tcp->tcp_snd_sack_ok)
            tcpopt.tcp = tcp;
        else
            tcpopt.tcp = NULL;
        options = tcp_parse_options(tcpha, &tcpopt);
        if (!(options & TCP_OPT_TSTAMP_PRESENT)) {
            DTRACE_TCP1(droppedtimestamp, tcp_t *, tcp);
            goto done;
        } else if (!tcp_paws_check(tcp, &tcpopt)) {
            tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt,
                TH_ACK);
            goto done;
        }
    }
    gap = seg_seq - tcp->tcp_rnxt;
    rgap = tcp->tcp_rwnd - (gap + seg_len);
    if (gap < 0) {
        TCPS_BUMP_MIB(tcps, tcpInDataDupSegs);
        TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes,
            (seg_len > -gap ? -gap : seg_len));
        seg_len += gap;
        if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) {
            if (flags & TH_RST) {
                goto done;
            }
            if ((flags & TH_FIN) && seg_len == -1) {
                /*
                 * When TCP receives a duplicate FIN in
                 * TIME_WAIT state, restart the 2 MSL timer.
                 * See page 73 in RFC 793. Make sure this TCP
                 * is already on the TIME_WAIT list. If not,
                 * just restart the timer.
                 */
                if (TCP_IS_DETACHED(tcp)) {
                    if (tcp_time_wait_remove(tcp, NULL) ==
                        B_TRUE) {
                        tcp_time_wait_append(tcp);
                        TCP_DBGSTAT(tcps,
                            tcp_rput_time_wait);
                    }
                } else {
                    ASSERT(tcp != NULL);
                    TCP_TIMER_RESTART(tcp,
                        tcps->tcps_time_wait_interval);
                }
                tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
                    tcp->tcp_rnxt, TH_ACK);
                goto done;
            }
            flags |=  TH_ACK_NEEDED;
            seg_len = 0;
            goto process_ack;
        }

        /* Fix seg_seq, and chew the gap off the front. */
        seg_seq = tcp->tcp_rnxt;
    }

    if ((flags & TH_SYN) && gap > 0 && rgap < 0) {
        /*
         * Make sure that when we accept the connection, pick
         * an ISS greater than (tcp_snxt + tcp_iss_incr/2) for the
         * old connection.
         *
         * The next ISS generated is equal to tcp_iss_incr_extra
         * + tcp_iss_incr/2 + other components depending on the
         * value of tcp_strong_iss.  We pre-calculate the new
         * ISS here and compare with tcp_snxt to determine if
         * we need to make adjustment to tcp_iss_incr_extra.
         *
         * The above calculation is ugly and is a
         * waste of CPU cycles...
         */
        uint32_t new_iss = tcps->tcps_iss_incr_extra;
        int32_t adj;
        ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;

        switch (tcps->tcps_strong_iss) {
        case 2: {
            /* Add time and MD5 components. */
            uint32_t answer[4];
            struct {
                uint32_t ports;
                in6_addr_t src;
                in6_addr_t dst;
            } arg;
            MD5_CTX context;

            mutex_enter(&tcps->tcps_iss_key_lock);
            context = tcps->tcps_iss_key;
            mutex_exit(&tcps->tcps_iss_key_lock);
            arg.ports = connp->conn_ports;
            /* We use MAPPED addresses in tcp_iss_init */
            arg.src = connp->conn_laddr_v6;
            arg.dst = connp->conn_faddr_v6;
            MD5Update(&context, (uchar_t *)&arg,
                sizeof (arg));
            MD5Final((uchar_t *)answer, &context);
            answer[0] ^= answer[1] ^ answer[2] ^ answer[3];
            new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0];
            break;
        }
        case 1:
            /* Add time component and min random (i.e. 1). */
            new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1;
            break;
        default:
            /* Add only time component. */
            new_iss += (uint32_t)gethrestime_sec() *
                tcps->tcps_iss_incr;
            break;
        }
        if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) {
            /*
             * New ISS not guaranteed to be tcp_iss_incr/2
             * ahead of the current tcp_snxt, so add the
             * difference to tcp_iss_incr_extra.
             */
            tcps->tcps_iss_incr_extra += adj;
        }
        /*
         * If tcp_clean_death() can not perform the task now,
         * drop the SYN packet and let the other side re-xmit.
         * Otherwise pass the SYN packet back in, since the
         * old tcp state has been cleaned up or freed.
         */
        if (tcp_clean_death(tcp, 0) == -1)
            goto done;
        nconnp = ipcl_classify(mp, ira, ipst);
        if (nconnp != NULL) {
            TCP_STAT(tcps, tcp_time_wait_syn_success);
            /* Drops ref on nconnp */
            tcp_reinput(nconnp, mp, ira, ipst);
            return;
        }
        goto done;
    }

    /*
     * rgap is the amount of stuff received out of window.  A negative
     * value is the amount out of window.
     */
    if (rgap < 0) {
        TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
        TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
        /* Fix seg_len and make sure there is something left. */
        seg_len += rgap;
        if (seg_len <= 0) {
            if (flags & TH_RST) {
                goto done;
            }
            flags |=  TH_ACK_NEEDED;
            seg_len = 0;
            goto process_ack;
        }
    }
    /*
     * Check whether we can update tcp_ts_recent. This test is from RFC
     * 7323, section 5.3.
     */
    if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) &&
        TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) &&
        SEQ_LEQ(seg_seq, tcp->tcp_rack)) {
        tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
        tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64();
    }

    if (seg_seq != tcp->tcp_rnxt && seg_len > 0) {
        /* Always ack out of order packets */
        flags |= TH_ACK_NEEDED;
        seg_len = 0;
    } else if (seg_len > 0) {
        TCPS_BUMP_MIB(tcps, tcpInClosed);
        TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
        TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
    }
    if (flags & TH_RST) {
        (void) tcp_clean_death(tcp, 0);
        goto done;
    }
    if (flags & TH_SYN) {
        tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
            TH_RST|TH_ACK);
        /*
         * Do not delete the TCP structure if it is in
         * TIME_WAIT state.  Refer to RFC 1122, 4.2.2.13.
         */
        goto done;
    }
process_ack:
    if (flags & TH_ACK) {
        bytes_acked = (int)(seg_ack - tcp->tcp_suna);
        if (bytes_acked <= 0) {
            if (bytes_acked == 0 && seg_len == 0 &&
                new_swnd == tcp->tcp_swnd)
                TCPS_BUMP_MIB(tcps, tcpInDupAck);
        } else {
            /* Acks something not sent */
            flags |= TH_ACK_NEEDED;
        }
    }
    if (flags & TH_ACK_NEEDED) {
        /*
         * Time to send an ack for some reason.
         */
        tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
            tcp->tcp_rnxt, TH_ACK);
    }
done:
    freemsg(mp);
}