tcp_socket.c revision dd49f125507979bb2ab505a8daf2a46d1be27051
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* This file contains all TCP kernel socket related functions. */
#include <sys/types.h>
#include <sys/strlog.h>
#include <sys/policy.h>
#include <sys/sockio.h>
#include <sys/strsubr.h>
#include <sys/strsun.h>
#include <sys/squeue_impl.h>
#include <sys/squeue.h>
#define _SUN_TPI_VERSION 2
#include <sys/tihdr.h>
#include <sys/timod.h>
#include <sys/tpicommon.h>
#include <sys/socketvar.h>
#include <inet/common.h>
#include <inet/proto_set.h>
#include <inet/ip.h>
#include <inet/tcp.h>
#include <inet/tcp_impl.h>
static void tcp_activate(sock_lower_handle_t, sock_upper_handle_t,
sock_upcalls_t *, int, cred_t *);
static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
sock_upper_handle_t, cred_t *);
static int tcp_bind(sock_lower_handle_t, struct sockaddr *,
socklen_t, cred_t *);
static int tcp_listen(sock_lower_handle_t, int, cred_t *);
static int tcp_connect(sock_lower_handle_t, const struct sockaddr *,
socklen_t, sock_connid_t *, cred_t *);
static int tcp_getsockopt(sock_lower_handle_t, int, int, void *,
socklen_t *, cred_t *);
static int tcp_setsockopt(sock_lower_handle_t, int, int, const void *,
socklen_t, cred_t *);
static int tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
cred_t *cr);
static int tcp_shutdown(sock_lower_handle_t, int, cred_t *);
static void tcp_clr_flowctrl(sock_lower_handle_t);
static int tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
cred_t *);
static int tcp_close(sock_lower_handle_t, int, cred_t *);
sock_downcalls_t sock_tcp_downcalls = {
tcp_activate,
tcp_accept,
tcp_bind,
tcp_listen,
tcp_connect,
tcp_getpeername,
tcp_getsockname,
tcp_getsockopt,
tcp_setsockopt,
tcp_sendmsg,
NULL,
NULL,
NULL,
tcp_shutdown,
tcp_clr_flowctrl,
tcp_ioctl,
tcp_close,
};
/* ARGSUSED */
static void
tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
struct sock_proto_props sopp;
extern struct module_info tcp_rinfo;
ASSERT(connp->conn_upper_handle == NULL);
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
sopp.sopp_maxpsz = INFPSZ;
sopp.sopp_maxblk = INFPSZ;
sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
sopp.sopp_maxaddrlen = sizeof (sin6_t);
sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
tcp_rinfo.mi_minpsz;
connp->conn_upcalls = sock_upcalls;
connp->conn_upper_handle = sock_handle;
ASSERT(connp->conn_rcvbuf != 0 &&
connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
(*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
}
/*ARGSUSED*/
static int
tcp_accept(sock_lower_handle_t lproto_handle,
sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
cred_t *cr)
{
conn_t *lconnp, *econnp;
tcp_t *listener, *eager;
/*
* KSSL can move a socket from one listener to another, in which
* case `lproto_handle' points to the new listener. To ensure that
* the original listener is used the information is obtained from
* the eager.
*/
econnp = (conn_t *)eproto_handle;
eager = econnp->conn_tcp;
ASSERT(IPCL_IS_NONSTR(econnp));
ASSERT(eager->tcp_listener != NULL);
listener = eager->tcp_listener;
lconnp = (conn_t *)listener->tcp_connp;
ASSERT(listener->tcp_state == TCPS_LISTEN);
ASSERT(lconnp->conn_upper_handle != NULL);
/*
* It is possible for the accept thread to race with the thread that
* made the su_newconn upcall in tcp_newconn_notify. Both
* tcp_newconn_notify and tcp_accept require that conn_upper_handle
* and conn_upcalls be set before returning, so they both write to
* them. However, we're guaranteed that the value written is the same
* for both threads.
*/
ASSERT(econnp->conn_upper_handle == NULL ||
econnp->conn_upper_handle == sock_handle);
ASSERT(econnp->conn_upcalls == NULL ||
econnp->conn_upcalls == lconnp->conn_upcalls);
econnp->conn_upper_handle = sock_handle;
econnp->conn_upcalls = lconnp->conn_upcalls;
ASSERT(econnp->conn_netstack ==
listener->tcp_connp->conn_netstack);
ASSERT(eager->tcp_tcps == listener->tcp_tcps);
/*
* We should have a minimum of 2 references on the conn at this
* point. One for TCP and one for the newconn notification
* (which is now taken over by IP). In the normal case we would
* also have another reference (making a total of 3) for the conn
* being in the classifier hash list. However the eager could have
* received an RST subsequently and tcp_closei_local could have
* removed the eager from the classifier hash list, hence we can't
* assert that reference.
*/
ASSERT(econnp->conn_ref >= 2);
/*
* An error is returned if this conn has been reset, which will
* cause the socket to be closed immediately. The eager will be
* unlinked from the listener during close.
*/
if (eager->tcp_state < TCPS_ESTABLISHED)
return (ECONNABORTED);
mutex_enter(&listener->tcp_eager_lock);
/*
* Non-STREAMS listeners never defer the notification of new
* connections.
*/
ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0);
tcp_eager_unlink(eager);
mutex_exit(&listener->tcp_eager_lock);
CONN_DEC_REF(listener->tcp_connp);
return (0);
}
static int
tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
socklen_t len, cred_t *cr)
{
int error;
conn_t *connp = (conn_t *)proto_handle;
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
ASSERT(connp->conn_upper_handle != NULL);
error = squeue_synch_enter(connp, NULL);
if (error != 0) {
/* failed to enter */
return (ENOSR);
}
/* binding to a NULL address really means unbind */
if (sa == NULL) {
if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
error = tcp_do_unbind(connp);
else
error = EINVAL;
} else {
error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
}
squeue_synch_exit(connp);
if (error < 0) {
if (error == -TOUTSTATE)
error = EINVAL;
else
error = proto_tlitosyserr(-error);
}
return (error);
}
/* ARGSUSED */
static int
tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
tcp_t *tcp = connp->conn_tcp;
int error;
ASSERT(connp->conn_upper_handle != NULL);
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
error = squeue_synch_enter(connp, NULL);
if (error != 0) {
/* failed to enter */
return (ENOBUFS);
}
error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
if (error == 0) {
/*
* sockfs needs to know what's the maximum number of socket
* that can be queued on the listener.
*/
(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
SOCK_OPCTL_ENAB_ACCEPT,
(uintptr_t)(tcp->tcp_conn_req_max +
tcp->tcp_tcps->tcps_conn_req_max_q0));
} else if (error < 0) {
if (error == -TOUTSTATE)
error = EINVAL;
else
error = proto_tlitosyserr(-error);
}
squeue_synch_exit(connp);
return (error);
}
static int
tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
socklen_t len, sock_connid_t *id, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
int error;
ASSERT(connp->conn_upper_handle != NULL);
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
error = proto_verify_ip_addr(connp->conn_family, sa, len);
if (error != 0) {
return (error);
}
error = squeue_synch_enter(connp, NULL);
if (error != 0) {
/* failed to enter */
return (ENOSR);
}
/*
* TCP supports quick connect, so no need to do an implicit bind
*/
error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
if (error == 0) {
*id = connp->conn_tcp->tcp_connid;
} else if (error < 0) {
if (error == -TOUTSTATE) {
switch (connp->conn_tcp->tcp_state) {
case TCPS_SYN_SENT:
error = EALREADY;
break;
case TCPS_ESTABLISHED:
error = EISCONN;
break;
case TCPS_LISTEN:
error = EOPNOTSUPP;
break;
default:
error = EINVAL;
break;
}
} else {
error = proto_tlitosyserr(-error);
}
}
if (connp->conn_tcp->tcp_loopback) {
struct sock_proto_props sopp;
sopp.sopp_flags = SOCKOPT_LOOPBACK;
sopp.sopp_loopback = B_TRUE;
(*connp->conn_upcalls->su_set_proto_props)(
connp->conn_upper_handle, &sopp);
}
done:
squeue_synch_exit(connp);
return ((error == 0) ? EINPROGRESS : error);
}
/* ARGSUSED3 */
int
tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
socklen_t *addrlenp, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
tcp_t *tcp = connp->conn_tcp;
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
ASSERT(tcp != NULL);
if (tcp->tcp_state < TCPS_SYN_RCVD)
return (ENOTCONN);
return (conn_getpeername(connp, addr, addrlenp));
}
/* ARGSUSED3 */
int
tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
socklen_t *addrlenp, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
return (conn_getsockname(connp, addr, addrlenp));
}
/* returns UNIX error, the optlen is a value-result arg */
static int
tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
void *optvalp, socklen_t *optlen, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
int error;
t_uscalar_t max_optbuf_len;
void *optvalp_buf;
int len;
ASSERT(connp->conn_upper_handle != NULL);
error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
tcp_opt_obj.odb_opt_des_arr,
tcp_opt_obj.odb_opt_arr_cnt,
B_FALSE, B_TRUE, cr);
if (error != 0) {
if (error < 0) {
error = proto_tlitosyserr(-error);
}
return (error);
}
optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
error = squeue_synch_enter(connp, NULL);
if (error == ENOMEM) {
kmem_free(optvalp_buf, max_optbuf_len);
return (ENOMEM);
}
len = tcp_opt_get(connp, level, option_name, optvalp_buf);
squeue_synch_exit(connp);
if (len == -1) {
kmem_free(optvalp_buf, max_optbuf_len);
return (EINVAL);
}
/*
* update optlen and copy option value
*/
t_uscalar_t size = MIN(len, *optlen);
bcopy(optvalp_buf, optvalp, size);
bcopy(&size, optlen, sizeof (size));
kmem_free(optvalp_buf, max_optbuf_len);
return (0);
}
static int
tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
const void *optvalp, socklen_t optlen, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
int error;
ASSERT(connp->conn_upper_handle != NULL);
/*
* Entering the squeue synchronously can result in a context switch,
* which can cause a rather sever performance degradation. So we try to
* handle whatever options we can without entering the squeue.
*/
if (level == IPPROTO_TCP) {
switch (option_name) {
case TCP_NODELAY:
if (optlen != sizeof (int32_t))
return (EINVAL);
mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
connp->conn_tcp->tcp_mss;
mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
return (0);
default:
break;
}
}
error = squeue_synch_enter(connp, NULL);
if (error == ENOMEM) {
return (ENOMEM);
}
error = proto_opt_check(level, option_name, optlen, NULL,
tcp_opt_obj.odb_opt_des_arr,
tcp_opt_obj.odb_opt_arr_cnt,
B_TRUE, B_FALSE, cr);
if (error != 0) {
if (error < 0) {
error = proto_tlitosyserr(-error);
}
squeue_synch_exit(connp);
return (error);
}
error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
NULL, cr);
squeue_synch_exit(connp);
ASSERT(error >= 0);
return (error);
}
/* ARGSUSED */
static int
tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
cred_t *cr)
{
tcp_t *tcp;
uint32_t msize;
conn_t *connp = (conn_t *)proto_handle;
int32_t tcpstate;
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
ASSERT(connp->conn_ref >= 2);
ASSERT(connp->conn_upper_handle != NULL);
if (msg->msg_controllen != 0) {
freemsg(mp);
return (EOPNOTSUPP);
}
switch (DB_TYPE(mp)) {
case M_DATA:
tcp = connp->conn_tcp;
ASSERT(tcp != NULL);
tcpstate = tcp->tcp_state;
if (tcpstate < TCPS_ESTABLISHED) {
freemsg(mp);
/*
* We return ENOTCONN if the endpoint is trying to
* connect or has never been connected, and EPIPE if it
* has been disconnected. The connection id helps us
* distinguish between the last two cases.
*/
return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN :
((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN));
} else if (tcpstate > TCPS_CLOSE_WAIT) {
freemsg(mp);
return (EPIPE);
}
msize = msgdsize(mp);
mutex_enter(&tcp->tcp_non_sq_lock);
tcp->tcp_squeue_bytes += msize;
/*
* Squeue Flow Control
*/
if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
tcp_setqfull(tcp);
}
mutex_exit(&tcp->tcp_non_sq_lock);
/*
* The application may pass in an address in the msghdr, but
* we ignore the address on connection-oriented sockets.
* Just like BSD this code does not generate an error for
* TCP (a CONNREQUIRED socket) when sending to an address
* passed in with sendto/sendmsg. Instead the data is
* delivered on the connection as if no address had been
* supplied.
*/
CONN_INC_REF(connp);
if (msg->msg_flags & MSG_OOB) {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
}
return (0);
default:
ASSERT(0);
}
freemsg(mp);
return (0);
}
/* ARGSUSED */
static int
tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
tcp_t *tcp = connp->conn_tcp;
ASSERT(connp->conn_upper_handle != NULL);
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
/*
* X/Open requires that we check the connected state.
*/
if (tcp->tcp_state < TCPS_SYN_SENT)
return (ENOTCONN);
/* shutdown the send side */
if (how != SHUT_RD) {
mblk_t *bp;
bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
CONN_INC_REF(connp);
SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
SOCK_OPCTL_SHUT_SEND, 0);
}
/* shutdown the recv side */
if (how != SHUT_WR)
(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
SOCK_OPCTL_SHUT_RECV, 0);
return (0);
}
static void
tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
{
conn_t *connp = (conn_t *)proto_handle;
tcp_t *tcp = connp->conn_tcp;
mblk_t *mp;
int error;
ASSERT(connp->conn_upper_handle != NULL);
/*
* If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
* is currently running.
*/
mutex_enter(&tcp->tcp_rsrv_mp_lock);
if ((mp = tcp->tcp_rsrv_mp) == NULL) {
mutex_exit(&tcp->tcp_rsrv_mp_lock);
return;
}
tcp->tcp_rsrv_mp = NULL;
mutex_exit(&tcp->tcp_rsrv_mp_lock);
error = squeue_synch_enter(connp, mp);
ASSERT(error == 0);
mutex_enter(&tcp->tcp_rsrv_mp_lock);
tcp->tcp_rsrv_mp = mp;
mutex_exit(&tcp->tcp_rsrv_mp_lock);
if (tcp->tcp_fused) {
tcp_fuse_backenable(tcp);
} else {
tcp->tcp_rwnd = connp->conn_rcvbuf;
/*
* Send back a window update immediately if TCP is above
* ESTABLISHED state and the increase of the rcv window
* that the other side knows is at least 1 MSS after flow
* control is lifted.
*/
if (tcp->tcp_state >= TCPS_ESTABLISHED &&
tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
tcp_xmit_ctl(NULL, tcp,
(tcp->tcp_swnd == 0) ? tcp->tcp_suna :
tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
}
}
squeue_synch_exit(connp);
}
/* ARGSUSED */
static int
tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
int mode, int32_t *rvalp, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
int error;
ASSERT(connp->conn_upper_handle != NULL);
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
/*
* If we don't have a helper stream then create one.
* ip_create_helper_stream takes care of locking the conn_t,
* so this check for NULL is just a performance optimization.
*/
if (connp->conn_helper_info == NULL) {
tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
/*
* Create a helper stream for non-STREAMS socket.
*/
error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
if (error != 0) {
ip0dbg(("tcp_ioctl: create of IP helper stream "
"failed %d\n", error));
return (error);
}
}
switch (cmd) {
case ND_SET:
case ND_GET:
case _SIOCSOCKFALLBACK:
case TCP_IOC_ABORT_CONN:
case TI_GETPEERNAME:
case TI_GETMYNAME:
ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
cmd));
error = EINVAL;
break;
default:
/*
* If the conn is not closing, pass on to IP using
* helper stream. Bump the ioctlref to prevent tcp_close
* from closing the rq/wq out from underneath the ioctl
* if it ends up queued or aborted/interrupted.
*/
mutex_enter(&connp->conn_lock);
if (connp->conn_state_flags & (CONN_CLOSING)) {
mutex_exit(&connp->conn_lock);
error = EINVAL;
break;
}
CONN_INC_IOCTLREF_LOCKED(connp);
error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
cmd, arg, mode, cr, rvalp);
CONN_DEC_IOCTLREF(connp);
break;
}
return (error);
}
/* ARGSUSED */
static int
tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
ASSERT(connp->conn_upper_handle != NULL);
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
tcp_close_common(connp, flags);
ip_free_helper_stream(connp);
/*
* Drop IP's reference on the conn. This is the last reference
* on the connp if the state was less than established. If the
* connection has gone into timewait state, then we will have
* one ref for the TCP and one more ref (total of two) for the
* classifier connected hash list (a timewait connections stays
* in connected hash till closed).
*
* We can't assert the references because there might be other
* transient reference places because of some walkers or queued
* packets in squeue for the timewait state.
*/
CONN_DEC_REF(connp);
/*
* EINPROGRESS tells sockfs to wait for a 'closed' upcall before
* freeing the socket.
*/
return (EINPROGRESS);
}
/* ARGSUSED */
sock_lower_handle_t
tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
uint_t *smodep, int *errorp, int flags, cred_t *credp)
{
conn_t *connp;
boolean_t isv6 = family == AF_INET6;
if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
(proto != 0 && proto != IPPROTO_TCP)) {
*errorp = EPROTONOSUPPORT;
return (NULL);
}
connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
if (connp == NULL) {
return (NULL);
}
/*
* Put the ref for TCP. Ref for IP was already put
* by ipcl_conn_create. Also Make the conn_t globally
* visible to walkers
*/
mutex_enter(&connp->conn_lock);
CONN_INC_REF_LOCKED(connp);
ASSERT(connp->conn_ref == 2);
connp->conn_state_flags &= ~CONN_INCIPIENT;
connp->conn_flags |= IPCL_NONSTR;
mutex_exit(&connp->conn_lock);
ASSERT(errorp != NULL);
*errorp = 0;
*sock_downcalls = &sock_tcp_downcalls;
*smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP |
SM_SENDFILESUPP;
return ((sock_lower_handle_t)connp);
}
/*
* tcp_fallback
*
* A direct socket is falling back to using STREAMS. The queue
* that is being passed down was created using tcp_open() with
* the SO_FALLBACK flag set. As a result, the queue is not
* associated with a conn, and the q_ptrs instead contain the
* dev and minor area that should be used.
*
* The 'issocket' flag indicates whether the FireEngine
* optimizations should be used. The common case would be that
* optimizations are enabled, and they might be subsequently
* disabled using the _SIOCSOCKFALLBACK ioctl.
*/
/*
* An active connection is falling back to TPI. Gather all the information
* required by the STREAM head and TPI sonode and send it up.
*/
static void
tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
sock_quiesce_arg_t *arg)
{
conn_t *connp = tcp->tcp_connp;
struct stroptions *stropt;
struct T_capability_ack tca;
struct sockaddr_in6 laddr, faddr;
socklen_t laddrlen, faddrlen;
short opts;
int error;
mblk_t *mp, *mpnext;
connp->conn_dev = (dev_t)RD(q)->q_ptr;
connp->conn_minor_arena = WR(q)->q_ptr;
RD(q)->q_ptr = WR(q)->q_ptr = connp;
connp->conn_rq = RD(q);
connp->conn_wq = WR(q);
WR(q)->q_qinfo = &tcp_sock_winit;
if (!issocket)
tcp_use_pure_tpi(tcp);
/*
* free the helper stream
*/
ip_free_helper_stream(connp);
/*
* Notify the STREAM head about options
*/
DB_TYPE(stropt_mp) = M_SETOPTS;
stropt = (struct stroptions *)stropt_mp->b_rptr;
stropt_mp->b_wptr += sizeof (struct stroptions);
stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
tcp->tcp_tcps->tcps_wroff_xtra);
if (tcp->tcp_snd_sack_ok)
stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
stropt->so_hiwat = connp->conn_rcvbuf;
stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
putnext(RD(q), stropt_mp);
/*
* Collect the information needed to sync with the sonode
*/
tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
laddrlen = faddrlen = sizeof (sin6_t);
(void) tcp_getsockname((sock_lower_handle_t)connp,
(struct sockaddr *)&laddr, &laddrlen, CRED());
error = tcp_getpeername((sock_lower_handle_t)connp,
(struct sockaddr *)&faddr, &faddrlen, CRED());
if (error != 0)
faddrlen = 0;
opts = 0;
if (connp->conn_oobinline)
opts |= SO_OOBINLINE;
if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
opts |= SO_DONTROUTE;
/*
* Notify the socket that the protocol is now quiescent,
* and it's therefore safe move data from the socket
* to the stream head.
*/
mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
(struct sockaddr *)&laddr, laddrlen,
(struct sockaddr *)&faddr, faddrlen, opts);
while (mp != NULL) {
mpnext = mp->b_next;
tcp->tcp_rcv_list = mp->b_next;
mp->b_next = NULL;
putnext(q, mp);
mp = mpnext;
}
ASSERT(tcp->tcp_rcv_last_head == NULL);
ASSERT(tcp->tcp_rcv_last_tail == NULL);
ASSERT(tcp->tcp_rcv_cnt == 0);
/*
* All eagers in q0 are marked as being non-STREAM, so they will
* make su_newconn upcalls when the handshake completes, which
* will fail (resulting in the conn being closed). So we just blow
* off everything in q0 instead of waiting for the inevitable.
*/
if (tcp->tcp_conn_req_cnt_q0 != 0)
tcp_eager_cleanup(tcp, B_TRUE);
}
/*
* An eager is falling back to TPI. All we have to do is send
* up a T_CONN_IND.
*/
static void
tcp_fallback_eager(tcp_t *eager, boolean_t issocket,
so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg)
{
conn_t *connp = eager->tcp_connp;
tcp_t *listener = eager->tcp_listener;
mblk_t *mp;
ASSERT(listener != NULL);
/*
* Notify the socket that the protocol is now quiescent,
* and it's therefore safe move data from the socket
* to tcp's rcv queue.
*/
mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0,
NULL, 0, 0);
if (mp != NULL) {
ASSERT(eager->tcp_rcv_cnt == 0);
eager->tcp_rcv_list = mp;
eager->tcp_rcv_cnt = msgdsize(mp);
while (mp->b_next != NULL) {
mp = mp->b_next;
eager->tcp_rcv_cnt += msgdsize(mp);
}
eager->tcp_rcv_last_head = mp;
while (mp->b_cont)
mp = mp->b_cont;
eager->tcp_rcv_last_tail = mp;
if (eager->tcp_rcv_cnt > eager->tcp_rwnd)
eager->tcp_rwnd = 0;
else
eager->tcp_rwnd -= eager->tcp_rcv_cnt;
}
if (!issocket)
eager->tcp_issocket = B_FALSE;
/*
* The stream for this eager does not yet exist, so mark it as
* being detached.
*/
eager->tcp_detached = B_TRUE;
eager->tcp_hard_binding = B_TRUE;
connp->conn_rq = listener->tcp_connp->conn_rq;
connp->conn_wq = listener->tcp_connp->conn_wq;
/* Send up the connection indication */
mp = eager->tcp_conn.tcp_eager_conn_ind;
ASSERT(mp != NULL);
eager->tcp_conn.tcp_eager_conn_ind = NULL;
/*
* TLI/XTI applications will get confused by
* sending eager as an option since it violates
* the option semantics. So remove the eager as
* option since TLI/XTI app doesn't need it anyway.
*/
if (!issocket) {
struct T_conn_ind *conn_ind;
conn_ind = (struct T_conn_ind *)mp->b_rptr;
conn_ind->OPT_length = 0;
conn_ind->OPT_offset = 0;
}
/*
* Sockfs guarantees that the listener will not be closed
* during fallback. So we can safely use the listener's queue.
*/
putnext(listener->tcp_connp->conn_rq, mp);
}
int
tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
sock_quiesce_arg_t *arg)
{
tcp_t *tcp;
conn_t *connp = (conn_t *)proto_handle;
int error;
mblk_t *stropt_mp;
mblk_t *ordrel_mp;
tcp = connp->conn_tcp;
stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
NULL);
/* Pre-allocate the T_ordrel_ind mblk. */
ASSERT(tcp->tcp_ordrel_mp == NULL);
ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
STR_NOSIG, NULL);
ordrel_mp->b_datap->db_type = M_PROTO;
((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
/*
* Enter the squeue so that no new packets can come in
*/
error = squeue_synch_enter(connp, NULL);
if (error != 0) {
/* failed to enter, free all the pre-allocated messages. */
freeb(stropt_mp);
freeb(ordrel_mp);
return (ENOMEM);
}
/*
* Both endpoints must be of the same type (either STREAMS or
* non-STREAMS) for fusion to be enabled. So if we are fused,
* we have to unfuse.
*/
if (tcp->tcp_fused)
tcp_unfuse(tcp);
if (tcp->tcp_listener != NULL) {
/* The eager will deal with opts when accept() is called */
freeb(stropt_mp);
tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg);
} else {
tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
quiesced_cb, arg);
}
/*
* No longer a direct socket
*
* Note that we intentionally leave the upper_handle and upcalls
* intact, since eagers may still be using them.
*/
connp->conn_flags &= ~IPCL_NONSTR;
tcp->tcp_ordrel_mp = ordrel_mp;
/*
* There should be atleast two ref's (IP + TCP)
*/
ASSERT(connp->conn_ref >= 2);
squeue_synch_exit(connp);
return (0);
}
/*
* Notifies a non-STREAMS based listener about a new connection. This
* function is executed on the *eager*'s squeue once the 3 way handshake
* has completed. Note that the behavior differs from STREAMS, where the
* T_CONN_IND is sent up by tcp_send_conn_ind while on the *listener*'s
* squeue.
*
* Returns B_TRUE if the notification succeeded, in which case `tcp' will
* be moved over to the ESTABLISHED list (q) of the listener. Othwerise,
* B_FALSE is returned and `tcp' is killed.
*/
boolean_t
tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira)
{
tcp_t *listener = tcp->tcp_listener;
conn_t *lconnp = listener->tcp_connp;
conn_t *econnp = tcp->tcp_connp;
tcp_t *tail;
ipaddr_t *addr_cache;
sock_upper_handle_t upper;
struct sock_proto_props sopp;
mblk_t *mp;
mutex_enter(&listener->tcp_eager_lock);
/*
* Take the eager out, if it is in the list of droppable eagers
* as we are here because the 3W handshake is over.
*/
MAKE_UNDROPPABLE(tcp);
/*
* The eager already has an extra ref put in tcp_input_data
* so that it stays till accept comes back even though it
* might get into TCPS_CLOSED as a result of a TH_RST etc.
*/
ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
listener->tcp_conn_req_cnt_q0--;
listener->tcp_conn_req_cnt_q++;
/* Move from SYN_RCVD to ESTABLISHED list */
tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0;
tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
tcp->tcp_eager_prev_q0 = NULL;
tcp->tcp_eager_next_q0 = NULL;
/*
* Insert at end of the queue because connections are accepted
* in chronological order. Leaving the older connections at front
* of the queue helps reducing search time.
*/
tail = listener->tcp_eager_last_q;
if (tail != NULL)
tail->tcp_eager_next_q = tcp;
else
listener->tcp_eager_next_q = tcp;
listener->tcp_eager_last_q = tcp;
tcp->tcp_eager_next_q = NULL;
/* we have timed out before */
if (tcp->tcp_syn_rcvd_timeout != 0) {
tcp->tcp_syn_rcvd_timeout = 0;
listener->tcp_syn_rcvd_timeout--;
if (listener->tcp_syn_defense &&
listener->tcp_syn_rcvd_timeout <=
(listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) &&
10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
listener->tcp_last_rcv_lbolt)) {
/*
* Turn off the defense mode if we
* believe the SYN attack is over.
*/
listener->tcp_syn_defense = B_FALSE;
if (listener->tcp_ip_addr_cache) {
kmem_free((void *)listener->tcp_ip_addr_cache,
IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
listener->tcp_ip_addr_cache = NULL;
}
}
}
addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
if (addr_cache != NULL) {
/*
* We have finished a 3-way handshake with this
* remote host. This proves the IP addr is good.
* Cache it!
*/
addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
tcp->tcp_connp->conn_faddr_v4;
}
mutex_exit(&listener->tcp_eager_lock);
/*
* Notify the ULP about the newconn. It is guaranteed that no
* tcp_accept() call will be made for the eager if the
* notification fails.
*/
if ((upper = (*lconnp->conn_upcalls->su_newconn)
(lconnp->conn_upper_handle, (sock_lower_handle_t)econnp,
&sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid,
&econnp->conn_upcalls)) == NULL) {
/*
* Normally this should not happen, but the listener might
* have done a fallback to TPI followed by a close(), in
* which case tcp_closemp for this conn might have been
* used by tcp_eager_cleanup().
*/
mutex_enter(&listener->tcp_eager_lock);
if (tcp->tcp_closemp_used) {
mutex_exit(&listener->tcp_eager_lock);
return (B_FALSE);
}
tcp->tcp_closemp_used = B_TRUE;
TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
mp = &tcp->tcp_closemp;
mutex_exit(&listener->tcp_eager_lock);
tcp_eager_kill(econnp, mp, NULL, NULL);
return (B_FALSE);
}
econnp->conn_upper_handle = upper;
tcp->tcp_detached = B_FALSE;
tcp->tcp_hard_binding = B_FALSE;
tcp->tcp_tconnind_started = B_TRUE;
if (econnp->conn_keepalive) {
tcp->tcp_ka_last_intrvl = 0;
tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
tcp->tcp_ka_interval);
}
/* Update the necessary parameters */
tcp_get_proto_props(tcp, &sopp);
(*econnp->conn_upcalls->su_set_proto_props)
(econnp->conn_upper_handle, &sopp);
return (B_TRUE);
}