/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/xti_xtiopt.h>
#include <sys/xti_inet.h>
#include <inet/proto_set.h>
#include <inet/tcp_impl.h>
/*
* Table of all known options handled on a TCP protocol stack.
*
* Note: This table contains options processed by both TCP and IP levels
* and is the superset of options that can be performed on a TCP over IP
* stack.
*/
sizeof (struct linger), 0 },
},
sizeof (struct timeval), 0 },
sizeof (struct timeval), 0 },
},
0 },
0 },
0 },
0 },
},
536 },
0 },
sizeof (int), 0 },
},
sizeof (int), 0 },
sizeof (int), 0 },
sizeof (int), 0 },
sizeof (int), -1 /* not initialized */ },
sizeof (int), 0 /* no ifindex */ },
sizeof (int), 0 },
sizeof (int), -1 /* not initialized */ },
sizeof (int), 0 /* no ifindex */ },
sizeof (int), 0 },
-1 /* not initialized */ },
-1 /* not initialized */ },
-1 /* not initialized */ },
-1 /* not initialized */ },
sizeof (int), -1 /* not initialized */ },
sizeof (int), 0 },
sizeof (int), 0 },
sizeof (int), 0 },
/* Enable receipt of ancillary data */
sizeof (int), 0 },
sizeof (int), 0 },
sizeof (int), 0 },
sizeof (int), 0 },
sizeof (int), 0 },
sizeof (int), 0 },
sizeof (int), 0 },
sizeof (int), 0 },
sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
};
/*
* Table of all supported levels
* Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
* any supported options so we need this info separately.
*
* This is needed only for topmost tpi providers and is used only by
* XTI interfaces.
*/
};
/*
* Initialize option database object for TCP
*
* This object represents database of options to search passed to
* {sock,tpi}optcom_req() interface routine to take care of option
* management and associated methods.
*/
tcp_opt_default, /* TCP default value function pointer */
tcp_tpi_opt_get, /* TCP get function pointer */
tcp_tpi_opt_set, /* TCP set function pointer */
TCP_OPT_ARR_CNT, /* TCP option database count of entries */
tcp_opt_arr, /* TCP option database */
TCP_VALID_LEVELS_CNT, /* TCP valid level count of entries */
tcp_valid_levels_arr /* TCP valid level array */
};
/*
* Some TCP options can be "set" by requesting them in the option
* buffer. This is needed for XTI feature test though we do not
* allow it in general. We interpret that this mechanism is more
* applicable to OSI protocols and need not be allowed in general.
* This routine filters out options for which it is not allowed (most)
* and lets through those (few) for which it is. [ The XTI interface
* test suite specifics will imply that any XTI_GENERIC level XTI_* if
* ever implemented will have to be allowed here ].
*/
static boolean_t
{
switch (level) {
case IPPROTO_TCP:
switch (name) {
case TCP_NODELAY:
return (B_TRUE);
default:
return (B_FALSE);
}
/*NOTREACHED*/
default:
return (B_FALSE);
}
/*NOTREACHED*/
}
/*
* This routine gets default values of certain options whose default
* values are maintained by protocol specific code
*/
/* ARGSUSED */
static int
{
switch (level) {
case IPPROTO_TCP:
switch (name) {
case TCP_NOTIFY_THRESHOLD:
break;
case TCP_ABORT_THRESHOLD:
break;
break;
case TCP_CONN_ABORT_THRESHOLD:
break;
default:
return (-1);
}
break;
case IPPROTO_IP:
switch (name) {
case IP_TTL:
break;
default:
return (-1);
}
break;
case IPPROTO_IPV6:
switch (name) {
case IPV6_UNICAST_HOPS:
break;
default:
return (-1);
}
break;
default:
return (-1);
}
return (sizeof (int));
}
/*
* TCP routine to get the values of options.
*/
int
{
int retval;
coas.coa_changed = 0;
switch (level) {
case SOL_SOCKET:
switch (name) {
case SO_SND_COPYAVOID:
SO_SND_COPYAVOID : 0;
return (sizeof (int));
case SO_ACCEPTCONN:
return (sizeof (int));
}
break;
case IPPROTO_TCP:
switch (name) {
case TCP_NODELAY:
return (sizeof (int));
case TCP_MAXSEG:
return (sizeof (int));
case TCP_NOTIFY_THRESHOLD:
return (sizeof (int));
case TCP_ABORT_THRESHOLD:
return (sizeof (int));
return (sizeof (int));
case TCP_CONN_ABORT_THRESHOLD:
return (sizeof (int));
case TCP_INIT_CWND:
return (sizeof (int));
case TCP_KEEPALIVE_THRESHOLD:
return (sizeof (int));
/*
* TCP_KEEPIDLE expects value in seconds, but
* tcp_ka_interval is in milliseconds.
*/
case TCP_KEEPIDLE:
return (sizeof (int));
case TCP_KEEPCNT:
return (sizeof (int));
/*
* TCP_KEEPINTVL expects value in seconds, but
* tcp_ka_rinterval is in milliseconds.
*/
case TCP_KEEPINTVL:
return (sizeof (int));
return (sizeof (int));
case TCP_CORK:
return (sizeof (int));
case TCP_RTO_INITIAL:
return (sizeof (uint32_t));
case TCP_RTO_MIN:
return (sizeof (uint32_t));
case TCP_RTO_MAX:
return (sizeof (uint32_t));
case TCP_LINGER2:
return (sizeof (int));
}
break;
case IPPROTO_IP:
return (-1);
switch (name) {
case IP_OPTIONS:
case T_IP_OPTIONS:
/* Caller ensures enough space */
default:
break;
}
break;
case IPPROTO_IPV6:
/*
* IPPROTO_IPV6 options are only supported for sockets
* that are using IPv6 on the wire.
*/
return (-1);
}
switch (name) {
case IPV6_PATHMTU:
return (-1);
break;
}
break;
}
return (retval);
}
/*
* We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
* Parameters are assumed to be verified by the caller.
*/
/* ARGSUSED */
int
{
int reterr;
coas.coa_changed = 0;
switch (optset_context) {
case SETFN_OPTCOM_CHECKONLY:
/*
* Note: Implies T_CHECK semantics for T_OPTCOM_REQ
* inlen != 0 implies value supplied and
* we have to "pretend" to set it.
* inlen == 0 implies that there is no
* value part in T_CHECK request and just validation
* done elsewhere should be enough, we just return here.
*/
if (inlen == 0) {
*outlenp = 0;
return (0);
}
break;
case SETFN_OPTCOM_NEGOTIATE:
break;
case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
case SETFN_CONN_NEGOTIATE:
/*
* Negotiating local and "association-related" options
* from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
* primitives is allowed by XTI, but we choose
* to not implement this style negotiation for Internet
* protocols (We interpret it is a must for OSI world but
* optional for Internet protocols) for all options.
* [ Will do only for the few options that enable test
* suites that our XTI implementation of this feature
* works for transports that do allow it ]
*/
*outlenp = 0;
return (EINVAL);
}
break;
default:
/*
* We should never get here
*/
*outlenp = 0;
return (EINVAL);
}
/*
* For TCP, we should have no ancillary data sent down
* (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
* has to be zero.
*/
/*
* For fixed length options, no sanity check
* of passed in length is done. It is assumed *_optcom_req()
* routines do the right thing.
*/
switch (level) {
case SOL_SOCKET:
switch (name) {
case SO_KEEPALIVE:
if (checkonly) {
/* check only case */
break;
}
if (!onoff) {
if (connp->conn_keepalive) {
if (tcp->tcp_ka_tid != 0) {
(void) TCP_TIMER_CANCEL(tcp,
tcp->tcp_ka_tid);
tcp->tcp_ka_tid = 0;
}
connp->conn_keepalive = 0;
}
break;
}
if (!connp->conn_keepalive) {
/* Crank up the keepalive timer */
tcp->tcp_ka_last_intrvl = 0;
}
break;
case SO_SNDBUF: {
*outlenp = 0;
return (ENOBUFS);
}
if (checkonly)
break;
if (tcps->tcps_snd_lowat_fraction != 0) {
}
/*
* If we are flow-controlled, recheck the condition.
* There are apps that increase SO_SNDBUF size when
* flow-controlled (EWOULDBLOCK), and expect the flow
* control condition to be lifted right away.
*/
if (tcp->tcp_flow_stopped &&
}
return (0);
}
case SO_RCVBUF:
*outlenp = 0;
return (ENOBUFS);
}
/* Silently ignore zero */
}
/*
* XXX should we return the rwnd here
* and tcp_opt_get ?
*/
return (0);
case SO_SND_COPYAVOID:
if (!checkonly) {
if (tcp->tcp_loopback ||
*outlenp = 0;
return (EOPNOTSUPP);
}
}
return (0);
}
break;
case IPPROTO_TCP:
switch (name) {
case TCP_NODELAY:
if (!checkonly)
break;
case TCP_NOTIFY_THRESHOLD:
if (!checkonly)
break;
case TCP_ABORT_THRESHOLD:
if (!checkonly)
break;
if (!checkonly)
break;
case TCP_CONN_ABORT_THRESHOLD:
if (!checkonly)
break;
case TCP_RECVDSTADDR:
*outlenp = 0;
return (EOPNOTSUPP);
}
/* Setting done in conn_opt_set */
break;
case TCP_INIT_CWND:
if (checkonly)
break;
/*
* Only allow socket with network configuration
* privilege to set the initial cwnd to be larger
* than allowed by RFC 3390.
*/
!= 0) {
*outlenp = 0;
return (reterr);
}
if (val > tcp_max_init_cwnd) {
*outlenp = 0;
return (EINVAL);
}
}
/*
* If the socket is connected, AND no outbound data
* has been sent, reset the actual cwnd values.
*/
}
break;
/*
* TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
* is in milliseconds. TCP_KEEPIDLE is introduced for
* compatibility with other Unix flavors.
* We can fall through TCP_KEEPALIVE_THRESHOLD logic after
* converting the input to milliseconds.
*/
case TCP_KEEPIDLE:
*i1 *= 1000;
/* FALLTHRU */
case TCP_KEEPALIVE_THRESHOLD:
if (checkonly)
break;
*outlenp = 0;
return (EINVAL);
}
/*
* Check if we need to restart the
* keepalive timer.
*/
if (tcp->tcp_ka_tid != 0) {
(void) TCP_TIMER_CANCEL(tcp,
tcp->tcp_ka_tid);
tcp->tcp_ka_last_intrvl = 0;
}
}
break;
/*
* tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
* So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
* three members - tcp_ka_abort_thres, tcp_ka_rinterval and
* tcp_ka_cnt.
*/
case TCP_KEEPCNT:
if (checkonly)
break;
if (*i1 == 0) {
return (EINVAL);
} else if (tcp->tcp_ka_rinterval == 0) {
tcp->tcp_rto_min ||
return (EINVAL);
} else {
return (EINVAL);
}
break;
case TCP_KEEPINTVL:
/*
* TCP_KEEPINTVL is specified in seconds, but
* tcp_ka_rinterval is in milliseconds.
*/
if (checkonly)
break;
return (EINVAL);
if (tcp->tcp_ka_cnt == 0) {
tcp->tcp_ka_cnt =
} else {
return (EINVAL);
}
break;
if (!checkonly) {
if (*i1 <
*i1 >
*outlenp = 0;
return (EINVAL);
}
tcp->tcp_ka_cnt = 0;
tcp->tcp_ka_rinterval = 0;
}
break;
case TCP_CORK:
if (!checkonly) {
/*
* if tcp->tcp_cork was set and is now
* being unset, we have to make sure that
* the remaining data gets sent out. Also
* unset tcp->tcp_cork so that tcp_wput_data()
* can send data even if it is less than mss
*/
tcp->tcp_unsent > 0) {
}
}
break;
case TCP_RTO_INITIAL: {
break;
/*
* Sanity checks
*
* The initial RTO should be bounded by the minimum
* and maximum RTO. And it should also be smaller
* than the connect attempt abort timeout. Otherwise,
* the connection won't be aborted in a period
* reasonably close to that timeout.
*/
*outlenp = 0;
return (EINVAL);
}
/*
* If TCP has not sent anything, need to re-calculate
* tcp_rto. Otherwise, this option change does not
* really affect anything.
*/
break;
break;
}
case TCP_RTO_MIN:
break;
*outlenp = 0;
return (EINVAL);
}
break;
case TCP_RTO_MAX:
break;
/*
* Sanity checks
*
* The maximum RTO should not be larger than the
* connection abort timeout. Otherwise, the
* connection won't be aborted in a period reasonably
* close to that timeout.
*/
*outlenp = 0;
return (EINVAL);
}
break;
case TCP_LINGER2:
break;
/*
* Note that the option value's unit is second. And
* the value should be bigger than the private
* parameter tcp_fin_wait_2_flush_interval's lower
* bound and smaller than the current value of that
* parameter. It should be smaller than the current
* value to avoid an app setting TCP_LINGER2 to a big
* value, causing resource to be held up too long in
* FIN-WAIT-2 state.
*/
if (*i1 < 0 ||
*i1 ||
*i1) {
*outlenp = 0;
return (EINVAL);
}
break;
default:
break;
}
break;
case IPPROTO_IP:
*outlenp = 0;
return (EINVAL);
}
switch (name) {
case IP_SEC_OPT:
/*
* We should not allow policy setting after
* we start listening for connections.
*/
return (EINVAL);
}
break;
}
break;
case IPPROTO_IPV6:
/*
* IPPROTO_IPV6 options are only supported for sockets
* that are using IPv6 on the wire.
*/
*outlenp = 0;
return (EINVAL);
}
switch (name) {
case IPV6_RECVPKTINFO:
if (!checkonly) {
/* Force it to be sent up with the next msg */
tcp->tcp_recvifindex = 0;
}
break;
case IPV6_RECVTCLASS:
if (!checkonly) {
/* Force it to be sent up with the next msg */
}
break;
case IPV6_RECVHOPLIMIT:
if (!checkonly) {
/* Force it to be sent up with the next msg */
}
break;
case IPV6_PKTINFO:
/* This is an extra check for TCP */
if (inlen == sizeof (struct in6_pktinfo)) {
/*
* RFC 3542 states that ipi6_addr must be
* the unspecified address when setting the
* IPV6_PKTINFO sticky socket option on a
* TCP socket.
*/
return (EINVAL);
}
break;
case IPV6_SEC_OPT:
/*
* We should not allow policy setting after
* we start listening for connections.
*/
return (EINVAL);
}
break;
}
break;
}
if (reterr != 0) {
*outlenp = 0;
return (reterr);
}
/*
* Common case of OK return with outval same as inval
*/
}
/* If we are connected we rebuilt the headers */
if (reterr != 0)
return (reterr);
}
}
/*
* If we are connected we re-cache the information.
* We ignore errors to preserve BSD behavior.
* Note that we don't redo IPsec policy lookup here
* since the final destination (or source) didn't change.
*/
}
}
}
connp->conn_wroff);
}
if (IPCL_IS_NONSTR(connp))
}
return (0);
}