sockstr.c revision 7d6c035b71d7c7b33a49c71bff266bf8aa9e0c24
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/types.h>
#include <sys/inttypes.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/cred.h>
#include <sys/kmem.h>
#include <sys/sysmacros.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/debug.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/file.h>
#include <sys/user.h>
#include <sys/stream.h>
#include <sys/strsubr.h>
#include <sys/esunddi.h>
#include <sys/flock.h>
#include <sys/modctl.h>
#include <sys/vtrace.h>
#include <sys/strsun.h>
#include <sys/cmn_err.h>
#include <sys/proc.h>
#include <sys/ddi.h>
#include <sys/kmem_impl.h>
#include <sys/suntpi.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/socketvar.h>
#include <netinet/in.h>
#include <sys/tiuser.h>
#define _SUN_TPI_VERSION 2
#include <sys/tihdr.h>
#include <inet/kssl/ksslapi.h>
#include <c2/audit.h>
int so_default_version = SOV_SOCKSTREAM;
#ifdef DEBUG
/* Set sockdebug to print debug messages when SO_DEBUG is set */
int sockdebug = 0;
/* Set sockprinterr to print error messages when SO_DEBUG is set */
int sockprinterr = 0;
/*
* Set so_default_options to SO_DEBUG is all sockets should be created
* with SO_DEBUG set. This is needed to get debug printouts from the
* socket() call itself.
*/
int so_default_options = 0;
#endif /* DEBUG */
#ifdef SOCK_TEST
/*
* Set to number of ticks to limit cv_waits for code coverage testing.
* Set to 1000 when SO_DEBUG is set to 2.
*/
clock_t sock_test_timelimit = 0;
#endif /* SOCK_TEST */
/*
* For concurrency testing of e.g. opening /dev/ip which does not
* handle T_INFO_REQ messages.
*/
int so_no_tinfo = 0;
/*
* Timeout for getting a T_CAPABILITY_ACK - it is possible for a provider
* to simply ignore the T_CAPABILITY_REQ.
*/
clock_t sock_capability_timeout = 2; /* seconds */
static int do_tcapability(struct sonode *so, t_uscalar_t cap_bits1);
static void so_removehooks(struct sonode *so);
static mblk_t *strsock_proto(vnode_t *vp, mblk_t *mp,
strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
strsigset_t *allmsgsigs, strpollset_t *pollwakeups);
static mblk_t *strsock_misc(vnode_t *vp, mblk_t *mp,
strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
strsigset_t *allmsgsigs, strpollset_t *pollwakeups);
static int tlitosyserr(int terr);
/*
* Convert a socket to a stream. Invoked when the illusory sockmod
* is popped from the stream.
* Change the stream head back to default operation without losing
* any messages (T_conn_ind's are moved to the stream head queue).
*/
int
so_sock2stream(struct sonode *so)
{
struct vnode *vp = SOTOV(so);
queue_t *rq;
mblk_t *mp;
int error = 0;
ASSERT(MUTEX_HELD(&so->so_plumb_lock));
mutex_enter(&so->so_lock);
so_lock_single(so);
ASSERT(so->so_version != SOV_STREAM);
if (so->so_state & SS_DIRECT) {
mblk_t **mpp;
int rval;
/*
* Tell the transport below that sockmod is being popped
*/
mutex_exit(&so->so_lock);
error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K, CRED(),
&rval);
mutex_enter(&so->so_lock);
if (error != 0) {
dprintso(so, 0, ("so_sock2stream(%p): "
"_SIOCSOCKFALLBACK failed\n", so));
goto exit;
}
so->so_state &= ~SS_DIRECT;
for (mpp = &so->so_conn_ind_head; (mp = *mpp) != NULL;
mpp = &mp->b_next) {
struct T_conn_ind *conn_ind;
/*
* strsock_proto() has already verified the length of
* this message block.
*/
ASSERT(MBLKL(mp) >= sizeof (struct T_conn_ind));
conn_ind = (struct T_conn_ind *)mp->b_rptr;
if (conn_ind->OPT_length == 0 &&
conn_ind->OPT_offset == 0)
continue;
if (DB_REF(mp) > 1) {
mblk_t *newmp;
size_t length;
cred_t *cr;
/*
* Copy the message block because it is used
* elsewhere, too.
*/
length = MBLKL(mp);
newmp = soallocproto(length, _ALLOC_INTR);
if (newmp == NULL) {
error = EINTR;
goto exit;
}
bcopy(mp->b_rptr, newmp->b_wptr, length);
newmp->b_wptr += length;
newmp->b_next = mp->b_next;
cr = DB_CRED(mp);
if (cr != NULL)
mblk_setcred(newmp, cr);
DB_CPID(newmp) = DB_CPID(mp);
/*
* Link the new message block into the queue
* and free the old one.
*/
*mpp = newmp;
mp->b_next = NULL;
freemsg(mp);
mp = newmp;
conn_ind = (struct T_conn_ind *)mp->b_rptr;
}
/*
* Remove options added by TCP for accept fast-path.
*/
conn_ind->OPT_length = 0;
conn_ind->OPT_offset = 0;
}
}
so->so_version = SOV_STREAM;
so->so_priv = NULL;
/*
* Remove the hooks in the stream head to avoid queuing more
* packets in sockfs.
*/
mutex_exit(&so->so_lock);
so_removehooks(so);
mutex_enter(&so->so_lock);
/*
* Clear any state related to urgent data. Leave any T_EXDATA_IND
* on the queue - the behavior of urgent data after a switch is
* left undefined.
*/
so->so_error = so->so_delayed_error = 0;
freemsg(so->so_oobmsg);
so->so_oobmsg = NULL;
so->so_oobsigcnt = so->so_oobcnt = 0;
so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|
SS_HASCONNIND|SS_SAVEDEOR);
ASSERT(so_verify_oobstate(so));
freemsg(so->so_ack_mp);
so->so_ack_mp = NULL;
/*
* Flush the T_DISCON_IND on so_discon_ind_mp.
*/
so_flush_discon_ind(so);
/*
* Move any queued T_CONN_IND messages to stream head queue.
*/
rq = RD(strvp2wq(vp));
while ((mp = so->so_conn_ind_head) != NULL) {
so->so_conn_ind_head = mp->b_next;
mp->b_next = NULL;
if (so->so_conn_ind_head == NULL) {
ASSERT(so->so_conn_ind_tail == mp);
so->so_conn_ind_tail = NULL;
}
dprintso(so, 0,
("so_sock2stream(%p): moving T_CONN_IND\n",
so));
/* Drop lock across put() */
mutex_exit(&so->so_lock);
put(rq, mp);
mutex_enter(&so->so_lock);
}
exit:
ASSERT(MUTEX_HELD(&so->so_lock));
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
return (error);
}
/*
* Covert a stream back to a socket. This is invoked when the illusory
* sockmod is pushed on a stream (where the stream was "created" by
* popping the illusory sockmod).
* This routine can not recreate the socket state (certain aspects of
* it like urgent data state and the bound/connected addresses for AF_UNIX
* sockets can not be recreated by asking the transport for information).
* Thus this routine implicitly assumes that the socket is in an initial
* state (as if it was just created). It flushes any messages queued on the
* read queue to avoid dealing with e.g. TPI acks or T_exdata_ind messages.
*/
void
so_stream2sock(struct sonode *so)
{
struct vnode *vp = SOTOV(so);
ASSERT(MUTEX_HELD(&so->so_plumb_lock));
mutex_enter(&so->so_lock);
so_lock_single(so);
ASSERT(so->so_version == SOV_STREAM);
so->so_version = SOV_SOCKSTREAM;
so->so_pushcnt = 0;
mutex_exit(&so->so_lock);
/*
* Set a permenent error to force any thread in sorecvmsg to
* return (and drop SOREADLOCKED). Clear the error once
* we have SOREADLOCKED.
* This makes a read sleeping during the I_PUSH of sockmod return
* EIO.
*/
strsetrerror(SOTOV(so), EIO, 1, NULL);
/*
* Get the read lock before flushing data to avoid
* problems with the T_EXDATA_IND MSG_PEEK code in sorecvmsg.
*/
mutex_enter(&so->so_lock);
(void) so_lock_read(so, 0); /* Set SOREADLOCKED */
mutex_exit(&so->so_lock);
strsetrerror(SOTOV(so), 0, 0, NULL);
so_installhooks(so);
/*
* Flush everything on the read queue.
* This ensures that no T_CONN_IND remain and that no T_EXDATA_IND
* remain; those types of messages would confuse sockfs.
*/
strflushrq(vp, FLUSHALL);
mutex_enter(&so->so_lock);
/*
* Flush the T_DISCON_IND on so_discon_ind_mp.
*/
so_flush_discon_ind(so);
so_unlock_read(so); /* Clear SOREADLOCKED */
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
}
/*
* Install the hooks in the stream head.
*/
void
so_installhooks(struct sonode *so)
{
struct vnode *vp = SOTOV(so);
strsetrputhooks(vp, SH_SIGALLDATA | SH_IGN_ZEROLEN | SH_CONSOL_DATA,
strsock_proto, strsock_misc);
strsetwputhooks(vp, SH_SIGPIPE | SH_RECHECK_ERR, 0);
}
/*
* Remove the hooks in the stream head.
*/
static void
so_removehooks(struct sonode *so)
{
struct vnode *vp = SOTOV(so);
strsetrputhooks(vp, 0, NULL, NULL);
strsetwputhooks(vp, 0, STRTIMOUT);
/*
* Leave read behavior as it would have been for a normal
* stream i.e. a read of an M_PROTO will fail.
*/
}
/*
* Initialize the streams side of a socket including
* T_info_req/ack processing. If tso is not NULL its values are used thereby
* avoiding the T_INFO_REQ.
*/
int
so_strinit(struct sonode *so, struct sonode *tso)
{
struct vnode *vp = SOTOV(so);
struct stdata *stp;
mblk_t *mp;
int error;
dprintso(so, 1, ("so_strinit(%p)\n", so));
/* Preallocate an unbind_req message */
mp = soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP);
mutex_enter(&so->so_lock);
so->so_unbind_mp = mp;
#ifdef DEBUG
so->so_options = so_default_options;
#endif /* DEBUG */
mutex_exit(&so->so_lock);
so_installhooks(so);
/*
* The T_CAPABILITY_REQ should be the first message sent down because
* at least TCP has a fast-path for this which avoids timeouts while
* waiting for the T_CAPABILITY_ACK under high system load.
*/
if (tso == NULL) {
error = do_tcapability(so, TC1_ACCEPTOR_ID | TC1_INFO);
if (error)
return (error);
} else {
mutex_enter(&so->so_lock);
so->so_tsdu_size = tso->so_tsdu_size;
so->so_etsdu_size = tso->so_etsdu_size;
so->so_addr_size = tso->so_addr_size;
so->so_opt_size = tso->so_opt_size;
so->so_tidu_size = tso->so_tidu_size;
so->so_serv_type = tso->so_serv_type;
so->so_mode = tso->so_mode & ~SM_ACCEPTOR_ID;
mutex_exit(&so->so_lock);
/* the following do_tcapability may update so->so_mode */
if ((tso->so_serv_type != T_CLTS) &&
!(tso->so_state & SS_DIRECT)) {
error = do_tcapability(so, TC1_ACCEPTOR_ID);
if (error)
return (error);
}
}
/*
* If the addr_size is 0 we treat it as already bound
* and connected. This is used by the routing socket.
* We set the addr_size to something to allocate a the address
* structures.
*/
if (so->so_addr_size == 0) {
so->so_state |= SS_ISBOUND | SS_ISCONNECTED;
/* Address size can vary with address families. */
if (so->so_family == AF_INET6)
so->so_addr_size =
(t_scalar_t)sizeof (struct sockaddr_in6);
else
so->so_addr_size =
(t_scalar_t)sizeof (struct sockaddr_in);
ASSERT(so->so_unbind_mp);
}
/*
* Allocate the addresses.
*/
ASSERT(so->so_laddr_sa == NULL && so->so_faddr_sa == NULL);
ASSERT(so->so_laddr_len == 0 && so->so_faddr_len == 0);
so->so_laddr_maxlen = so->so_faddr_maxlen =
P2ROUNDUP(so->so_addr_size, KMEM_ALIGN);
so->so_laddr_sa = kmem_alloc(so->so_laddr_maxlen * 2, KM_SLEEP);
so->so_faddr_sa = (struct sockaddr *)((caddr_t)so->so_laddr_sa
+ so->so_laddr_maxlen);
if (so->so_family == AF_UNIX) {
/*
* Initialize AF_UNIX related fields.
*/
bzero(&so->so_ux_laddr, sizeof (so->so_ux_laddr));
bzero(&so->so_ux_faddr, sizeof (so->so_ux_faddr));
}
stp = vp->v_stream;
/*
* Have to keep minpsz at zero in order to allow write/send of zero
* bytes.
*/
mutex_enter(&stp->sd_lock);
if (stp->sd_qn_minpsz == 1)
stp->sd_qn_minpsz = 0;
mutex_exit(&stp->sd_lock);
return (0);
}
static void
copy_tinfo(struct sonode *so, struct T_info_ack *tia)
{
so->so_tsdu_size = tia->TSDU_size;
so->so_etsdu_size = tia->ETSDU_size;
so->so_addr_size = tia->ADDR_size;
so->so_opt_size = tia->OPT_size;
so->so_tidu_size = tia->TIDU_size;
so->so_serv_type = tia->SERV_type;
switch (tia->CURRENT_state) {
case TS_UNBND:
break;
case TS_IDLE:
so->so_state |= SS_ISBOUND;
so->so_laddr_len = 0;
so->so_state &= ~SS_LADDR_VALID;
break;
case TS_DATA_XFER:
so->so_state |= SS_ISBOUND|SS_ISCONNECTED;
so->so_laddr_len = 0;
so->so_faddr_len = 0;
so->so_state &= ~(SS_LADDR_VALID | SS_FADDR_VALID);
break;
}
/*
* Heuristics for determining the socket mode flags
* (SM_ATOMIC, SM_CONNREQUIRED, SM_ADDR, SM_FDPASSING,
* and SM_EXDATA, SM_OPTDATA, and SM_BYTESTREAM)
* from the info ack.
*/
if (so->so_serv_type == T_CLTS) {
so->so_mode |= SM_ATOMIC | SM_ADDR;
} else {
so->so_mode |= SM_CONNREQUIRED;
if (so->so_etsdu_size != 0 && so->so_etsdu_size != -2)
so->so_mode |= SM_EXDATA;
}
if (so->so_type == SOCK_SEQPACKET || so->so_type == SOCK_RAW) {
/* Semantics are to discard tail end of messages */
so->so_mode |= SM_ATOMIC;
}
if (so->so_family == AF_UNIX) {
so->so_mode |= SM_FDPASSING | SM_OPTDATA;
if (so->so_addr_size == -1) {
/* MAXPATHLEN + soun_family + nul termination */
so->so_addr_size = (t_scalar_t)(MAXPATHLEN +
sizeof (short) + 1);
}
if (so->so_type == SOCK_STREAM) {
/*
* Make it into a byte-stream transport.
* SOCK_SEQPACKET sockets are unchanged.
*/
so->so_tsdu_size = 0;
}
} else if (so->so_addr_size == -1) {
/*
* Logic extracted from sockmod - have to pick some max address
* length in order to preallocate the addresses.
*/
so->so_addr_size = SOA_DEFSIZE;
}
if (so->so_tsdu_size == 0)
so->so_mode |= SM_BYTESTREAM;
}
static int
check_tinfo(struct sonode *so)
{
/* Consistency checks */
if (so->so_type == SOCK_DGRAM && so->so_serv_type != T_CLTS) {
eprintso(so, ("service type and socket type mismatch\n"));
eprintsoline(so, EPROTO);
return (EPROTO);
}
if (so->so_type == SOCK_STREAM && so->so_serv_type == T_CLTS) {
eprintso(so, ("service type and socket type mismatch\n"));
eprintsoline(so, EPROTO);
return (EPROTO);
}
if (so->so_type == SOCK_SEQPACKET && so->so_serv_type == T_CLTS) {
eprintso(so, ("service type and socket type mismatch\n"));
eprintsoline(so, EPROTO);
return (EPROTO);
}
if (so->so_family == AF_INET &&
so->so_addr_size != (t_scalar_t)sizeof (struct sockaddr_in)) {
eprintso(so,
("AF_INET must have sockaddr_in address length. Got %d\n",
so->so_addr_size));
eprintsoline(so, EMSGSIZE);
return (EMSGSIZE);
}
if (so->so_family == AF_INET6 &&
so->so_addr_size != (t_scalar_t)sizeof (struct sockaddr_in6)) {
eprintso(so,
("AF_INET6 must have sockaddr_in6 address length. Got %d\n",
so->so_addr_size));
eprintsoline(so, EMSGSIZE);
return (EMSGSIZE);
}
dprintso(so, 1, (
"tinfo: serv %d tsdu %d, etsdu %d, addr %d, opt %d, tidu %d\n",
so->so_serv_type, so->so_tsdu_size, so->so_etsdu_size,
so->so_addr_size, so->so_opt_size,
so->so_tidu_size));
dprintso(so, 1, ("tinfo: so_state %s\n",
pr_state(so->so_state, so->so_mode)));
return (0);
}
/*
* Send down T_info_req and wait for the ack.
* Record interesting T_info_ack values in the sonode.
*/
static int
do_tinfo(struct sonode *so)
{
struct T_info_req tir;
mblk_t *mp;
int error;
ASSERT(MUTEX_NOT_HELD(&so->so_lock));
if (so_no_tinfo) {
so->so_addr_size = 0;
return (0);
}
dprintso(so, 1, ("do_tinfo(%p)\n", so));
/* Send T_INFO_REQ */
tir.PRIM_type = T_INFO_REQ;
mp = soallocproto1(&tir, sizeof (tir),
sizeof (struct T_info_req) + sizeof (struct T_info_ack),
_ALLOC_INTR);
if (mp == NULL) {
eprintsoline(so, ENOBUFS);
return (ENOBUFS);
}
/* T_INFO_REQ has to be M_PCPROTO */
DB_TYPE(mp) = M_PCPROTO;
error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
if (error) {
eprintsoline(so, error);
return (error);
}
mutex_enter(&so->so_lock);
/* Wait for T_INFO_ACK */
if ((error = sowaitprim(so, T_INFO_REQ, T_INFO_ACK,
(t_uscalar_t)sizeof (struct T_info_ack), &mp, 0))) {
mutex_exit(&so->so_lock);
eprintsoline(so, error);
return (error);
}
ASSERT(mp);
copy_tinfo(so, (struct T_info_ack *)mp->b_rptr);
mutex_exit(&so->so_lock);
freemsg(mp);
return (check_tinfo(so));
}
/*
* Send down T_capability_req and wait for the ack.
* Record interesting T_capability_ack values in the sonode.
*/
static int
do_tcapability(struct sonode *so, t_uscalar_t cap_bits1)
{
struct T_capability_req tcr;
struct T_capability_ack *tca;
mblk_t *mp;
int error;
ASSERT(cap_bits1 != 0);
ASSERT((cap_bits1 & ~(TC1_ACCEPTOR_ID | TC1_INFO)) == 0);
ASSERT(MUTEX_NOT_HELD(&so->so_lock));
if (so->so_provinfo->tpi_capability == PI_NO)
return (do_tinfo(so));
if (so_no_tinfo) {
so->so_addr_size = 0;
if ((cap_bits1 &= ~TC1_INFO) == 0)
return (0);
}
dprintso(so, 1, ("do_tcapability(%p)\n", so));
/* Send T_CAPABILITY_REQ */
tcr.PRIM_type = T_CAPABILITY_REQ;
tcr.CAP_bits1 = cap_bits1;
mp = soallocproto1(&tcr, sizeof (tcr),
sizeof (struct T_capability_req) + sizeof (struct T_capability_ack),
_ALLOC_INTR);
if (mp == NULL) {
eprintsoline(so, ENOBUFS);
return (ENOBUFS);
}
/* T_CAPABILITY_REQ should be M_PCPROTO here */
DB_TYPE(mp) = M_PCPROTO;
error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
if (error) {
eprintsoline(so, error);
return (error);
}
mutex_enter(&so->so_lock);
/* Wait for T_CAPABILITY_ACK */
if ((error = sowaitprim(so, T_CAPABILITY_REQ, T_CAPABILITY_ACK,
(t_uscalar_t)sizeof (*tca), &mp, sock_capability_timeout * hz))) {
mutex_exit(&so->so_lock);
PI_PROVLOCK(so->so_provinfo);
if (so->so_provinfo->tpi_capability == PI_DONTKNOW)
so->so_provinfo->tpi_capability = PI_NO;
PI_PROVUNLOCK(so->so_provinfo);
ASSERT((so->so_mode & SM_ACCEPTOR_ID) == 0);
if (cap_bits1 & TC1_INFO) {
/*
* If the T_CAPABILITY_REQ timed out and then a
* T_INFO_REQ gets a protocol error, most likely
* the capability was slow (vs. unsupported). Return
* ENOSR for this case as a best guess.
*/
if (error == ETIME) {
return ((error = do_tinfo(so)) == EPROTO ?
ENOSR : error);
}
return (do_tinfo(so));
}
return (0);
}
if (so->so_provinfo->tpi_capability == PI_DONTKNOW) {
PI_PROVLOCK(so->so_provinfo);
so->so_provinfo->tpi_capability = PI_YES;
PI_PROVUNLOCK(so->so_provinfo);
}
ASSERT(mp);
tca = (struct T_capability_ack *)mp->b_rptr;
ASSERT((cap_bits1 & TC1_INFO) == (tca->CAP_bits1 & TC1_INFO));
cap_bits1 = tca->CAP_bits1;
if (cap_bits1 & TC1_ACCEPTOR_ID) {
so->so_acceptor_id = tca->ACCEPTOR_id;
so->so_mode |= SM_ACCEPTOR_ID;
}
if (cap_bits1 & TC1_INFO)
copy_tinfo(so, &tca->INFO_ack);
mutex_exit(&so->so_lock);
freemsg(mp);
if (cap_bits1 & TC1_INFO)
return (check_tinfo(so));
return (0);
}
/*
* Retrieve and clear the socket error.
*/
int
sogeterr(struct sonode *so)
{
int error;
ASSERT(MUTEX_HELD(&so->so_lock));
error = so->so_error;
so->so_error = 0;
return (error);
}
/*
* This routine is registered with the stream head to retrieve read
* side errors.
* It does not clear the socket error for a peeking read side operation.
* It the error is to be cleared it sets *clearerr.
*/
int
sogetrderr(vnode_t *vp, int ispeek, int *clearerr)
{
struct sonode *so = VTOSO(vp);
int error;
mutex_enter(&so->so_lock);
if (ispeek) {
error = so->so_error;
*clearerr = 0;
} else {
error = so->so_error;
so->so_error = 0;
*clearerr = 1;
}
mutex_exit(&so->so_lock);
return (error);
}
/*
* This routine is registered with the stream head to retrieve write
* side errors.
* It does not clear the socket error for a peeking read side operation.
* It the error is to be cleared it sets *clearerr.
*/
int
sogetwrerr(vnode_t *vp, int ispeek, int *clearerr)
{
struct sonode *so = VTOSO(vp);
int error;
mutex_enter(&so->so_lock);
if (so->so_state & SS_CANTSENDMORE) {
error = EPIPE;
*clearerr = 0;
} else {
error = so->so_error;
if (ispeek) {
*clearerr = 0;
} else {
so->so_error = 0;
*clearerr = 1;
}
}
mutex_exit(&so->so_lock);
return (error);
}
/*
* Set a nonpersistent read and write error on the socket.
* Used when there is a T_uderror_ind for a connected socket.
* The caller also needs to call strsetrerror and strsetwerror
* after dropping the lock.
*/
void
soseterror(struct sonode *so, int error)
{
ASSERT(error != 0);
ASSERT(MUTEX_HELD(&so->so_lock));
so->so_error = (ushort_t)error;
}
void
soisconnecting(struct sonode *so)
{
ASSERT(MUTEX_HELD(&so->so_lock));
so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
so->so_state |= SS_ISCONNECTING;
cv_broadcast(&so->so_state_cv);
}
void
soisconnected(struct sonode *so)
{
ASSERT(MUTEX_HELD(&so->so_lock));
so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING);
so->so_state |= SS_ISCONNECTED;
cv_broadcast(&so->so_state_cv);
}
/*
* The caller also needs to call strsetrerror, strsetwerror and strseteof.
*/
void
soisdisconnected(struct sonode *so, int error)
{
ASSERT(MUTEX_HELD(&so->so_lock));
so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING|
SS_LADDR_VALID|SS_FADDR_VALID);
so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE);
so->so_error = (ushort_t)error;
if (so->so_peercred != NULL) {
crfree(so->so_peercred);
so->so_peercred = NULL;
}
cv_broadcast(&so->so_state_cv);
}
/*
* For connected AF_UNIX SOCK_DGRAM sockets when the peer closes.
* Does not affect write side.
* The caller also has to call strsetrerror.
*/
static void
sobreakconn(struct sonode *so, int error)
{
ASSERT(MUTEX_HELD(&so->so_lock));
so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
so->so_error = (ushort_t)error;
cv_broadcast(&so->so_state_cv);
}
/*
* Can no longer send.
* Caller must also call strsetwerror.
*
* We mark the peer address as no longer valid for getpeername, but
* leave it around for so_unix_close to notify the peer (that
* transport has no addressing held at that layer).
*/
void
socantsendmore(struct sonode *so)
{
ASSERT(MUTEX_HELD(&so->so_lock));
so->so_state = so->so_state & ~SS_FADDR_VALID | SS_CANTSENDMORE;
cv_broadcast(&so->so_state_cv);
}
/*
* The caller must call strseteof(,1) as well as this routine
* to change the socket state.
*/
void
socantrcvmore(struct sonode *so)
{
ASSERT(MUTEX_HELD(&so->so_lock));
so->so_state |= SS_CANTRCVMORE;
cv_broadcast(&so->so_state_cv);
}
/*
* The caller has sent down a "request_prim" primitive and wants to wait for
* an ack ("ack_prim") or an T_ERROR_ACK for it.
* The specified "ack_prim" can be a T_OK_ACK.
*
* Assumes that all the TPI acks are M_PCPROTO messages.
*
* Note that the socket is single-threaded (using so_lock_single)
* for all operations that generate TPI ack messages. Since
* only TPI ack messages are M_PCPROTO we should never receive
* anything except either the ack we are expecting or a T_ERROR_ACK
* for the same primitive.
*/
int
sowaitprim(struct sonode *so, t_scalar_t request_prim, t_scalar_t ack_prim,
t_uscalar_t min_size, mblk_t **mpp, clock_t wait)
{
mblk_t *mp;
union T_primitives *tpr;
int error;
dprintso(so, 1, ("sowaitprim(%p, %d, %d, %d, %p, %lu)\n",
so, request_prim, ack_prim, min_size, mpp, wait));
ASSERT(MUTEX_HELD(&so->so_lock));
error = sowaitack(so, &mp, wait);
if (error)
return (error);
dprintso(so, 1, ("got msg %p\n", mp));
if (DB_TYPE(mp) != M_PCPROTO ||
MBLKL(mp) < sizeof (tpr->type)) {
freemsg(mp);
eprintsoline(so, EPROTO);
return (EPROTO);
}
tpr = (union T_primitives *)mp->b_rptr;
/*
* Did we get the primitive that we were asking for?
* For T_OK_ACK we also check that it matches the request primitive.
*/
if (tpr->type == ack_prim &&
(ack_prim != T_OK_ACK ||
tpr->ok_ack.CORRECT_prim == request_prim)) {
if (MBLKL(mp) >= (ssize_t)min_size) {
/* Found what we are looking for */
*mpp = mp;
return (0);
}
/* Too short */
freemsg(mp);
eprintsoline(so, EPROTO);
return (EPROTO);
}
if (tpr->type == T_ERROR_ACK &&
tpr->error_ack.ERROR_prim == request_prim) {
/* Error to the primitive we were looking for */
if (tpr->error_ack.TLI_error == TSYSERR) {
error = tpr->error_ack.UNIX_error;
} else {
error = tlitosyserr(tpr->error_ack.TLI_error);
}
dprintso(so, 0, ("error_ack for %d: %d/%d ->%d\n",
tpr->error_ack.ERROR_prim,
tpr->error_ack.TLI_error,
tpr->error_ack.UNIX_error,
error));
freemsg(mp);
return (error);
}
/*
* Wrong primitive or T_ERROR_ACK for the wrong primitive
*/
#ifdef DEBUG
if (tpr->type == T_ERROR_ACK) {
dprintso(so, 0, ("error_ack for %d: %d/%d\n",
tpr->error_ack.ERROR_prim,
tpr->error_ack.TLI_error,
tpr->error_ack.UNIX_error));
} else if (tpr->type == T_OK_ACK) {
dprintso(so, 0, ("ok_ack for %d, expected %d for %d\n",
tpr->ok_ack.CORRECT_prim,
ack_prim, request_prim));
} else {
dprintso(so, 0,
("unexpected primitive %d, expected %d for %d\n",
tpr->type, ack_prim, request_prim));
}
#endif /* DEBUG */
freemsg(mp);
eprintsoline(so, EPROTO);
return (EPROTO);
}
/*
* Wait for a T_OK_ACK for the specified primitive.
*/
int
sowaitokack(struct sonode *so, t_scalar_t request_prim)
{
mblk_t *mp;
int error;
error = sowaitprim(so, request_prim, T_OK_ACK,
(t_uscalar_t)sizeof (struct T_ok_ack), &mp, 0);
if (error)
return (error);
freemsg(mp);
return (0);
}
/*
* Queue a received TPI ack message on so_ack_mp.
*/
void
soqueueack(struct sonode *so, mblk_t *mp)
{
if (DB_TYPE(mp) != M_PCPROTO) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: received unexpected M_PROTO TPI ack. Prim %d\n",
*(t_scalar_t *)mp->b_rptr);
freemsg(mp);
return;
}
mutex_enter(&so->so_lock);
if (so->so_ack_mp != NULL) {
dprintso(so, 1, ("so_ack_mp already set\n"));
freemsg(so->so_ack_mp);
so->so_ack_mp = NULL;
}
so->so_ack_mp = mp;
cv_broadcast(&so->so_ack_cv);
mutex_exit(&so->so_lock);
}
/*
* Wait for a TPI ack ignoring signals and errors.
*/
int
sowaitack(struct sonode *so, mblk_t **mpp, clock_t wait)
{
ASSERT(MUTEX_HELD(&so->so_lock));
while (so->so_ack_mp == NULL) {
#ifdef SOCK_TEST
if (wait == 0 && sock_test_timelimit != 0)
wait = sock_test_timelimit;
#endif
if (wait != 0) {
/*
* Only wait for the time limit.
*/
clock_t now;
time_to_wait(&now, wait);
if (cv_timedwait(&so->so_ack_cv, &so->so_lock,
now) == -1) {
eprintsoline(so, ETIME);
return (ETIME);
}
}
else
cv_wait(&so->so_ack_cv, &so->so_lock);
}
*mpp = so->so_ack_mp;
#ifdef DEBUG
{
union T_primitives *tpr;
mblk_t *mp = *mpp;
tpr = (union T_primitives *)mp->b_rptr;
ASSERT(DB_TYPE(mp) == M_PCPROTO);
ASSERT(tpr->type == T_OK_ACK ||
tpr->type == T_ERROR_ACK ||
tpr->type == T_BIND_ACK ||
tpr->type == T_CAPABILITY_ACK ||
tpr->type == T_INFO_ACK ||
tpr->type == T_OPTMGMT_ACK);
}
#endif /* DEBUG */
so->so_ack_mp = NULL;
return (0);
}
/*
* Queue a received T_CONN_IND message on so_conn_ind_head/tail.
*/
void
soqueueconnind(struct sonode *so, mblk_t *mp)
{
if (DB_TYPE(mp) != M_PROTO) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: received unexpected M_PCPROTO T_CONN_IND\n");
freemsg(mp);
return;
}
mutex_enter(&so->so_lock);
ASSERT(mp->b_next == NULL);
if (so->so_conn_ind_head == NULL) {
so->so_conn_ind_head = mp;
so->so_state |= SS_HASCONNIND;
} else {
ASSERT(so->so_state & SS_HASCONNIND);
ASSERT(so->so_conn_ind_tail->b_next == NULL);
so->so_conn_ind_tail->b_next = mp;
}
so->so_conn_ind_tail = mp;
/* Wakeup a single consumer of the T_CONN_IND */
cv_signal(&so->so_connind_cv);
mutex_exit(&so->so_lock);
}
/*
* Wait for a T_CONN_IND.
* Don't wait if nonblocking.
* Accept signals and socket errors.
*/
int
sowaitconnind(struct sonode *so, int fmode, mblk_t **mpp)
{
mblk_t *mp;
int error = 0;
ASSERT(MUTEX_NOT_HELD(&so->so_lock));
mutex_enter(&so->so_lock);
check_error:
if (so->so_error) {
error = sogeterr(so);
if (error) {
mutex_exit(&so->so_lock);
return (error);
}
}
if (so->so_conn_ind_head == NULL) {
if (fmode & (FNDELAY|FNONBLOCK)) {
error = EWOULDBLOCK;
goto done;
}
if (!cv_wait_sig_swap(&so->so_connind_cv, &so->so_lock)) {
error = EINTR;
goto done;
}
goto check_error;
}
mp = so->so_conn_ind_head;
so->so_conn_ind_head = mp->b_next;
mp->b_next = NULL;
if (so->so_conn_ind_head == NULL) {
ASSERT(so->so_conn_ind_tail == mp);
so->so_conn_ind_tail = NULL;
so->so_state &= ~SS_HASCONNIND;
}
*mpp = mp;
done:
mutex_exit(&so->so_lock);
return (error);
}
/*
* Flush a T_CONN_IND matching the sequence number from the list.
* Return zero if found; non-zero otherwise.
* This is called very infrequently thus it is ok to do a linear search.
*/
int
soflushconnind(struct sonode *so, t_scalar_t seqno)
{
mblk_t *prevmp, *mp;
struct T_conn_ind *tci;
mutex_enter(&so->so_lock);
for (prevmp = NULL, mp = so->so_conn_ind_head; mp != NULL;
prevmp = mp, mp = mp->b_next) {
tci = (struct T_conn_ind *)mp->b_rptr;
if (tci->SEQ_number == seqno) {
dprintso(so, 1,
("t_discon_ind: found T_CONN_IND %d\n", seqno));
/* Deleting last? */
if (so->so_conn_ind_tail == mp) {
so->so_conn_ind_tail = prevmp;
}
if (prevmp == NULL) {
/* Deleting first */
so->so_conn_ind_head = mp->b_next;
} else {
prevmp->b_next = mp->b_next;
}
mp->b_next = NULL;
if (so->so_conn_ind_head == NULL) {
ASSERT(so->so_conn_ind_tail == NULL);
so->so_state &= ~SS_HASCONNIND;
} else {
ASSERT(so->so_conn_ind_tail != NULL);
}
so->so_error = ECONNABORTED;
mutex_exit(&so->so_lock);
/*
* T_KSSL_PROXY_CONN_IND may carry a handle for
* an SSL context, and needs to be released.
*/
if ((tci->PRIM_type == T_SSL_PROXY_CONN_IND) &&
(mp->b_cont != NULL)) {
kssl_ctx_t kssl_ctx;
ASSERT(MBLKL(mp->b_cont) ==
sizeof (kssl_ctx_t));
kssl_ctx = *((kssl_ctx_t *)mp->b_cont->b_rptr);
kssl_release_ctx(kssl_ctx);
}
freemsg(mp);
return (0);
}
}
mutex_exit(&so->so_lock);
dprintso(so, 1, ("t_discon_ind: NOT found T_CONN_IND %d\n", seqno));
return (-1);
}
/*
* Wait until the socket is connected or there is an error.
* fmode should contain any nonblocking flags. nosig should be
* set if the caller does not want the wait to be interrupted by a signal.
*/
int
sowaitconnected(struct sonode *so, int fmode, int nosig)
{
int error;
ASSERT(MUTEX_HELD(&so->so_lock));
while ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) ==
SS_ISCONNECTING && so->so_error == 0) {
dprintso(so, 1, ("waiting for SS_ISCONNECTED on %p\n", so));
if (fmode & (FNDELAY|FNONBLOCK))
return (EINPROGRESS);
if (nosig)
cv_wait(&so->so_state_cv, &so->so_lock);
else if (!cv_wait_sig_swap(&so->so_state_cv, &so->so_lock)) {
/*
* Return EINTR and let the application use
* nonblocking techniques for detecting when
* the connection has been established.
*/
return (EINTR);
}
dprintso(so, 1, ("awoken on %p\n", so));
}
if (so->so_error != 0) {
error = sogeterr(so);
ASSERT(error != 0);
dprintso(so, 1, ("sowaitconnected: error %d\n", error));
return (error);
}
if (!(so->so_state & SS_ISCONNECTED)) {
/*
* Could have received a T_ORDREL_IND or a T_DISCON_IND with
* zero errno. Or another thread could have consumed so_error
* e.g. by calling read.
*/
error = ECONNREFUSED;
dprintso(so, 1, ("sowaitconnected: error %d\n", error));
return (error);
}
return (0);
}
/*
* Handle the signal generation aspect of urgent data.
*/
static void
so_oob_sig(struct sonode *so, int extrasig,
strsigset_t *signals, strpollset_t *pollwakeups)
{
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(so_verify_oobstate(so));
ASSERT(so->so_oobsigcnt >= so->so_oobcnt);
if (so->so_oobsigcnt > so->so_oobcnt) {
/*
* Signal has already been generated once for this
* urgent "event". However, since TCP can receive updated
* urgent pointers we still generate a signal.
*/
ASSERT(so->so_state & SS_OOBPEND);
if (extrasig) {
*signals |= S_RDBAND;
*pollwakeups |= POLLRDBAND;
}
return;
}
so->so_oobsigcnt++;
ASSERT(so->so_oobsigcnt > 0); /* Wraparound */
ASSERT(so->so_oobsigcnt > so->so_oobcnt);
/*
* Record (for select/poll) that urgent data is pending.
*/
so->so_state |= SS_OOBPEND;
/*
* New urgent data on the way so forget about any old
* urgent data.
*/
so->so_state &= ~(SS_HAVEOOBDATA|SS_HADOOBDATA);
if (so->so_oobmsg != NULL) {
dprintso(so, 1, ("sock: discarding old oob\n"));
freemsg(so->so_oobmsg);
so->so_oobmsg = NULL;
}
*signals |= S_RDBAND;
*pollwakeups |= POLLRDBAND;
ASSERT(so_verify_oobstate(so));
}
/*
* Handle the processing of the T_EXDATA_IND with urgent data.
* Returns the T_EXDATA_IND if it should be queued on the read queue.
*/
/* ARGSUSED2 */
static mblk_t *
so_oob_exdata(struct sonode *so, mblk_t *mp,
strsigset_t *signals, strpollset_t *pollwakeups)
{
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(so_verify_oobstate(so));
ASSERT(so->so_oobsigcnt > so->so_oobcnt);
so->so_oobcnt++;
ASSERT(so->so_oobcnt > 0); /* wraparound? */
ASSERT(so->so_oobsigcnt >= so->so_oobcnt);
/*
* Set MSGMARK for SIOCATMARK.
*/
mp->b_flag |= MSGMARK;
ASSERT(so_verify_oobstate(so));
return (mp);
}
/*
* Handle the processing of the actual urgent data.
* Returns the data mblk if it should be queued on the read queue.
*/
static mblk_t *
so_oob_data(struct sonode *so, mblk_t *mp,
strsigset_t *signals, strpollset_t *pollwakeups)
{
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(so_verify_oobstate(so));
ASSERT(so->so_oobsigcnt >= so->so_oobcnt);
ASSERT(mp != NULL);
/*
* For OOBINLINE we keep the data in the T_EXDATA_IND.
* Otherwise we store it in so_oobmsg.
*/
ASSERT(so->so_oobmsg == NULL);
if (so->so_options & SO_OOBINLINE) {
*pollwakeups |= POLLIN | POLLRDNORM | POLLRDBAND;
*signals |= S_INPUT | S_RDNORM;
} else {
*pollwakeups |= POLLRDBAND;
so->so_state |= SS_HAVEOOBDATA;
so->so_oobmsg = mp;
mp = NULL;
}
ASSERT(so_verify_oobstate(so));
return (mp);
}
/*
* Caller must hold the mutex.
* For delayed processing, save the T_DISCON_IND received
* from below on so_discon_ind_mp.
* When the message is processed the framework will call:
* (*func)(so, mp);
*/
static void
so_save_discon_ind(struct sonode *so,
mblk_t *mp,
void (*func)(struct sonode *so, mblk_t *))
{
ASSERT(MUTEX_HELD(&so->so_lock));
/*
* Discard new T_DISCON_IND if we have already received another.
* Currently the earlier message can either be on so_discon_ind_mp
* or being processed.
*/
if (so->so_discon_ind_mp != NULL || (so->so_flag & SOASYNC_UNBIND)) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: received unexpected additional T_DISCON_IND\n");
freemsg(mp);
return;
}
mp->b_prev = (mblk_t *)func;
mp->b_next = NULL;
so->so_discon_ind_mp = mp;
}
/*
* Caller must hold the mutex and make sure that either SOLOCKED
* or SOASYNC_UNBIND is set. Called from so_unlock_single().
* Perform delayed processing of T_DISCON_IND message on so_discon_ind_mp.
* Need to ensure that strsock_proto() will not end up sleeping for
* SOASYNC_UNBIND, while executing this function.
*/
void
so_drain_discon_ind(struct sonode *so)
{
mblk_t *bp;
void (*func)(struct sonode *so, mblk_t *);
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(so->so_flag & (SOLOCKED|SOASYNC_UNBIND));
/* Process T_DISCON_IND on so_discon_ind_mp */
if ((bp = so->so_discon_ind_mp) != NULL) {
so->so_discon_ind_mp = NULL;
func = (void (*)())bp->b_prev;
bp->b_prev = NULL;
/*
* This (*func) is supposed to generate a message downstream
* and we need to have a flag set until the corresponding
* upstream message reaches stream head.
* When processing T_DISCON_IND in strsock_discon_ind
* we hold SOASYN_UNBIND when sending T_UNBIND_REQ down and
* drop the flag after we get the ACK in strsock_proto.
*/
(void) (*func)(so, bp);
}
}
/*
* Caller must hold the mutex.
* Remove the T_DISCON_IND on so_discon_ind_mp.
*/
void
so_flush_discon_ind(struct sonode *so)
{
mblk_t *bp;
ASSERT(MUTEX_HELD(&so->so_lock));
/*
* Remove T_DISCON_IND mblk at so_discon_ind_mp.
*/
if ((bp = so->so_discon_ind_mp) != NULL) {
so->so_discon_ind_mp = NULL;
bp->b_prev = NULL;
freemsg(bp);
}
}
/*
* Caller must hold the mutex.
*
* This function is used to process the T_DISCON_IND message. It does
* immediate processing when called from strsock_proto and delayed
* processing of discon_ind saved on so_discon_ind_mp when called from
* so_drain_discon_ind. When a T_DISCON_IND message is saved in
* so_discon_ind_mp for delayed processing, this function is registered
* as the callback function to process the message.
*
* SOASYNC_UNBIND should be held in this function, during the non-blocking
* unbind operation, and should be released only after we receive the ACK
* in strsock_proto, for the T_UNBIND_REQ sent here. Since SOLOCKED is not set,
* no TPI messages would be sent down at this time. This is to prevent M_FLUSH
* sent from either this function or tcp_unbind(), flushing away any TPI
* message that is being sent down and stays in a lower module's queue.
*
* This function drops so_lock and grabs it again.
*/
static void
strsock_discon_ind(struct sonode *so, mblk_t *discon_mp)
{
struct vnode *vp;
struct stdata *stp;
union T_primitives *tpr;
struct T_unbind_req *ubr;
mblk_t *mp;
int error;
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(discon_mp);
ASSERT(discon_mp->b_rptr);
tpr = (union T_primitives *)discon_mp->b_rptr;
ASSERT(tpr->type == T_DISCON_IND);
vp = SOTOV(so);
stp = vp->v_stream;
ASSERT(stp);
/*
* Not a listener
*/
ASSERT((so->so_state & SS_ACCEPTCONN) == 0);
/*
* This assumes that the name space for DISCON_reason
* is the errno name space.
*/
soisdisconnected(so, tpr->discon_ind.DISCON_reason);
/*
* Unbind with the transport without blocking.
* If we've already received a T_DISCON_IND do not unbind.
*
* If there is no preallocated unbind message, we have already
* unbound with the transport
*
* If the socket is not bound, no need to unbind.
*/
mp = so->so_unbind_mp;
if (mp == NULL) {
ASSERT(!(so->so_state & SS_ISBOUND));
mutex_exit(&so->so_lock);
} else if (!(so->so_state & SS_ISBOUND)) {
mutex_exit(&so->so_lock);
} else {
so->so_unbind_mp = NULL;
/*
* Is another T_DISCON_IND being processed.
*/
ASSERT((so->so_flag & SOASYNC_UNBIND) == 0);
/*
* Make strsock_proto ignore T_OK_ACK and T_ERROR_ACK for
* this unbind. Set SOASYNC_UNBIND. This should be cleared
* only after we receive the ACK in strsock_proto.
*/
so->so_flag |= SOASYNC_UNBIND;
ASSERT(!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)));
so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN|SS_LADDR_VALID);
mutex_exit(&so->so_lock);
/*
* Send down T_UNBIND_REQ ignoring flow control.
* XXX Assumes that MSG_IGNFLOW implies that this thread
* does not run service procedures.
*/
ASSERT(DB_TYPE(mp) == M_PROTO);
ubr = (struct T_unbind_req *)mp->b_rptr;
mp->b_wptr += sizeof (*ubr);
ubr->PRIM_type = T_UNBIND_REQ;
/*
* Flush the read and write side (except stream head read queue)
* and send down T_UNBIND_REQ.
*/
(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
/* LINTED - warning: statement has no consequent: if */
if (error) {
eprintsoline(so, error);
}
}
if (tpr->discon_ind.DISCON_reason != 0)
strsetrerror(SOTOV(so), 0, 0, sogetrderr);
strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
strseteof(SOTOV(so), 1);
/*
* strseteof takes care of read side wakeups,
* pollwakeups, and signals.
*/
dprintso(so, 1, ("T_DISCON_IND: error %d\n", so->so_error));
freemsg(discon_mp);
pollwakeup(&stp->sd_pollist, POLLOUT);
mutex_enter(&stp->sd_lock);
/*
* Wake sleeping write
*/
if (stp->sd_flag & WSLEEP) {
stp->sd_flag &= ~WSLEEP;
cv_broadcast(&stp->sd_wrq->q_wait);
}
/*
* strsendsig can handle multiple signals with a
* single call. Send SIGPOLL for S_OUTPUT event.
*/
if (stp->sd_sigflags & S_OUTPUT)
strsendsig(stp->sd_siglist, S_OUTPUT, 0, 0);
mutex_exit(&stp->sd_lock);
mutex_enter(&so->so_lock);
}
/*
* This routine is registered with the stream head to receive M_PROTO
* and M_PCPROTO messages.
*
* Returns NULL if the message was consumed.
* Returns an mblk to make that mblk be processed (and queued) by the stream
* head.
*
* Sets the return parameters (*wakeups, *firstmsgsigs, *allmsgsigs, and
* *pollwakeups) for the stream head to take action on. Note that since
* sockets always deliver SIGIO for every new piece of data this routine
* never sets *firstmsgsigs; any signals are returned in *allmsgsigs.
*
* This routine handles all data related TPI messages independent of
* the type of the socket i.e. it doesn't care if T_UNITDATA_IND message
* arrive on a SOCK_STREAM.
*/
static mblk_t *
strsock_proto(vnode_t *vp, mblk_t *mp,
strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
strsigset_t *allmsgsigs, strpollset_t *pollwakeups)
{
union T_primitives *tpr;
struct sonode *so;
so = VTOSO(vp);
dprintso(so, 1, ("strsock_proto(%p, %p)\n", vp, mp));
/* Set default return values */
*firstmsgsigs = *wakeups = *allmsgsigs = *pollwakeups = 0;
ASSERT(DB_TYPE(mp) == M_PROTO ||
DB_TYPE(mp) == M_PCPROTO);
if (MBLKL(mp) < sizeof (tpr->type)) {
/* The message is too short to even contain the primitive */
zcmn_err(getzoneid(), CE_WARN,
"sockfs: Too short TPI message received. Len = %ld\n",
(ptrdiff_t)(MBLKL(mp)));
freemsg(mp);
return (NULL);
}
if (!__TPI_PRIM_ISALIGNED(mp->b_rptr)) {
/* The read pointer is not aligned correctly for TPI */
zcmn_err(getzoneid(), CE_WARN,
"sockfs: Unaligned TPI message received. rptr = %p\n",
(void *)mp->b_rptr);
freemsg(mp);
return (NULL);
}
tpr = (union T_primitives *)mp->b_rptr;
dprintso(so, 1, ("strsock_proto: primitive %d\n", tpr->type));
switch (tpr->type) {
case T_DATA_IND:
if (MBLKL(mp) < sizeof (struct T_data_ind)) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: Too short T_DATA_IND. Len = %ld\n",
(ptrdiff_t)(MBLKL(mp)));
freemsg(mp);
return (NULL);
}
/*
* Ignore zero-length T_DATA_IND messages. These might be
* generated by some transports.
* This is needed to prevent read (which skips the M_PROTO
* part) to unexpectedly return 0 (or return EWOULDBLOCK
* on a non-blocking socket after select/poll has indicated
* that data is available).
*/
if (msgdsize(mp->b_cont) == 0) {
dprintso(so, 0,
("strsock_proto: zero length T_DATA_IND\n"));
freemsg(mp);
return (NULL);
}
*allmsgsigs = S_INPUT | S_RDNORM;
*pollwakeups = POLLIN | POLLRDNORM;
*wakeups = RSLEEP;
return (mp);
case T_UNITDATA_IND: {
struct T_unitdata_ind *tudi = &tpr->unitdata_ind;
void *addr;
t_uscalar_t addrlen;
if (MBLKL(mp) < sizeof (struct T_unitdata_ind)) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: Too short T_UNITDATA_IND. Len = %ld\n",
(ptrdiff_t)(MBLKL(mp)));
freemsg(mp);
return (NULL);
}
/* Is this is not a connected datagram socket? */
if ((so->so_mode & SM_CONNREQUIRED) ||
!(so->so_state & SS_ISCONNECTED)) {
/*
* Not a connected datagram socket. Look for
* the SO_UNIX_CLOSE option. If such an option is found
* discard the message (since it has no meaning
* unless connected).
*/
if (so->so_family == AF_UNIX && msgdsize(mp) == 0 &&
tudi->OPT_length != 0) {
void *opt;
t_uscalar_t optlen = tudi->OPT_length;
opt = sogetoff(mp, tudi->OPT_offset,
optlen, __TPI_ALIGN_SIZE);
if (opt == NULL) {
/* The len/off falls outside mp */
freemsg(mp);
mutex_enter(&so->so_lock);
soseterror(so, EPROTO);
mutex_exit(&so->so_lock);
zcmn_err(getzoneid(), CE_WARN,
"sockfs: T_unidata_ind with "
"invalid optlen/offset %u/%d\n",
optlen, tudi->OPT_offset);
return (NULL);
}
if (so_getopt_unix_close(opt, optlen)) {
freemsg(mp);
return (NULL);
}
}
*allmsgsigs = S_INPUT | S_RDNORM;
*pollwakeups = POLLIN | POLLRDNORM;
*wakeups = RSLEEP;
#ifdef C2_AUDIT
if (audit_active)
audit_sock(T_UNITDATA_IND, strvp2wq(vp),
mp, 0);
#endif /* C2_AUDIT */
return (mp);
}
/*
* A connect datagram socket. For AF_INET{,6} we verify that
* the source address matches the "connected to" address.
* The semantics of AF_UNIX sockets is to not verify
* the source address.
* Note that this source address verification is transport
* specific. Thus the real fix would be to extent TPI
* to allow T_CONN_REQ messages to be send to connectionless
* transport providers and always let the transport provider
* do whatever filtering is needed.
*
* The verification/filtering semantics for transports
* other than AF_INET and AF_UNIX are unknown. The choice
* would be to either filter using bcmp or let all messages
* get through. This code does not filter other address
* families since this at least allows the application to
* work around any missing filtering.
*
* XXX Should we move filtering to UDP/ICMP???
* That would require passing e.g. a T_DISCON_REQ to UDP
* when the socket becomes unconnected.
*/
addrlen = tudi->SRC_length;
/*
* The alignment restriction is really to strict but
* we want enough alignment to inspect the fields of
* a sockaddr_in.
*/
addr = sogetoff(mp, tudi->SRC_offset, addrlen,
__TPI_ALIGN_SIZE);
if (addr == NULL) {
freemsg(mp);
mutex_enter(&so->so_lock);
soseterror(so, EPROTO);
mutex_exit(&so->so_lock);
zcmn_err(getzoneid(), CE_WARN,
"sockfs: T_unidata_ind with invalid "
"addrlen/offset %u/%d\n",
addrlen, tudi->SRC_offset);
return (NULL);
}
if (so->so_family == AF_INET) {
/*
* For AF_INET we allow wildcarding both sin_addr
* and sin_port.
*/
struct sockaddr_in *faddr, *sin;
/* Prevent so_faddr_sa from changing while accessed */
mutex_enter(&so->so_lock);
ASSERT(so->so_faddr_len ==
(socklen_t)sizeof (struct sockaddr_in));
faddr = (struct sockaddr_in *)so->so_faddr_sa;
sin = (struct sockaddr_in *)addr;
if (addrlen !=
(t_uscalar_t)sizeof (struct sockaddr_in) ||
(sin->sin_addr.s_addr != faddr->sin_addr.s_addr &&
faddr->sin_addr.s_addr != INADDR_ANY) ||
(so->so_type != SOCK_RAW &&
sin->sin_port != faddr->sin_port &&
faddr->sin_port != 0)) {
#ifdef DEBUG
dprintso(so, 0,
("sockfs: T_UNITDATA_IND mismatch: %s",
pr_addr(so->so_family,
(struct sockaddr *)addr,
addrlen)));
dprintso(so, 0, (" - %s\n",
pr_addr(so->so_family, so->so_faddr_sa,
(t_uscalar_t)so->so_faddr_len)));
#endif /* DEBUG */
mutex_exit(&so->so_lock);
freemsg(mp);
return (NULL);
}
mutex_exit(&so->so_lock);
} else if (so->so_family == AF_INET6) {
/*
* For AF_INET6 we allow wildcarding both sin6_addr
* and sin6_port.
*/
struct sockaddr_in6 *faddr6, *sin6;
static struct in6_addr zeroes; /* inits to all zeros */
/* Prevent so_faddr_sa from changing while accessed */
mutex_enter(&so->so_lock);
ASSERT(so->so_faddr_len ==
(socklen_t)sizeof (struct sockaddr_in6));
faddr6 = (struct sockaddr_in6 *)so->so_faddr_sa;
sin6 = (struct sockaddr_in6 *)addr;
/* XXX could we get a mapped address ::ffff:0.0.0.0 ? */
if (addrlen !=
(t_uscalar_t)sizeof (struct sockaddr_in6) ||
(!IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
&faddr6->sin6_addr) &&
!IN6_ARE_ADDR_EQUAL(&faddr6->sin6_addr, &zeroes)) ||
(so->so_type != SOCK_RAW &&
sin6->sin6_port != faddr6->sin6_port &&
faddr6->sin6_port != 0)) {
#ifdef DEBUG
dprintso(so, 0,
("sockfs: T_UNITDATA_IND mismatch: %s",
pr_addr(so->so_family,
(struct sockaddr *)addr,
addrlen)));
dprintso(so, 0, (" - %s\n",
pr_addr(so->so_family, so->so_faddr_sa,
(t_uscalar_t)so->so_faddr_len)));
#endif /* DEBUG */
mutex_exit(&so->so_lock);
freemsg(mp);
return (NULL);
}
mutex_exit(&so->so_lock);
} else if (so->so_family == AF_UNIX &&
msgdsize(mp->b_cont) == 0 &&
tudi->OPT_length != 0) {
/*
* Attempt to extract AF_UNIX
* SO_UNIX_CLOSE indication from options.
*/
void *opt;
t_uscalar_t optlen = tudi->OPT_length;
opt = sogetoff(mp, tudi->OPT_offset,
optlen, __TPI_ALIGN_SIZE);
if (opt == NULL) {
/* The len/off falls outside mp */
freemsg(mp);
mutex_enter(&so->so_lock);
soseterror(so, EPROTO);
mutex_exit(&so->so_lock);
zcmn_err(getzoneid(), CE_WARN,
"sockfs: T_unidata_ind with invalid "
"optlen/offset %u/%d\n",
optlen, tudi->OPT_offset);
return (NULL);
}
/*
* If we received a unix close indication mark the
* socket and discard this message.
*/
if (so_getopt_unix_close(opt, optlen)) {
mutex_enter(&so->so_lock);
sobreakconn(so, ECONNRESET);
mutex_exit(&so->so_lock);
strsetrerror(SOTOV(so), 0, 0, sogetrderr);
freemsg(mp);
*pollwakeups = POLLIN | POLLRDNORM;
*allmsgsigs = S_INPUT | S_RDNORM;
*wakeups = RSLEEP;
return (NULL);
}
}
*allmsgsigs = S_INPUT | S_RDNORM;
*pollwakeups = POLLIN | POLLRDNORM;
*wakeups = RSLEEP;
return (mp);
}
case T_OPTDATA_IND: {
struct T_optdata_ind *tdi = &tpr->optdata_ind;
if (MBLKL(mp) < sizeof (struct T_optdata_ind)) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: Too short T_OPTDATA_IND. Len = %ld\n",
(ptrdiff_t)(MBLKL(mp)));
freemsg(mp);
return (NULL);
}
/*
* Allow zero-length messages carrying options.
* This is used when carrying the SO_UNIX_CLOSE option.
*/
if (so->so_family == AF_UNIX && msgdsize(mp->b_cont) == 0 &&
tdi->OPT_length != 0) {
/*
* Attempt to extract AF_UNIX close indication
* from the options. Ignore any other options -
* those are handled once the message is removed
* from the queue.
* The close indication message should not carry data.
*/
void *opt;
t_uscalar_t optlen = tdi->OPT_length;
opt = sogetoff(mp, tdi->OPT_offset,
optlen, __TPI_ALIGN_SIZE);
if (opt == NULL) {
/* The len/off falls outside mp */
freemsg(mp);
mutex_enter(&so->so_lock);
soseterror(so, EPROTO);
mutex_exit(&so->so_lock);
zcmn_err(getzoneid(), CE_WARN,
"sockfs: T_optdata_ind with invalid "
"optlen/offset %u/%d\n",
optlen, tdi->OPT_offset);
return (NULL);
}
/*
* If we received a close indication mark the
* socket and discard this message.
*/
if (so_getopt_unix_close(opt, optlen)) {
mutex_enter(&so->so_lock);
socantsendmore(so);
mutex_exit(&so->so_lock);
strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
freemsg(mp);
return (NULL);
}
}
*allmsgsigs = S_INPUT | S_RDNORM;
*pollwakeups = POLLIN | POLLRDNORM;
*wakeups = RSLEEP;
return (mp);
}
case T_EXDATA_IND: {
mblk_t *mctl, *mdata;
if (MBLKL(mp) < sizeof (struct T_exdata_ind)) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: Too short T_EXDATA_IND. Len = %ld\n",
(ptrdiff_t)(MBLKL(mp)));
freemsg(mp);
return (NULL);
}
/*
* Ignore zero-length T_EXDATA_IND messages. These might be
* generated by some transports.
*
* This is needed to prevent read (which skips the M_PROTO
* part) to unexpectedly return 0 (or return EWOULDBLOCK
* on a non-blocking socket after select/poll has indicated
* that data is available).
*/
dprintso(so, 1,
("T_EXDATA_IND(%p): counts %d/%d state %s\n",
vp, so->so_oobsigcnt, so->so_oobcnt,
pr_state(so->so_state, so->so_mode)));
if (msgdsize(mp->b_cont) == 0) {
dprintso(so, 0,
("strsock_proto: zero length T_EXDATA_IND\n"));
freemsg(mp);
return (NULL);
}
/*
* Split into the T_EXDATA_IND and the M_DATA part.
* We process these three pieces separately:
* signal generation
* handling T_EXDATA_IND
* handling M_DATA component
*/
mctl = mp;
mdata = mctl->b_cont;
mctl->b_cont = NULL;
mutex_enter(&so->so_lock);
so_oob_sig(so, 0, allmsgsigs, pollwakeups);
mctl = so_oob_exdata(so, mctl, allmsgsigs, pollwakeups);
mdata = so_oob_data(so, mdata, allmsgsigs, pollwakeups);
/*
* Pass the T_EXDATA_IND and the M_DATA back separately
* by using b_next linkage. (The stream head will queue any
* b_next linked messages separately.) This is needed
* since MSGMARK applies to the last by of the message
* hence we can not have any M_DATA component attached
* to the marked T_EXDATA_IND. Note that the stream head
* will not consolidate M_DATA messages onto an MSGMARK'ed
* message in order to preserve the constraint that
* the T_EXDATA_IND always is a separate message.
*/
ASSERT(mctl != NULL);
mctl->b_next = mdata;
mp = mctl;
#ifdef DEBUG
if (mdata == NULL) {
dprintso(so, 1,
("after outofline T_EXDATA_IND(%p): "
"counts %d/%d poll 0x%x sig 0x%x state %s\n",
vp, so->so_oobsigcnt,
so->so_oobcnt, *pollwakeups, *allmsgsigs,
pr_state(so->so_state, so->so_mode)));
} else {
dprintso(so, 1,
("after inline T_EXDATA_IND(%p): "
"counts %d/%d poll 0x%x sig 0x%x state %s\n",
vp, so->so_oobsigcnt,
so->so_oobcnt, *pollwakeups, *allmsgsigs,
pr_state(so->so_state, so->so_mode)));
}
#endif /* DEBUG */
mutex_exit(&so->so_lock);
*wakeups = RSLEEP;
return (mp);
}
case T_CONN_CON: {
struct T_conn_con *conn_con;
void *addr;
t_uscalar_t addrlen;
/*
* Verify the state, update the state to ISCONNECTED,
* record the potentially new address in the message,
* and drop the message.
*/
if (MBLKL(mp) < sizeof (struct T_conn_con)) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: Too short T_CONN_CON. Len = %ld\n",
(ptrdiff_t)(MBLKL(mp)));
freemsg(mp);
return (NULL);
}
mutex_enter(&so->so_lock);
if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) !=
SS_ISCONNECTING) {
mutex_exit(&so->so_lock);
dprintso(so, 1,
("T_CONN_CON: state %x\n", so->so_state));
freemsg(mp);
return (NULL);
}
conn_con = &tpr->conn_con;
addrlen = conn_con->RES_length;
/*
* Allow the address to be of different size than sent down
* in the T_CONN_REQ as long as it doesn't exceed the maxlen.
* For AF_UNIX require the identical length.
*/
if (so->so_family == AF_UNIX ?
addrlen != (t_uscalar_t)sizeof (so->so_ux_laddr) :
addrlen > (t_uscalar_t)so->so_faddr_maxlen) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: T_conn_con with different "
"length %u/%d\n",
addrlen, conn_con->RES_length);
soisdisconnected(so, EPROTO);
mutex_exit(&so->so_lock);
strsetrerror(SOTOV(so), 0, 0, sogetrderr);
strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
strseteof(SOTOV(so), 1);
freemsg(mp);
/*
* strseteof takes care of read side wakeups,
* pollwakeups, and signals.
*/
*wakeups = WSLEEP;
*allmsgsigs = S_OUTPUT;
*pollwakeups = POLLOUT;
return (NULL);
}
addr = sogetoff(mp, conn_con->RES_offset, addrlen, 1);
if (addr == NULL) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: T_conn_con with invalid "
"addrlen/offset %u/%d\n",
addrlen, conn_con->RES_offset);
mutex_exit(&so->so_lock);
strsetrerror(SOTOV(so), 0, 0, sogetrderr);
strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
strseteof(SOTOV(so), 1);
freemsg(mp);
/*
* strseteof takes care of read side wakeups,
* pollwakeups, and signals.
*/
*wakeups = WSLEEP;
*allmsgsigs = S_OUTPUT;
*pollwakeups = POLLOUT;
return (NULL);
}
/*
* Save for getpeername.
*/
if (so->so_family != AF_UNIX) {
so->so_faddr_len = (socklen_t)addrlen;
ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
bcopy(addr, so->so_faddr_sa, addrlen);
so->so_state |= SS_FADDR_VALID;
}
if (so->so_peercred != NULL)
crfree(so->so_peercred);
so->so_peercred = DB_CRED(mp);
so->so_cpid = DB_CPID(mp);
if (so->so_peercred != NULL)
crhold(so->so_peercred);
/* Wakeup anybody sleeping in sowaitconnected */
soisconnected(so);
mutex_exit(&so->so_lock);
/*
* The socket is now available for sending data.
*/
*wakeups = WSLEEP;
*allmsgsigs = S_OUTPUT;
*pollwakeups = POLLOUT;
freemsg(mp);
return (NULL);
}
/*
* Extra processing in case of an SSL proxy, before queuing or
* forwarding to the fallback endpoint
*/
case T_SSL_PROXY_CONN_IND:
case T_CONN_IND:
/*
* Verify the min size and queue the message on
* the so_conn_ind_head/tail list.
*/
if (MBLKL(mp) < sizeof (struct T_conn_ind)) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: Too short T_CONN_IND. Len = %ld\n",
(ptrdiff_t)(MBLKL(mp)));
freemsg(mp);
return (NULL);
}
#ifdef C2_AUDIT
if (audit_active)
audit_sock(T_CONN_IND, strvp2wq(vp), mp, 0);
#endif /* C2_AUDIT */
if (!(so->so_state & SS_ACCEPTCONN)) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: T_conn_ind on non-listening socket\n");
freemsg(mp);
return (NULL);
}
if (tpr->type == T_SSL_PROXY_CONN_IND && mp->b_cont == NULL) {
/* No context: need to fall back */
struct sonode *fbso;
stdata_t *fbstp;
tpr->type = T_CONN_IND;
fbso = kssl_find_fallback(so->so_kssl_ent);
/*
* No fallback: the remote will timeout and
* disconnect.
*/
if (fbso == NULL) {
freemsg(mp);
return (NULL);
}
fbstp = SOTOV(fbso)->v_stream;
qreply(fbstp->sd_wrq->q_next, mp);
return (NULL);
}
soqueueconnind(so, mp);
*allmsgsigs = S_INPUT | S_RDNORM;
*pollwakeups = POLLIN | POLLRDNORM;
*wakeups = RSLEEP;
return (NULL);
case T_ORDREL_IND:
if (MBLKL(mp) < sizeof (struct T_ordrel_ind)) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: Too short T_ORDREL_IND. Len = %ld\n",
(ptrdiff_t)(MBLKL(mp)));
freemsg(mp);
return (NULL);
}
/*
* Some providers send this when not fully connected.
* SunLink X.25 needs to retrieve disconnect reason after
* disconnect for compatibility. It uses T_ORDREL_IND
* instead of T_DISCON_IND so that it may use the
* endpoint after a connect failure to retrieve the
* reason using an ioctl. Thus we explicitly clear
* SS_ISCONNECTING here for SunLink X.25.
* This is a needed TPI violation.
*/
mutex_enter(&so->so_lock);
so->so_state &= ~SS_ISCONNECTING;
socantrcvmore(so);
mutex_exit(&so->so_lock);
strseteof(SOTOV(so), 1);
/*
* strseteof takes care of read side wakeups,
* pollwakeups, and signals.
*/
freemsg(mp);
return (NULL);
case T_DISCON_IND:
if (MBLKL(mp) < sizeof (struct T_discon_ind)) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: Too short T_DISCON_IND. Len = %ld\n",
(ptrdiff_t)(MBLKL(mp)));
freemsg(mp);
return (NULL);
}
if (so->so_state & SS_ACCEPTCONN) {
/*
* This is a listener. Look for a queued T_CONN_IND
* with a matching sequence number and remove it
* from the list.
* It is normal to not find the sequence number since
* the soaccept might have already dequeued it
* (in which case the T_CONN_RES will fail with
* TBADSEQ).
*/
(void) soflushconnind(so, tpr->discon_ind.SEQ_number);
freemsg(mp);
return (0);
}
/*
* Not a listener
*
* If SS_CANTRCVMORE for AF_UNIX ignore the discon_reason.
* Such a discon_ind appears when the peer has first done
* a shutdown() followed by a close() in which case we just
* want to record socantsendmore.
* In this case sockfs first receives a T_ORDREL_IND followed
* by a T_DISCON_IND.
* Note that for other transports (e.g. TCP) we need to handle
* the discon_ind in this case since it signals an error.
*/
mutex_enter(&so->so_lock);
if ((so->so_state & SS_CANTRCVMORE) &&
(so->so_family == AF_UNIX)) {
socantsendmore(so);
mutex_exit(&so->so_lock);
strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
dprintso(so, 1,
("T_DISCON_IND: error %d\n", so->so_error));
freemsg(mp);
/*
* Set these variables for caller to process them.
* For the else part where T_DISCON_IND is processed,
* this will be done in the function being called
* (strsock_discon_ind())
*/
*wakeups = WSLEEP;
*allmsgsigs = S_OUTPUT;
*pollwakeups = POLLOUT;
} else if (so->so_flag & (SOASYNC_UNBIND | SOLOCKED)) {
/*
* Deferred processing of T_DISCON_IND
*/
so_save_discon_ind(so, mp, strsock_discon_ind);
mutex_exit(&so->so_lock);
} else {
/*
* Process T_DISCON_IND now
*/
(void) strsock_discon_ind(so, mp);
mutex_exit(&so->so_lock);
}
return (NULL);
case T_UDERROR_IND: {
struct T_uderror_ind *tudi = &tpr->uderror_ind;
void *addr;
t_uscalar_t addrlen;
int error;
dprintso(so, 0,
("T_UDERROR_IND: error %d\n", tudi->ERROR_type));
if (MBLKL(mp) < sizeof (struct T_uderror_ind)) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: Too short T_UDERROR_IND. Len = %ld\n",
(ptrdiff_t)(MBLKL(mp)));
freemsg(mp);
return (NULL);
}
/* Ignore on connection-oriented transports */
if (so->so_mode & SM_CONNREQUIRED) {
freemsg(mp);
eprintsoline(so, 0);
zcmn_err(getzoneid(), CE_WARN,
"sockfs: T_uderror_ind on connection-oriented "
"transport\n");
return (NULL);
}
addrlen = tudi->DEST_length;
addr = sogetoff(mp, tudi->DEST_offset, addrlen, 1);
if (addr == NULL) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: T_uderror_ind with invalid "
"addrlen/offset %u/%d\n",
addrlen, tudi->DEST_offset);
freemsg(mp);
return (NULL);
}
/* Verify source address for connected socket. */
mutex_enter(&so->so_lock);
if (so->so_state & SS_ISCONNECTED) {
void *faddr;
t_uscalar_t faddr_len;
boolean_t match = B_FALSE;
switch (so->so_family) {
case AF_INET: {
/* Compare just IP address and port */
struct sockaddr_in *sin1, *sin2;
sin1 = (struct sockaddr_in *)so->so_faddr_sa;
sin2 = (struct sockaddr_in *)addr;
if (addrlen == sizeof (struct sockaddr_in) &&
sin1->sin_port == sin2->sin_port &&
sin1->sin_addr.s_addr ==
sin2->sin_addr.s_addr)
match = B_TRUE;
break;
}
case AF_INET6: {
/* Compare just IP address and port. Not flow */
struct sockaddr_in6 *sin1, *sin2;
sin1 = (struct sockaddr_in6 *)so->so_faddr_sa;
sin2 = (struct sockaddr_in6 *)addr;
if (addrlen == sizeof (struct sockaddr_in6) &&
sin1->sin6_port == sin2->sin6_port &&
IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
&sin2->sin6_addr))
match = B_TRUE;
break;
}
case AF_UNIX:
faddr = &so->so_ux_faddr;
faddr_len =
(t_uscalar_t)sizeof (so->so_ux_faddr);
if (faddr_len == addrlen &&
bcmp(addr, faddr, addrlen) == 0)
match = B_TRUE;
break;
default:
faddr = so->so_faddr_sa;
faddr_len = (t_uscalar_t)so->so_faddr_len;
if (faddr_len == addrlen &&
bcmp(addr, faddr, addrlen) == 0)
match = B_TRUE;
break;
}
if (!match) {
#ifdef DEBUG
dprintso(so, 0,
("sockfs: T_UDERR_IND mismatch: %s - ",
pr_addr(so->so_family,
(struct sockaddr *)addr,
addrlen)));
dprintso(so, 0, ("%s\n",
pr_addr(so->so_family, so->so_faddr_sa,
so->so_faddr_len)));
#endif /* DEBUG */
mutex_exit(&so->so_lock);
freemsg(mp);
return (NULL);
}
/*
* Make the write error nonpersistent. If the error
* is zero we use ECONNRESET.
* This assumes that the name space for ERROR_type
* is the errno name space.
*/
if (tudi->ERROR_type != 0)
error = tudi->ERROR_type;
else
error = ECONNRESET;
soseterror(so, error);
mutex_exit(&so->so_lock);
strsetrerror(SOTOV(so), 0, 0, sogetrderr);
strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
*wakeups = RSLEEP | WSLEEP;
*allmsgsigs = S_INPUT | S_RDNORM | S_OUTPUT;
*pollwakeups = POLLIN | POLLRDNORM | POLLOUT;
freemsg(mp);
return (NULL);
}
/*
* If the application asked for delayed errors
* record the T_UDERROR_IND so_eaddr_mp and the reason in
* so_delayed_error for delayed error posting. If the reason
* is zero use ECONNRESET.
* Note that delayed error indications do not make sense for
* AF_UNIX sockets since sendto checks that the destination
* address is valid at the time of the sendto.
*/
if (!(so->so_options & SO_DGRAM_ERRIND)) {
mutex_exit(&so->so_lock);
freemsg(mp);
return (NULL);
}
if (so->so_eaddr_mp != NULL)
freemsg(so->so_eaddr_mp);
so->so_eaddr_mp = mp;
if (tudi->ERROR_type != 0)
error = tudi->ERROR_type;
else
error = ECONNRESET;
so->so_delayed_error = (ushort_t)error;
mutex_exit(&so->so_lock);
return (NULL);
}
case T_ERROR_ACK:
dprintso(so, 0,
("strsock_proto: T_ERROR_ACK for %d, error %d/%d\n",
tpr->error_ack.ERROR_prim,
tpr->error_ack.TLI_error,
tpr->error_ack.UNIX_error));
if (MBLKL(mp) < sizeof (struct T_error_ack)) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: Too short T_ERROR_ACK. Len = %ld\n",
(ptrdiff_t)(MBLKL(mp)));
freemsg(mp);
return (NULL);
}
/*
* Check if we were waiting for the async message
*/
mutex_enter(&so->so_lock);
if ((so->so_flag & SOASYNC_UNBIND) &&
tpr->error_ack.ERROR_prim == T_UNBIND_REQ) {
so_unlock_single(so, SOASYNC_UNBIND);
mutex_exit(&so->so_lock);
freemsg(mp);
return (NULL);
}
mutex_exit(&so->so_lock);
soqueueack(so, mp);
return (NULL);
case T_OK_ACK:
if (MBLKL(mp) < sizeof (struct T_ok_ack)) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: Too short T_OK_ACK. Len = %ld\n",
(ptrdiff_t)(MBLKL(mp)));
freemsg(mp);
return (NULL);
}
/*
* Check if we were waiting for the async message
*/
mutex_enter(&so->so_lock);
if ((so->so_flag & SOASYNC_UNBIND) &&
tpr->ok_ack.CORRECT_prim == T_UNBIND_REQ) {
dprintso(so, 1,
("strsock_proto: T_OK_ACK async unbind\n"));
so_unlock_single(so, SOASYNC_UNBIND);
mutex_exit(&so->so_lock);
freemsg(mp);
return (NULL);
}
mutex_exit(&so->so_lock);
soqueueack(so, mp);
return (NULL);
case T_INFO_ACK:
if (MBLKL(mp) < sizeof (struct T_info_ack)) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: Too short T_INFO_ACK. Len = %ld\n",
(ptrdiff_t)(MBLKL(mp)));
freemsg(mp);
return (NULL);
}
soqueueack(so, mp);
return (NULL);
case T_CAPABILITY_ACK:
/*
* A T_capability_ack need only be large enough to hold
* the PRIM_type and CAP_bits1 fields; checking for anything
* larger might reject a correct response from an older
* provider.
*/
if (MBLKL(mp) < 2 * sizeof (t_uscalar_t)) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: Too short T_CAPABILITY_ACK. Len = %ld\n",
(ptrdiff_t)(MBLKL(mp)));
freemsg(mp);
return (NULL);
}
soqueueack(so, mp);
return (NULL);
case T_BIND_ACK:
if (MBLKL(mp) < sizeof (struct T_bind_ack)) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: Too short T_BIND_ACK. Len = %ld\n",
(ptrdiff_t)(MBLKL(mp)));
freemsg(mp);
return (NULL);
}
soqueueack(so, mp);
return (NULL);
case T_OPTMGMT_ACK:
if (MBLKL(mp) < sizeof (struct T_optmgmt_ack)) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: Too short T_OPTMGMT_ACK. Len = %ld\n",
(ptrdiff_t)(MBLKL(mp)));
freemsg(mp);
return (NULL);
}
soqueueack(so, mp);
return (NULL);
default:
#ifdef DEBUG
zcmn_err(getzoneid(), CE_WARN,
"sockfs: unknown TPI primitive %d received\n",
tpr->type);
#endif /* DEBUG */
freemsg(mp);
return (NULL);
}
}
/*
* This routine is registered with the stream head to receive other
* (non-data, and non-proto) messages.
*
* Returns NULL if the message was consumed.
* Returns an mblk to make that mblk be processed by the stream head.
*
* Sets the return parameters (*wakeups, *firstmsgsigs, *allmsgsigs, and
* *pollwakeups) for the stream head to take action on.
*/
static mblk_t *
strsock_misc(vnode_t *vp, mblk_t *mp,
strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
strsigset_t *allmsgsigs, strpollset_t *pollwakeups)
{
struct sonode *so;
so = VTOSO(vp);
dprintso(so, 1, ("strsock_misc(%p, %p, 0x%x)\n",
vp, mp, DB_TYPE(mp)));
/* Set default return values */
*wakeups = *allmsgsigs = *firstmsgsigs = *pollwakeups = 0;
switch (DB_TYPE(mp)) {
case M_PCSIG:
/*
* This assumes that an M_PCSIG for the urgent data arrives
* before the corresponding T_EXDATA_IND.
*
* Note: Just like in SunOS 4.X and 4.4BSD a poll will be
* awoken before the urgent data shows up.
* For OOBINLINE this can result in select returning
* only exceptions as opposed to except|read.
*/
if (*mp->b_rptr == SIGURG) {
mutex_enter(&so->so_lock);
dprintso(so, 1,
("SIGURG(%p): counts %d/%d state %s\n",
vp, so->so_oobsigcnt,
so->so_oobcnt,
pr_state(so->so_state, so->so_mode)));
so_oob_sig(so, 1, allmsgsigs, pollwakeups);
dprintso(so, 1,
("after SIGURG(%p): counts %d/%d "
" poll 0x%x sig 0x%x state %s\n",
vp, so->so_oobsigcnt,
so->so_oobcnt, *pollwakeups, *allmsgsigs,
pr_state(so->so_state, so->so_mode)));
mutex_exit(&so->so_lock);
}
freemsg(mp);
return (NULL);
case M_SIG:
case M_HANGUP:
case M_UNHANGUP:
case M_ERROR:
/* M_ERRORs etc are ignored */
freemsg(mp);
return (NULL);
case M_FLUSH:
/*
* Do not flush read queue. If the M_FLUSH
* arrives because of an impending T_discon_ind
* we still have to keep any queued data - this is part of
* socket semantics.
*/
if (*mp->b_rptr & FLUSHW) {
*mp->b_rptr &= ~FLUSHR;
return (mp);
}
freemsg(mp);
return (NULL);
default:
return (mp);
}
}
/* Register to receive signals for certain events */
int
so_set_asyncsigs(vnode_t *vp, pid_t pgrp, int events, int mode, cred_t *cr)
{
struct strsigset ss;
int32_t rval;
/*
* Note that SOLOCKED will be set except for the call from soaccept().
*/
ASSERT(!mutex_owned(&VTOSO(vp)->so_lock));
ss.ss_pid = pgrp;
ss.ss_events = events;
return (strioctl(vp, I_ESETSIG, (intptr_t)&ss, mode, K_TO_K, cr,
&rval));
}
/* Register for events matching the SS_ASYNC flag */
int
so_set_events(struct sonode *so, vnode_t *vp, cred_t *cr)
{
int events = so->so_state & SS_ASYNC ?
S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT :
S_RDBAND | S_BANDURG;
return (so_set_asyncsigs(vp, so->so_pgrp, events, 0, cr));
}
/* Change the SS_ASYNC flag, and update signal delivery if needed */
int
so_flip_async(struct sonode *so, vnode_t *vp, int mode, cred_t *cr)
{
ASSERT(mutex_owned(&so->so_lock));
if (so->so_pgrp != 0) {
int error;
int events = so->so_state & SS_ASYNC ? /* Old flag */
S_RDBAND | S_BANDURG : /* New sigs */
S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT;
so_lock_single(so);
mutex_exit(&so->so_lock);
error = so_set_asyncsigs(vp, so->so_pgrp, events, mode, cr);
mutex_enter(&so->so_lock);
so_unlock_single(so, SOLOCKED);
if (error)
return (error);
}
so->so_state ^= SS_ASYNC;
return (0);
}
/*
* Set new pid/pgrp for SIGPOLL (or SIGIO for FIOASYNC mode), replacing
* any existing one. If passed zero, just clear the existing one.
*/
int
so_set_siggrp(struct sonode *so, vnode_t *vp, pid_t pgrp, int mode, cred_t *cr)
{
int events = so->so_state & SS_ASYNC ?
S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT :
S_RDBAND | S_BANDURG;
int error;
ASSERT(mutex_owned(&so->so_lock));
/*
* Change socket process (group).
*
* strioctl (via so_set_asyncsigs) will perform permission check and
* also keep a PID_HOLD to prevent the pid from being reused.
*/
so_lock_single(so);
mutex_exit(&so->so_lock);
if (pgrp != 0) {
dprintso(so, 1, ("setown: adding pgrp %d ev 0x%x\n",
pgrp, events));
error = so_set_asyncsigs(vp, pgrp, events, mode, cr);
if (error != 0) {
eprintsoline(so, error);
goto bad;
}
}
/* Remove the previously registered process/group */
if (so->so_pgrp != 0) {
dprintso(so, 1, ("setown: removing pgrp %d\n", so->so_pgrp));
error = so_set_asyncsigs(vp, so->so_pgrp, 0, mode, cr);
if (error != 0) {
eprintsoline(so, error);
error = 0;
}
}
mutex_enter(&so->so_lock);
so_unlock_single(so, SOLOCKED);
so->so_pgrp = pgrp;
return (0);
bad:
mutex_enter(&so->so_lock);
so_unlock_single(so, SOLOCKED);
return (error);
}
/*
* Translate a TLI(/XTI) error into a system error as best we can.
*/
static const int tli_errs[] = {
0, /* no error */
EADDRNOTAVAIL, /* TBADADDR */
ENOPROTOOPT, /* TBADOPT */
EACCES, /* TACCES */
EBADF, /* TBADF */
EADDRNOTAVAIL, /* TNOADDR */
EPROTO, /* TOUTSTATE */
ECONNABORTED, /* TBADSEQ */
0, /* TSYSERR - will never get */
EPROTO, /* TLOOK - should never be sent by transport */
EMSGSIZE, /* TBADDATA */
EMSGSIZE, /* TBUFOVFLW */
EPROTO, /* TFLOW */
EWOULDBLOCK, /* TNODATA */
EPROTO, /* TNODIS */
EPROTO, /* TNOUDERR */
EINVAL, /* TBADFLAG */
EPROTO, /* TNOREL */
EOPNOTSUPP, /* TNOTSUPPORT */
EPROTO, /* TSTATECHNG */
/* following represent error namespace expansion with XTI */
EPROTO, /* TNOSTRUCTYPE - never sent by transport */
EPROTO, /* TBADNAME - never sent by transport */
EPROTO, /* TBADQLEN - never sent by transport */
EADDRINUSE, /* TADDRBUSY */
EBADF, /* TINDOUT */
EBADF, /* TPROVMISMATCH */
EBADF, /* TRESQLEN */
EBADF, /* TRESADDR */
EPROTO, /* TQFULL - never sent by transport */
EPROTO, /* TPROTO */
};
static int
tlitosyserr(int terr)
{
ASSERT(terr != TSYSERR);
if (terr >= (sizeof (tli_errs) / sizeof (tli_errs[0])))
return (EPROTO);
else
return (tli_errs[terr]);
}